mirror of
https://github.com/prometheus/prometheus.git
synced 2026-05-28 04:02:21 -04:00
#14173 introduced an optimisation to better handle regex patterns like .*-.*-.*. It identifies strings the pattern cannot possibly match (because they do not contain all of the literal values) and returns false from MatchString early. However, if the string does contain all literal values, then the Go regex engine is used to confirm that the string does match the pattern. But this is not necessary in the case where the start and end of the pattern is .* and everything in between is either a literal or .*: if the string contains all of the literals in order, then it matches the pattern, and invoking Go's regex engine to confirm this is unnecessary and quite slow. * Add some more test cases * Add benchmark, since existing benchmark doesn't show much impact given most of the random test strings will not match the patterns. Signed-off-by: Charles Korn <charles.korn@grafana.com>
1143 lines
34 KiB
Go
1143 lines
34 KiB
Go
// Copyright The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package labels
|
|
|
|
import (
|
|
"slices"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"github.com/grafana/regexp"
|
|
"github.com/grafana/regexp/syntax"
|
|
"golang.org/x/text/unicode/norm"
|
|
)
|
|
|
|
const (
|
|
maxSetMatches = 256
|
|
|
|
// The minimum number of alternate values a regex should have to trigger
|
|
// the optimization done by optimizeEqualOrPrefixStringMatchers() and so use a map
|
|
// to match values instead of iterating over a list. This value has
|
|
// been computed running BenchmarkOptimizeEqualStringMatchers.
|
|
minEqualMultiStringMatcherMapThreshold = 16
|
|
)
|
|
|
|
type FastRegexMatcher struct {
|
|
// Under some conditions, re is nil because the expression is never parsed.
|
|
// We store the original string to be able to return it in GetRegexString().
|
|
reString string
|
|
re *regexp.Regexp
|
|
|
|
setMatches []string
|
|
stringMatcher StringMatcher
|
|
prefix string
|
|
suffix string
|
|
contains []string
|
|
|
|
// matchString is the "compiled" function to run by MatchString().
|
|
matchString func(string) bool
|
|
}
|
|
|
|
func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
|
|
m := &FastRegexMatcher{
|
|
reString: v,
|
|
}
|
|
|
|
m.stringMatcher, m.setMatches = optimizeAlternatingLiterals(v)
|
|
if m.stringMatcher != nil {
|
|
// If we already have a string matcher, we don't need to parse the regex
|
|
// or compile the matchString function. This also avoids the behavior in
|
|
// compileMatchStringFunction where it prefers to use setMatches when
|
|
// available, even if the string matcher is faster.
|
|
m.matchString = m.stringMatcher.Matches
|
|
} else {
|
|
parsed, err := syntax.Parse(v, syntax.Perl|syntax.DotNL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
m.re, err = regexp.Compile("^(?s:" + parsed.String() + ")$")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if parsed.Op == syntax.OpConcat {
|
|
m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
|
|
}
|
|
if matches, caseSensitive := findSetMatches(parsed); caseSensitive {
|
|
m.setMatches = matches
|
|
}
|
|
|
|
// Check if we have a pattern like .*-.*-.*.
|
|
// If so, then we can rely on the containsInOrder check in compileMatchStringFunction,
|
|
// so no further inspection of the string is required.
|
|
// We can't do this in stringMatcherFromRegexpInternal as we only want to apply this
|
|
// if the top-level pattern satisfies this requirement.
|
|
if isSimpleConcatenationPattern(parsed) {
|
|
m.stringMatcher = trueMatcher{}
|
|
} else {
|
|
m.stringMatcher = stringMatcherFromRegexp(parsed)
|
|
}
|
|
|
|
m.matchString = m.compileMatchStringFunction()
|
|
}
|
|
|
|
return m, nil
|
|
}
|
|
|
|
// compileMatchStringFunction returns the function to run by MatchString().
|
|
func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
|
|
// If the only optimization available is the string matcher, then we can just run it.
|
|
if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && len(m.contains) == 0 && m.stringMatcher != nil {
|
|
return m.stringMatcher.Matches
|
|
}
|
|
|
|
return func(s string) bool {
|
|
if len(m.setMatches) != 0 {
|
|
return slices.Contains(m.setMatches, s)
|
|
}
|
|
if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
|
|
return false
|
|
}
|
|
if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
|
|
return false
|
|
}
|
|
if len(m.contains) > 0 && !containsInOrder(s, m.contains) {
|
|
return false
|
|
}
|
|
if m.stringMatcher != nil {
|
|
return m.stringMatcher.Matches(s)
|
|
}
|
|
return m.re.MatchString(s)
|
|
}
|
|
}
|
|
|
|
// IsOptimized returns true if any fast-path optimization is applied to the
|
|
// regex matcher.
|
|
func (m *FastRegexMatcher) IsOptimized() bool {
|
|
return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || len(m.contains) > 0
|
|
}
|
|
|
|
// findSetMatches extract equality matches from a regexp.
|
|
// Returns nil if we can't replace the regexp by only equality matchers or the regexp contains
|
|
// a mix of case sensitive and case insensitive matchers.
|
|
func findSetMatches(re *syntax.Regexp) (matches []string, caseSensitive bool) {
|
|
clearBeginEndText(re)
|
|
|
|
return findSetMatchesInternal(re, "")
|
|
}
|
|
|
|
func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) {
|
|
switch re.Op {
|
|
case syntax.OpBeginText:
|
|
// Correctly handling the begin text operator inside a regex is tricky,
|
|
// so in this case we fallback to the regex engine.
|
|
return nil, false
|
|
case syntax.OpEndText:
|
|
// Correctly handling the end text operator inside a regex is tricky,
|
|
// so in this case we fallback to the regex engine.
|
|
return nil, false
|
|
case syntax.OpLiteral:
|
|
return []string{base + string(re.Rune)}, isCaseSensitive(re)
|
|
case syntax.OpEmptyMatch:
|
|
if base != "" {
|
|
return []string{base}, isCaseSensitive(re)
|
|
}
|
|
case syntax.OpAlternate:
|
|
return findSetMatchesFromAlternate(re, base)
|
|
case syntax.OpCapture:
|
|
clearCapture(re)
|
|
return findSetMatchesInternal(re, base)
|
|
case syntax.OpConcat:
|
|
return findSetMatchesFromConcat(re, base)
|
|
case syntax.OpCharClass:
|
|
if len(re.Rune)%2 != 0 {
|
|
return nil, false
|
|
}
|
|
var matches []string
|
|
var totalSet int
|
|
for i := 0; i+1 < len(re.Rune); i += 2 {
|
|
totalSet += int(re.Rune[i+1]-re.Rune[i]) + 1
|
|
}
|
|
// limits the total characters that can be used to create matches.
|
|
// In some case like negation [^0-9] a lot of possibilities exists and that
|
|
// can create thousands of possible matches at which points we're better off using regexp.
|
|
if totalSet > maxSetMatches {
|
|
return nil, false
|
|
}
|
|
for i := 0; i+1 < len(re.Rune); i += 2 {
|
|
lo, hi := re.Rune[i], re.Rune[i+1]
|
|
for c := lo; c <= hi; c++ {
|
|
matches = append(matches, base+string(c))
|
|
}
|
|
}
|
|
return matches, isCaseSensitive(re)
|
|
default:
|
|
return nil, false
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
|
|
if len(re.Sub) == 0 {
|
|
return nil, false
|
|
}
|
|
clearCapture(re.Sub...)
|
|
|
|
matches = []string{base}
|
|
|
|
for i := 0; i < len(re.Sub); i++ {
|
|
var newMatches []string
|
|
for j, b := range matches {
|
|
m, caseSensitive := findSetMatchesInternal(re.Sub[i], b)
|
|
if m == nil {
|
|
return nil, false
|
|
}
|
|
if tooManyMatches(newMatches, m...) {
|
|
return nil, false
|
|
}
|
|
|
|
// All matches must have the same case sensitivity. If it's the first set of matches
|
|
// returned, we store its sensitivity as the expected case, and then we'll check all
|
|
// other ones.
|
|
if i == 0 && j == 0 {
|
|
matchesCaseSensitive = caseSensitive
|
|
}
|
|
if matchesCaseSensitive != caseSensitive {
|
|
return nil, false
|
|
}
|
|
|
|
newMatches = append(newMatches, m...)
|
|
}
|
|
matches = newMatches
|
|
}
|
|
|
|
return matches, matchesCaseSensitive
|
|
}
|
|
|
|
func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
|
|
for i, sub := range re.Sub {
|
|
found, caseSensitive := findSetMatchesInternal(sub, base)
|
|
if found == nil {
|
|
return nil, false
|
|
}
|
|
if tooManyMatches(matches, found...) {
|
|
return nil, false
|
|
}
|
|
|
|
// All matches must have the same case sensitivity. If it's the first set of matches
|
|
// returned, we store its sensitivity as the expected case, and then we'll check all
|
|
// other ones.
|
|
if i == 0 {
|
|
matchesCaseSensitive = caseSensitive
|
|
}
|
|
if matchesCaseSensitive != caseSensitive {
|
|
return nil, false
|
|
}
|
|
|
|
matches = append(matches, found...)
|
|
}
|
|
|
|
return matches, matchesCaseSensitive
|
|
}
|
|
|
|
// clearCapture removes capture operation as they are not used for matching.
|
|
func clearCapture(regs ...*syntax.Regexp) {
|
|
for _, r := range regs {
|
|
// Iterate on the regexp because capture groups could be nested.
|
|
for r.Op == syntax.OpCapture {
|
|
*r = *r.Sub[0]
|
|
}
|
|
}
|
|
}
|
|
|
|
// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string.
|
|
func clearBeginEndText(re *syntax.Regexp) {
|
|
// Do not clear begin/end text from an alternate operator because it could
|
|
// change the actual regexp properties.
|
|
if re.Op == syntax.OpAlternate {
|
|
return
|
|
}
|
|
|
|
if len(re.Sub) == 0 {
|
|
return
|
|
}
|
|
if len(re.Sub) == 1 {
|
|
if re.Sub[0].Op == syntax.OpBeginText || re.Sub[0].Op == syntax.OpEndText {
|
|
// We need to remove this element. Since it's the only one, we convert into a matcher of an empty string.
|
|
// OpEmptyMatch is regexp's nop operator.
|
|
re.Op = syntax.OpEmptyMatch
|
|
re.Sub = nil
|
|
return
|
|
}
|
|
}
|
|
if re.Sub[0].Op == syntax.OpBeginText {
|
|
re.Sub = re.Sub[1:]
|
|
}
|
|
if re.Sub[len(re.Sub)-1].Op == syntax.OpEndText {
|
|
re.Sub = re.Sub[:len(re.Sub)-1]
|
|
}
|
|
}
|
|
|
|
// isCaseInsensitive tells if a regexp is case insensitive.
|
|
// The flag should be check at each level of the syntax tree.
|
|
func isCaseInsensitive(reg *syntax.Regexp) bool {
|
|
return (reg.Flags & syntax.FoldCase) != 0
|
|
}
|
|
|
|
// isCaseSensitive tells if a regexp is case sensitive.
|
|
// The flag should be check at each level of the syntax tree.
|
|
func isCaseSensitive(reg *syntax.Regexp) bool {
|
|
return !isCaseInsensitive(reg)
|
|
}
|
|
|
|
// tooManyMatches guards against creating too many set matches.
|
|
func tooManyMatches(matches []string, added ...string) bool {
|
|
return len(matches)+len(added) > maxSetMatches
|
|
}
|
|
|
|
func (m *FastRegexMatcher) MatchString(s string) bool {
|
|
return m.matchString(s)
|
|
}
|
|
|
|
func (m *FastRegexMatcher) SetMatches() []string {
|
|
// IMPORTANT: always return a copy, otherwise if the caller manipulate this slice it will
|
|
// also get manipulated in the cached FastRegexMatcher instance.
|
|
return slices.Clone(m.setMatches)
|
|
}
|
|
|
|
func (m *FastRegexMatcher) GetRegexString() string {
|
|
return m.reString
|
|
}
|
|
|
|
// optimizeAlternatingLiterals optimizes a regex of the form
|
|
//
|
|
// `literal1|literal2|literal3|...`
|
|
//
|
|
// this function returns an optimized StringMatcher or nil if the regex
|
|
// cannot be optimized in this way, and a list of setMatches up to maxSetMatches.
|
|
func optimizeAlternatingLiterals(s string) (StringMatcher, []string) {
|
|
if len(s) == 0 {
|
|
return emptyStringMatcher{}, nil
|
|
}
|
|
|
|
estimatedAlternates := strings.Count(s, "|") + 1
|
|
|
|
// If there are no alternates, check if the string is a literal
|
|
if estimatedAlternates == 1 {
|
|
if regexp.QuoteMeta(s) == s {
|
|
return &equalStringMatcher{s: s, caseSensitive: true}, []string{s}
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates, 0, 0)
|
|
|
|
for end := strings.IndexByte(s, '|'); end > -1; end = strings.IndexByte(s, '|') {
|
|
// Split the string into the next literal and the remainder
|
|
subMatch := s[:end]
|
|
s = s[end+1:]
|
|
|
|
// break if any of the submatches are not literals
|
|
if regexp.QuoteMeta(subMatch) != subMatch {
|
|
return nil, nil
|
|
}
|
|
|
|
multiMatcher.add(subMatch)
|
|
}
|
|
|
|
// break if the remainder is not a literal
|
|
if regexp.QuoteMeta(s) != s {
|
|
return nil, nil
|
|
}
|
|
multiMatcher.add(s)
|
|
|
|
return multiMatcher, multiMatcher.setMatches()
|
|
}
|
|
|
|
// optimizeConcatRegex returns literal prefix/suffix text that can be safely
|
|
// checked against the label value before running the regexp matcher.
|
|
func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []string) {
|
|
sub := r.Sub
|
|
clearCapture(sub...)
|
|
|
|
// We can safely remove begin and end text matchers respectively
|
|
// at the beginning and end of the regexp.
|
|
if len(sub) > 0 && sub[0].Op == syntax.OpBeginText {
|
|
sub = sub[1:]
|
|
}
|
|
if len(sub) > 0 && sub[len(sub)-1].Op == syntax.OpEndText {
|
|
sub = sub[:len(sub)-1]
|
|
}
|
|
|
|
if len(sub) == 0 {
|
|
return prefix, suffix, contains
|
|
}
|
|
|
|
// Given Prometheus regex matchers are always anchored to the begin/end
|
|
// of the text, if the first/last operations are literals, we can safely
|
|
// treat them as prefix/suffix.
|
|
if sub[0].Op == syntax.OpLiteral && (sub[0].Flags&syntax.FoldCase) == 0 {
|
|
prefix = string(sub[0].Rune)
|
|
}
|
|
if last := len(sub) - 1; sub[last].Op == syntax.OpLiteral && (sub[last].Flags&syntax.FoldCase) == 0 {
|
|
suffix = string(sub[last].Rune)
|
|
}
|
|
|
|
// If contains any literal which is not a prefix/suffix, we keep track of
|
|
// all the ones which are case-sensitive.
|
|
for i := 1; i < len(sub)-1; i++ {
|
|
if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 {
|
|
contains = append(contains, string(sub[i].Rune))
|
|
}
|
|
}
|
|
|
|
return prefix, suffix, contains
|
|
}
|
|
|
|
// StringMatcher is a matcher that matches a string in place of a regular expression.
|
|
type StringMatcher interface {
|
|
Matches(s string) bool
|
|
}
|
|
|
|
// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher.
|
|
// It returns nil if the regexp is not supported.
|
|
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
|
|
clearBeginEndText(re)
|
|
|
|
m := stringMatcherFromRegexpInternal(re)
|
|
m = optimizeEqualOrPrefixStringMatchers(m, minEqualMultiStringMatcherMapThreshold)
|
|
|
|
return m
|
|
}
|
|
|
|
func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
|
|
clearCapture(re)
|
|
|
|
switch re.Op {
|
|
case syntax.OpBeginText:
|
|
// Correctly handling the begin text operator inside a regex is tricky,
|
|
// so in this case we fallback to the regex engine.
|
|
return nil
|
|
case syntax.OpEndText:
|
|
// Correctly handling the end text operator inside a regex is tricky,
|
|
// so in this case we fallback to the regex engine.
|
|
return nil
|
|
case syntax.OpPlus:
|
|
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
|
|
return nil
|
|
}
|
|
return &anyNonEmptyStringMatcher{
|
|
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
|
|
}
|
|
case syntax.OpStar:
|
|
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
|
|
return nil
|
|
}
|
|
|
|
// If the newline is valid, than this matcher literally match any string (even empty).
|
|
if re.Sub[0].Op == syntax.OpAnyChar {
|
|
return trueMatcher{}
|
|
}
|
|
|
|
// Any string is fine (including an empty one), as far as it doesn't contain any newline.
|
|
return anyStringWithoutNewlineMatcher{}
|
|
case syntax.OpQuest:
|
|
// Only optimize for ".?".
|
|
if len(re.Sub) != 1 || (re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL) {
|
|
return nil
|
|
}
|
|
|
|
return &zeroOrOneCharacterStringMatcher{
|
|
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
|
|
}
|
|
case syntax.OpEmptyMatch:
|
|
return emptyStringMatcher{}
|
|
|
|
case syntax.OpLiteral:
|
|
return &equalStringMatcher{
|
|
s: string(re.Rune),
|
|
caseSensitive: !isCaseInsensitive(re),
|
|
}
|
|
case syntax.OpAlternate:
|
|
or := make([]StringMatcher, 0, len(re.Sub))
|
|
for _, sub := range re.Sub {
|
|
m := stringMatcherFromRegexpInternal(sub)
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
or = append(or, m)
|
|
}
|
|
return orStringMatcher(or)
|
|
case syntax.OpConcat:
|
|
clearCapture(re.Sub...)
|
|
|
|
if len(re.Sub) == 0 {
|
|
return emptyStringMatcher{}
|
|
}
|
|
if len(re.Sub) == 1 {
|
|
return stringMatcherFromRegexpInternal(re.Sub[0])
|
|
}
|
|
|
|
var left, right StringMatcher
|
|
|
|
// Let's try to find if there's a first and last any matchers.
|
|
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar || re.Sub[0].Op == syntax.OpQuest {
|
|
left = stringMatcherFromRegexpInternal(re.Sub[0])
|
|
if left == nil {
|
|
return nil
|
|
}
|
|
re.Sub = re.Sub[1:]
|
|
}
|
|
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar || re.Sub[len(re.Sub)-1].Op == syntax.OpQuest {
|
|
right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1])
|
|
if right == nil {
|
|
return nil
|
|
}
|
|
re.Sub = re.Sub[:len(re.Sub)-1]
|
|
}
|
|
|
|
matches, matchesCaseSensitive := findSetMatchesInternal(re, "")
|
|
|
|
if len(matches) == 0 && len(re.Sub) == 2 {
|
|
// We have not find fixed set matches. We look for other known cases that
|
|
// we can optimize.
|
|
switch {
|
|
// Prefix is literal.
|
|
case right == nil && re.Sub[0].Op == syntax.OpLiteral:
|
|
right = stringMatcherFromRegexpInternal(re.Sub[1])
|
|
if right != nil {
|
|
matches = []string{string(re.Sub[0].Rune)}
|
|
matchesCaseSensitive = !isCaseInsensitive(re.Sub[0])
|
|
}
|
|
|
|
// Suffix is literal.
|
|
case left == nil && re.Sub[1].Op == syntax.OpLiteral:
|
|
left = stringMatcherFromRegexpInternal(re.Sub[0])
|
|
if left != nil {
|
|
matches = []string{string(re.Sub[1].Rune)}
|
|
matchesCaseSensitive = !isCaseInsensitive(re.Sub[1])
|
|
}
|
|
}
|
|
}
|
|
|
|
// Ensure we've found some literals to match (optionally with a left and/or right matcher).
|
|
// If not, then this optimization doesn't trigger.
|
|
if len(matches) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Use the right (and best) matcher based on what we've found.
|
|
switch {
|
|
// No left and right matchers (only fixed set matches).
|
|
case left == nil && right == nil:
|
|
// if there's no any matchers on both side it's a concat of literals
|
|
or := make([]StringMatcher, 0, len(matches))
|
|
for _, match := range matches {
|
|
or = append(or, &equalStringMatcher{
|
|
s: match,
|
|
caseSensitive: matchesCaseSensitive,
|
|
})
|
|
}
|
|
return orStringMatcher(or)
|
|
|
|
// Right matcher with 1 fixed set match.
|
|
case left == nil && len(matches) == 1:
|
|
return newLiteralPrefixStringMatcher(matches[0], matchesCaseSensitive, right)
|
|
|
|
// Left matcher with 1 fixed set match.
|
|
case right == nil && len(matches) == 1:
|
|
return &literalSuffixStringMatcher{
|
|
left: left,
|
|
suffix: matches[0],
|
|
suffixCaseSensitive: matchesCaseSensitive,
|
|
}
|
|
|
|
// We found literals in the middle. We can trigger the fast path only if
|
|
// the matches are case sensitive because containsStringMatcher doesn't
|
|
// support case insensitive.
|
|
case matchesCaseSensitive:
|
|
return &containsStringMatcher{
|
|
substrings: matches,
|
|
left: left,
|
|
right: right,
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// isSimpleConcatenationPattern returns true if re contains only literals or wildcard matchers,
|
|
// and starts and ends with a wildcard matcher (eg. .*-.*-.*).
|
|
func isSimpleConcatenationPattern(re *syntax.Regexp) bool {
|
|
if re.Op != syntax.OpConcat {
|
|
return false
|
|
}
|
|
|
|
if len(re.Sub) < 2 {
|
|
return false
|
|
}
|
|
|
|
first := re.Sub[0]
|
|
last := re.Sub[len(re.Sub)-1]
|
|
if !isMatchAny(first) || !isMatchAny(last) {
|
|
return false
|
|
}
|
|
|
|
for _, re := range re.Sub[1 : len(re.Sub)-1] {
|
|
if !isMatchAny(re) && !isCaseSensitiveLiteral(re) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func isMatchAny(re *syntax.Regexp) bool {
|
|
return re.Op == syntax.OpStar && re.Sub[0].Op == syntax.OpAnyChar
|
|
}
|
|
|
|
func isCaseSensitiveLiteral(re *syntax.Regexp) bool {
|
|
return re.Op == syntax.OpLiteral && isCaseSensitive(re)
|
|
}
|
|
|
|
// containsStringMatcher matches a string if it contains any of the substrings.
|
|
// If left and right are not nil, it's a contains operation where left and right must match.
|
|
// If left is nil, it's a hasPrefix operation and right must match.
|
|
// Finally, if right is nil it's a hasSuffix operation and left must match.
|
|
type containsStringMatcher struct {
|
|
// The matcher that must match the left side. Can be nil.
|
|
left StringMatcher
|
|
|
|
// At least one of these strings must match in the "middle", between left and right matchers.
|
|
substrings []string
|
|
|
|
// The matcher that must match the right side. Can be nil.
|
|
right StringMatcher
|
|
}
|
|
|
|
func (m *containsStringMatcher) Matches(s string) bool {
|
|
for _, substr := range m.substrings {
|
|
switch {
|
|
case m.right != nil && m.left != nil:
|
|
searchStartPos := 0
|
|
|
|
for {
|
|
pos := strings.Index(s[searchStartPos:], substr)
|
|
if pos < 0 {
|
|
break
|
|
}
|
|
|
|
// Since we started searching from searchStartPos, we have to add that offset
|
|
// to get the actual position of the substring inside the text.
|
|
pos += searchStartPos
|
|
|
|
// If both the left and right matchers match, then we can stop searching because
|
|
// we've found a match.
|
|
if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(substr):]) {
|
|
return true
|
|
}
|
|
|
|
// Continue searching for another occurrence of the substring inside the text.
|
|
searchStartPos = pos + 1
|
|
}
|
|
case m.left != nil:
|
|
// If we have to check for characters on the left then we need to match a suffix.
|
|
if strings.HasSuffix(s, substr) && m.left.Matches(s[:len(s)-len(substr)]) {
|
|
return true
|
|
}
|
|
case m.right != nil:
|
|
if strings.HasPrefix(s, substr) && m.right.Matches(s[len(substr):]) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func newLiteralPrefixStringMatcher(prefix string, prefixCaseSensitive bool, right StringMatcher) StringMatcher {
|
|
if prefixCaseSensitive {
|
|
return &literalPrefixSensitiveStringMatcher{
|
|
prefix: prefix,
|
|
right: right,
|
|
}
|
|
}
|
|
|
|
return &literalPrefixInsensitiveStringMatcher{
|
|
prefix: prefix,
|
|
right: right,
|
|
}
|
|
}
|
|
|
|
// literalPrefixSensitiveStringMatcher matches a string with the given literal case-sensitive prefix and right side matcher.
|
|
type literalPrefixSensitiveStringMatcher struct {
|
|
prefix string
|
|
|
|
// The matcher that must match the right side. Can be nil.
|
|
right StringMatcher
|
|
}
|
|
|
|
func (m *literalPrefixSensitiveStringMatcher) Matches(s string) bool {
|
|
if !strings.HasPrefix(s, m.prefix) {
|
|
return false
|
|
}
|
|
|
|
// Ensure the right side matches.
|
|
return m.right.Matches(s[len(m.prefix):])
|
|
}
|
|
|
|
// literalPrefixInsensitiveStringMatcher matches a string with the given literal case-insensitive prefix and right side matcher.
|
|
type literalPrefixInsensitiveStringMatcher struct {
|
|
prefix string
|
|
|
|
// The matcher that must match the right side. Can be nil.
|
|
right StringMatcher
|
|
}
|
|
|
|
func (m *literalPrefixInsensitiveStringMatcher) Matches(s string) bool {
|
|
if !hasPrefixCaseInsensitive(s, m.prefix) {
|
|
return false
|
|
}
|
|
|
|
// Ensure the right side matches.
|
|
return m.right.Matches(s[len(m.prefix):])
|
|
}
|
|
|
|
// literalSuffixStringMatcher matches a string with the given literal suffix and left side matcher.
|
|
type literalSuffixStringMatcher struct {
|
|
// The matcher that must match the left side. Can be nil.
|
|
left StringMatcher
|
|
|
|
suffix string
|
|
suffixCaseSensitive bool
|
|
}
|
|
|
|
func (m *literalSuffixStringMatcher) Matches(s string) bool {
|
|
// Ensure the suffix matches.
|
|
if m.suffixCaseSensitive && !strings.HasSuffix(s, m.suffix) {
|
|
return false
|
|
}
|
|
if !m.suffixCaseSensitive && !hasSuffixCaseInsensitive(s, m.suffix) {
|
|
return false
|
|
}
|
|
|
|
// Ensure the left side matches.
|
|
return m.left.Matches(s[:len(s)-len(m.suffix)])
|
|
}
|
|
|
|
// emptyStringMatcher matches an empty string.
|
|
type emptyStringMatcher struct{}
|
|
|
|
func (emptyStringMatcher) Matches(s string) bool {
|
|
return len(s) == 0
|
|
}
|
|
|
|
// orStringMatcher matches any of the sub-matchers.
|
|
type orStringMatcher []StringMatcher
|
|
|
|
func (m orStringMatcher) Matches(s string) bool {
|
|
for _, matcher := range m {
|
|
if matcher.Matches(s) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// equalStringMatcher matches a string exactly and support case insensitive.
|
|
type equalStringMatcher struct {
|
|
s string
|
|
caseSensitive bool
|
|
}
|
|
|
|
func (m *equalStringMatcher) Matches(s string) bool {
|
|
if m.caseSensitive {
|
|
return m.s == s
|
|
}
|
|
return strings.EqualFold(m.s, s)
|
|
}
|
|
|
|
type multiStringMatcherBuilder interface {
|
|
StringMatcher
|
|
add(s string)
|
|
addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher)
|
|
setMatches() []string
|
|
}
|
|
|
|
func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize, estimatedPrefixes, minPrefixLength int) multiStringMatcherBuilder {
|
|
// If the estimated size is low enough, it's faster to use a slice instead of a map.
|
|
if estimatedSize < minEqualMultiStringMatcherMapThreshold && estimatedPrefixes == 0 {
|
|
return &equalMultiStringSliceMatcher{caseSensitive: caseSensitive, values: make([]string, 0, estimatedSize)}
|
|
}
|
|
|
|
return &equalMultiStringMapMatcher{
|
|
values: make(map[string]struct{}, estimatedSize),
|
|
prefixes: make(map[string][]StringMatcher, estimatedPrefixes),
|
|
minPrefixLen: minPrefixLength,
|
|
caseSensitive: caseSensitive,
|
|
}
|
|
}
|
|
|
|
// equalMultiStringSliceMatcher matches a string exactly against a slice of valid values.
|
|
type equalMultiStringSliceMatcher struct {
|
|
values []string
|
|
|
|
caseSensitive bool
|
|
}
|
|
|
|
func (m *equalMultiStringSliceMatcher) add(s string) {
|
|
m.values = append(m.values, s)
|
|
}
|
|
|
|
func (*equalMultiStringSliceMatcher) addPrefix(string, bool, StringMatcher) {
|
|
panic("not implemented")
|
|
}
|
|
|
|
func (m *equalMultiStringSliceMatcher) setMatches() []string {
|
|
return m.values
|
|
}
|
|
|
|
func (m *equalMultiStringSliceMatcher) Matches(s string) bool {
|
|
if m.caseSensitive {
|
|
return slices.Contains(m.values, s)
|
|
}
|
|
for _, v := range m.values {
|
|
if strings.EqualFold(s, v) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// equalMultiStringMapMatcher matches a string exactly against a map of valid values
|
|
// or against a set of prefix matchers.
|
|
type equalMultiStringMapMatcher struct {
|
|
// values contains values to match a string against. If the matching is case insensitive,
|
|
// the values here must be lowercase.
|
|
values map[string]struct{}
|
|
// prefixes maps strings, all of length minPrefixLen, to sets of matchers to check the rest of the string.
|
|
// If the matching is case insensitive, prefixes are all lowercase.
|
|
prefixes map[string][]StringMatcher
|
|
// minPrefixLen can be zero, meaning there are no prefix matchers.
|
|
minPrefixLen int
|
|
caseSensitive bool
|
|
}
|
|
|
|
func (m *equalMultiStringMapMatcher) add(s string) {
|
|
if !m.caseSensitive {
|
|
s = toNormalisedLower(s, nil) // Don't pass a stack buffer here - it will always escape to heap.
|
|
}
|
|
|
|
m.values[s] = struct{}{}
|
|
}
|
|
|
|
func (m *equalMultiStringMapMatcher) addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher) {
|
|
if m.minPrefixLen == 0 {
|
|
panic("addPrefix called when no prefix length defined")
|
|
}
|
|
if len(prefix) < m.minPrefixLen {
|
|
panic("addPrefix called with a too short prefix")
|
|
}
|
|
if m.caseSensitive != prefixCaseSensitive {
|
|
panic("addPrefix called with a prefix whose case sensitivity is different than the expected one")
|
|
}
|
|
|
|
s := prefix[:m.minPrefixLen]
|
|
if !m.caseSensitive {
|
|
s = strings.ToLower(s)
|
|
}
|
|
|
|
m.prefixes[s] = append(m.prefixes[s], matcher)
|
|
}
|
|
|
|
func (m *equalMultiStringMapMatcher) setMatches() []string {
|
|
if len(m.values) >= maxSetMatches || len(m.prefixes) > 0 {
|
|
return nil
|
|
}
|
|
|
|
matches := make([]string, 0, len(m.values))
|
|
for s := range m.values {
|
|
matches = append(matches, s)
|
|
}
|
|
return matches
|
|
}
|
|
|
|
func (m *equalMultiStringMapMatcher) Matches(s string) bool {
|
|
if len(m.values) > 0 {
|
|
sNorm := s
|
|
var a [32]byte
|
|
if !m.caseSensitive {
|
|
sNorm = toNormalisedLower(s, a[:])
|
|
}
|
|
if _, ok := m.values[sNorm]; ok {
|
|
return true
|
|
}
|
|
}
|
|
|
|
if m.minPrefixLen > 0 && len(s) >= m.minPrefixLen {
|
|
prefix := s[:m.minPrefixLen]
|
|
var a [32]byte
|
|
if !m.caseSensitive {
|
|
prefix = toNormalisedLower(s[:m.minPrefixLen], a[:])
|
|
}
|
|
for _, matcher := range m.prefixes[prefix] {
|
|
if matcher.Matches(s) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert
|
|
// it to lower case.
|
|
func toNormalisedLower(s string, a []byte) string {
|
|
for i := 0; i < len(s); i++ {
|
|
c := s[i]
|
|
if c >= utf8.RuneSelf {
|
|
return strings.Map(unicode.ToLower, norm.NFKD.String(s))
|
|
}
|
|
if 'A' <= c && c <= 'Z' {
|
|
return toNormalisedLowerSlow(s, i, a)
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
|
|
// toNormalisedLowerSlow is split from toNormalisedLower because having a call
|
|
// to `copy` slows it down even when it is not called.
|
|
func toNormalisedLowerSlow(s string, i int, a []byte) string {
|
|
var buf []byte
|
|
if cap(a) > len(s) {
|
|
buf = a[:len(s)]
|
|
copy(buf, s)
|
|
} else {
|
|
buf = []byte(s)
|
|
}
|
|
for ; i < len(s); i++ {
|
|
c := s[i]
|
|
if c >= utf8.RuneSelf {
|
|
return strings.Map(unicode.ToLower, norm.NFKD.String(s))
|
|
}
|
|
if 'A' <= c && c <= 'Z' {
|
|
buf[i] = c + 'a' - 'A'
|
|
}
|
|
}
|
|
return yoloString(buf)
|
|
}
|
|
|
|
// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string
|
|
// (including an empty one) as far as it doesn't contain any newline character.
|
|
type anyStringWithoutNewlineMatcher struct{}
|
|
|
|
func (anyStringWithoutNewlineMatcher) Matches(s string) bool {
|
|
// We need to make sure it doesn't contain a newline. Since the newline is
|
|
// an ASCII character, we can use strings.IndexByte().
|
|
return strings.IndexByte(s, '\n') == -1
|
|
}
|
|
|
|
// anyNonEmptyStringMatcher is a stringMatcher which matches any non-empty string.
|
|
type anyNonEmptyStringMatcher struct {
|
|
matchNL bool
|
|
}
|
|
|
|
func (m *anyNonEmptyStringMatcher) Matches(s string) bool {
|
|
if m.matchNL {
|
|
// It's OK if the string contains a newline so we just need to make
|
|
// sure it's non-empty.
|
|
return len(s) > 0
|
|
}
|
|
|
|
// We need to make sure it non-empty and doesn't contain a newline.
|
|
// Since the newline is an ASCII character, we can use strings.IndexByte().
|
|
return len(s) > 0 && strings.IndexByte(s, '\n') == -1
|
|
}
|
|
|
|
// zeroOrOneCharacterStringMatcher is a StringMatcher which matches zero or one occurrence
|
|
// of any character. The newline character is matches only if matchNL is set to true.
|
|
type zeroOrOneCharacterStringMatcher struct {
|
|
matchNL bool
|
|
}
|
|
|
|
func (m *zeroOrOneCharacterStringMatcher) Matches(s string) bool {
|
|
// If there's more than one rune in the string, then it can't match.
|
|
if r, size := utf8.DecodeRuneInString(s); r == utf8.RuneError {
|
|
// Size is 0 for empty strings, 1 for invalid rune.
|
|
// Empty string matches, invalid rune matches if there isn't anything else.
|
|
return size == len(s)
|
|
} else if size < len(s) {
|
|
return false
|
|
}
|
|
|
|
// No need to check for the newline if the string is empty or matching a newline is OK.
|
|
if m.matchNL || len(s) == 0 {
|
|
return true
|
|
}
|
|
|
|
return s[0] != '\n'
|
|
}
|
|
|
|
// trueMatcher is a stringMatcher which matches any string (always returns true).
|
|
type trueMatcher struct{}
|
|
|
|
func (trueMatcher) Matches(string) bool {
|
|
return true
|
|
}
|
|
|
|
// optimizeEqualOrPrefixStringMatchers optimize a specific case where all matchers are made by an
|
|
// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher) or
|
|
// with a literal prefix (literalPrefixSensitiveStringMatcher or literalPrefixInsensitiveStringMatcher).
|
|
//
|
|
// In this specific case, when we have many strings to match against we can use a map instead
|
|
// of iterating over the list of strings.
|
|
func optimizeEqualOrPrefixStringMatchers(input StringMatcher, threshold int) StringMatcher {
|
|
var (
|
|
caseSensitive bool
|
|
caseSensitiveSet bool
|
|
numValues int
|
|
numPrefixes int
|
|
minPrefixLength int
|
|
)
|
|
|
|
// Analyse the input StringMatcher to count the number of occurrences
|
|
// and ensure all of them have the same case sensitivity.
|
|
analyseEqualMatcherCallback := func(matcher *equalStringMatcher) bool {
|
|
// Ensure we don't have mixed case sensitivity.
|
|
if caseSensitiveSet && caseSensitive != matcher.caseSensitive {
|
|
return false
|
|
} else if !caseSensitiveSet {
|
|
caseSensitive = matcher.caseSensitive
|
|
caseSensitiveSet = true
|
|
}
|
|
|
|
numValues++
|
|
return true
|
|
}
|
|
|
|
analysePrefixMatcherCallback := func(prefix string, prefixCaseSensitive bool, _ StringMatcher) bool {
|
|
// Ensure we don't have mixed case sensitivity.
|
|
if caseSensitiveSet && caseSensitive != prefixCaseSensitive {
|
|
return false
|
|
} else if !caseSensitiveSet {
|
|
caseSensitive = prefixCaseSensitive
|
|
caseSensitiveSet = true
|
|
}
|
|
if numPrefixes == 0 || len(prefix) < minPrefixLength {
|
|
minPrefixLength = len(prefix)
|
|
}
|
|
|
|
numPrefixes++
|
|
return true
|
|
}
|
|
|
|
if !findEqualOrPrefixStringMatchers(input, analyseEqualMatcherCallback, analysePrefixMatcherCallback) {
|
|
return input
|
|
}
|
|
|
|
// If the number of values and prefixes found is less than the threshold, then we should skip the optimization.
|
|
if (numValues + numPrefixes) < threshold {
|
|
return input
|
|
}
|
|
|
|
// Parse again the input StringMatcher to extract all values and storing them.
|
|
// We can skip the case sensitivity check because we've already checked it and
|
|
// if the code reach this point then it means all matchers have the same case sensitivity.
|
|
multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues, numPrefixes, minPrefixLength)
|
|
|
|
// Ignore the return value because we already iterated over the input StringMatcher
|
|
// and it was all good.
|
|
findEqualOrPrefixStringMatchers(input, func(matcher *equalStringMatcher) bool {
|
|
multiMatcher.add(matcher.s)
|
|
return true
|
|
}, func(prefix string, _ bool, matcher StringMatcher) bool {
|
|
multiMatcher.addPrefix(prefix, caseSensitive, matcher)
|
|
return true
|
|
})
|
|
|
|
return multiMatcher
|
|
}
|
|
|
|
// findEqualOrPrefixStringMatchers analyze the input StringMatcher and calls the equalMatcherCallback for each
|
|
// equalStringMatcher found, and prefixMatcherCallback for each literalPrefixSensitiveStringMatcher and literalPrefixInsensitiveStringMatcher found.
|
|
//
|
|
// Returns true if and only if the input StringMatcher is *only* composed by an alternation of equalStringMatcher and/or
|
|
// literal prefix matcher. Returns false if prefixMatcherCallback is nil and a literal prefix matcher is encountered.
|
|
func findEqualOrPrefixStringMatchers(input StringMatcher, equalMatcherCallback func(matcher *equalStringMatcher) bool, prefixMatcherCallback func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool) bool {
|
|
orInput, ok := input.(orStringMatcher)
|
|
if !ok {
|
|
return false
|
|
}
|
|
|
|
for _, m := range orInput {
|
|
switch casted := m.(type) {
|
|
case orStringMatcher:
|
|
if !findEqualOrPrefixStringMatchers(m, equalMatcherCallback, prefixMatcherCallback) {
|
|
return false
|
|
}
|
|
|
|
case *equalStringMatcher:
|
|
if !equalMatcherCallback(casted) {
|
|
return false
|
|
}
|
|
|
|
case *literalPrefixSensitiveStringMatcher:
|
|
if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, true, casted) {
|
|
return false
|
|
}
|
|
|
|
case *literalPrefixInsensitiveStringMatcher:
|
|
if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, false, casted) {
|
|
return false
|
|
}
|
|
|
|
default:
|
|
// It's not an equal or prefix string matcher, so we have to stop searching
|
|
// cause this optimization can't be applied.
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func hasPrefixCaseInsensitive(s, prefix string) bool {
|
|
return len(s) >= len(prefix) && strings.EqualFold(s[0:len(prefix)], prefix)
|
|
}
|
|
|
|
func hasSuffixCaseInsensitive(s, suffix string) bool {
|
|
return len(s) >= len(suffix) && strings.EqualFold(s[len(s)-len(suffix):], suffix)
|
|
}
|
|
|
|
func containsInOrder(s string, contains []string) bool {
|
|
// Optimization for the case we only have to look for 1 substring.
|
|
if len(contains) == 1 {
|
|
return strings.Contains(s, contains[0])
|
|
}
|
|
|
|
return containsInOrderMulti(s, contains)
|
|
}
|
|
|
|
func containsInOrderMulti(s string, contains []string) bool {
|
|
offset := 0
|
|
|
|
for _, substr := range contains {
|
|
at := strings.Index(s[offset:], substr)
|
|
if at == -1 {
|
|
return false
|
|
}
|
|
|
|
offset += at + len(substr)
|
|
}
|
|
|
|
return true
|
|
}
|