mirror of
https://github.com/postgres/postgres.git
synced 2026-03-10 18:28:35 -04:00
A bounded quantifier with m = n = 1 might be thought a no-op. But
according to our documentation (which traces back to Henry Spencer's
original man page) it still imposes greediness, or non-greediness in the
case of the non-greedy variant "{1,1}?", on whatever it's attached to.
This turns out not to work though, because parseqatom() optimizes away
the m = n = 1 case without regard for whether it's supposed to change
the greediness of the argument RE.
We can fix this by just not applying the optimization when the greediness
needs to change; the subsequent general cases handle it fine.
The three cases in which we can still apply the optimization are
(a) no quantifier, or quantifier does not impose a preference;
(b) atom has no greediness property, implying it cannot match a
variable amount of text anyway; or
(c) quantifier's greediness is same as atom's.
Note that in most cases where one of these applies, we'd have exited
earlier in the "not a messy case" fast path. I think it's now only
possible to get to the optimization when the atom involves capturing
parentheses or a non-top-level backref.
Back-patch to all supported branches. I'd ordinarily be hesitant to
put a subtle behavioral change into back branches, but in this case
it's very hard to see a reason why somebody would write "{1,1}?" unless
they're trying to get the documented change-of-greediness behavior.
Discussion: https://postgr.es/m/5bb27a41-350d-37bf-901e-9d26f5592dd0@charter.net
141 lines
5.3 KiB
SQL
141 lines
5.3 KiB
SQL
--
|
|
-- Regular expression tests
|
|
--
|
|
|
|
-- Don't want to have to double backslashes in regexes
|
|
set standard_conforming_strings = on;
|
|
|
|
-- Test simple quantified backrefs
|
|
select 'bbbbb' ~ '^([bc])\1*$' as t;
|
|
select 'ccc' ~ '^([bc])\1*$' as t;
|
|
select 'xxx' ~ '^([bc])\1*$' as f;
|
|
select 'bbc' ~ '^([bc])\1*$' as f;
|
|
select 'b' ~ '^([bc])\1*$' as t;
|
|
|
|
-- Test quantified backref within a larger expression
|
|
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
|
|
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
|
|
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
|
|
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
|
|
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
|
|
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
|
|
|
|
-- Test some cases that crashed in 9.2beta1 due to pmatch[] array overrun
|
|
select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)');
|
|
select substring('a' from '((a))+');
|
|
select substring('a' from '((a)+)');
|
|
|
|
-- Test regexp_match()
|
|
select regexp_match('abc', '');
|
|
select regexp_match('abc', 'bc');
|
|
select regexp_match('abc', 'd') is null;
|
|
select regexp_match('abc', '(B)(c)', 'i');
|
|
select regexp_match('abc', 'Bd', 'ig'); -- error
|
|
|
|
-- Test lookahead constraints
|
|
select regexp_matches('ab', 'a(?=b)b*');
|
|
select regexp_matches('a', 'a(?=b)b*');
|
|
select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
|
|
select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
|
|
select regexp_matches('ab', 'a(?!b)b*');
|
|
select regexp_matches('a', 'a(?!b)b*');
|
|
select regexp_matches('b', '(?=b)b');
|
|
select regexp_matches('a', '(?=b)b');
|
|
|
|
-- Test lookbehind constraints
|
|
select regexp_matches('abb', '(?<=a)b*');
|
|
select regexp_matches('a', 'a(?<=a)b*');
|
|
select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*');
|
|
select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*');
|
|
select regexp_matches('ab', 'a*(?<!a)b*');
|
|
select regexp_matches('ab', 'a*(?<!a)b+');
|
|
select regexp_matches('b', 'a*(?<!a)b+');
|
|
select regexp_matches('a', 'a(?<!a)b*');
|
|
select regexp_matches('b', '(?<=b)b');
|
|
select regexp_matches('foobar', '(?<=f)b+');
|
|
select regexp_matches('foobar', '(?<=foo)b+');
|
|
select regexp_matches('foobar', '(?<=oo)b+');
|
|
|
|
-- Test optimization of single-chr-or-bracket-expression lookaround constraints
|
|
select 'xz' ~ 'x(?=[xy])';
|
|
select 'xy' ~ 'x(?=[xy])';
|
|
select 'xz' ~ 'x(?![xy])';
|
|
select 'xy' ~ 'x(?![xy])';
|
|
select 'x' ~ 'x(?![xy])';
|
|
select 'xyy' ~ '(?<=[xy])yy+';
|
|
select 'zyy' ~ '(?<=[xy])yy+';
|
|
select 'xyy' ~ '(?<![xy])yy+';
|
|
select 'zyy' ~ '(?<![xy])yy+';
|
|
|
|
-- Test conversion of regex patterns to indexable conditions
|
|
explain (costs off) select * from pg_proc where proname ~ 'abc';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abc';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abc$';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abcd*e';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abc+d';
|
|
explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
|
|
explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
|
|
explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abcd(x|(?=\w\w)q)';
|
|
|
|
-- Test for infinite loop in pullback() (CVE-2007-4772)
|
|
select 'a' ~ '($|^)*';
|
|
|
|
-- These cases expose a bug in the original fix for CVE-2007-4772
|
|
select 'a' ~ '(^)+^';
|
|
select 'a' ~ '$($$)+';
|
|
|
|
-- More cases of infinite loop in pullback(), not fixed by CVE-2007-4772 fix
|
|
select 'a' ~ '($^)+';
|
|
select 'a' ~ '(^$)*';
|
|
select 'aa bb cc' ~ '(^(?!aa))+';
|
|
select 'aa x' ~ '(^(?!aa)(?!bb)(?!cc))+';
|
|
select 'bb x' ~ '(^(?!aa)(?!bb)(?!cc))+';
|
|
select 'cc x' ~ '(^(?!aa)(?!bb)(?!cc))+';
|
|
select 'dd x' ~ '(^(?!aa)(?!bb)(?!cc))+';
|
|
|
|
-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
|
|
select 'a' ~ '((((((a)*)*)*)*)*)*';
|
|
select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
|
|
|
|
-- These cases used to give too-many-states failures
|
|
select 'x' ~ 'abcd(\m)+xyz';
|
|
select 'a' ~ '^abcd*(((((^(a c(e?d)a+|)+|)+|)+|)+|a)+|)';
|
|
select 'x' ~ 'a^(^)bcd*xy(((((($a+|)+|)+|)+$|)+|)+|)^$';
|
|
select 'x' ~ 'xyz(\Y\Y)+';
|
|
select 'x' ~ 'x|(?:\M)+';
|
|
|
|
-- This generates O(N) states but O(N^2) arcs, so it causes problems
|
|
-- if arc count is not constrained
|
|
select 'x' ~ repeat('x*y*z*', 1000);
|
|
|
|
-- Test backref in combination with non-greedy quantifier
|
|
-- https://core.tcl.tk/tcl/tktview/6585b21ca8fa6f3678d442b97241fdd43dba2ec0
|
|
select 'Programmer' ~ '(\w).*?\1' as t;
|
|
select regexp_matches('Programmer', '(\w)(.*?\1)', 'g');
|
|
|
|
-- Test for proper matching of non-greedy iteration (bug #11478)
|
|
select regexp_matches('foo/bar/baz',
|
|
'^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', '');
|
|
|
|
-- Test that greediness can be overridden by outer quantifier
|
|
select regexp_matches('llmmmfff', '^(l*)(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*){1,1}(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*){1,1}?(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*){1,1}?(.*){1,1}?(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*?)(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*?){1,1}(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*){1,1}?(f*)$');
|
|
|
|
-- Test for infinite loop in cfindloop with zero-length possible match
|
|
-- but no actual match (can only happen in the presence of backrefs)
|
|
select 'a' ~ '$()|^\1';
|
|
select 'a' ~ '.. ()|\1';
|
|
select 'a' ~ '()*\1';
|
|
select 'a' ~ '()+\1';
|
|
|
|
-- Error conditions
|
|
select 'xyz' ~ 'x(\w)(?=\1)'; -- no backrefs in LACONs
|
|
select 'xyz' ~ 'x(\w)(?=(\1))';
|
|
select 'a' ~ '\x7fffffff'; -- invalid chr code
|