mirror of
https://github.com/opnsense/src.git
synced 2026-06-09 08:43:19 -04:00
regex: mixed sets are misidentified as singletons
Fix "singleton" function used by regcomp() to turn character set matches
into exact character matches if a character set has exactly one
element.
The underlying cset representation is complex; most critically it
records"small" characters (codepoint less than either 128
or 256 depending on locale) in a bit vector, and "wide" characters in
a secondary array.
Unfortunately the "singleton" function uses to identify singleton sets
treated a cset as a singleton if either the "small" or the "wide" sets
had exactly one element (it would then ignore the other set).
The easiest way to demonstrate this bug:
$ export LANG=C.UTF-8
$ echo 'a' | grep '[abà]'
It should match (and print "a") but instead it doesn't match because the
single accented character in the set is misinterpreted as a singleton.
PR: 281710
Reviewed by: kevans, yuripv
Obtained from: illumos
(cherry picked from commit 8f7ed58a15)
This commit is contained in:
parent
39f39a9656
commit
4f4860c9b0
2 changed files with 62 additions and 6 deletions
|
|
@ -1591,17 +1591,32 @@ singleton(cset *cs)
|
|||
{
|
||||
wint_t i, s, n;
|
||||
|
||||
/* Exclude the complicated cases we don't want to deal with */
|
||||
if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
|
||||
return (OUT);
|
||||
|
||||
if (cs->nwides > 1)
|
||||
return (OUT);
|
||||
|
||||
/* Count the number of characters present in the bitmap */
|
||||
for (i = n = 0; i < NC; i++)
|
||||
if (CHIN(cs, i)) {
|
||||
n++;
|
||||
s = i;
|
||||
}
|
||||
if (n == 1)
|
||||
return (s);
|
||||
if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
|
||||
cs->icase == 0)
|
||||
|
||||
if (n > 1)
|
||||
return (OUT);
|
||||
|
||||
if (n == 1) {
|
||||
if (cs->nwides == 0)
|
||||
return (s);
|
||||
else
|
||||
return (OUT);
|
||||
}
|
||||
if (cs->nwides == 1)
|
||||
return (cs->wides[0]);
|
||||
/* Don't bother handling the other cases. */
|
||||
|
||||
return (OUT);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
atf_test_case bmpat
|
||||
bmpat_head()
|
||||
{
|
||||
|
|
@ -45,8 +44,50 @@ icase_body()
|
|||
echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
|
||||
}
|
||||
|
||||
atf_test_case mbset cleanup
|
||||
mbset_head()
|
||||
{
|
||||
atf_set "descr" "Check multibyte sets matching"
|
||||
}
|
||||
mbset_body()
|
||||
{
|
||||
export LC_CTYPE="C.UTF-8"
|
||||
|
||||
# This involved an erroneously implemented optimization which reduces
|
||||
# single-element sets to an exact match with a single codepoint.
|
||||
# Match sets record small-codepoint characters in a bitmap and
|
||||
# large-codepoint characters in an array; the optimization would falsely
|
||||
# trigger if either the bitmap or the array was a singleton, ignoring
|
||||
# the members of the other side of the set.
|
||||
#
|
||||
# To exercise this, we construct sets which have one member of one side
|
||||
# and one or more of the other, and verify that all members can be
|
||||
# found.
|
||||
printf "a" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
|
||||
printf "à" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
|
||||
printf "a" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
|
||||
printf "à" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
|
||||
printf "á" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
|
||||
printf "à" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
|
||||
printf "a" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
|
||||
printf "b" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
|
||||
printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
|
||||
printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
|
||||
printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
|
||||
printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
|
||||
printf "á" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
|
||||
printf "à" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
|
||||
printf "a" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
|
||||
printf "b" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
|
||||
}
|
||||
mbset_cleanup()
|
||||
{
|
||||
rm -f mbset
|
||||
}
|
||||
|
||||
atf_init_test_cases()
|
||||
{
|
||||
atf_add_test_case bmpat
|
||||
atf_add_test_case icase
|
||||
atf_add_test_case mbset
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue