Remove 'charlen' argument from make_trigrams()

The function assumed that if charlen == bytelen, there are no
multibyte characters in the string. That's sensible, but the callers
were a little careless in how they calculated the lengths. The callers
converted the string to lowercase before calling make_trigram(), and
the 'charlen' value was calculated *before* the conversion to
lowercase while 'bytelen' was calculated after the conversion. If the
lowercased string had a different number of characters than the
original, make_trigram() might incorrectly apply the fastpath and
treat all the bytes as single-byte characters, or fail to apply the
fastpath (which is harmless), or it might hit the "Assert(bytelen ==
charlen)" assertion. I'm not aware of any locale / character
combinations where you could hit that assertion in practice,
i.e. where a string converted to lowercase would have fewer characters
than the original, but it seems best to avoid making that assumption.

To fix, remove the 'charlen' argument. To keep the performance when
there are no multibyte characters, always try the fast path first, but
check the input for multibyte characters as we go. The check on each
byte adds some overhead, but it's close enough. And to compensate, the
find_word() function no longer needs to count the characters.

This fixes one small bug in make_trigrams(): in the multibyte
codepath, it peeked at the byte just after the end of the input
string. When compiled with IGNORECASE, that was harmless because there
is always a NUL byte or blank after the input string. But with
!IGNORECASE, the call from generate_wildcard_trgm() doesn't guarantee
that.

Backpatch to v18, but no further. In previous versions lower-casing was
done character by character, and thus the assumption that lower-casing
doesn't change the character length was valid. That was changed in v18,
commit fb1a18810f.

Security: CVE-2026-2007
Reviewed-by: Noah Misch <noah@leadboat.com>
This commit is contained in:
Heikki Linnakangas 2026-01-20 14:34:32 +02:00 committed by Thomas Munro
parent 379695d3cc
commit 54598670fe

View file

@ -220,7 +220,7 @@ comp_trgm(const void *a, const void *b)
* endword points to the character after word
*/
static char *
find_word(char *str, int lenstr, char **endword, int *charlen)
find_word(char *str, int lenstr, char **endword)
{
char *beginword = str;
@ -231,12 +231,8 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
return NULL;
*endword = beginword;
*charlen = 0;
while (*endword - str < lenstr && ISWORDCHR(*endword))
{
*endword += pg_mblen(*endword);
(*charlen)++;
}
return beginword;
}
@ -269,47 +265,84 @@ compact_trigram(trgm *tptr, char *str, int bytelen)
}
/*
* Adds trigrams from words (already padded).
* Adds trigrams from the word in 'str' (already padded if necessary).
*/
static trgm *
make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
make_trigrams(trgm *tptr, char *str, int bytelen)
{
char *ptr = str;
if (charlen < 3)
if (bytelen < 3)
return tptr;
if (bytelen > charlen)
if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
{
/* Find multibyte character boundaries and apply compact_trigram */
int lenfirst = pg_mblen(str),
lenmiddle = pg_mblen(str + lenfirst),
lenlast = pg_mblen(str + lenfirst + lenmiddle);
while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
{
compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
ptr += lenfirst;
tptr++;
lenfirst = lenmiddle;
lenmiddle = lenlast;
lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
}
}
else
{
/* Fast path when there are no multibyte characters */
Assert(bytelen == charlen);
while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
while (ptr < str + bytelen - 2)
{
CPTRGM(tptr, ptr);
ptr++;
tptr++;
}
}
else
{
int lenfirst,
lenmiddle,
lenlast;
char *endptr;
/*
* Fast path as long as there are no multibyte characters
*/
if (!IS_HIGHBIT_SET(ptr[0]) && !IS_HIGHBIT_SET(ptr[1]))
{
while (!IS_HIGHBIT_SET(ptr[2]))
{
CPTRGM(tptr, ptr);
ptr++;
tptr++;
if (ptr == str + bytelen - 2)
return tptr;
}
lenfirst = 1;
lenmiddle = 1;
lenlast = pg_mblen(ptr + 2);
}
else
{
lenfirst = pg_mblen(ptr);
if (ptr + lenfirst >= str + bytelen)
return tptr;
lenmiddle = pg_mblen(ptr + lenfirst);
if (ptr + lenfirst + lenmiddle >= str + bytelen)
return tptr;
lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
}
/*
* Slow path to handle any remaining multibyte characters
*
* As we go, 'ptr' points to the beginning of the current
* three-character string and 'endptr' points to just past it.
*/
endptr = ptr + lenfirst + lenmiddle + lenlast;
while (endptr <= str + bytelen)
{
compact_trigram(tptr, ptr, endptr - ptr);
tptr++;
/* Advance to the next character */
if (endptr == str + bytelen)
break;
ptr += lenfirst;
lenfirst = lenmiddle;
lenmiddle = lenlast;
lenlast = pg_mblen(endptr);
endptr += lenlast;
}
}
return tptr;
}
@ -328,8 +361,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
{
trgm *tptr;
char *buf;
int charlen,
bytelen;
int bytelen;
char *bword,
*eword;
@ -349,7 +381,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
}
eword = str;
while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
{
#ifdef IGNORECASE
bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
@ -370,8 +402,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
/* Calculate trigrams marking their bounds if needed */
if (bounds)
bounds[tptr - trg] |= TRGM_BOUND_LEFT;
tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
charlen + LPADDING + RPADDING);
tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING);
if (bounds)
bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
}
@ -761,17 +792,16 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
* str: source string, of length lenstr bytes (need not be null-terminated)
* buf: where to return the substring (must be long enough)
* *bytelen: receives byte length of the found substring
* *charlen: receives character length of the found substring
*
* Returns pointer to end+1 of the found substring in the source string.
* Returns NULL if no word found (in which case buf, bytelen, charlen not set)
* Returns NULL if no word found (in which case buf, bytelen is not set)
*
* If the found word is bounded by non-word characters or string boundaries
* then this function will include corresponding padding spaces into buf.
*/
static const char *
get_wildcard_part(const char *str, int lenstr,
char *buf, int *bytelen, int *charlen)
char *buf, int *bytelen)
{
const char *beginword = str;
const char *endword;
@ -820,18 +850,13 @@ get_wildcard_part(const char *str, int lenstr,
* Add left padding spaces if preceding character wasn't wildcard
* meta-character.
*/
*charlen = 0;
if (!in_leading_wildcard_meta)
{
if (LPADDING > 0)
{
*s++ = ' ';
(*charlen)++;
if (LPADDING > 1)
{
*s++ = ' ';
(*charlen)++;
}
}
}
@ -848,7 +873,6 @@ get_wildcard_part(const char *str, int lenstr,
if (ISWORDCHR(endword))
{
memcpy(s, endword, clen);
(*charlen)++;
s += clen;
}
else
@ -876,7 +900,6 @@ get_wildcard_part(const char *str, int lenstr,
else if (ISWORDCHR(endword))
{
memcpy(s, endword, clen);
(*charlen)++;
s += clen;
}
else
@ -894,12 +917,8 @@ get_wildcard_part(const char *str, int lenstr,
if (RPADDING > 0)
{
*s++ = ' ';
(*charlen)++;
if (RPADDING > 1)
{
*s++ = ' ';
(*charlen)++;
}
}
}
@ -922,7 +941,6 @@ generate_wildcard_trgm(const char *str, int slen)
*buf2;
trgm *tptr;
int len,
charlen,
bytelen;
const char *eword;
@ -945,7 +963,7 @@ generate_wildcard_trgm(const char *str, int slen)
*/
eword = str;
while ((eword = get_wildcard_part(eword, slen - (eword - str),
buf, &bytelen, &charlen)) != NULL)
buf, &bytelen)) != NULL)
{
#ifdef IGNORECASE
buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
@ -957,7 +975,7 @@ generate_wildcard_trgm(const char *str, int slen)
/*
* count trigrams
*/
tptr = make_trigrams(tptr, buf2, bytelen, charlen);
tptr = make_trigrams(tptr, buf2, bytelen);
#ifdef IGNORECASE
pfree(buf2);