Remove 'charlen' argument from make_trigrams()

The function assumed that if charlen == bytelen, there are no multibyte characters in the string. That's sensible, but the callers were a little careless in how they calculated the lengths. The callers converted the string to lowercase before calling make_trigram(), and the 'charlen' value was calculated *before* the conversion to lowercase while 'bytelen' was calculated after the conversion. If the lowercased string had a different number of characters than the original, make_trigram() might incorrectly apply the fastpath and treat all the bytes as single-byte characters, or fail to apply the fastpath (which is harmless), or it might hit the "Assert(bytelen == charlen)" assertion. I'm not aware of any locale / character combinations where you could hit that assertion in practice, i.e. where a string converted to lowercase would have fewer characters than the original, but it seems best to avoid making that assumption. To fix, remove the 'charlen' argument. To keep the performance when there are no multibyte characters, always try the fast path first, but check the input for multibyte characters as we go. The check on each byte adds some overhead, but it's close enough. And to compensate, the find_word() function no longer needs to count the characters. This fixes one small bug in make_trigrams(): in the multibyte codepath, it peeked at the byte just after the end of the input string. When compiled with IGNORECASE, that was harmless because there is always a NUL byte or blank after the input string. But with !IGNORECASE, the call from generate_wildcard_trgm() doesn't guarantee that. Backpatch to v18, but no further. In previous versions lower-casing was done character by character, and thus the assumption that lower-casing doesn't change the character length was valid. That was changed in v18, commit fb1a18810f. Security: CVE-2026-2007 Reviewed-by: Noah Misch <noah@leadboat.com>
2026-04-22 06:37:06 -04:00 · 2026-01-20 14:34:32 +02:00 · 2026-01-20 14:34:32 +02:00 · 54598670fe
commit 54598670fe
parent 379695d3cc
1 changed files with 72 additions and 54 deletions
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@ -220,7 +220,7 @@ comp_trgm(const void *a, const void *b)
 * endword points to the character after word
 */
 static char *
-find_word(char *str, int lenstr, char **endword, int *charlen)
+find_word(char *str, int lenstr, char **endword)
 {
 	char	   *beginword = str;

@ -231,12 +231,8 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
 		return NULL;

 	*endword = beginword;
-	*charlen = 0;
 	while (*endword - str < lenstr && ISWORDCHR(*endword))
-	{
 		*endword += pg_mblen(*endword);
-		(*charlen)++;
-	}

 	return beginword;
 }
@ -269,47 +265,84 @@ compact_trigram(trgm *tptr, char *str, int bytelen)
 }

 /*
- * Adds trigrams from words (already padded).
+ * Adds trigrams from the word in 'str' (already padded if necessary).
 */
 static trgm *
-make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
+make_trigrams(trgm *tptr, char *str, int bytelen)
 {
 	char	   *ptr = str;

-	if (charlen < 3)
+	if (bytelen < 3)
 		return tptr;

-	if (bytelen > charlen)
+	if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
 	{
-		/* Find multibyte character boundaries and apply compact_trigram */
-		int			lenfirst = pg_mblen(str),
-					lenmiddle = pg_mblen(str + lenfirst),
-					lenlast = pg_mblen(str + lenfirst + lenmiddle);
-
-		while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
-		{
-			compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
-
-			ptr += lenfirst;
-			tptr++;
-
-			lenfirst = lenmiddle;
-			lenmiddle = lenlast;
-			lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
-		}
-	}
-	else
-	{
-		/* Fast path when there are no multibyte characters */
-		Assert(bytelen == charlen);
-
-		while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
+		while (ptr < str + bytelen - 2)
 		{
 			CPTRGM(tptr, ptr);
 			ptr++;
 			tptr++;
 		}
 	}
+	else
+	{
+		int			lenfirst,
+					lenmiddle,
+					lenlast;
+		char	   *endptr;
+
+		/*
+		 * Fast path as long as there are no multibyte characters
+		 */
+		if (!IS_HIGHBIT_SET(ptr[0]) && !IS_HIGHBIT_SET(ptr[1]))
+		{
+			while (!IS_HIGHBIT_SET(ptr[2]))
+			{
+				CPTRGM(tptr, ptr);
+				ptr++;
+				tptr++;
+
+				if (ptr == str + bytelen - 2)
+					return tptr;
+			}
+
+			lenfirst = 1;
+			lenmiddle = 1;
+			lenlast = pg_mblen(ptr + 2);
+		}
+		else
+		{
+			lenfirst = pg_mblen(ptr);
+			if (ptr + lenfirst >= str + bytelen)
+				return tptr;
+			lenmiddle = pg_mblen(ptr + lenfirst);
+			if (ptr + lenfirst + lenmiddle >= str + bytelen)
+				return tptr;
+			lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
+		}
+
+		/*
+		 * Slow path to handle any remaining multibyte characters
+		 *
+		 * As we go, 'ptr' points to the beginning of the current
+		 * three-character string and 'endptr' points to just past it.
+		 */
+		endptr = ptr + lenfirst + lenmiddle + lenlast;
+		while (endptr <= str + bytelen)
+		{
+			compact_trigram(tptr, ptr, endptr - ptr);
+			tptr++;
+
+			/* Advance to the next character */
+			if (endptr == str + bytelen)
+				break;
+			ptr += lenfirst;
+			lenfirst = lenmiddle;
+			lenmiddle = lenlast;
+			lenlast = pg_mblen(endptr);
+			endptr += lenlast;
+		}
+	}

 	return tptr;
 }
@ -328,8 +361,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
 {
 	trgm	   *tptr;
 	char	   *buf;
-	int			charlen,
-				bytelen;
+	int			bytelen;
 	char	   *bword,
 			   *eword;

@ -349,7 +381,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
 	}

 	eword = str;
-	while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
+	while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
 	{
 #ifdef IGNORECASE
 		bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
@ -370,8 +402,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
 		/* Calculate trigrams marking their bounds if needed */
 		if (bounds)
 			bounds[tptr - trg] |= TRGM_BOUND_LEFT;
-		tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
-							 charlen + LPADDING + RPADDING);
+		tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING);
 		if (bounds)
 			bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
 	}
@ -761,17 +792,16 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
 * str: source string, of length lenstr bytes (need not be null-terminated)
 * buf: where to return the substring (must be long enough)
 * *bytelen: receives byte length of the found substring
- * *charlen: receives character length of the found substring
 *
 * Returns pointer to end+1 of the found substring in the source string.
- * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
+ * Returns NULL if no word found (in which case buf, bytelen is not set)
 *
 * If the found word is bounded by non-word characters or string boundaries
 * then this function will include corresponding padding spaces into buf.
 */
 static const char *
 get_wildcard_part(const char *str, int lenstr,
-				  char *buf, int *bytelen, int *charlen)
+				  char *buf, int *bytelen)
 {
 	const char *beginword = str;
 	const char *endword;
@ -820,18 +850,13 @@ get_wildcard_part(const char *str, int lenstr,
 	 * Add left padding spaces if preceding character wasn't wildcard
 	 * meta-character.
 	 */
-	*charlen = 0;
 	if (!in_leading_wildcard_meta)
 	{
 		if (LPADDING > 0)
 		{
 			*s++ = ' ';
-			(*charlen)++;
 			if (LPADDING > 1)
-			{
 				*s++ = ' ';
-				(*charlen)++;
-			}
 		}
 	}

@ -848,7 +873,6 @@ get_wildcard_part(const char *str, int lenstr,
 			if (ISWORDCHR(endword))
 			{
 				memcpy(s, endword, clen);
-				(*charlen)++;
 				s += clen;
 			}
 			else
@ -876,7 +900,6 @@ get_wildcard_part(const char *str, int lenstr,
 			else if (ISWORDCHR(endword))
 			{
 				memcpy(s, endword, clen);
-				(*charlen)++;
 				s += clen;
 			}
 			else
@ -894,12 +917,8 @@ get_wildcard_part(const char *str, int lenstr,
 		if (RPADDING > 0)
 		{
 			*s++ = ' ';
-			(*charlen)++;
 			if (RPADDING > 1)
-			{
 				*s++ = ' ';
-				(*charlen)++;
-			}
 		}
 	}

@ -922,7 +941,6 @@ generate_wildcard_trgm(const char *str, int slen)
 			   *buf2;
 	trgm	   *tptr;
 	int			len,
-				charlen,
 				bytelen;
 	const char *eword;

@ -945,7 +963,7 @@ generate_wildcard_trgm(const char *str, int slen)
 	 */
 	eword = str;
 	while ((eword = get_wildcard_part(eword, slen - (eword - str),
-									  buf, &bytelen, &charlen)) != NULL)
+									  buf, &bytelen)) != NULL)
 	{
 #ifdef IGNORECASE
 		buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
@ -957,7 +975,7 @@ generate_wildcard_trgm(const char *str, int slen)
 		/*
 		 * count trigrams
 		 */
-		tptr = make_trigrams(tptr, buf2, bytelen, charlen);
+		tptr = make_trigrams(tptr, buf2, bytelen);

 #ifdef IGNORECASE
 		pfree(buf2);