Using new UTF8bv* all over, getting rid of UTF8normalize() and

UTF8normcmp().
2026-06-11 09:40:11 -04:00 · 2002-02-27 16:11:24 +00:00 · 2002-02-27 16:11:24 +00:00 · c476878fde
commit c476878fde
parent 94983da942
3 changed files with 14 additions and 274 deletions
--- a/include/ldap_pvt_uc.h
+++ b/include/ldap_pvt_uc.h
@ -143,20 +143,11 @@ LDAP_LUNICODE_F(void) ucstr2upper(
 #define LDAP_UTF8_ARG2NFC	0x4U
 #define LDAP_UTF8_APPROX	0x8U

-LDAP_LUNICODE_F(char *) UTF8normalize(
-	struct berval *,
-	unsigned );
-
 LDAP_LUNICODE_F(struct berval *) UTF8bvnormalize(
 	struct berval *,
 	struct berval *,
 	unsigned );

-LDAP_LUNICODE_F(int) UTF8normcmp(
-	const char *,
-	const char *,
-	unsigned );
-
 LDAP_LUNICODE_F(int) UTF8bvnormcmp(
 	struct berval *,
 	struct berval *,
--- a/libraries/liblunicode/ucstr.c
+++ b/libraries/liblunicode/ucstr.c
@ -92,156 +92,6 @@ void ucstr2upper(
 	}
 }

-char * UTF8normalize(
-	struct berval *bv,
-	unsigned casefold )
-{
-	int i, j, len, clen, outpos, ucsoutlen, outsize, last;
-	char *out, *s;
-	unsigned long *ucs, *p, *ucsout;
-
-	static unsigned char mask[] = {
-                0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
-
-	if ( bv == NULL ) {
-		return NULL;
-	}
-
-	s = bv->bv_val;
-	len = bv->bv_len;
-
-	/* See if the string is pure ASCII so we can shortcut */
-	for ( i=0; i<len; i++ ) {
-		if ( s[i] & 0x80 )	/* non-ASCII */
-			break;
-	}
-
-	/* It's pure ASCII or zero-len */
-	if ( i == len ) {
-		out = malloc( len + 1 );
-		if ( i && !casefold ) {
-			strncpy( out, bv->bv_val, len );
-		} else {
-			for ( j=0; j<i; j++ )
-				out[j] = TOUPPER( s[j] );
-		}
-		out[len] = '\0';
-		return out;
-	}
-
-	outsize = len + 7;
-	out = (char *) malloc( outsize );
-	if ( out == NULL ) {
-		return NULL;
-	}
-
-	/* FIXME: Should first check to see if string is already in
-	 * proper normalized form.
-	 */
-
-	outpos = 0;
-
-	/* finish off everything up to character before first non-ascii */
-	if ( LDAP_UTF8_ISASCII( s ) ) {
-		for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
-			out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
-		}
-		if ( i == len ) {
-			out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
-			out[outpos] = '\0';
-			return out;
-		}
-	} else {
-		i = 0;
-	}
-
-	p = ucs = (long *) malloc( len * sizeof(*ucs) );
-	if ( ucs == NULL ) {
-		free(out);
-		return NULL;
-	}
-
-	/* convert character before first non-ascii to ucs-4 */
-	if ( i > 0 ) {
-		*p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
-		p++;
-	}
-
-	/* s[i] is now first non-ascii character */
-	for (;;) {
-		/* s[i] is non-ascii */
-		/* convert everything up to next ascii to ucs-4 */
-		while ( i < len ) {
-			clen = LDAP_UTF8_CHARLEN2( s + i, clen );
-			if ( clen == 0 ) {
-				free( ucs );
-				free( out );
-				return NULL;
-			}
-			if ( clen == 1 ) {
-				/* ascii */
-				break;
-			}
-			*p = s[i] & mask[clen];
-			i++;
-			for( j = 1; j < clen; j++ ) {
-				if ( (s[i] & 0xc0) != 0x80 ) {
-					free( ucs );
-					free( out );
-					return NULL;
-				}
-				*p <<= 6;
-				*p |= s[i] & 0x3f;
-				i++;
-			}
-			if ( casefold ) {
-				*p = uctoupper( *p );
-			}
-			p++;
-                }
-		/* normalize ucs of length p - ucs */
-		uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );    
-		ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
-		/* convert ucs to utf-8 and store in out */
-		for ( j = 0; j < ucsoutlen; j++ ) {
-			/* allocate more space if not enough room for
-			   6 bytes and terminator */
-			if ( outsize - outpos < 7 ) {
-				outsize = ucsoutlen - j + outpos + 6;
-				out = (char *) realloc( out, outsize );
-				if ( out == NULL ) {
-					free( ucs );
-					return NULL;
-				}
-			}
-			outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
-		}
-		
-		if ( i == len ) {
-			break;
-		}
-
-		last = i;
-
-		/* s[i] is ascii */
-		/* finish off everything up to char before next non-ascii */
-		for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
-			out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
-		}
-		if ( i == len ) {
-			out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
-			break;
-		}
-
-		/* convert character before next non-ascii to ucs-4 */
-		*ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
-		p = ucs + 1;
-	}		
-	free( ucs );
-	out[outpos] = '\0';
-	return out;
-}
-
 struct berval * UTF8bvnormalize(
 	struct berval *bv,
 	struct berval *newbv,
@ -412,102 +262,6 @@ struct berval * UTF8bvnormalize(
 	return ber_str2bv( out, outpos, 0, newbv );
 }

-/* compare UTF8-strings, optionally ignore casing, string pointers must not be NULL */
-/* slow, should be optimized */
-int UTF8normcmp(
-	const char *s1,
-	const char *s2,
-	unsigned casefold )
-{
-	int i, l1, l2, len, ulen, res;
-	unsigned long *ucs, *ucsout1, *ucsout2;
-
-	l1 = strlen( s1 );
-	l2 = strlen( s2 );
-
-	if ( ( l1 == 0 ) || ( l2 == 0 ) ) {
-		if ( l1 == l2 ) {
-			return 0;
-		}
-		return *s1 - *s2 > 0 ? 1 : -1;
-	}
-	
-	/* See if we can get away with a straight ASCII compare */
-	len = (l1 < l2) ? l1 : l2;
-	for ( i = 0; i<len; i++ ) {
-		/* Is either char non-ASCII? */
-		if ((s1[i] & 0x80) || (s2[i] & 0x80))
-			break;
-		if (casefold) {
-			char c1 = TOUPPER(s1[i]);
-			char c2 = TOUPPER(s2[i]);
-		    	res = c1 - c2;
-		} else {
-			res = s1[i] - s2[i];
-		}
-		if (res)
-			return res;
-	}
-	/* Strings were ASCII, equal up to minlen */
-	if (i == len)
-		return l1 - l2;
-		
-	/* FIXME: Should first check to see if strings are already in
-	 * proper normalized form.
-	 */
-
-	ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) );
-	if ( ucs == NULL ) {
-		return l1 > l2 ? 1 : -1; /* what to do??? */
-	}
-	
-	/*
-	 * XXYYZ: we convert to ucs4 even though -llunicode
-	 * expects ucs2 in an unsigned long
-	 */
-	
-	/* convert and normalize 1st string */
-	for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
-                ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
-                if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
-			free( ucs );
-                        return -1; /* what to do??? */
-                }
-		len = LDAP_UTF8_CHARLEN( s1 + i );
-	}
-	uccanondecomp( ucs, ulen, &ucsout1, &l1 );
-	l1 = uccanoncomp( ucsout1, l1 );
-
-	/* convert and normalize 2nd string */
-	for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
-                ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
-                if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
-			free( ucsout1 );
-			free( ucs );
-                        return 1; /* what to do??? */
-                }
-		len = LDAP_UTF8_CHARLEN( s2 + i );
-	}
-	uccanondecomp( ucs, ulen, &ucsout2, &l2 );
-	l2 = uccanoncomp( ucsout2, l2 );
-
-	free( ucs );
-
-	res = casefold
-		? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
-		: ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
-	free( ucsout1 );
-	free( ucsout2 );
-
-	if ( res != 0 ) {
-		return res;
-	}
-	if ( l1 == l2 ) {
-		return 0;
-	}
-	return l1 > l2 ? 1 : -1;
-}
-
 /* compare UTF8-strings, optionally ignore casing */
 /* slow, should be optimized */
 int UTF8bvnormcmp(
--- a/servers/slapd/schema_init.c
+++ b/servers/slapd/schema_init.c
@ -776,16 +776,17 @@ approxIndexer(
 {
 	char *c;
 	int i,j, len, wordcount, keycount=0;
-	struct berval *val, *newkeys;
+	struct berval *newkeys;
 	BerVarray keys=NULL;

 	for( j=0; values[j].bv_val != NULL; j++ ) {
+		struct berval val = { 0, NULL };
 		/* Yes, this is necessary */
-		val = UTF8bvnormalize( &values[j], NULL, LDAP_UTF8_APPROX );
-		assert( val != NULL && val->bv_val != NULL );
+		UTF8bvnormalize( &values[j], &val, LDAP_UTF8_APPROX );
+		assert( val.bv_val != NULL );

 		/* Isolate how many words there are. There will be a key for each */
-		for( wordcount = 0, c = val->bv_val; *c; c++) {
+		for( wordcount = 0, c = val.bv_val; *c; c++) {
 			len = strcspn(c, SLAPD_APPROX_DELIMITER);
 			if( len >= SLAPD_APPROX_WORDLEN ) wordcount++;
 			c+= len;
@ -801,7 +802,7 @@ approxIndexer(
 		keys = newkeys;

 		/* Get a phonetic copy of each word */
-		for( c = val->bv_val, i = 0; i < wordcount; c += len + 1 ) {
+		for( c = val.bv_val, i = 0; i < wordcount; c += len + 1 ) {
 			len = strlen( c );
 			if( len < SLAPD_APPROX_WORDLEN ) continue;
 			ber_str2bv( phonetic( c ), 0, 0, &keys[keycount] );
@ -809,7 +810,7 @@ approxIndexer(
 			i++;
 		}

-		ber_bvfree( val );
+		ber_memfree( val.bv_val );
 	}
 	keys[keycount].bv_val = NULL;
 	*keysp = keys;
@ -997,9 +998,7 @@ caseExactMatch(
 	struct berval *value,
 	void *assertedValue )
 {
-	*matchp = UTF8normcmp( value->bv_val,
-		((struct berval *) assertedValue)->bv_val,
-		LDAP_UTF8_NOCASEFOLD );
+	*matchp = UTF8bvnormcmp( value, (struct berval *) assertedValue, LDAP_UTF8_NOCASEFOLD );
 	return LDAP_SUCCESS;
 }

@ -1195,8 +1194,7 @@ static int caseExactIgnoreIndexer(

 	for( i=0; values[i].bv_val != NULL; i++ ) {
 		struct berval value;
-		ber_str2bv( UTF8normalize( &values[i], casefold ), 0, 0,
-			&value );
+		UTF8bvnormalize( &values[i], &value, casefold );

 		HASH_Init( &HASHcontext );
 		if( prefix != NULL && prefix->bv_len > 0 ) {
@ -1236,8 +1234,9 @@ static int caseExactIgnoreFilter(
 	BerVarray keys;
 	HASH_CONTEXT   HASHcontext;
 	unsigned char	HASHdigest[HASH_BYTES];
-	struct berval value;
+	struct berval value = { 0, NULL };
 	struct berval digest;
+
 	digest.bv_val = HASHdigest;
 	digest.bv_len = sizeof(HASHdigest);

@ -1247,8 +1246,7 @@ static int caseExactIgnoreFilter(
 	casefold = strcmp( mr->smr_oid, caseExactMatchOID )
 		? LDAP_UTF8_CASEFOLD : LDAP_UTF8_NOCASEFOLD;

-	ber_str2bv( UTF8normalize( ((struct berval *) assertValue), casefold ),
-		0, 0, &value );
+	UTF8bvnormalize( (struct berval *) assertValue, &value, casefold );
 	/* This usually happens if filter contains bad UTF8 */
 	if( value.bv_val == NULL ) {
 		keys = ch_malloc( sizeof( struct berval ) );
@ -1316,8 +1314,7 @@ static int caseExactIgnoreSubstringsIndexer(

 	nvalues = ch_malloc( sizeof( struct berval ) * (i+1) );
 	for( i=0; values[i].bv_val != NULL; i++ ) {
-		ber_str2bv( UTF8normalize( &values[i], casefold ),
-			0, 0, &nvalues[i] );
+		UTF8bvnormalize( &values[i], &nvalues[i], casefold );
 	}
 	nvalues[i].bv_val = NULL;
 	values = nvalues;
@ -1647,9 +1644,7 @@ caseIgnoreMatch(
 	struct berval *value,
 	void *assertedValue )
 {
-	*matchp = UTF8normcmp( value->bv_val,
-		((struct berval *) assertedValue)->bv_val,
-		LDAP_UTF8_CASEFOLD );
+	*matchp = UTF8bvnormcmp( value, (struct berval *) assertedValue, LDAP_UTF8_CASEFOLD );
 	return LDAP_SUCCESS;
 }