Adding support for compatibility decomposition to ucdata lib, and switch

from NFC to NFKC in UTF8bvnormalize() and UTF8bvnormcmp()
This commit is contained in:
Stig Venaas 2002-04-19 12:59:57 +00:00
parent 5528772f23
commit e08bc054c7
5 changed files with 293 additions and 52 deletions

View file

@ -23,7 +23,7 @@ ucgendat: $(XLIBS) ucgendat.o
$(LTLINK) -o $@ ucgendat.o $(LIBS)
./ucgendat $(srcdir)/UnicodeData.txt -x $(srcdir)/CompositionExclusions.txt
DATFILES = case.dat cmbcl.dat comp.dat ctype.dat decomp.dat num.dat
DATFILES = case.dat cmbcl.dat comp.dat ctype.dat decomp.dat num.dat kdecomp.dat
install-local: $(PROGRAMS) FORCE
-$(MKDIR) $(DESTDIR)$(datadir)/ucdata

View file

@ -674,6 +674,10 @@ static unsigned long _ucdcmp_size;
static unsigned long *_ucdcmp_nodes;
static unsigned long *_ucdcmp_decomp;
static unsigned long _uckdcmp_size;
static unsigned long *_uckdcmp_nodes;
static unsigned long *_uckdcmp_decomp;
/*
* Return -1 on error, 0 if okay
*/
@ -729,6 +733,61 @@ _ucdcmp_load(char *paths, int reload)
return 0;
}
/*
* Return -1 on error, 0 if okay
*/
static int
_uckdcmp_load(char *paths, int reload)
{
FILE *in;
unsigned long size, i;
_ucheader_t hdr;
if (_uckdcmp_size > 0) {
if (!reload)
/*
* The decompositions have already been loaded.
*/
return 0;
free((char *) _uckdcmp_nodes);
_uckdcmp_size = 0;
}
if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0)
return -1;
/*
* Load the header.
*/
fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
if (hdr.bom == 0xfffe) {
hdr.cnt = endian_short(hdr.cnt);
hdr.size.bytes = endian_long(hdr.size.bytes);
}
_uckdcmp_size = hdr.cnt << 1;
_uckdcmp_nodes = (unsigned long *) malloc(hdr.size.bytes);
_uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1);
/*
* Read the decomposition data in.
*/
size = hdr.size.bytes / sizeof(unsigned long);
fread((char *) _uckdcmp_nodes, sizeof(unsigned long), size, in);
/*
* Do an endian swap if necessary.
*/
if (hdr.bom == 0xfffe) {
for (i = 0; i < size; i++)
_uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]);
}
fclose(in);
return 0;
}
static void
_ucdcmp_unload(void)
{
@ -743,11 +802,29 @@ _ucdcmp_unload(void)
_ucdcmp_size = 0;
}
static void
_uckdcmp_unload(void)
{
if (_uckdcmp_size == 0)
return;
/*
* Only need to free the offsets because the memory is allocated as a
* single block.
*/
free((char *) _uckdcmp_nodes);
_uckdcmp_size = 0;
}
int
ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
{
long l, r, m;
if (code < _ucdcmp_nodes[0]) {
return 0;
}
l = 0;
r = _ucdcmp_nodes[_ucdcmp_size] - 1;
@ -771,6 +848,38 @@ ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
return 0;
}
int
uckdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
{
long l, r, m;
if (code < _uckdcmp_nodes[0]) {
return 0;
}
l = 0;
r = _uckdcmp_nodes[_uckdcmp_size] - 1;
while (l <= r) {
/*
* Determine a "mid" point and adjust to make sure the mid point is at
* the beginning of a code+offset pair.
*/
m = (l + r) >> 1;
m -= (m & 1);
if (code > _uckdcmp_nodes[m])
l = m + 2;
else if (code < _uckdcmp_nodes[m])
r = m - 2;
else if (code == _uckdcmp_nodes[m]) {
*num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1];
*decomp = &_uckdcmp_decomp[_uckdcmp_nodes[m + 1]];
return 1;
}
}
return 0;
}
int
ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[])
{
@ -786,9 +895,10 @@ ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[])
return 1;
}
int
uccanondecomp(const unsigned long *in, int inlen,
unsigned long **out, int *outlen)
/* mode == 0 for canonical, mode == 1 for compatibility */
static int
uccanoncompatdecomp(const unsigned long *in, int inlen,
unsigned long **out, int *outlen, short mode)
{
int l, size;
unsigned i, j, k;
@ -801,7 +911,7 @@ uccanondecomp(const unsigned long *in, int inlen,
i = 0;
for (j = 0; j < (unsigned) inlen; j++) {
if (ucdecomp(in[j], &num, &decomp)) {
if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) {
if ( size - i < num) {
size = inlen + i - j + num - 1;
*out = (unsigned long *) realloc(*out, size * sizeof(**out));
@ -855,6 +965,20 @@ uccanondecomp(const unsigned long *in, int inlen,
return *outlen = i;
}
int
uccanondecomp(const unsigned long *in, int inlen,
unsigned long **out, int *outlen)
{
return uccanoncompatdecomp(in, inlen, out, outlen, 0);
}
int
uccompatdecomp(const unsigned long *in, int inlen,
unsigned long **out, int *outlen)
{
return uccanoncompatdecomp(in, inlen, out, outlen, 1);
}
/**************************************************************************
*
* Support for combining classes.
@ -1152,6 +1276,8 @@ ucdata_load(char *paths, int masks)
error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0;
if (masks & UCDATA_COMP)
error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0;
if (masks & UCDATA_KDECOMP)
error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0;
return -error;
}
@ -1171,6 +1297,8 @@ ucdata_unload(int masks)
_ucnumb_unload();
if (masks & UCDATA_COMP)
_uccomp_unload();
if (masks & UCDATA_KDECOMP)
_uckdcmp_unload();
}
/*
@ -1193,6 +1321,8 @@ ucdata_reload(char *paths, int masks)
error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0;
if (masks & UCDATA_COMP)
error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0;
if (masks & UCDATA_KDECOMP)
error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0;
return -error;
}

View file

@ -249,6 +249,14 @@ LDAP_LUNICODE_F (int)
ucdecomp LDAP_P((unsigned long code, unsigned long *num,
unsigned long **decomp));
/*
* Equivalent to ucdecomp() except that it includes compatibility
* decompositions.
*/
LDAP_LUNICODE_F (int)
uckdecomp LDAP_P((unsigned long code, unsigned long *num,
unsigned long **decomp));
/*
* If the code is a Hangul syllable, this routine decomposes it into the array
* passed. The array size should be at least 3.
@ -267,6 +275,14 @@ LDAP_LUNICODE_F (int)
uccanondecomp LDAP_P((const unsigned long *in, int inlen,
unsigned long **out, int *outlen));
/*
* Equivalent to uccanondecomp() except that it includes compatibility
* decompositions.
*/
LDAP_LUNICODE_F (int)
uccompatdecomp LDAP_P((const unsigned long *in, int inlen,
unsigned long **out, int *outlen));
/**************************************************************************
*
* Functions for getting combining classes.
@ -318,9 +334,10 @@ LDAP_LUNICODE_F (int) ucgetdigit LDAP_P((unsigned long code));
#define UCDATA_CMBCL 0x08
#define UCDATA_NUM 0x10
#define UCDATA_COMP 0x20
#define UCDATA_KDECOMP 0x40
#define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
UCDATA_CMBCL|UCDATA_NUM|UCDATA_COMP)
UCDATA_CMBCL|UCDATA_NUM|UCDATA_COMP|UCDATA_KDECOMP)
/*
* Functions to load, unload, and reload specific data files.

View file

@ -108,12 +108,17 @@ typedef struct {
/*
* List of decomposition. Created and expanded in order as the characters are
* encountered.
* encountered. First list contains canonical mappings, second also includes
* compatibility mappings.
*/
static _decomp_t *decomps;
static unsigned long decomps_used;
static unsigned long decomps_size;
static _decomp_t *kdecomps;
static unsigned long kdecomps_used;
static unsigned long kdecomps_size;
/*
* Composition exclusion table stuff.
*/
@ -420,41 +425,56 @@ ordered_range_insert(unsigned long c, char *name, int len)
}
static void
add_decomp(unsigned long code)
add_decomp(unsigned long code, short compat)
{
unsigned long i, j, size;
_decomp_t **pdecomps;
unsigned long *pdecomps_used;
unsigned long *pdecomps_size;
if (compat) {
pdecomps = &kdecomps;
pdecomps_used = &kdecomps_used;
pdecomps_size = &kdecomps_size;
} else {
pdecomps = &decomps;
pdecomps_used = &decomps_used;
pdecomps_size = &decomps_size;
}
/*
* Add the code to the composite property.
*/
ordered_range_insert(code, "Cm", 2);
if (!compat) {
ordered_range_insert(code, "Cm", 2);
}
/*
* Locate the insertion point for the code.
*/
for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
/*
* Allocate space for a new decomposition.
*/
if (decomps_used == decomps_size) {
if (decomps_size == 0)
decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
if (*pdecomps_used == *pdecomps_size) {
if (*pdecomps_size == 0)
*pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
else
decomps = (_decomp_t *)
realloc((char *) decomps,
sizeof(_decomp_t) * (decomps_size + 8));
(void) memset((char *) (decomps + decomps_size), '\0',
*pdecomps = (_decomp_t *)
realloc((char *) *pdecomps,
sizeof(_decomp_t) * (*pdecomps_size + 8));
(void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
sizeof(_decomp_t) << 3);
decomps_size += 8;
*pdecomps_size += 8;
}
if (i < decomps_used && code != decomps[i].code) {
if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
/*
* Shift the decomps up by one if the codes don't match.
*/
for (j = decomps_used; j > i; j--)
(void) AC_MEMCPY((char *) &decomps[j], (char *) &decomps[j - 1],
for (j = *pdecomps_used; j > i; j--)
(void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
sizeof(_decomp_t));
}
@ -462,30 +482,30 @@ add_decomp(unsigned long code)
* Insert or replace a decomposition.
*/
size = dectmp_size + (4 - (dectmp_size & 3));
if (decomps[i].size < size) {
if (decomps[i].size == 0)
decomps[i].decomp = (unsigned long *)
if ((*pdecomps)[i].size < size) {
if ((*pdecomps)[i].size == 0)
(*pdecomps)[i].decomp = (unsigned long *)
malloc(sizeof(unsigned long) * size);
else
decomps[i].decomp = (unsigned long *)
realloc((char *) decomps[i].decomp,
(*pdecomps)[i].decomp = (unsigned long *)
realloc((char *) (*pdecomps)[i].decomp,
sizeof(unsigned long) * size);
decomps[i].size = size;
(*pdecomps)[i].size = size;
}
if (decomps[i].code != code)
decomps_used++;
if ((*pdecomps)[i].code != code)
(*pdecomps_used)++;
decomps[i].code = code;
decomps[i].used = dectmp_size;
(void) AC_MEMCPY((char *) decomps[i].decomp, (char *) dectmp,
(*pdecomps)[i].code = code;
(*pdecomps)[i].used = dectmp_size;
(void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
sizeof(unsigned long) * dectmp_size);
/*
* NOTICE: This needs changing later so it is more general than simply
* pairs. This calculation is done here to simplify allocation elsewhere.
*/
if (dectmp_size == 2)
if (!compat && dectmp_size == 2)
comps_used++;
}
@ -780,7 +800,7 @@ static void
read_cdata(FILE *in)
{
unsigned long i, lineno, skip, code, ccl_code;
short wnum, neg, number[2];
short wnum, neg, number[2], compat;
char line[512], *s, *e;
lineno = skip = 0;
@ -933,7 +953,14 @@ read_cdata(FILE *in)
* Check for a decomposition.
*/
s = ++e;
if (*s != ';' && *s != '<') {
if (*s != ';') {
compat = *s == '<';
if (compat) {
/*
* Skip compatibility formatting tag.
*/
while (*s++ != '>');
}
/*
* Collect the codes of the decomposition.
*/
@ -942,7 +969,7 @@ read_cdata(FILE *in)
* Skip all leading non-hex digits.
*/
while (!ishdigit(*s))
s++;
s++;
for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
dectmp[dectmp_size] <<= 4;
@ -960,8 +987,12 @@ read_cdata(FILE *in)
* If there are any codes in the temporary decomposition array,
* then add the character with its decomposition.
*/
if (dectmp_size > 0)
add_decomp(code);
if (dectmp_size > 0) {
if (!compat) {
add_decomp(code, 0);
}
add_decomp(code, 1);
}
}
/*
@ -1052,33 +1083,35 @@ read_cdata(FILE *in)
}
static _decomp_t *
find_decomp(unsigned long code)
find_decomp(unsigned long code, short compat)
{
long l, r, m;
_decomp_t *decs;
l = 0;
r = decomps_used - 1;
r = (compat ? kdecomps_used : decomps_used) - 1;
decs = compat ? kdecomps : decomps;
while (l <= r) {
m = (l + r) >> 1;
if (code > decomps[m].code)
if (code > decs[m].code)
l = m + 1;
else if (code < decomps[m].code)
else if (code < decs[m].code)
r = m - 1;
else
return &decomps[m];
return &decs[m];
}
return 0;
}
static void
decomp_it(_decomp_t *d)
decomp_it(_decomp_t *d, short compat)
{
unsigned long i;
_decomp_t *dp;
for (i = 0; i < d->used; i++) {
if ((dp = find_decomp(d->decomp[i])) != 0)
decomp_it(dp);
if ((dp = find_decomp(d->decomp[i], compat)) != 0)
decomp_it(dp, compat);
else
dectmp[dectmp_size++] = d->decomp[i];
}
@ -1095,9 +1128,16 @@ expand_decomp(void)
for (i = 0; i < decomps_used; i++) {
dectmp_size = 0;
decomp_it(&decomps[i]);
decomp_it(&decomps[i], 0);
if (dectmp_size > 0)
add_decomp(decomps[i].code);
add_decomp(decomps[i].code, 0);
}
for (i = 0; i < kdecomps_used; i++) {
dectmp_size = 0;
decomp_it(&kdecomps[i], 1);
if (dectmp_size > 0)
add_decomp(kdecomps[i].code, 1);
}
}
@ -1402,6 +1442,60 @@ write_cdata(char *opath)
fclose(out);
}
/*
* Open the kdecomp.dat file.
*/
sprintf(path, "%s%skdecomp.dat", opath, LDAP_DIRSEP);
if ((out = fopen(path, "wb")) == 0)
return;
hdr[1] = kdecomps_used;
/*
* Write the header.
*/
fwrite((char *) hdr, sizeof(unsigned short), 2, out);
/*
* Write a temporary byte count which will be calculated as the
* decompositions are written out.
*/
bytes = 0;
fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
if (kdecomps_used) {
/*
* Write the list of kdecomp nodes.
*/
for (i = idx = 0; i < kdecomps_used; i++) {
fwrite((char *) &kdecomps[i].code, sizeof(unsigned long), 1, out);
fwrite((char *) &idx, sizeof(unsigned long), 1, out);
idx += kdecomps[i].used;
}
/*
* Write the sentinel index as the last decomp node.
*/
fwrite((char *) &idx, sizeof(unsigned long), 1, out);
/*
* Write the decompositions themselves.
*/
for (i = 0; i < kdecomps_used; i++)
fwrite((char *) kdecomps[i].decomp, sizeof(unsigned long),
kdecomps[i].used, out);
/*
* Seek back to the beginning and write the byte count.
*/
bytes = (sizeof(unsigned long) * idx) +
(sizeof(unsigned long) * ((hdr[1] << 1) + 1));
fseek(out, sizeof(unsigned short) << 1, 0L);
fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
fclose(out);
}
/*****************************************************************
*
* Generate the combining class data.

View file

@ -212,7 +212,7 @@ struct berval * UTF8bvnormalize(
p++;
}
/* normalize ucs of length p - ucs */
uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
if ( approx ) {
for ( j = 0; j < ucsoutlen; j++ ) {
if ( ucsout[j] < 0x80 ) {
@ -370,7 +370,7 @@ int UTF8bvnormcmp(
return l1 > l2 ? 1 : -1; /* what to do??? */
}
} else {
uccanondecomp( ucs, ulen, &ucsout1, &l1 );
uccompatdecomp( ucs, ulen, &ucsout1, &l1 );
l1 = uccanoncomp( ucsout1, l1 );
}
@ -389,7 +389,7 @@ int UTF8bvnormcmp(
ucsout2 = ucs;
l2 = ulen;
} else {
uccanondecomp( ucs, ulen, &ucsout2, &l2 );
uccompatdecomp( ucs, ulen, &ucsout2, &l2 );
l2 = uccanoncomp( ucsout2, l2 );
free( ucs );
}