mirror of
https://git.openldap.org/openldap/openldap.git
synced 2025-12-31 03:59:34 -05:00
Upgrading to UCData-2.5, this is just the docs
This commit is contained in:
parent
8478db9c08
commit
a92da36ed8
4 changed files with 128 additions and 13 deletions
|
|
@ -1,8 +1,8 @@
|
|||
#
|
||||
# $Id: README,v 1.32 1999/11/29 16:41:05 mleisher Exp $
|
||||
# $Id: README,v 1.33 2001/01/02 18:46:19 mleisher Exp $
|
||||
#
|
||||
|
||||
MUTT UCData Package 2.4
|
||||
MUTT UCData Package 2.5
|
||||
-----------------------
|
||||
|
||||
This is a package that supports ctype-like operations for Unicode UCS-2 text
|
||||
|
|
@ -17,9 +17,10 @@ The character information portion of the package consists of three parts:
|
|||
|
||||
A. case.dat - the case mappings.
|
||||
B. ctype.dat - the character property tables.
|
||||
C. decomp.dat - the character decompositions.
|
||||
D. cmbcl.dat - the non-zero combining classes.
|
||||
E. num.dat - the codes representing numbers.
|
||||
C. comp.dat - the character composition pairs.
|
||||
D. decomp.dat - the character decompositions.
|
||||
E. cmbcl.dat - the non-zero combining classes.
|
||||
F. num.dat - the codes representing numbers.
|
||||
|
||||
2. The "ucdata.[ch]" files which implement the functions needed to
|
||||
check to see if a character matches groups of properties, to map between
|
||||
|
|
@ -49,10 +50,10 @@ all decompositions that do not have tags such as "<compat>" or "<font>".
|
|||
|
||||
The data is almost all stored as unsigned longs (32-bits assumed) and the
|
||||
routines that load the data take care of endian swaps when necessary. This
|
||||
also means that surrogates (>= 0x10000) can be placed in the data files the
|
||||
"ucgendat" program parses.
|
||||
also means that supplementary characters (>= 0x10000) can be placed in the
|
||||
data files the "ucgendat" program parses.
|
||||
|
||||
The data is written as external files and broken into five parts so it can be
|
||||
The data is written as external files and broken into six parts so it can be
|
||||
selectively updated at runtime if necessary.
|
||||
|
||||
The data files currently generated from the "ucgendat" program total about 56K
|
||||
|
|
@ -83,6 +84,14 @@ Mark Leisher <mleisher@crl.nmsu.edu>
|
|||
|
||||
CHANGES
|
||||
=======
|
||||
Version 2.5
|
||||
-----------
|
||||
1. Changed the number lookup to set the denominator to 1 in cases of digits.
|
||||
This restores functional compatibility with John Cowan's UCType package.
|
||||
|
||||
2. Added support for the AL property.
|
||||
|
||||
3. Modified load and reload functions to return error codes.
|
||||
|
||||
Version 2.4
|
||||
-----------
|
||||
|
|
@ -298,3 +307,7 @@ incomplete decompositions to be generated by the "ucgendat" program.
|
|||
|
||||
Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
|
||||
error and an initialization error.
|
||||
|
||||
Thanks go to Stig Venaas <Stig.Venaas@uninett.no> for providing a patch to
|
||||
support return types on load and reload, and for major updates to handle
|
||||
canonical composition and decomposition.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# $Id: api.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $
|
||||
# $Id: api.txt,v 1.3 2001/01/02 18:46:20 mleisher Exp $
|
||||
#
|
||||
|
||||
The MUTT UCData API
|
||||
|
|
@ -18,8 +18,9 @@ and ucdata_reload().
|
|||
#define UCDATA_DECOMP 0x04
|
||||
#define UCDATA_CMBCL 0x08
|
||||
#define UCDATA_NUM 0x10
|
||||
#define UCDATA_COMP 0x20
|
||||
#define UCATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
|
||||
UCDATA_CMBCL|UCDATA_NUM)
|
||||
UCDATA_CMBCL|UCDATA_NUM|UCDATA_COMP)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
void ucdata_load(char *paths, int masks)
|
||||
|
|
@ -79,6 +80,19 @@ int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
|
|||
putchar('\n');
|
||||
}
|
||||
|
||||
int uccanondecomp(const unsigned long *in, int inlen, unsigned long **out,
|
||||
int *outlen)
|
||||
|
||||
This function decomposes an input string and does canonical reordering of
|
||||
the characters at the same time.
|
||||
|
||||
If a -1 is returned, memory allocation was not successful. If a zero is
|
||||
returned, no decomposition occured. Any other value means the output string
|
||||
contains the fully decomposed string in canonical order.
|
||||
|
||||
If the "outlen" parameter comes back with a value > 0, then the string
|
||||
returned in the "out" parameter needs to be deallocated by the caller.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int ucdecomp_hangul(unsigned long code, unsigned long *num,
|
||||
|
|
@ -106,6 +120,30 @@ int ucdecomp_hangul(unsigned long code, unsigned long *num,
|
|||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int uccomp(unsigned long ch1, unsigned long ch2, unsigned long *comp)
|
||||
|
||||
This function takes a pair of characters and determines if they combine to
|
||||
form another character.
|
||||
|
||||
If a zero is returned, no composition is formed by the character pair. Any
|
||||
other value indicates the "comp" parameter has a value.
|
||||
|
||||
int uccomp_hangul(unsigned long *str, int len)
|
||||
|
||||
This function composes the Hangul Jamo in the string. The composition is
|
||||
done in-place.
|
||||
|
||||
The return value provides the new length of the string. This will be
|
||||
smaller than "len" if compositions occured.
|
||||
|
||||
int uccanoncomp(unsigned long *str, int len)
|
||||
|
||||
This function does a canonical composition of characters in the string.
|
||||
|
||||
The return value is the new length of the string.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
struct ucnumber {
|
||||
int numerator;
|
||||
int denominator;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# $Id: format.txt,v 1.1 1998/07/24 15:17:21 mleisher Exp $
|
||||
# $Id: format.txt,v 1.2 2001/01/02 18:46:20 mleisher Exp $
|
||||
#
|
||||
|
||||
CHARACTER DATA
|
||||
|
|
@ -159,6 +159,30 @@ The format for the binary form of these tables is:
|
|||
distribution of mappings may be more or less than 21845 per table, but only
|
||||
65536 are allowed.
|
||||
|
||||
COMPOSITIONS
|
||||
============
|
||||
|
||||
This data file is called "comp.dat" and contains data that tracks character
|
||||
pairs that have a single Unicode value representing the combination of the two
|
||||
characters.
|
||||
|
||||
The format for the binary form of this table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumCompositionNodes, count of composition nodes
|
||||
unsigned long Bytes, total number of bytes used for composition nodes
|
||||
unsigned long CompositionNodes[NumCompositionNodes * 4]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The CompositionNodes[] array consists of groups of 4 unsigned longs. The
|
||||
first of these is the character code representing the combination of two
|
||||
other character codes, the second records the number of character codes that
|
||||
make up the composition (not currently used), and the last two are the pair
|
||||
of character codes whose combination is represented by the character code in
|
||||
the first field.
|
||||
|
||||
DECOMPOSITIONS
|
||||
==============
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
.\"
|
||||
.\" $Id: ucdata.man,v 1.4 1999/11/19 16:08:33 mleisher Exp $
|
||||
.\" $Id: ucdata.man,v 1.5 2001/01/02 18:46:20 mleisher Exp $
|
||||
.\"
|
||||
.TH ucdata 3 "19 November 1999"
|
||||
.TH ucdata 3 "03 January 2001"
|
||||
.SH NAME
|
||||
ucdata \- package for providing Unicode/ISO10646 character information
|
||||
|
||||
|
|
@ -16,9 +16,17 @@ void ucdata_reload(char * paths, int masks)
|
|||
.sp
|
||||
int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
|
||||
.sp
|
||||
int uccanondecomp(const unsigned long *in, int inlen, unsigned long **out,
|
||||
int *outlen)
|
||||
.sp
|
||||
int ucdecomp_hangul(unsigned long code, unsigned long *num,
|
||||
unsigned long decomp[])
|
||||
.sp
|
||||
int uccomp(unsigned long ch1, unsigned long ch2, unsigned long *comp)
|
||||
.sp
|
||||
int uccomp_hangul(unsigned long *str, int len)
|
||||
.sp
|
||||
int uccanoncomp(unsiged long *str, int len)
|
||||
.nf
|
||||
struct ucnumber {
|
||||
int numerator;
|
||||
|
|
@ -203,6 +211,17 @@ Example call:
|
|||
putchar('\n');
|
||||
}
|
||||
.TP 4
|
||||
.BR uccanondecomp()
|
||||
This function will decompose a string, insuring the characters are in
|
||||
canonical order for comparison.
|
||||
.sp
|
||||
If a decomposed string is returned, the caller is responsible for deallocating
|
||||
the string.
|
||||
.sp
|
||||
If a -1 is returned, memory allocation failed. If a zero is returned, no
|
||||
decomposition was done. Any other value means a decomposition string was
|
||||
created and the values returned in the `out' and `outlen' parameters.
|
||||
.TP 4
|
||||
.BR ucdecomp_hangul()
|
||||
This function determines if a Hangul syllable has a
|
||||
decomposition and returns the decomposition information.
|
||||
|
|
@ -226,6 +245,25 @@ Example call:
|
|||
putchar('\n');
|
||||
}
|
||||
.TP 4
|
||||
.BR uccomp()
|
||||
This function determines if a pair of characters have a composition, and
|
||||
returns that composition if one exists.
|
||||
.sp
|
||||
A zero is returned is no composition exists for the character pair. Any other
|
||||
value indicates the `comp' field holds the character code representing the
|
||||
composition of the two character codes.
|
||||
.TP 4
|
||||
.BR uccomp_hangul()
|
||||
This composes the Hangul Jamo in-place in the string.
|
||||
.sp
|
||||
The returned value is the new length of the string.
|
||||
.TP 4
|
||||
.BR uccanoncomp()
|
||||
This function does a full composition in-place in the string, including the
|
||||
Hangul composition.
|
||||
.sp
|
||||
The returned value is the new length of the string.
|
||||
.TP 4
|
||||
.BR ucnumber_lookup()
|
||||
This function determines if the code is a number and
|
||||
fills in the `num' field with the numerator and
|
||||
|
|
@ -453,6 +491,8 @@ Christophe Pierret <cpierret@businessobjects.com>
|
|||
Kent Johnson <kent@pondview.mv.com>
|
||||
.br
|
||||
Valeriy E. Ushakov <uwe@ptc.spbu.ru>
|
||||
.br
|
||||
Stig Venaas <Stig.Venaas@uninett.no>
|
||||
|
||||
.SH AUTHOR
|
||||
Mark Leisher
|
||||
|
|
|
|||
Loading…
Reference in a new issue