Initial revision

This commit is contained in:
Kurt Zeilenga 2000-01-25 22:38:34 +00:00
parent 9fec129997
commit fe98d9fa7b
22 changed files with 10372 additions and 0 deletions

View file

@ -0,0 +1,303 @@
#
# $Id: MUTTUCData.txt,v 1.3 1999/10/29 00:04:35 mleisher Exp $
#
# Copyright 1999 Computing Research Labs, New Mexico State University
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#
# Implementation specific character properties.
#
#
# Space, other.
#
0009;;Ss;;;;;;;;;;;;
000A;;Ss;;;;;;;;;;;;
000B;;Ss;;;;;;;;;;;;
000C;;Ss;;;;;;;;;;;;
000D;;Ss;;;;;;;;;;;;
#
# Non-breaking.
#
00A0;;Nb;;;;;;;;;;;;
2007;;Nb;;;;;;;;;;;;
2011;;Nb;;;;;;;;;;;;
FEFF;;Nb;;;;;;;;;;;;
#
# Symmetric.
#
0028;;Sy;;;;;;;;;;;;
0029;;Sy;;;;;;;;;;;;
005B;;Sy;;;;;;;;;;;;
005D;;Sy;;;;;;;;;;;;
007B;;Sy;;;;;;;;;;;;
007D;;Sy;;;;;;;;;;;;
00AB;;Sy;;;;;;;;;;;;
00BB;;Sy;;;;;;;;;;;;
0F3A;;Sy;;;;;;;;;;;;
0F3B;;Sy;;;;;;;;;;;;
0F3C;;Sy;;;;;;;;;;;;
0F3D;;Sy;;;;;;;;;;;;
0F3E;;Sy;;;;;;;;;;;;
0F3F;;Sy;;;;;;;;;;;;
2018;;Sy;;;;;;;;;;;;
2019;;Sy;;;;;;;;;;;;
201A;;Sy;;;;;;;;;;;;
201B;;Sy;;;;;;;;;;;;
201C;;Sy;;;;;;;;;;;;
201D;;Sy;;;;;;;;;;;;
201E;;Sy;;;;;;;;;;;;
201F;;Sy;;;;;;;;;;;;
2039;;Sy;;;;;;;;;;;;
203A;;Sy;;;;;;;;;;;;
2045;;Sy;;;;;;;;;;;;
2046;;Sy;;;;;;;;;;;;
207D;;Sy;;;;;;;;;;;;
207E;;Sy;;;;;;;;;;;;
208D;;Sy;;;;;;;;;;;;
208E;;Sy;;;;;;;;;;;;
2329;;Sy;;;;;;;;;;;;
232A;;Sy;;;;;;;;;;;;
3008;;Sy;;;;;;;;;;;;
3009;;Sy;;;;;;;;;;;;
300A;;Sy;;;;;;;;;;;;
300B;;Sy;;;;;;;;;;;;
300C;;Sy;;;;;;;;;;;;
300D;;Sy;;;;;;;;;;;;
300E;;Sy;;;;;;;;;;;;
300F;;Sy;;;;;;;;;;;;
3010;;Sy;;;;;;;;;;;;
3011;;Sy;;;;;;;;;;;;
3014;;Sy;;;;;;;;;;;;
3015;;Sy;;;;;;;;;;;;
3016;;Sy;;;;;;;;;;;;
3017;;Sy;;;;;;;;;;;;
3018;;Sy;;;;;;;;;;;;
3019;;Sy;;;;;;;;;;;;
301A;;Sy;;;;;;;;;;;;
301B;;Sy;;;;;;;;;;;;
301D;;Sy;;;;;;;;;;;;
301E;;Sy;;;;;;;;;;;;
301F;;Sy;;;;;;;;;;;;
FD3E;;Sy;;;;;;;;;;;;
FD3F;;Sy;;;;;;;;;;;;
FE35;;Sy;;;;;;;;;;;;
FE36;;Sy;;;;;;;;;;;;
FE37;;Sy;;;;;;;;;;;;
FE38;;Sy;;;;;;;;;;;;
FE39;;Sy;;;;;;;;;;;;
FE3A;;Sy;;;;;;;;;;;;
FE3B;;Sy;;;;;;;;;;;;
FE3C;;Sy;;;;;;;;;;;;
FE3D;;Sy;;;;;;;;;;;;
FE3E;;Sy;;;;;;;;;;;;
FE3F;;Sy;;;;;;;;;;;;
FE40;;Sy;;;;;;;;;;;;
FE41;;Sy;;;;;;;;;;;;
FE42;;Sy;;;;;;;;;;;;
FE43;;Sy;;;;;;;;;;;;
FE44;;Sy;;;;;;;;;;;;
FE59;;Sy;;;;;;;;;;;;
FE5A;;Sy;;;;;;;;;;;;
FE5B;;Sy;;;;;;;;;;;;
FE5C;;Sy;;;;;;;;;;;;
FE5D;;Sy;;;;;;;;;;;;
FE5E;;Sy;;;;;;;;;;;;
FF08;;Sy;;;;;;;;;;;;
FF09;;Sy;;;;;;;;;;;;
FF3B;;Sy;;;;;;;;;;;;
FF3D;;Sy;;;;;;;;;;;;
FF5B;;Sy;;;;;;;;;;;;
FF5D;;Sy;;;;;;;;;;;;
FF62;;Sy;;;;;;;;;;;;
FF63;;Sy;;;;;;;;;;;;
#
# Hex digit.
#
0030;;Hd;;;;;;;;;;;;
0031;;Hd;;;;;;;;;;;;
0032;;Hd;;;;;;;;;;;;
0033;;Hd;;;;;;;;;;;;
0034;;Hd;;;;;;;;;;;;
0035;;Hd;;;;;;;;;;;;
0036;;Hd;;;;;;;;;;;;
0037;;Hd;;;;;;;;;;;;
0038;;Hd;;;;;;;;;;;;
0039;;Hd;;;;;;;;;;;;
0041;;Hd;;;;;;;;;;;;
0042;;Hd;;;;;;;;;;;;
0043;;Hd;;;;;;;;;;;;
0044;;Hd;;;;;;;;;;;;
0045;;Hd;;;;;;;;;;;;
0046;;Hd;;;;;;;;;;;;
0061;;Hd;;;;;;;;;;;;
0062;;Hd;;;;;;;;;;;;
0063;;Hd;;;;;;;;;;;;
0064;;Hd;;;;;;;;;;;;
0065;;Hd;;;;;;;;;;;;
0066;;Hd;;;;;;;;;;;;
FF10;;Hd;;;;;;;;;;;;
FF11;;Hd;;;;;;;;;;;;
FF12;;Hd;;;;;;;;;;;;
FF13;;Hd;;;;;;;;;;;;
FF14;;Hd;;;;;;;;;;;;
FF15;;Hd;;;;;;;;;;;;
FF16;;Hd;;;;;;;;;;;;
FF17;;Hd;;;;;;;;;;;;
FF18;;Hd;;;;;;;;;;;;
FF19;;Hd;;;;;;;;;;;;
FF21;;Hd;;;;;;;;;;;;
FF22;;Hd;;;;;;;;;;;;
FF23;;Hd;;;;;;;;;;;;
FF24;;Hd;;;;;;;;;;;;
FF25;;Hd;;;;;;;;;;;;
FF26;;Hd;;;;;;;;;;;;
FF41;;Hd;;;;;;;;;;;;
FF42;;Hd;;;;;;;;;;;;
FF43;;Hd;;;;;;;;;;;;
FF44;;Hd;;;;;;;;;;;;
FF45;;Hd;;;;;;;;;;;;
FF46;;Hd;;;;;;;;;;;;
#
# Quote marks.
#
0022;;Qm;;;;;;;;;;;;
0027;;Qm;;;;;;;;;;;;
00AB;;Qm;;;;;;;;;;;;
00BB;;Qm;;;;;;;;;;;;
2018;;Qm;;;;;;;;;;;;
2019;;Qm;;;;;;;;;;;;
201A;;Qm;;;;;;;;;;;;
201B;;Qm;;;;;;;;;;;;
201C;;Qm;;;;;;;;;;;;
201D;;Qm;;;;;;;;;;;;
201E;;Qm;;;;;;;;;;;;
201F;;Qm;;;;;;;;;;;;
2039;;Qm;;;;;;;;;;;;
203A;;Qm;;;;;;;;;;;;
300C;;Qm;;;;;;;;;;;;
300D;;Qm;;;;;;;;;;;;
300E;;Qm;;;;;;;;;;;;
300F;;Qm;;;;;;;;;;;;
301D;;Qm;;;;;;;;;;;;
301E;;Qm;;;;;;;;;;;;
301F;;Qm;;;;;;;;;;;;
FE41;;Qm;;;;;;;;;;;;
FE42;;Qm;;;;;;;;;;;;
FE43;;Qm;;;;;;;;;;;;
FE44;;Qm;;;;;;;;;;;;
FF02;;Qm;;;;;;;;;;;;
FF07;;Qm;;;;;;;;;;;;
FF62;;Qm;;;;;;;;;;;;
FF63;;Qm;;;;;;;;;;;;
#
# Special Devanagari forms
#
E900;DEVANAGARI KSHA LIGATURE;Lo;0;L;0915 094D 0937;;;;N;;;;;
E901;DEVANAGARI GNYA LIGATURE;Lo;0;L;091C 094D 091E;;;;N;;;;;
E902;DEVANAGARI TTA LIGATURE;Lo;0;L;0924 094D 0924;;;;N;;;;;
E903;DEVANAGARI TRA LIGATURE;Lo;0;L;0924 094D 0930;;;;N;;;;;
E904;DEVANAGARI SHCHA LIGATURE;Lo;0;L;0936 094D 091B;;;;N;;;;;
E905;DEVANAGARI SHRA LIGATURE;Lo;0;L;0936 094D 0930;;;;N;;;;;
E906;DEVANAGARI SHVA LIGATURE;Lo;0;L;0936 094D 0935;;;;N;;;;;
E907;DEVANAGARI KRA LIGATURE;Lo;0;L;;;;;N;;;;;
E908;DEVANAGARI JRA LIGATURE;Lo;0;L;;;;;N;;;;;
E909;DEVANAGARI ZRA LIGATURE;Lo;0;L;;;;;N;;;;;
E90A;DEVANAGARI PHRA LIGATURE;Lo;0;L;;;;;N;;;;;
E90B;DEVANAGARI FRA LIGATURE;Lo;0;L;;;;;N;;;;;
E90C;DEVANAGARI PRA LIGATURE;Lo;0;L;;;;;N;;;;;
E90D;DEVANAGARI SRA LIGATURE;Lo;0;L;;;;;N;;;;;
E90E;DEVANAGARI RU LIGATURE;Lo;0;L;;;;;N;;;;;
E90F;DEVANAGARI RUU LIGATURE;Lo;0;L;;;;;N;;;;;
E915;DEVANAGARI HALF LETTER KA;Lo;0;L;;;;;N;;;;;
E916;DEVANAGARI HALF LETTER KHA;Lo;0;L;;;;;N;;;;;
E917;DEVANAGARI HALF LETTER GA;Lo;0;L;;;;;N;;;;;
E918;DEVANAGARI HALF LETTER GHA;Lo;0;L;;;;;N;;;;;
E919;DEVANAGARI HALF LETTER NGA;Lo;0;L;;;;;N;;;;;
E91A;DEVANAGARI HALF LETTER CA;Lo;0;L;;;;;N;;;;;
E91B;DEVANAGARI HALF LETTER CHA;Lo;0;L;;;;;N;;;;;
E91C;DEVANAGARI HALF LETTER JA;Lo;0;L;;;;;N;;;;;
E91D;DEVANAGARI HALF LETTER JHA;Lo;0;L;;;;;N;;;;;
E91E;DEVANAGARI HALF LETTER NYA;Lo;0;L;;;;;N;;;;;
E91F;DEVANAGARI HALF LETTER TTA;Lo;0;L;;;;;N;;;;;
E920;DEVANAGARI HALF LETTER TTHA;Lo;0;L;;;;;N;;;;;
E921;DEVANAGARI HALF LETTER DDA;Lo;0;L;;;;;N;;;;;
E922;DEVANAGARI HALF LETTER DDHA;Lo;0;L;;;;;N;;;;;
E923;DEVANAGARI HALF LETTER NNA;Lo;0;L;;;;;N;;;;;
E924;DEVANAGARI HALF LETTER TA;Lo;0;L;;;;;N;;;;;
E925;DEVANAGARI HALF LETTER THA;Lo;0;L;;;;;N;;;;;
E926;DEVANAGARI HALF LETTER DA;Lo;0;L;;;;;N;;;;;
E927;DEVANAGARI HALF LETTER DHA;Lo;0;L;;;;;N;;;;;
E928;DEVANAGARI HALF LETTER NA;Lo;0;L;;;;;N;;;;;
E929;DEVANAGARI HALF LETTER NNNA;Lo;0;L;0928 093C;;;;N;;;;;
E92A;DEVANAGARI HALF LETTER PA;Lo;0;L;;;;;N;;;;;
E92B;DEVANAGARI HALF LETTER PHA;Lo;0;L;;;;;N;;;;;
E92C;DEVANAGARI HALF LETTER BA;Lo;0;L;;;;;N;;;;;
E92D;DEVANAGARI HALF LETTER BHA;Lo;0;L;;;;;N;;;;;
E92E;DEVANAGARI HALF LETTER MA;Lo;0;L;;;;;N;;;;;
E92F;DEVANAGARI HALF LETTER YA;Lo;0;L;;;;;N;;;;;
E930;DEVANAGARI HALF LETTER RA;Lo;0;L;;;;;N;;;;;
E931;DEVANAGARI HALF LETTER RRA;Lo;0;L;0930 093C;;;;N;;;;;
E932;DEVANAGARI HALF LETTER LA;Lo;0;L;;;;;N;;;;;
E933;DEVANAGARI HALF LETTER LLA;Lo;0;L;;;;;N;;;;;
E934;DEVANAGARI HALF LETTER LLLA;Lo;0;L;0933 093C;;;;N;;;;;
E935;DEVANAGARI HALF LETTER VA;Lo;0;L;;;;;N;;;;;
E936;DEVANAGARI HALF LETTER SHA;Lo;0;L;;;;;N;;;;;
E937;DEVANAGARI HALF LETTER SSA;Lo;0;L;;;;;N;;;;;
E938;DEVANAGARI HALF LETTER SA;Lo;0;L;;;;;N;;;;;
E939;DEVANAGARI HALF LETTER HA;Lo;0;L;;;;;N;;;;;
E940;DEVANAGARI KKA LIGATURE;Lo;0;L;0915 094D 0915;;;;N;;;;;
E941;DEVANAGARI KTA LIGATURE;Lo;0;L;0915 094D 0924;;;;N;;;;;
E942;DEVANAGARI NGKA LIGATURE;Lo;0;L;0919 094D 0915;;;;N;;;;;
E943;DEVANAGARI NGKHA LIGATURE;Lo;0;L;0919 094D 0916;;;;N;;;;;
E944;DEVANAGARI NGGA LIGATURE;Lo;0;L;0919 094D 0917;;;;N;;;;;
E945;DEVANAGARI NGGHA LIGATURE;Lo;0;L;0919 094D 0918;;;;N;;;;;
E946;DEVANAGARI NYJA LIGATURE;Lo;0;L;091E 094D 091C;;;;N;;;;;
E947;DEVANAGARI DGHA LIGATURE;Lo;0;L;0926 094D 0918;;;;N;;;;;
E948;DEVANAGARI DDA LIGATURE;Lo;0;L;0926 094D 0926;;;;N;;;;;
E949;DEVANAGARI DDHA LIGATURE;Lo;0;L;0926 094D 0927;;;;N;;;;;
E94A;DEVANAGARI DBA LIGATURE;Lo;0;L;0926 094D 092C;;;;N;;;;;
E94B;DEVANAGARI DBHA LIGATURE;Lo;0;L;0926 094D 092D;;;;N;;;;;
E94C;DEVANAGARI DMA LIGATURE;Lo;0;L;0926 094D 092E;;;;N;;;;;
E94D;DEVANAGARI DYA LIGATURE;Lo;0;L;0926 094D 092F;;;;N;;;;;
E94E;DEVANAGARI DVA LIGATURE;Lo;0;L;0926 094D 0935;;;;N;;;;;
E94F;DEVANAGARI TT-TTA LIGATURE;Lo;0;L;091F 094D 091F;;;;N;;;;;
E950;DEVANAGARI TT-TTHA LIGATURE;Lo;0;L;091F 094D 0920;;;;N;;;;;
E951;DEVANAGARI TTH-TTHA LIGATURE;Lo;0;L;0920 094D 0920;;;;N;;;;;
E952;DEVANAGARI DD-GA LIGATURE;Lo;0;L;0921 094D 0917;;;;N;;;;;
E953;DEVANAGARI DD-DDA LIGATURE;Lo;0;L;0921 094D 0921;;;;N;;;;;
E954;DEVANAGARI DD-DDHA LIGATURE;Lo;0;L;0921 094D 0922;;;;N;;;;;
E955;DEVANAGARI NNA LIGATURE;Lo;0;L;0928 094D 0928;;;;N;;;;;
E956;DEVANAGARI HMA LIGATURE;Lo;0;L;0939 094D 092E;;;;N;;;;;
E957;DEVANAGARI HYA LIGATURE;Lo;0;L;0939 094D 092F;;;;N;;;;;
E958;DEVANAGARI HLA LIGATURE;Lo;0;L;0939 094D 0932;;;;N;;;;;
E959;DEVANAGARI HVA LIGATURE;Lo;0;L;0939 094D 0935;;;;N;;;;;
E95A;DEVANAGARI STRA LIGATURE;Lo;0;L;0938 094D 0924 094D 0930;;;;N;;;;;
E970;DEVANAGARI HALF KSHA LIGATURE;Lo;0;L;0915 094D 0937;;;;N;;;;;
E971;DEVANAGARI HALF GNYA LIGATURE;Lo;0;L;091C 094D 091E;;;;N;;;;;
E972;DEVANAGARI HALF TTA LIGATURE;Lo;0;L;0924 094D 0924;;;;N;;;;;
E973;DEVANAGARI HALF TRA LIGATURE;Lo;0;L;0924 094D 0930;;;;N;;;;;
E974;DEVANAGARI HALF SHCHA LIGATURE;Lo;0;L;0936 094D 091B;;;;N;;;;;
E975;DEVANAGARI HALF SHRA LIGATURE;Lo;0;L;0936 094D 0930;;;;N;;;;;
E976;DEVANAGARI HALF SHVA LIGATURE;Lo;0;L;0936 094D 0935;;;;N;;;;;
E97B;DEVANAGARI SIGN RRA-REPHA;Mn;36;L;;;;;N;;;;;
E97C;DEVANAGARI HAR LIGATURE;Lo;0;L;0939 0943;;;;N;;;;;
E97D;DEVANAGARI SIGN EYELASH RA;Lo;0;L;;;;;N;;;;;
E97E;DEVANAGARI SIGN REPHA;Mn;36;L;;;;;N;;;;;
E97F;DEVANAGARI SIGN SUBJOINED RA;Mn;36;L;;;;;N;;;;;

View file

@ -0,0 +1,300 @@
#
# $Id: README,v 1.32 1999/11/29 16:41:05 mleisher Exp $
#
MUTT UCData Package 2.4
-----------------------
This is a package that supports ctype-like operations for Unicode UCS-2 text
(and surrogates), case mapping, decomposition lookup, and provides a
bidirectional reordering algorithm. To use it, you will need to get the
latest "UnicodeData-*.txt" (or later) file from the Unicode Web or FTP site.
The character information portion of the package consists of three parts:
1. A program called "ucgendat" which generates five data files from the
UnicodeData-*.txt file. The files are:
A. case.dat - the case mappings.
B. ctype.dat - the character property tables.
C. decomp.dat - the character decompositions.
D. cmbcl.dat - the non-zero combining classes.
E. num.dat - the codes representing numbers.
2. The "ucdata.[ch]" files which implement the functions needed to
check to see if a character matches groups of properties, to map between
upper, lower, and title case, to look up the decomposition of a
character, look up the combining class of a character, and get the number
value of a character.
3. The UCData.java class which provides the same API (with minor changes for
the numbers) and loads the same binary data files as the C code.
A short reference to the functions available is in the "api.txt" file.
Techie Details
==============
The "ucgendat" program parses files from the command line which are all in the
Unicode Character Database (UCDB) format. An additional properties file,
"MUTTUCData.txt", provides some extra properties for some characters.
The program looks for the two character properties fields (2 and 4), the
combining class field (3), the decomposition field (5), the numeric value
field (8), and the case mapping fields (12, 13, and 14). The decompositions
are recursively expanded before being written out.
The decomposition table contains all the canonical decompositions. This means
all decompositions that do not have tags such as "<compat>" or "<font>".
The data is almost all stored as unsigned longs (32-bits assumed) and the
routines that load the data take care of endian swaps when necessary. This
also means that surrogates (>= 0x10000) can be placed in the data files the
"ucgendat" program parses.
The data is written as external files and broken into five parts so it can be
selectively updated at runtime if necessary.
The data files currently generated from the "ucgendat" program total about 56K
in size all together.
The format of the binary data files is documented in the "format.txt" file.
==========================================================================
The "Pretty Good Bidi Algorithm"
--------------------------------
This routine provides an alternative to the Unicode Bidi algorithm. The
difference is that this version of the PGBA does not handle the explicit
directional codes (LRE, RLE, LRO, RLO, PDF). It should now produce the same
results as the Unicode BiDi algorithm for implicit reordering. Included are
functions for doing cursor motion in both logical and visual order.
This implementation is provided to demonstrate an effective alternate method
for implicit reordering. To make this useful for an application, it probably
needs some changes to the memory allocation and deallocation, as well as data
structure additions for rendering.
Mark Leisher <mleisher@crl.nmsu.edu>
19 November 1999
-----------------------------------------------------------------------------
CHANGES
=======
Version 2.4
-----------
1. Improved some bidi algorithm documentation in the code.
2. Fixed a code mixup that produced a non-working version.
Version 2.3
-----------
1. Fixed a misspelling in the ucpgba.h header file.
2. Fixed a bug which caused trailing weak non-digit sequences to be left out of
the reordered string in the bidi algorithm.
3. Fixed a problem with weak sequences containing non-spacing marks in the
bidi algorithm.
4. Fixed a problem with text runs of the opposite direction of the string
surrounding a weak + neutral text run appearing in the wrong order in the
bidi algorithm.
5. Added a default overall direction parameter to the reordering function for
cases of strings with no strong directional characters in the bidi
algorithm.
6. The bidi API documentation was improved.
7. Added a man page for the bidi API.
Version 2.2
-----------
1. Fixed a problem with the bidi algorithm locating directional section
boundaries.
2. Fixed a problem with the bidi algorithm starting the reordering correctly.
3. Fixed a problem with the bidi algorithm determining end boundaries for LTR
segments.
4. Fixed a problem with the bidi algorithm reordering weak (digits and number
separators) segments.
5. Added automatic switching of symmetrically paired characters when
reversing RTL segments.
6. Added a missing symmetric character to the extra character properties in
MUTTUCData.txt.
7. Added support for doing logical and visual cursor traversal.
Version 2.1
-----------
1. Updated the ucgendat program to handle the Unicode 3.0 character database
properties. The AL and BM bidi properties gets marked as strong RTL and
Other Neutral, the NSM, LRE, RLE, PDF, LRO, and RLO controls all get marked
as Other Neutral.
2. Fixed some problems with testing against signed values in the UCData.java
code and some minor cleanup.
3. Added the "Pretty Good Bidi Algorithm."
Version 2.0
-----------
1. Removed the old Java stuff for a new class that loads directly from the
same data files as the C code does.
2. Fixed a problem with choosing the correct field when mapping case.
3. Adjust some search routines to start their search in the correct position.
4. Moved the copyright year to 1999.
Version 1.9
-----------
1. Fixed a problem with an incorrect amount of storage being allocated for the
combining class nodes.
2. Fixed an invalid initialization in the number code.
3. Changed the Java template file formatting a bit.
4. Added tables and function for getting decompositions in the Java class.
Version 1.8
-----------
1. Fixed a problem with adding certain ranges.
2. Added two more macros for testing for identifiers.
3. Tested with the UnicodeData-2.1.5.txt file.
Version 1.7
-----------
1. Fixed a problem with looking up decompositions in "ucgendat."
Version 1.6
-----------
1. Added two new properties introduced with UnicodeData-2.1.4.txt.
2. Changed the "ucgendat.c" program a little to automatically align the
property data on a 4-byte boundary when new properties are added.
3. Changed the "ucgendat.c" programs to only generate canonical
decompositions.
4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for
initial and final punctuation characters.
5. Minor additions and changes to the documentation.
Version 1.5
-----------
1. Changed all file open calls to include binary mode with "b" for DOS/WIN
platforms.
2. Wrapped the unistd.h include so it won't be included when compiled under
Win32.
3. Fixed a bad range check for hex digits in ucgendat.c.
4. Fixed a bad endian swap for combining classes.
5. Added code to make a number table and associated lookup functions.
Functions added are ucnumber(), ucdigit(), and ucgetnumber(). The last
function is to maintain compatibility with John Cowan's "uctype" package.
Version 1.4
-----------
1. Fixed a bug with adding a range.
2. Fixed a bug with inserting a range in order.
3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros.
4. Added the missing unload for the combining class data.
5. Fixed a bad macro placement in ucisweak().
Version 1.3
-----------
1. Bug with case mapping calculations fixed.
2. Bug with empty character property entries fixed.
3. Bug with incorrect type in the combining class lookup fixed.
4. Some corrections done to api.txt.
5. Bug in certain character property lookups fixed.
6. Added a character property table that records the defined characters.
7. Replaced ucisunknown() with ucisdefined() and ucisundefined().
Version 1.2
-----------
1. Added code to ucgendat to generate a combining class table.
2. Fixed an endian problem with the byte count of decompositions.
3. Fixed some minor problems in the "format.txt" file.
4. Removed some bogus "Ss" values from MUTTUCData.txt file.
5. Added API function to get combining class.
6. Changed the open mode to "rb" so binary data files will be opened correctly
on DOS/WIN as well as other platforms.
7. Added the "api.txt" file.
Version 1.1
-----------
1. Added ucisxdigit() which I overlooked.
2. Added UC_LT to the ucisalpha() macro which I overlooked.
3. Change uciscntrl() to include UC_CF.
4. Added ucisocntrl() and ucfntcntrl() macros.
5. Added a ucisblank() which I overlooked.
6. Added missing properties to ucissymbol() and ucisnumber().
7. Added ucisgraph() and ucisprint().
8. Changed the "Mr" property to "Sy" to mark this subset of mirroring
characters as symmetric to avoid trampling the Unicode/ISO10646 sense of
mirroring.
9. Added another property called "Ss" which includes control characters
traditionally seen as spaces in the isspace() macro.
10. Added a bunch of macros to be API compatible with John Cowan's package.
ACKNOWLEDGEMENTS
================
Thanks go to John Cowan <cowan@locke.ccil.org> for pointing out lots of
missing things and giving me stuff, particularly a bunch of new macros.
Thanks go to Bob Verbrugge <bob_verbrugge@nl.compuware.com> for pointing out
various bugs.
Thanks go to Christophe Pierret <cpierret@businessobjects.com> for pointing
out that file modes need to have "b" for DOS/WIN machines, pointing out
unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum().
Thanks go to Kent Johnson <kent@pondview.mv.com> for finding a bug that caused
incomplete decompositions to be generated by the "ucgendat" program.
Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
error and an initialization error.

View file

@ -0,0 +1,935 @@
/*
* $Id: UCData.java,v 1.2 1999/10/07 20:49:56 mleisher Exp $
*
* Copyright 1999 Computing Research Labs, New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
import java.io.*;
import java.net.*;
public class UCData {
private static byte[] buffer;
private static boolean endian;
private static int bytes, buffpos;
//
// Do the static initialization.
//
static {
buffer = new byte[24576];
}
private static boolean load_file(InputStream in) {
buffpos = 0;
try {
bytes = in.read(buffer);
} catch (IOException e) {
return false;
}
endian = (buffer[0] == -2 && buffer[1] == -2);
buffpos = 2;
return (bytes > 0);
}
private static int getInt() {
int b1, b2, b3, b4;
if (!endian) {
b1 = buffer[buffpos++];
b2 = buffer[buffpos++];
b3 = buffer[buffpos++];
b4 = buffer[buffpos++];
} else {
b4 = buffer[buffpos++];
b3 = buffer[buffpos++];
b2 = buffer[buffpos++];
b1 = buffer[buffpos++];
}
if (b1 < 0)
b1 += 256;
if (b2 < 0)
b2 += 256;
if (b3 < 0)
b3 += 256;
if (b4 < 0)
b4 += 256;
return ((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
}
private static int getInt(int from) {
buffpos = from;
return getInt();
}
private static short getShort() {
int b1, b2;
if (!endian) {
b1 = buffer[buffpos++];
b2 = buffer[buffpos++];
} else {
b2 = buffer[buffpos++];
b1 = buffer[buffpos++];
}
if (b1 < 0)
b1 += 256;
if (b2 < 0)
b2 += 256;
return (short) ((b1 << 8) | b2);
}
private static short getShort(int from) {
buffpos = from;
return getShort();
}
/**********************************************************************
*
* Character type info section.
*
**********************************************************************/
private static int masks32[] = {
0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
0x40000000, 0x80000000
};
//
// The arrays with the character property info.
//
private static short[] _ucprop_offsets = null;
private static int[] _ucprop_ranges = null;
public static final int UC_MN = 0x00000001;
public static final int UC_MC = 0x00000002;
public static final int UC_ME = 0x00000004;
public static final int UC_ND = 0x00000008;
public static final int UC_NL = 0x00000010;
public static final int UC_NO = 0x00000020;
public static final int UC_ZS = 0x00000040;
public static final int UC_ZL = 0x00000080;
public static final int UC_ZP = 0x00000100;
public static final int UC_CC = 0x00000200;
public static final int UC_CF = 0x00000400;
public static final int UC_OS = 0x00000800;
public static final int UC_CO = 0x00001000;
public static final int UC_CN = 0x00002000;
public static final int UC_LU = 0x00004000;
public static final int UC_LL = 0x00008000;
public static final int UC_LT = 0x00010000;
public static final int UC_LM = 0x00020000;
public static final int UC_LO = 0x00040000;
public static final int UC_PC = 0x00080000;
public static final int UC_PD = 0x00100000;
public static final int UC_PS = 0x00200000;
public static final int UC_PE = 0x00400000;
public static final int UC_PO = 0x00800000;
public static final int UC_SM = 0x01000000;
public static final int UC_SC = 0x02000000;
public static final int UC_SK = 0x04000000;
public static final int UC_SO = 0x08000000;
public static final int UC_L = 0x10000000;
public static final int UC_R = 0x20000000;
public static final int UC_EN = 0x40000000;
public static final int UC_ES = 0x80000000;
public static final int UC_ET = 0x00000001;
public static final int UC_AN = 0x00000002;
public static final int UC_CS = 0x00000004;
public static final int UC_B = 0x00000008;
public static final int UC_S = 0x00000010;
public static final int UC_WS = 0x00000020;
public static final int UC_ON = 0x00000040;
public static final int UC_CM = 0x00000080;
public static final int UC_NB = 0x00000100;
public static final int UC_SY = 0x00000200;
public static final int UC_HD = 0x00000400;
public static final int UC_QM = 0x00000800;
public static final int UC_MR = 0x00001000;
public static final int UC_SS = 0x00002000;
public static final int UC_CP = 0x00004000;
public static final int UC_PI = 0x00008000;
public static final int UC_PF = 0x00010000;
private static boolean _ucprop_load(URL where) {
int i, hsize, size = 0;
boolean res;
InputStream in = null;
//
// If the offsets array is not null, then this file has been loaded.
//
if (_ucprop_offsets != null)
return true;
try {
in = where.openStream();
} catch (IOException e1) {
return false;
}
res = load_file(in);
try {
in.close();
} catch (IOException e) {}
if (res == false)
return res;
hsize = getShort();
if (((size = (hsize + 1) << 1) & 3) != 0)
size += 4 - (size & 3);
_ucprop_offsets = new short[hsize + 1];
//
// Skip the byte count which won't be needed.
//
buffpos += 4;
//
// Adjust the byte count used to position at the beginning of the
// ranges to include the 4 bytes at the beginning and the byte count
// which is unused.
//
size += 8;
for (i = 0; i <= hsize; i++)
_ucprop_offsets[i] = getShort();
//
// Now allocate the ranges.
//
_ucprop_ranges = new int[_ucprop_offsets[hsize]];
for (i = 0, buffpos = size; i < _ucprop_offsets[hsize]; i++)
_ucprop_ranges[i] = getInt();
return true;
}
private static void _ucprop_unload() {
_ucprop_offsets = null;
_ucprop_ranges = null;
}
private static boolean uclookup(int code, int n) {
int l, r, m;
if ((l = _ucprop_offsets[n]) == -1)
return false;
for (m = 1; n + m < _ucprop_offsets.length &&
_ucprop_offsets[n + m] == -1; m++) ;
r = _ucprop_offsets[n + m] - 1;
while (l <= r) {
m = (l + r) >> 1;
m -= (m & 1);
if (code > _ucprop_ranges[m + 1])
l = m + 2;
else if (code < _ucprop_ranges[m])
r = m - 2;
else if (_ucprop_ranges[m] <= code && code <= _ucprop_ranges[m+1])
return true;
}
return false;
}
public static boolean ucisprop(int code, int mask1, int mask2) {
int i;
if (mask1 == 0 && mask2 == 0)
return false;
if (mask1 != 0) {
for (i = 0; i < 32; i++) {
if ((mask1 & masks32[i]) != 0 && uclookup(code, i))
return true;
}
}
if (mask2 != 0) {
for (i = 32; i < _ucprop_offsets.length; i++) {
if ((mask2 & masks32[i & 31]) != 0 && uclookup(code, i))
return true;
}
}
return false;
}
public static boolean ucisalpha(int code) {
return ucisprop(code, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0);
}
public static boolean ucisdigit(int code) {
return ucisprop(code, UC_ND, 0);
}
public static boolean ucisalnum(int code) {
return ucisprop(code, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0);
}
public static boolean uciscntrl(int code) {
return ucisprop(code, UC_CC|UC_CF, 0);
}
public static boolean ucisspace(int code) {
return ucisprop(code, UC_ZS|UC_SS, 0);
}
public static boolean ucisblank(int code) {
return ucisprop(code, UC_ZS, 0);
}
public static boolean ucispunct(int code) {
return ucisprop(code, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF);
}
public static boolean ucisgraph(int code) {
return ucisprop(code, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|
UC_SO, UC_PI|UC_PF);
}
public static boolean ucisprint(int code) {
return ucisprop(code, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|
UC_SO|UC_ZS, UC_PI|UC_PF);
}
public static boolean ucisupper(int code) {
return ucisprop(code, UC_LU, 0);
}
public static boolean ucislower(int code) {
return ucisprop(code, UC_LL, 0);
}
public static boolean ucistitle(int code) {
return ucisprop(code, UC_LT, 0);
}
public static boolean ucisxdigit(int code) {
return ucisprop(code, 0, UC_HD);
}
public static boolean ucisisocntrl(int code) {
return ucisprop(code, UC_CC, 0);
}
public static boolean ucisfmtcntrl(int code) {
return ucisprop(code, UC_CF, 0);
}
public static boolean ucissymbol(int code) {
return ucisprop(code, UC_SM|UC_SC|UC_SO|UC_SK, 0);
}
public static boolean ucisnumber(int code) {
return ucisprop(code, UC_ND|UC_NO|UC_NL, 0);
}
public static boolean ucisnonspacing(int code) {
return ucisprop(code, UC_MN, 0);
}
public static boolean ucisopenpunct(int code) {
return ucisprop(code, UC_PS, 0);
}
public static boolean ucisclosepunct(int code) {
return ucisprop(code, UC_PE, 0);
}
public static boolean ucisinitialpunct(int code) {
return ucisprop(code, 0, UC_PI);
}
public static boolean ucisfinalpunct(int code) {
return ucisprop(code, 0, UC_PF);
}
public static boolean uciscomposite(int code) {
return ucisprop(code, 0, UC_CM);
}
public static boolean ucishex(int code) {
return ucisprop(code, 0, UC_HD);
}
public static boolean ucisquote(int code) {
return ucisprop(code, 0, UC_QM);
}
public static boolean ucissymmetric(int code) {
return ucisprop(code, 0, UC_SY);
}
public static boolean ucismirroring(int code) {
return ucisprop(code, 0, UC_MR);
}
public static boolean ucisnonbreaking(int code) {
return ucisprop(code, 0, UC_NB);
}
public static boolean ucisrtl(int code) {
return ucisprop(code, UC_R, 0);
}
public static boolean ucisltr(int code) {
return ucisprop(code, UC_L, 0);
}
public static boolean ucisstrong(int code) {
return ucisprop(code, UC_L|UC_R, 0);
}
public static boolean ucisweak(int code) {
return ucisprop(code, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS);
}
public static boolean ucisneutral(int code) {
return ucisprop(code, 0, UC_B|UC_S|UC_WS|UC_ON);
}
public static boolean ucisseparator(int code) {
return ucisprop(code, 0, UC_B|UC_S);
}
public static boolean ucismark(int code) {
return ucisprop(code, UC_MN|UC_MC|UC_ME, 0);
}
public static boolean ucismodif(int code) {
return ucisprop(code, UC_LM, 0);
}
public static boolean ucisletnum(int code) {
return ucisprop(code, UC_NL, 0);
}
public static boolean ucisconnect(int code) {
return ucisprop(code, UC_PC, 0);
}
public static boolean ucisdash(int code) {
return ucisprop(code, UC_PD, 0);
}
public static boolean ucismath(int code) {
return ucisprop(code, UC_SM, 0);
}
public static boolean uciscurrency(int code) {
return ucisprop(code, UC_SC, 0);
}
public static boolean ucismodifsymbol(int code) {
return ucisprop(code, UC_SK, 0);
}
public static boolean ucisnsmark(int code) {
return ucisprop(code, UC_MN, 0);
}
public static boolean ucisspmark(int code) {
return ucisprop(code, UC_MC, 0);
}
public static boolean ucisenclosing(int code) {
return ucisprop(code, UC_ME, 0);
}
public static boolean ucisprivate(int code) {
return ucisprop(code, UC_CO, 0);
}
public static boolean ucissurrogate(int code) {
return ucisprop(code, UC_OS, 0);
}
public static boolean ucislsep(int code) {
return ucisprop(code, UC_ZL, 0);
}
public static boolean ucispsep(int code) {
return ucisprop(code, UC_ZP, 0);
}
public static boolean ucisidentstart(int code) {
return ucisprop(code, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0);
}
public static boolean ucisidentpart(int code) {
return ucisprop(code, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|
UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0);
}
public static boolean ucisdefined(int code) {
return ucisprop(code, 0, UC_CP);
}
public static boolean ucisundefined(int code) {
return (ucisprop(code, 0, UC_CP) == true) ? false : true;
}
public static boolean ucishan(int code) {
return ((0x4e00 <= code && code <= 0x9fff) ||
(0xf900 <= code && code <= 0xfaff)) ? true : false;
}
public static boolean ucishangul(int code) {
return (0xac00 <= code && code <= 0xd7ff) ? true : false;
}
/**********************************************************************
*
* Case mapping section.
*
**********************************************************************/
private static int[] _uccase_len = {0, 0};
private static int[] _uccase_map = null;
private static boolean _uccase_load(URL where) {
int i, n;
boolean res;
InputStream in = null;
//
// If this array exists, then the file has already been loaded.
//
if (_uccase_map != null)
return true;
try {
in = where.openStream();
} catch (IOException e1) {
return false;
}
res = load_file(in);
try {
in.close();
} catch (IOException e) {}
if (res == false)
return res;
n = getShort(2) * 3;
_uccase_len[0] = getShort() * 3;
_uccase_len[1] = getShort() * 3;
_uccase_map = new int[n];
for (i = 0; i < n; i++)
_uccase_map[i] = getInt();
return true;
}
private static void _uccase_unload() {
_uccase_len[0] = _uccase_len[1] = 0;
_uccase_map = null;
}
private static int _uccase_lookup(int code, int l, int r, int field) {
int m;
while (l <= r) {
m = (l + r) >> 1;
m -= (m % 3);
if (code > _uccase_map[m])
l = m + 3;
else if (code < _uccase_map[m])
r = m - 3;
else
return _uccase_map[m + field];
}
return -1;
}
public static int uctoupper(int code) {
int l, r, field;
if (ucisupper(code))
return code;
if (ucislower(code)) {
//
// Lower case.
//
field = 2;
l = _uccase_len[0];
r = (l + _uccase_len[1]) - 3;
} else {
//
// Title case.
//
field = 1;
l = _uccase_len[0] + _uccase_len[1];
r = _uccase_map.length - 3;
}
return _uccase_lookup(code, l, r, field);
}
public static int uctolower(int code) {
int l, r, field;
if (ucislower(code))
return code;
if (ucisupper(code)) {
//
// Upper case.
//
field = 1;
l = 0;
r = _uccase_len[0] - 3;
} else {
//
// Title case.
//
field = 2;
l = _uccase_len[0] + _uccase_len[1];
r = _uccase_map.length - 1;
}
return _uccase_lookup(code, l, r, field);
}
public static int uctotitle(int code) {
int l, r, field;
if (ucistitle(code))
return code;
field = 2;
if (ucisupper(code)) {
//
// Upper case.
//
l = 0;
r = _uccase_len[0] - 3;
} else {
//
// Lower case.
//
l = _uccase_len[0];
r = (l + _uccase_len[1]) - 3;
}
return _uccase_lookup(code, l, r, field);
}
/**********************************************************************
*
* Character decomposition section.
*
**********************************************************************/
static int _ucdcmp_node_count = 0;
static int[] _ucdcmp_data = null;
private static boolean _ucdcmp_load(URL where) {
int i, bcnt;
boolean res;
InputStream in = null;
//
// If this array is not null, then the file has already been loaded.
//
if (_ucdcmp_data != null)
return true;
try {
in = where.openStream();
} catch (IOException e1) {
return false;
}
res = load_file(in);
try {
in.close();
} catch (IOException e) {}
if (res == false)
return res;
//
// This specifies how many of the _ucdmp_data elements are nodes which
// leaves the remaining number to be decompositions.
//
_ucdcmp_node_count = getShort() << 1;
bcnt = getInt() >> 2;
_ucdcmp_data = new int[bcnt];
for (i = 0; i < bcnt; i++)
_ucdcmp_data[i] = getInt();
return res;
}
private static void _ucdcmp_unload() {
_ucdcmp_node_count = 0;
_ucdcmp_data = null;
}
public static int[] ucdecomp(int code) {
int l, r, m, out[];
l = 0;
r = _ucdcmp_data[_ucdcmp_node_count] - 1;
while (l <= r) {
//
// Determine a "mid" point and adjust to make sure the mid point
// is at the beginning of a code+offset pair.
//
m = (l + r) >> 1;
m -= (m & 1);
if (code > _ucdcmp_data[m])
l = m + 2;
else if (code < _ucdcmp_data[m])
r = m - 2;
else {
l = _ucdcmp_data[m + 3] - _ucdcmp_data[m + 1];
out = new int[l];
for (r = 0; r < l; r++)
out[r] = _ucdcmp_data[_ucdcmp_node_count + 1 +
_ucdcmp_data[m + 1] + r];
return out;
}
}
return null;
}
public static int[] ucdecomp_hangul(int code) {
int out[], decomp[] = {0, 0, 0};
if (!ucishangul(code))
return null;
code -= 0xac00;
decomp[0] = 0x1100 + (code / 588);
decomp[1] = 0x1161 + ((code % 588) / 28);
decomp[2] = 0x11a7 + (code % 28);
out = new int[(decomp[2] != 0x11a7) ? 3 : 2];
out[0] = decomp[0];
out[1] = decomp[1];
if (decomp[0] != 0x11a7)
out[2] = decomp[2];
return out;
}
/**********************************************************************
*
* Combining class section.
*
**********************************************************************/
private static int[] _uccmbcl_nodes = null;
private static boolean _uccmbcl_load(URL where) {
int i, n;
boolean res;
InputStream in = null;
//
// If this array is not null, the file has already been loaded.
//
if (_uccmbcl_nodes != null)
return true;
try {
in = where.openStream();
} catch (IOException e1) {
return false;
}
res = load_file(in);
try {
in.close();
} catch (IOException e) {}
if (res == false)
return res;
n = getShort() * 3;
buffpos += 4;
_uccmbcl_nodes = new int[n];
for (i = 0; i < n; i++)
_uccmbcl_nodes[i] = getInt();
return true;
}
private static void _uccmbcl_unload() {
_uccmbcl_nodes = null;
}
public static int uccombining_class(int code) {
int l, r, m;
l = 0;
r = _uccmbcl_nodes.length - 3;
while (l <= r) {
m = (l + r) >> 1;
m -= (m % 3);
if (code > _uccmbcl_nodes[m + 1])
l = m + 3;
else if (code < _uccmbcl_nodes[m])
r = m - 3;
else if (_uccmbcl_nodes[m] <= code &&
code <= _uccmbcl_nodes[m + 1])
return _uccmbcl_nodes[m + 2];
}
return 0;
}
/**********************************************************************
*
* Number section.
*
**********************************************************************/
private static short[] _ucnum_vals;
private static int[] _ucnum_nodes;
private static boolean _ucnumb_load(URL where) {
int i, n, b;
boolean res;
InputStream in = null;
//
// If this array is not null, then the file has already been loaded.
//
if (_ucnum_nodes != null)
return true;
try {
in = where.openStream();
} catch (IOException e1) {
return false;
}
res = load_file(in);
try {
in.close();
} catch (IOException e) {}
if (res == false)
return res;
n = getShort();
b = (getInt() - (n << 2)) >> 1;
_ucnum_nodes = new int[n];
for (i = 0; i < n; i++)
_ucnum_nodes[i] = getInt();
_ucnum_vals = new short[b];
for (i = 0; i < b; i++)
_ucnum_vals[i] = getShort();
return true;
}
private static void _ucnumb_unload() {
_ucnum_vals = null;
_ucnum_nodes = null;
}
public static boolean ucnumber_lookup(int code, int[] result) {
int l, r, m;
result[0] = result[1] = 0;
l = 0;
r = _ucnum_nodes.length - 1;
while (l <= r) {
m = (l + r) >> 1;
m -= (m & 1);
if (code > _ucnum_nodes[m])
l = m + 2;
else if (code < _ucnum_nodes[m])
r = m - 2;
else {
result[0] = _ucnum_vals[_ucnum_nodes[m + 1]];
result[1] = _ucnum_vals[_ucnum_nodes[m + 1] + 1];
return true;
}
}
return false;
}
public static boolean ucdigit_lookup(int code, int[] result) {
int l, r, m;
result[0] = -1;
l = 0;
r = _ucnum_nodes.length - 1;
while (l <= r) {
m = (l + r) >> 1;
m -= (m & 1);
if (code > _ucnum_nodes[m])
l = m + 2;
else if (code < _ucnum_nodes[m])
r = m - 2;
else {
short d1 = _ucnum_vals[_ucnum_nodes[m + 1]];
short d2 = _ucnum_vals[_ucnum_nodes[m + 1] + 1];
if (d1 == d2) {
result[0] = d1;
return true;
}
return false;
}
}
return false;
}
/**********************************************************************
*
* File loading and unloading routines.
*
**********************************************************************/
//
// Masks that combine to load and unload files using a base URL.
//
public final static int UCDATA_CASE = 0x01;
public final static int UCDATA_CTYPE = 0x02;
public final static int UCDATA_DECOMP = 0x04;
public final static int UCDATA_CMBCL = 0x08;
public final static int UCDATA_NUM = 0x10;
public final static int UCDATA_ALL = 0x1f;
public static void ucdata_load(URL base, int masks) {
//
// Make sure the base has the trailing slash.
//
String url = base.toString();
if (url.lastIndexOf('/') != url.length() - 1)
url += "/";
if ((masks & UCDATA_CTYPE) != 0) {
try {
_ucprop_load(new URL(url + "ctype.dat"));
} catch (MalformedURLException mue) {}
}
if ((masks & UCDATA_CASE) != 0) {
try {
_uccase_load(new URL(url + "case.dat"));
} catch (MalformedURLException mue) {}
}
if ((masks & UCDATA_DECOMP) != 0) {
try {
_ucdcmp_load(new URL(url + "decomp.dat"));
} catch (MalformedURLException mue) {}
}
if ((masks & UCDATA_CMBCL) != 0) {
try {
_uccmbcl_load(new URL(url + "cmbcl.dat"));
} catch (MalformedURLException mue) {}
}
if ((masks & UCDATA_NUM) != 0) {
try {
_ucnumb_load(new URL(url + "num.dat"));
} catch (MalformedURLException mue) {}
}
}
public static void ucdata_unload(int masks) {
if ((masks & UCDATA_CTYPE) != 0)
_ucprop_unload();
if ((masks & UCDATA_CASE) != 0)
_uccase_unload();
if ((masks & UCDATA_DECOMP) != 0)
_ucdcmp_unload();
if ((masks & UCDATA_CMBCL) != 0)
_uccmbcl_unload();
if ((masks & UCDATA_NUM) != 0)
_ucnumb_unload();
}
}

View file

@ -0,0 +1,94 @@
/*
* $Id: UCDataTest.java,v 1.1 1999/08/23 16:14:08 mleisher Exp $
*
* Copyright 1999 Computing Research Labs, New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
import java.io.*;
import java.net.*;
import UCData.*;
public class UCDataTest {
/**********************************************************************
*
* Main.
*
**********************************************************************/
public static void main(String[] args) {
URL url = null;
try {
url = new URL("file:/home/mleisher/unicode/textutils/ucdata");
} catch (MalformedURLException mue) {}
UCData.ucdata_load(url, UCData.UCDATA_ALL);
if (UCData.ucisalpha(0x1d5))
System.out.println("0x1d5 is alpha");
else
System.out.println("0x1d5 is not alpha");
int c;
c = UCData.uctolower(0x1f1);
System.out.println("0x1f1 lower is 0x"+Integer.toHexString(c));
c = UCData.uctotitle(0x1f1);
System.out.println("0x1f1 title is 0x"+Integer.toHexString(c));
c = UCData.uctolower(0xff3a);
System.out.println("0xff3a lower is 0x"+Integer.toHexString(c));
c = UCData.uctotitle(0xff3a);
System.out.println("0xff3a title is 0x"+Integer.toHexString(c));
int[] decomp = UCData.ucdecomp(0x1d5);
if (decomp != null) {
System.out.print("0x1d5 decomposition :");
for (int i = 0; i < decomp.length; i++)
System.out.print("0x"+Integer.toHexString(decomp[i])+" ");
System.out.println("");
}
int ccl = UCData.uccombining_class(0x41);
System.out.println("0x41 combining class " + ccl);
ccl = UCData.uccombining_class(0xfe23);
System.out.println("0xfe23 combining class " + ccl);
int num[] = {0,0};
if (UCData.ucnumber_lookup(0x30, num)) {
if (num[0] != num[1])
System.out.println("0x30 is fraction "+num[0]+"/"+num[1]);
else
System.out.println("0x30 is digit "+num[0]);
}
if (UCData.ucnumber_lookup(0xbc, num)) {
if (num[0] != num[1])
System.out.println("0xbc is fraction "+num[0]+"/"+num[1]);
else
System.out.println("0xbc is digit "+num[0]);
}
if (UCData.ucdigit_lookup(0x6f9, num))
System.out.println("0x6f9 is digit " + num[0]);
else
System.out.println("0x6f9 is not a digit");
}
}

View file

@ -0,0 +1,343 @@
#
# $Id: api.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $
#
The MUTT UCData API
-------------------
-----------------------------------------------------------------------------
Macros that combine to select data tables for ucdata_load(), ucdata_unload(),
and ucdata_reload().
#define UCDATA_CASE 0x01
#define UCDATA_CTYPE 0x02
#define UCDATA_DECOMP 0x04
#define UCDATA_CMBCL 0x08
#define UCDATA_NUM 0x10
#define UCATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
UCDATA_CMBCL|UCDATA_NUM)
-----------------------------------------------------------------------------
void ucdata_load(char *paths, int masks)
This function initializes the UCData library by locating the data files in
one of the colon-separated directories in the `paths' parameter. The data
files to be loaded are specified in the `masks' parameter as a bitwise
combination of the macros listed above.
This should be called before using any of the other functions.
NOTE: the ucdata_setup(char *paths) function is now a macro that expands
into this function at compile time.
-----------------------------------------------------------------------------
void ucdata_unload(int masks)
This function unloads the data tables specified in the `masks' parameter.
This function should be called when the application is done using the UCData
package.
NOTE: the ucdata_cleanup() function is now a macro that expands into this
function at compile time.
-----------------------------------------------------------------------------
void ucdata_reload(char *paths, int masks)
This function reloads the data files from one of the colon-separated
directories in the `paths' parameter. The data files to be reloaded are
specified in the `masks' parameter as a bitwise combination of the macros
listed above.
If the data files have already been loaded, they are unloaded before the
data files are loaded again.
-----------------------------------------------------------------------------
int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
This function determines if a character has a decomposition and returns the
decomposition information if it exists.
If a zero is returned, there is no decomposition. If a non-zero is
returned, then the `num' and `decomp' variables are filled in with the
appropriate values.
Example call:
unsigned long i, num, *decomp;
if (ucdecomp(0x1d5, &num, &decomp) != 0) {
for (i = 0; i < num; i++)
printf("0x%08lX,", decomp[i]);
putchar('\n');
}
-----------------------------------------------------------------------------
int ucdecomp_hangul(unsigned long code, unsigned long *num,
unsigned long decomp[])
This function determines if a Hangul syllable has a decomposition and
returns the decomposition information.
An array of at least size 3 should be passed to the function for the
decomposition of the syllable.
If a zero is returned, the character is not a Hangul syllable. If a
non-zero is returned, the `num' field will be 2 or 3 and the syllable will
be decomposed into the `decomp' array arithmetically.
Example call:
unsigned long i, num, decomp[3];
if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) {
for (i = 0; i < num; i++)
printf("0x%08lX,", decomp[i]);
putchar('\n');
}
-----------------------------------------------------------------------------
struct ucnumber {
int numerator;
int denominator;
};
int ucnumber_lookup(unsigned long code, struct ucnumber *num)
This function determines if the code is a number and fills in the `num'
field with the numerator and denominator. If the code happens to be a
single digit, the numerator and denominator fields will be the same.
If the function returns 0, the code is not a number. Any other return
value means the code is a number.
int ucdigit_lookup(unsigned long code, int *digit)
This function determines if the code is a digit and fills in the `digit'
field with the digit value.
If the function returns 0, the code is not a number. Any other return
value means the code is a number.
struct ucnumber ucgetnumber(unsigned long code)
This is a compatibility function with John Cowan's "uctype" package. It
uses ucnumber_lookup().
int ucgetdigit(unsigned long code)
This is a compatibility function with John Cowan's "uctype" package. It
uses ucdigit_lookup().
-----------------------------------------------------------------------------
unsigned long uctoupper(unsigned long code)
This function returns the code unchanged if it is already upper case or has
no upper case equivalent. Otherwise the upper case equivalent is returned.
-----------------------------------------------------------------------------
unsigned long uctolower(unsigned long code)
This function returns the code unchanged if it is already lower case or has
no lower case equivalent. Otherwise the lower case equivalent is returned.
-----------------------------------------------------------------------------
unsigned long uctotitle(unsigned long code)
This function returns the code unchanged if it is already title case or has
no title case equivalent. Otherwise the title case equivalent is returned.
-----------------------------------------------------------------------------
int ucisalpha(unsigned long code)
int ucisalnum(unsigned long code)
int ucisdigit(unsigned long code)
int uciscntrl(unsigned long code)
int ucisspace(unsigned long code)
int ucisblank(unsigned long code)
int ucispunct(unsigned long code)
int ucisgraph(unsigned long code)
int ucisprint(unsigned long code)
int ucisxdigit(unsigned long code)
int ucisupper(unsigned long code)
int ucislower(unsigned long code)
int ucistitle(unsigned long code)
These functions (actually macros) determine if a character has these
properties. These behave in a fashion very similar to the venerable ctype
package.
-----------------------------------------------------------------------------
int ucisisocntrl(unsigned long code)
Is the character a C0 control character (< 32) ?
int ucisfmtcntrl(unsigned long code)
Is the character a format control character?
int ucissymbol(unsigned long code)
Is the character a symbol?
int ucisnumber(unsigned long code)
Is the character a number or digit?
int ucisnonspacing(unsigned long code)
Is the character non-spacing?
int ucisopenpunct(unsigned long code)
Is the character an open/left punctuation (i.e. '[')
int ucisclosepunct(unsigned long code)
Is the character an close/right punctuation (i.e. ']')
int ucisinitialpunct(unsigned long code)
Is the character an initial punctuation (i.e. U+2018 LEFT SINGLE QUOTATION
MARK)
int ucisfinalpunct(unsigned long code)
Is the character a final punctuation (i.e. U+2019 RIGHT SINGLE QUOTATION
MARK)
int uciscomposite(unsigned long code)
Can the character be decomposed into a set of other characters?
int ucisquote(unsigned long code)
Is the character one of the many quotation marks?
int ucissymmetric(unsigned long code)
Is the character one that has an opposite form (i.e. <>)
int ucismirroring(unsigned long code)
Is the character mirroring (superset of symmetric)?
int ucisnonbreaking(unsigned long code)
Is the character non-breaking (i.e. non-breaking space)?
int ucisrtl(unsigned long code)
Does the character have strong right-to-left directionality (i.e. Arabic
letters)?
int ucisltr(unsigned long code)
Does the character have strong left-to-right directionality (i.e. Latin
letters)?
int ucisstrong(unsigned long code)
Does the character have strong directionality?
int ucisweak(unsigned long code)
Does the character have weak directionality (i.e. numbers)?
int ucisneutral(unsigned long code)
Does the character have neutral directionality (i.e. whitespace)?
int ucisseparator(unsigned long code)
Is the character a block or segment separator?
int ucislsep(unsigned long code)
Is the character a line separator?
int ucispsep(unsigned long code)
Is the character a paragraph separator?
int ucismark(unsigned long code)
Is the character a mark of some kind?
int ucisnsmark(unsigned long code)
Is the character a non-spacing mark?
int ucisspmark(unsigned long code)
Is the character a spacing mark?
int ucismodif(unsigned long code)
Is the character a modifier letter?
int ucismodifsymbol(unsigned long code)
Is the character a modifier symbol?
int ucisletnum(unsigned long code)
Is the character a number represented by a letter?
int ucisconnect(unsigned long code)
Is the character connecting punctuation?
int ucisdash(unsigned long code)
Is the character dash punctuation?
int ucismath(unsigned long code)
Is the character a math character?
int uciscurrency(unsigned long code)
Is the character a currency character?
int ucisenclosing(unsigned long code)
Is the character enclosing (i.e. enclosing box)?
int ucisprivate(unsigned long code)
Is the character from the Private Use Area?
int ucissurrogate(unsigned long code)
Is the character one of the surrogate codes?
int ucisdefined(unsigned long code)
Is the character defined (appeared in one of the data files)?
int ucisundefined(unsigned long code)
Is the character not defined (non-Unicode)?
int ucishan(unsigned long code)
Is the character a Han ideograph?
int ucishangul(unsigned long code)
Is the character a pre-composed Hangul syllable?

View file

@ -0,0 +1,84 @@
#
# $Id: bidiapi.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $
#
"Pretty Good Bidi Algorithm" API
The PGBA (Pretty Good Bidi Algorithm) is an effective alternative to the
Unicode BiDi algorithm. It currently provides only implicit reordering and
does not yet support explicit reordering codes that the Unicode BiDi algorithm
supports. In addition to reordering, the PGBA includes cursor movement
support for both visual and logical navigation.
-----------------------------------------------------------------------------
#define UCPGBA_LTR 0
#define UCPGBA_RTL 1
These macros appear in the `direction' field of the data structures.
#define UCPGBA_CURSOR_VISUAL 0
#define UCPGBA_CURSOR_LOGICAL 1
These macros are used to set the cursor movement for each reordered string.
-----------------------------------------------------------------------------
ucstring_t *ucstring_create(unsigned long *source, unsigned long start,
unsigned long end, int default_direction,
int cursor_motion)
This function will create a reordered string by using the implicit
directionality of the characters in the specified substring.
The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL
and is used only in cases where a string contains no characters with strong
directionality.
The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or
UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion
behavior. This behavior can be switched at any time using
ustring_set_cursor_motion().
-----------------------------------------------------------------------------
void ucstring_free(ucstring_t *string)
This function will deallocate the memory used by the string, incuding the
string itself.
-----------------------------------------------------------------------------
void ucstring_cursor_info(ustring_t *string, int *direction,
unsigned long *position)
This function will return the text position of the internal cursor and the
directionality of the text at that position. The position returned is the
original text position of the character.
-----------------------------------------------------------------------------
int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion)
This function will change the cursor motion type and return the previous
cursor motion type.
-----------------------------------------------------------------------------
int ucstring_cursor_right(ucstring_t *string, int count)
This function will move the internal cursor to the right according to the
type of cursor motion set for the string.
If no cursor motion is performed, it returns 0. Otherwise it will return a
1.
-----------------------------------------------------------------------------
int ucstring_cursor_left(ucstring_t *string, int count)
This function will move the internal cursor to the left according to the
type of cursor motion set for the string.
If no cursor motion is performed, it returns 0. Otherwise it will return a
1.

View file

@ -0,0 +1,243 @@
#
# $Id: format.txt,v 1.1 1998/07/24 15:17:21 mleisher Exp $
#
CHARACTER DATA
==============
This package generates some data files that contain character properties useful
for text processing.
CHARACTER PROPERTIES
====================
The first data file is called "ctype.dat" and contains a compressed form of
the character properties found in the Unicode Character Database (UCDB).
Additional properties can be specified in limited UCDB format in another file
to avoid modifying the original UCDB.
The following is a property name and code table to be used with the character
data:
NAME CODE DESCRIPTION
---------------------
Mn 0 Mark, Non-Spacing
Mc 1 Mark, Spacing Combining
Me 2 Mark, Enclosing
Nd 3 Number, Decimal Digit
Nl 4 Number, Letter
No 5 Number, Other
Zs 6 Separator, Space
Zl 7 Separator, Line
Zp 8 Separator, Paragraph
Cc 9 Other, Control
Cf 10 Other, Format
Cs 11 Other, Surrogate
Co 12 Other, Private Use
Cn 13 Other, Not Assigned
Lu 14 Letter, Uppercase
Ll 15 Letter, Lowercase
Lt 16 Letter, Titlecase
Lm 17 Letter, Modifier
Lo 18 Letter, Other
Pc 19 Punctuation, Connector
Pd 20 Punctuation, Dash
Ps 21 Punctuation, Open
Pe 22 Punctuation, Close
Po 23 Punctuation, Other
Sm 24 Symbol, Math
Sc 25 Symbol, Currency
Sk 26 Symbol, Modifier
So 27 Symbol, Other
L 28 Left-To-Right
R 29 Right-To-Left
EN 30 European Number
ES 31 European Number Separator
ET 32 European Number Terminator
AN 33 Arabic Number
CS 34 Common Number Separator
B 35 Block Separator
S 36 Segment Separator
WS 37 Whitespace
ON 38 Other Neutrals
Pi 47 Punctuation, Initial
Pf 48 Punctuation, Final
#
# Implementation specific properties.
#
Cm 39 Composite
Nb 40 Non-Breaking
Sy 41 Symmetric (characters which are part of open/close pairs)
Hd 42 Hex Digit
Qm 43 Quote Mark
Mr 44 Mirroring
Ss 45 Space, Other (controls viewed as spaces in ctype isspace())
Cp 46 Defined character
The actual binary data is formatted as follows:
Assumptions: unsigned short is at least 16-bits in size and unsigned long
is at least 32-bits in size.
unsigned short ByteOrderMark
unsigned short OffsetArraySize
unsigned long Bytes
unsigned short Offsets[OffsetArraySize + 1]
unsigned long Ranges[N], N = value of Offsets[OffsetArraySize]
The Bytes field provides the total byte count used for the Offsets[] and
Ranges[] arrays. The Offsets[] array is aligned on a 4-byte boundary and
there is always one extra node on the end to hold the final index of the
Ranges[] array. The Ranges[] array contains pairs of 4-byte values
representing a range of Unicode characters. The pairs are arranged in
increasing order by the first character code in the range.
Determining if a particular character is in the property list requires a
simple binary search to determine if a character is in any of the ranges
for the property.
If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a
machine with a different endian order and the values must be byte-swapped.
To swap a 16-bit value:
c = (c >> 8) | ((c & 0xff) << 8)
To swap a 32-bit value:
c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) |
(((c >> 16) & 0xff) << 8) | (c >> 24)
CASE MAPPINGS
=============
The next data file is called "case.dat" and contains three case mapping tables
in the following order: upper, lower, and title case. Each table is in
increasing order by character code and each mapping contains 3 unsigned longs
which represent the possible mappings.
The format for the binary form of these tables is:
unsigned short ByteOrderMark
unsigned short NumMappingNodes, count of all mapping nodes
unsigned short CaseTableSizes[2], upper and lower mapping node counts
unsigned long CaseTables[NumMappingNodes]
The starting indexes of the case tables are calculated as following:
UpperIndex = 0;
LowerIndex = CaseTableSizes[0] * 3;
TitleIndex = LowerIndex + CaseTableSizes[1] * 3;
The order of the fields for the three tables are:
Upper case
----------
unsigned long upper;
unsigned long lower;
unsigned long title;
Lower case
----------
unsigned long lower;
unsigned long upper;
unsigned long title;
Title case
----------
unsigned long title;
unsigned long upper;
unsigned long lower;
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
same way as described in the CHARACTER PROPERTIES section.
Because the tables are in increasing order by character code, locating a
mapping requires a simple binary search on one of the 3 codes that make up
each node.
It is important to note that there can only be 65536 mapping nodes which
divided into 3 portions allows 21845 nodes for each case mapping table. The
distribution of mappings may be more or less than 21845 per table, but only
65536 are allowed.
DECOMPOSITIONS
==============
The next data file is called "decomp.dat" and contains the decomposition data
for all characters with decompositions containing more than one character and
are *not* compatibility decompositions. Compatibility decompositions are
signaled in the UCDB format by the use of the <compat> tag in the
decomposition field. Each list of character codes represents a full
decomposition of a composite character. The nodes are arranged in increasing
order by character code.
The format for the binary form of this table is:
unsigned short ByteOrderMark
unsigned short NumDecompNodes, count of all decomposition nodes
unsigned long Bytes
unsigned long DecompNodes[(NumDecompNodes * 2) + 1]
unsigned long Decomp[N], N = sum of all counts in DecompNodes[]
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
same way as described in the CHARACTER PROPERTIES section.
The DecompNodes[] array consists of pairs of unsigned longs, the first of
which is the character code and the second is the initial index of the list
of character codes representing the decomposition.
Locating the decomposition of a composite character requires a binary search
for a character code in the DecompNodes[] array and using its index to
locate the start of the decomposition. The length of the decomposition list
is the index in the following element in DecompNode[] minus the current
index.
COMBINING CLASSES
=================
The fourth data file is called "cmbcl.dat" and contains the characters with
non-zero combining classes.
The format for the binary form of this table is:
unsigned short ByteOrderMark
unsigned short NumCCLNodes
unsigned long Bytes
unsigned long CCLNodes[NumCCLNodes * 3]
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
same way as described in the CHARACTER PROPERTIES section.
The CCLNodes[] array consists of groups of three unsigned longs. The first
and second are the beginning and ending of a range and the third is the
combining class of that range.
If a character is not found in this table, then the combining class is
assumed to be 0.
It is important to note that only 65536 distinct ranges plus combining class
can be specified because the NumCCLNodes is usually a 16-bit number.
NUMBER TABLE
============
The final data file is called "num.dat" and contains the characters that have
a numeric value associated with them.
The format for the binary form of the table is:
unsigned short ByteOrderMark
unsigned short NumNumberNodes
unsigned long Bytes
unsigned long NumberNodes[NumNumberNodes]
unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long)))
/ sizeof(short)]
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
same way as described in the CHARACTER PROPERTIES section.
The NumberNodes array contains pairs of values, the first of which is the
character code and the second an index into the ValueNodes array. The
ValueNodes array contains pairs of integers which represent the numerator
and denominator of the numeric value of the character. If the character
happens to map to an integer, both the values in ValueNodes will be the
same.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,306 @@
/*
* Copyright 1999 Computing Research Labs, New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _h_ucdata
#define _h_ucdata
/*
* $Id: ucdata.h,v 1.5 1999/11/19 15:24:29 mleisher Exp $
*/
#ifdef __cplusplus
extern "C" {
#endif
#undef __
#ifdef __STDC__
#define __(x) x
#else
#define __(x) ()
#endif
#define UCDATA_VERSION "2.3"
/**************************************************************************
*
* Masks and macros for character properties.
*
**************************************************************************/
/*
* Values that can appear in the `mask1' parameter of the ucisprop()
* function.
*/
#define UC_MN 0x00000001 /* Mark, Non-Spacing */
#define UC_MC 0x00000002 /* Mark, Spacing Combining */
#define UC_ME 0x00000004 /* Mark, Enclosing */
#define UC_ND 0x00000008 /* Number, Decimal Digit */
#define UC_NL 0x00000010 /* Number, Letter */
#define UC_NO 0x00000020 /* Number, Other */
#define UC_ZS 0x00000040 /* Separator, Space */
#define UC_ZL 0x00000080 /* Separator, Line */
#define UC_ZP 0x00000100 /* Separator, Paragraph */
#define UC_CC 0x00000200 /* Other, Control */
#define UC_CF 0x00000400 /* Other, Format */
#define UC_OS 0x00000800 /* Other, Surrogate */
#define UC_CO 0x00001000 /* Other, Private Use */
#define UC_CN 0x00002000 /* Other, Not Assigned */
#define UC_LU 0x00004000 /* Letter, Uppercase */
#define UC_LL 0x00008000 /* Letter, Lowercase */
#define UC_LT 0x00010000 /* Letter, Titlecase */
#define UC_LM 0x00020000 /* Letter, Modifier */
#define UC_LO 0x00040000 /* Letter, Other */
#define UC_PC 0x00080000 /* Punctuation, Connector */
#define UC_PD 0x00100000 /* Punctuation, Dash */
#define UC_PS 0x00200000 /* Punctuation, Open */
#define UC_PE 0x00400000 /* Punctuation, Close */
#define UC_PO 0x00800000 /* Punctuation, Other */
#define UC_SM 0x01000000 /* Symbol, Math */
#define UC_SC 0x02000000 /* Symbol, Currency */
#define UC_SK 0x04000000 /* Symbol, Modifier */
#define UC_SO 0x08000000 /* Symbol, Other */
#define UC_L 0x10000000 /* Left-To-Right */
#define UC_R 0x20000000 /* Right-To-Left */
#define UC_EN 0x40000000 /* European Number */
#define UC_ES 0x80000000 /* European Number Separator */
/*
* Values that can appear in the `mask2' parameter of the ucisprop()
* function.
*/
#define UC_ET 0x00000001 /* European Number Terminator */
#define UC_AN 0x00000002 /* Arabic Number */
#define UC_CS 0x00000004 /* Common Number Separator */
#define UC_B 0x00000008 /* Block Separator */
#define UC_S 0x00000010 /* Segment Separator */
#define UC_WS 0x00000020 /* Whitespace */
#define UC_ON 0x00000040 /* Other Neutrals */
/*
* Implementation specific character properties.
*/
#define UC_CM 0x00000080 /* Composite */
#define UC_NB 0x00000100 /* Non-Breaking */
#define UC_SY 0x00000200 /* Symmetric */
#define UC_HD 0x00000400 /* Hex Digit */
#define UC_QM 0x00000800 /* Quote Mark */
#define UC_MR 0x00001000 /* Mirroring */
#define UC_SS 0x00002000 /* Space, other */
#define UC_CP 0x00004000 /* Defined */
/*
* Added for UnicodeData-2.1.3.
*/
#define UC_PI 0x00008000 /* Punctuation, Initial */
#define UC_PF 0x00010000 /* Punctuation, Final */
/*
* This is the primary function for testing to see if a character has some set
* of properties. The macros that test for various character properties all
* call this function with some set of masks.
*/
extern int ucisprop __((unsigned long code, unsigned long mask1,
unsigned long mask2));
#define ucisalpha(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0)
#define ucisdigit(cc) ucisprop(cc, UC_ND, 0)
#define ucisalnum(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0)
#define uciscntrl(cc) ucisprop(cc, UC_CC|UC_CF, 0)
#define ucisspace(cc) ucisprop(cc, UC_ZS|UC_SS, 0)
#define ucisblank(cc) ucisprop(cc, UC_ZS, 0)
#define ucispunct(cc) ucisprop(cc, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF)
#define ucisgraph(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
UC_SO, UC_PI|UC_PF)
#define ucisprint(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
UC_SO|UC_ZS, UC_PI|UC_PF)
#define ucisupper(cc) ucisprop(cc, UC_LU, 0)
#define ucislower(cc) ucisprop(cc, UC_LL, 0)
#define ucistitle(cc) ucisprop(cc, UC_LT, 0)
#define ucisxdigit(cc) ucisprop(cc, 0, UC_HD)
#define ucisisocntrl(cc) ucisprop(cc, UC_CC, 0)
#define ucisfmtcntrl(cc) ucisprop(cc, UC_CF, 0)
#define ucissymbol(cc) ucisprop(cc, UC_SM|UC_SC|UC_SO|UC_SK, 0)
#define ucisnumber(cc) ucisprop(cc, UC_ND|UC_NO|UC_NL, 0)
#define ucisnonspacing(cc) ucisprop(cc, UC_MN, 0)
#define ucisopenpunct(cc) ucisprop(cc, UC_PS, 0)
#define ucisclosepunct(cc) ucisprop(cc, UC_PE, 0)
#define ucisinitialpunct(cc) ucisprop(cc, 0, UC_PI)
#define ucisfinalpunct(cc) ucisprop(cc, 0, UC_PF)
#define uciscomposite(cc) ucisprop(cc, 0, UC_CM)
#define ucishex(cc) ucisprop(cc, 0, UC_HD)
#define ucisquote(cc) ucisprop(cc, 0, UC_QM)
#define ucissymmetric(cc) ucisprop(cc, 0, UC_SY)
#define ucismirroring(cc) ucisprop(cc, 0, UC_MR)
#define ucisnonbreaking(cc) ucisprop(cc, 0, UC_NB)
/*
* Directionality macros.
*/
#define ucisrtl(cc) ucisprop(cc, UC_R, 0)
#define ucisltr(cc) ucisprop(cc, UC_L, 0)
#define ucisstrong(cc) ucisprop(cc, UC_L|UC_R, 0)
#define ucisweak(cc) ucisprop(cc, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS)
#define ucisneutral(cc) ucisprop(cc, 0, UC_B|UC_S|UC_WS|UC_ON)
#define ucisseparator(cc) ucisprop(cc, 0, UC_B|UC_S)
/*
* Other macros inspired by John Cowan.
*/
#define ucismark(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME, 0)
#define ucismodif(cc) ucisprop(cc, UC_LM, 0)
#define ucisletnum(cc) ucisprop(cc, UC_NL, 0)
#define ucisconnect(cc) ucisprop(cc, UC_PC, 0)
#define ucisdash(cc) ucisprop(cc, UC_PD, 0)
#define ucismath(cc) ucisprop(cc, UC_SM, 0)
#define uciscurrency(cc) ucisprop(cc, UC_SC, 0)
#define ucismodifsymbol(cc) ucisprop(cc, UC_SK, 0)
#define ucisnsmark(cc) ucisprop(cc, UC_MN, 0)
#define ucisspmark(cc) ucisprop(cc, UC_MC, 0)
#define ucisenclosing(cc) ucisprop(cc, UC_ME, 0)
#define ucisprivate(cc) ucisprop(cc, UC_CO, 0)
#define ucissurrogate(cc) ucisprop(cc, UC_OS, 0)
#define ucislsep(cc) ucisprop(cc, UC_ZL, 0)
#define ucispsep(cc) ucisprop(cc, UC_ZP, 0)
#define ucisidentstart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0)
#define ucisidentpart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|\
UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0)
#define ucisdefined(cc) ucisprop(cc, 0, UC_CP)
#define ucisundefined(cc) !ucisprop(cc, 0, UC_CP)
/*
* Other miscellaneous character property macros.
*/
#define ucishan(cc) (((cc) >= 0x4e00 && (cc) <= 0x9fff) ||\
((cc) >= 0xf900 && (cc) <= 0xfaff))
#define ucishangul(cc) ((cc) >= 0xac00 && (cc) <= 0xd7ff)
/**************************************************************************
*
* Functions for case conversion.
*
**************************************************************************/
extern unsigned long uctoupper __((unsigned long code));
extern unsigned long uctolower __((unsigned long code));
extern unsigned long uctotitle __((unsigned long code));
/**************************************************************************
*
* Functions for getting decompositions.
*
**************************************************************************/
/*
* This routine determines if the code has a decomposition. If it returns 0,
* there is no decomposition. Any other value indicates a decomposition was
* returned.
*/
extern int ucdecomp __((unsigned long code, unsigned long *num,
unsigned long **decomp));
/*
* If the code is a Hangul syllable, this routine decomposes it into the array
* passed. The array size should be at least 3.
*/
extern int ucdecomp_hangul __((unsigned long code, unsigned long *num,
unsigned long decomp[]));
/**************************************************************************
*
* Functions for getting combining classes.
*
**************************************************************************/
/*
* This will return the combining class for a character to be used with the
* Canonical Ordering algorithm.
*/
extern unsigned long uccombining_class __((unsigned long code));
/**************************************************************************
*
* Functions for getting numbers and digits.
*
**************************************************************************/
struct ucnumber {
int numerator;
int denominator;
};
extern int ucnumber_lookup __((unsigned long code, struct ucnumber *num));
extern int ucdigit_lookup __((unsigned long code, int *digit));
/*
* For compatibility with John Cowan's "uctype" package.
*/
extern struct ucnumber ucgetnumber __((unsigned long code));
extern int ucgetdigit __((unsigned long code));
/**************************************************************************
*
* Functions library initialization and cleanup.
*
**************************************************************************/
/*
* Macros for specifying the data tables to be loaded, unloaded, or reloaded
* by the ucdata_load(), ucdata_unload(), and ucdata_reload() routines.
*/
#define UCDATA_CASE 0x01
#define UCDATA_CTYPE 0x02
#define UCDATA_DECOMP 0x04
#define UCDATA_CMBCL 0x08
#define UCDATA_NUM 0x10
#define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
UCDATA_CMBCL|UCDATA_NUM)
/*
* Functions to load, unload, and reload specific data files.
*/
extern void ucdata_load __((char *paths, int mask));
extern void ucdata_unload __((int mask));
extern void ucdata_reload __((char *paths, int mask));
/*
* Deprecated functions, now just compatibility macros.
*/
#define ucdata_setup(p) ucdata_load(p, UCDATA_ALL)
#define ucdata_cleanup() ucdata_unload(UCDATA_ALL)
#undef __
#ifdef __cplusplus
}
#endif
#endif /* _h_ucdata */

View file

@ -0,0 +1,464 @@
.\"
.\" $Id: ucdata.man,v 1.4 1999/11/19 16:08:33 mleisher Exp $
.\"
.TH ucdata 3 "19 November 1999"
.SH NAME
ucdata \- package for providing Unicode/ISO10646 character information
.SH SYNOPSIS
#include <ucdata.h>
.sp
void ucdata_load(char * paths, int masks)
.sp
void ucdata_unload(int masks)
.sp
void ucdata_reload(char * paths, int masks)
.sp
int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
.sp
int ucdecomp_hangul(unsigned long code, unsigned long *num,
unsigned long decomp[])
.sp
.nf
struct ucnumber {
int numerator;
int denominator;
};
.sp
int ucnumber_lookup(unsigned long code, struct ucnumber *num)
.sp
int ucdigit_lookup(unsigned long code, int *digit)
.sp
struct ucnumber ucgetnumber(unsigned long code)
.sp
int ucgetdigit(unsigned long code)
.sp
unsigned long uctoupper(unsigned long code)
.sp
unsigned long uctolower(unsigned long code)
.sp
unsigned long uctotitle(unsigned long code)
.sp
int ucisalpha(unsigned long code)
.sp
int ucisalnum(unsigned long code)
.sp
int ucisdigit(unsigned long code)
.sp
int uciscntrl(unsigned long code)
.sp
int ucisspace(unsigned long code)
.sp
int ucisblank(unsigned long code)
.sp
int ucispunct(unsigned long code)
.sp
int ucisgraph(unsigned long code)
.sp
int ucisprint(unsigned long code)
.sp
int ucisxdigit(unsigned long code)
.sp
int ucisupper(unsigned long code)
.sp
int ucislower(unsigned long code)
.sp
int ucistitle(unsigned long code)
.sp
int ucisisocntrl(unsigned long code)
.sp
int ucisfmtcntrl(unsigned long code)
.sp
int ucissymbol(unsigned long code)
.sp
int ucisnumber(unsigned long code)
.sp
int ucisnonspacing(unsigned long code)
.sp
int ucisopenpunct(unsigned long code)
.sp
int ucisclosepunct(unsigned long code)
.sp
int ucisinitialpunct(unsigned long code)
.sp
int ucisfinalpunct(unsigned long code)
.sp
int uciscomposite(unsigned long code)
.sp
int ucisquote(unsigned long code)
.sp
int ucissymmetric(unsigned long code)
.sp
int ucismirroring(unsigned long code)
.sp
int ucisnonbreaking(unsigned long code)
.sp
int ucisrtl(unsigned long code)
.sp
int ucisltr(unsigned long code)
.sp
int ucisstrong(unsigned long code)
.sp
int ucisweak(unsigned long code)
.sp
int ucisneutral(unsigned long code)
.sp
int ucisseparator(unsigned long code)
.sp
int ucislsep(unsigned long code)
.sp
int ucispsep(unsigned long code)
.sp
int ucismark(unsigned long code)
.sp
int ucisnsmark(unsigned long code)
.sp
int ucisspmark(unsigned long code)
.sp
int ucismodif(unsigned long code)
.sp
int ucismodifsymbol(unsigned long code)
.sp
int ucisletnum(unsigned long code)
.sp
int ucisconnect(unsigned long code)
.sp
int ucisdash(unsigned long code)
.sp
int ucismath(unsigned long code)
.sp
int uciscurrency(unsigned long code)
.sp
int ucisenclosing(unsigned long code)
.sp
int ucisprivate(unsigned long code)
.sp
int ucissurrogate(unsigned long code)
.sp
int ucisidentstart(unsigned long code)
.sp
int ucisidentpart(unsigned long code)
.sp
int ucisdefined(unsigned long code)
.sp
int ucisundefined(unsigned long code)
.sp
int ucishan(unsigned long code)
.sp
int ucishangul(unsigned long code)
.SH DESCRIPTION
.TP 4
.BR Macros
.br
UCDATA_CASE
.br
UCDATA_CTYPE
.br
UCDATA_DECOMP
.br
UCDATA_CMBCL
.br
UCDATA_NUM
.br
UCDATA_ALL
.br
.TP 4
.BR ucdata_load()
This function initializes the UCData library by locating the data files in one
of the colon-separated directories in the `paths' parameter. The data files
to be loaded are specified in the `masks' parameter as a bitwise combination
of the macros listed above.
.sp
This should be called before using any of the other functions.
.TP 4
.BR ucdata_unload()
This function unloads the data tables specified in the `masks' parameter.
.sp
This function should be called when the application is done using the UCData
package.
.TP 4
.BR ucdata_reload()
This function reloads the data files from one of the colon-separated
directories in the `paths' parameter. The data files to be reloaded are
specified in the `masks' parameter as a bitwise combination of the macros
listed above.
.TP 4
.BR ucdecomp()
This function determines if a character has a decomposition and returns the
decomposition information if it exists.
.sp
If a zero is returned, there is no decomposition. If a non-zero is
returned, then the `num' and `decomp' variables are filled in with the
appropriate values.
.sp
Example call:
.sp
.nf
unsigned long i, num, *decomp;
if (ucdecomp(0x1d5, &num, &decomp) != 0) {
for (i = 0; i < num; i++)
printf("0x%08lX,", decomp[i]);
putchar('\n');
}
.TP 4
.BR ucdecomp_hangul()
This function determines if a Hangul syllable has a
decomposition and returns the decomposition information.
.sp
An array of at least size 3 should be passed to the function
for the decomposition of the syllable.
.sp
If a zero is returned, the character is not a Hangul
syllable. If a non-zero is returned, the `num' field
will be 2 or 3 and the syllable will be decomposed into
the `decomp' array arithmetically.
.sp
Example call:
.sp
.nf
unsigned long i, num, decomp[3];
if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) {
for (i = 0; i < num; i++)
printf("0x%08lX,", decomp[i]);
putchar('\n');
}
.TP 4
.BR ucnumber_lookup()
This function determines if the code is a number and
fills in the `num' field with the numerator and
denominator. If the code happens to be a single digit,
the numerator and denominator fields will be the same.
.sp
If the function returns 0, the code is not a number.
Any other return value means the code is a number.
.TP 4
.BR ucdigit_lookup()
This function determines if the code is a digit and
fills in the `digit' field with the digit value.
.sp
If the function returns 0, the code is not a number.
Any other return value means the code is a number.
.TP 4
.BR ucgetnumber()
This is a compatibility function with John Cowan's
"uctype" package. It uses ucnumber_lookup().
.TP 4
.BR ucgetdigit()
This is a compatibility function with John Cowan's
"uctype" package. It uses ucdigit_lookup().
.TP 4
.BR uctoupper()
This function returns the code unchanged if it is
already upper case or has no upper case equivalent.
Otherwise the upper case equivalent is returned.
.TP 4
.BR uctolower()
This function returns the code unchanged if it is
already lower case or has no lower case equivalent.
Otherwise the lower case equivalent is returned.
.TP 4
.BR uctotitle()
This function returns the code unchanged if it is
already title case or has no title case equivalent.
Otherwise the title case equivalent is returned.
.TP 4
.BR ucisalpha()
Test if \fIcode\fR is an alpha character.
.TP 4
.BR ucisalnum()
Test if \fIcode\fR is an alpha or digit character.
.TP 4
.BR ucisdigit()
Test if \fIcode\fR is a digit character.
.TP 4
.BR uciscntrl()
Test if \fIcode\fR is a control character.
.TP 4
.BR ucisspace()
Test if \fIcode\fR is a space character.
.TP 4
.BR ucisblank()
Test if \fIcode\fR is a blank character.
.TP 4
.BR ucispunct()
Test if \fIcode\fR is a punctuation character.
.TP 4
.BR ucisgraph()
Test if \fIcode\fR is a graphical (visible) character.
.TP 4
.BR ucisprint()
Test if \fIcode\fR is a printable character.
.TP 4
.BR ucisxdigit()
Test if \fIcode\fR is a hexadecimal digit character.
.TP 4
.BR ucisupper()
Test if \fIcode\fR is an upper case character.
.TP 4
.BR ucislower()
Test if \fIcode\fR is a lower case character.
.TP 4
.BR ucistitle()
Test if \fIcode\fR is a title case character.
.TP 4
.BR ucisisocntrl()
Is the character a C0 control character (< 32)?
.TP 4
.BR ucisfmtcntrl()
Is the character a format control character?
.TP 4
.BR ucissymbol()
Is the character a symbol?
.TP 4
.BR ucisnumber()
Is the character a number or digit?
.TP 4
.BR ucisnonspacing()
Is the character non-spacing?
.TP 4
.BR ucisopenpunct()
Is the character an open/left punctuation (i.e. '[')
.TP 4
.BR ucisclosepunct()
Is the character an close/right punctuation (i.e. ']')
.TP 4
.BR ucisinitialpunct()
Is the character an initial punctuation (i.e. U+2018 LEFT
SINGLE QUOTATION MARK)
.TP 4
.BR ucisfinalpunct()
Is the character a final punctuation (i.e. U+2019 RIGHT
SINGLE QUOTATION MARK)
.TP 4
.BR uciscomposite()
Can the character be decomposed into a set of other
characters?
.TP 4
.BR ucisquote()
Is the character one of the many quotation marks?
.TP 4
.BR ucissymmetric()
Is the character one that has an opposite form
(i.e. <>)
.TP 4
.BR ucismirroring()
Is the character mirroring (superset of symmetric)?
.TP 4
.BR ucisnonbreaking()
Is the character non-breaking (i.e. non-breaking
space)?
.TP 4
.BR ucisrtl()
Does the character have strong right-to-left
directionality (i.e. Arabic letters)?
.TP 4
.BR ucisltr()
Does the character have strong left-to-right
directionality (i.e. Latin letters)?
.TP 4
.BR ucisstrong()
Does the character have strong directionality?
.TP 4
.BR ucisweak()
Does the character have weak directionality
(i.e. numbers)?
.TP 4
.BR ucisneutral()
Does the character have neutral directionality
(i.e. whitespace)?
.TP 4
.BR ucisseparator()
Is the character a block or segment separator?
.TP 4
.BR ucislsep()
Is the character a line separator?
.TP 4
.BR ucispsep()
Is the character a paragraph separator?
.TP 4
.BR ucismark()
Is the character a mark of some kind?
.TP 4
.BR ucisnsmark()
Is the character a non-spacing mark?
.TP 4
.BR ucisspmark()
Is the character a spacing mark?
.TP 4
.BR ucismodif()
Is the character a modifier letter?
.TP 4
.BR ucismodifsymbol()
Is the character a modifier symbol?
.TP 4
.BR ucisletnum()
Is the character a number represented by a letter?
.TP 4
.BR ucisconnect()
Is the character connecting punctuation?
.TP 4
.BR ucisdash()
Is the character dash punctuation?
.TP 4
.BR ucismath()
Is the character a math character?
.TP 4
.BR uciscurrency()
Is the character a currency character?
.TP 4
.BR ucisenclosing()
Is the character enclosing (i.e. enclosing box)?
.TP 4
.BR ucisprivate()
Is the character from the Private Use Area?
.TP 4
.BR ucissurrogate()
Is the character one of the surrogate codes?
.TP 4
.BR ucisidentstart()
Is the character a legal initial character of an identifier?
.TP 4
.BR ucisidentpart()
Is the character a legal identifier character?
.TP 4
.BR ucisdefined()
Is the character defined (appeared in one of the data
files)?
.TP 4
.BR ucisundefined()
Is the character not defined (non-Unicode)?
.TP 4
.BR ucishan()
Is the character a Han ideograph?
.TP 4
.BR ucishangul()
Is the character a pre-composed Hangul syllable?
.SH "SEE ALSO"
ctype(3)
.SH ACKNOWLEDGMENTS
These are people who have helped with patches or
alerted me about problems.
.sp
John Cowan <cowan@locke.ccil.org>
.br
Bob Verbrugge <bob_verbrugge@nl.compuware.com>
.br
Christophe Pierret <cpierret@businessobjects.com>
.br
Kent Johnson <kent@pondview.mv.com>
.br
Valeriy E. Ushakov <uwe@ptc.spbu.ru>
.SH AUTHOR
Mark Leisher
.br
Computing Research Lab
.br
New Mexico State University
.br
Email: mleisher@crl.nmsu.edu

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,813 @@
/*
* Copyright 1999 Computing Research Labs, New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef lint
#ifdef __GNUC__
static char rcsid[] __attribute__ ((unused)) = "$Id: ucpgba.c,v 1.4 1999/11/29 16:41:06 mleisher Exp $";
#else
static char rcsid[] = "$Id: ucpgba.c,v 1.4 1999/11/29 16:41:06 mleisher Exp $";
#endif
#endif
#include <stdio.h>
#include <stdlib.h>
#include "ucdata.h"
#include "ucpgba.h"
/*
* These macros are used while reordering of RTL runs of text for the
* special case of non-spacing characters being in runs of weakly
* directional text. They check for weak and non-spacing, and digits and
* non-spacing.
*/
#define ISWEAKSPECIAL(cc) ucisprop(cc, UC_EN|UC_ES|UC_MN, UC_ET|UC_AN|UC_CS)
#define ISDIGITSPECIAL(cc) ucisprop(cc, UC_ND|UC_MN, 0)
/*
* These macros are used while breaking a string into runs of text in
* different directions. Descriptions:
*
* ISLTR_LTR - Test for members of an LTR run in an LTR context. This looks
* for characters with ltr, non-spacing, weak, and neutral
* properties.
*
* ISRTL_RTL - Test for members of an RTL run in an RTL context. This looks
* for characters with rtl, non-spacing, weak, and neutral
* properties.
*
* ISRTL_NEUTRAL - Test for RTL or neutral characters.
*
* ISWEAK_NEUTRAL - Test for weak or neutral characters.
*/
#define ISLTR_LTR(cc) ucisprop(cc, UC_L|UC_MN|UC_EN|UC_ES,\
UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
#define ISRTL_RTL(cc) ucisprop(cc, UC_R|UC_MN|UC_EN|UC_ES,\
UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
#define ISRTL_NEUTRAL(cc) ucisprop(cc, UC_R, UC_B|UC_S|UC_WS|UC_ON)
#define ISWEAK_NEUTRAL(cc) ucisprop(cc, UC_EN|UC_ES, \
UC_B|UC_S|UC_WS|UC_ON|UC_ET|UC_AN|UC_CS)
/*
* This table is temporarily hard-coded here until it can be constructed
* automatically somehow.
*/
static unsigned long _symmetric_pairs[] = {
0x0028, 0x0029, 0x0029, 0x0028, 0x003C, 0x003E, 0x003E, 0x003C,
0x005B, 0x005D, 0x005D, 0x005B, 0x007B, 0x007D, 0x007D, 0x007B,
0x2045, 0x2046, 0x2046, 0x2045, 0x207D, 0x207E, 0x207E, 0x207D,
0x208D, 0x208E, 0x208E, 0x208D, 0x3008, 0x3009, 0x3009, 0x3008,
0x300A, 0x300B, 0x300B, 0x300A, 0x300C, 0x300D, 0x300D, 0x300C,
0x300E, 0x300F, 0x300F, 0x300E, 0x3010, 0x3011, 0x3011, 0x3010,
0x3014, 0x3015, 0x3015, 0x3014, 0x3016, 0x3017, 0x3017, 0x3016,
0x3018, 0x3019, 0x3019, 0x3018, 0x301A, 0x301B, 0x301B, 0x301A,
0xFD3E, 0xFD3F, 0xFD3F, 0xFD3E, 0xFE59, 0xFE5A, 0xFE5A, 0xFE59,
0xFE5B, 0xFE5C, 0xFE5C, 0xFE5B, 0xFE5D, 0xFE5E, 0xFE5E, 0xFE5D,
0xFF08, 0xFF09, 0xFF09, 0xFF08, 0xFF3B, 0xFF3D, 0xFF3D, 0xFF3B,
0xFF5B, 0xFF5D, 0xFF5D, 0xFF5B, 0xFF62, 0xFF63, 0xFF63, 0xFF62,
};
static int _symmetric_pairs_size =
sizeof(_symmetric_pairs)/sizeof(_symmetric_pairs[0]);
/*
* This routine looks up the other form of a symmetric pair.
*/
static unsigned long
#ifdef __STDC__
_ucsymmetric_pair(unsigned long c)
#else
_ucsymmetric_pair(c)
unsigned long c;
#endif
{
int i;
for (i = 0; i < _symmetric_pairs_size; i += 2) {
if (_symmetric_pairs[i] == c)
return _symmetric_pairs[i+1];
}
return c;
}
/*
* This routine creates a new run, copies the text into it, links it into the
* logical text order chain and returns it to the caller to be linked into
* the visual text order chain.
*/
static ucrun_t *
#ifdef __STDC__
_add_run(ucstring_t *str, unsigned long *src,
unsigned long start, unsigned long end, int direction)
#else
_add_run(str, src, start, end, direction)
ucstring_t *str;
unsigned long *src, start, end;
int direction;
#endif
{
long i, t;
ucrun_t *run;
run = (ucrun_t *) malloc(sizeof(ucrun_t));
run->visual_next = run->visual_prev = 0;
run->direction = direction;
run->cursor = ~0;
run->chars = (unsigned long *)
malloc(sizeof(unsigned long) * ((end - start) << 1));
run->positions = run->chars + (end - start);
run->source = src;
run->start = start;
run->end = end;
if (direction == UCPGBA_RTL) {
/*
* Copy the source text into the run in reverse order and select
* replacements for the pairwise punctuation and the <> characters.
*/
for (i = 0, t = end - 1; start < end; start++, t--, i++) {
run->positions[i] = t;
if (ucissymmetric(src[t]) || src[t] == '<' || src[t] == '>')
run->chars[i] = _ucsymmetric_pair(src[t]);
else
run->chars[i] = src[t];
}
} else {
/*
* Copy the source text into the run directly.
*/
for (i = start; i < end; i++) {
run->positions[i - start] = i;
run->chars[i - start] = src[i];
}
}
/*
* Add the run to the logical list for cursor traversal.
*/
if (str->logical_first == 0)
str->logical_first = str->logical_last = run;
else {
run->logical_prev = str->logical_last;
str->logical_last->logical_next = run;
str->logical_last = run;
}
return run;
}
static void
#ifdef __STDC__
_ucadd_rtl_segment(ucstring_t *str, unsigned long *source, unsigned long start,
unsigned long end)
#else
_ucadd_rtl_segment(str, source, start, end)
ucstring_t *str;
unsigned long *source, start, end;
#endif
{
unsigned long s, e;
ucrun_t *run, *lrun;
/*
* This is used to splice runs into strings with overall LTR direction.
* The `lrun' variable will never be NULL because at least one LTR run was
* added before this RTL run.
*/
lrun = str->visual_last;
for (e = s = start; s < end;) {
for (; e < end && ISRTL_NEUTRAL(source[e]); e++) ;
if (e > s) {
run = _add_run(str, source, s, e, UCPGBA_RTL);
/*
* Add the run to the visual list for cursor traversal.
*/
if (str->visual_first != 0) {
if (str->direction == UCPGBA_LTR) {
run->visual_prev = lrun;
run->visual_next = lrun->visual_next;
if (lrun->visual_next != 0)
lrun->visual_next->visual_prev = run;
lrun->visual_next = run;
if (lrun == str->visual_last)
str->visual_last = run;
} else {
run->visual_next = str->visual_first;
str->visual_first->visual_prev = run;
str->visual_first = run;
}
} else
str->visual_first = str->visual_last = run;
}
/*
* Now handle the weak sequences such that multiple non-digit groups
* are kept together appropriately and added as RTL sequences.
*/
for (s = e; e < end && ISWEAKSPECIAL(source[e]); e++) {
if (!ISDIGITSPECIAL(source[e]) &&
(e + 1 == end || !ISDIGITSPECIAL(source[e + 1])))
break;
}
if (e > s) {
run = _add_run(str, source, s, e, UCPGBA_LTR);
/*
* Add the run to the visual list for cursor traversal.
*/
if (str->visual_first != 0) {
if (str->direction == UCPGBA_LTR) {
run->visual_prev = lrun;
run->visual_next = lrun->visual_next;
if (lrun->visual_next != 0)
lrun->visual_next->visual_prev = run;
lrun->visual_next = run;
if (lrun == str->visual_last)
str->visual_last = run;
} else {
run->visual_next = str->visual_first;
str->visual_first->visual_prev = run;
str->visual_first = run;
}
} else
str->visual_first = str->visual_last = run;
}
/*
* Collect all weak non-digit sequences for an RTL segment. These
* will appear as part of the next RTL segment or will be added as
* an RTL segment by themselves.
*/
for (s = e; e < end && ucisweak(source[e]) && !ucisdigit(source[e]);
e++) ;
}
/*
* Capture any weak non-digit sequences that occur at the end of the RTL
* run.
*/
if (e > s) {
run = _add_run(str, source, s, e, UCPGBA_RTL);
/*
* Add the run to the visual list for cursor traversal.
*/
if (str->visual_first != 0) {
if (str->direction == UCPGBA_LTR) {
run->visual_prev = lrun;
run->visual_next = lrun->visual_next;
if (lrun->visual_next != 0)
lrun->visual_next->visual_prev = run;
lrun->visual_next = run;
if (lrun == str->visual_last)
str->visual_last = run;
} else {
run->visual_next = str->visual_first;
str->visual_first->visual_prev = run;
str->visual_first = run;
}
} else
str->visual_first = str->visual_last = run;
}
}
static void
#ifdef __STDC__
_ucadd_ltr_segment(ucstring_t *str, unsigned long *source, unsigned long start,
unsigned long end)
#else
_ucadd_ltr_segment(str, source, start, end)
ucstring_t *str;
unsigned long *source, start, end;
#endif
{
ucrun_t *run;
run = _add_run(str, source, start, end, UCPGBA_LTR);
/*
* Add the run to the visual list for cursor traversal.
*/
if (str->visual_first != 0) {
if (str->direction == UCPGBA_LTR) {
run->visual_prev = str->visual_last;
str->visual_last->visual_next = run;
str->visual_last = run;
} else {
run->visual_next = str->visual_first;
str->visual_first->visual_prev = run;
str->visual_first = run;
}
} else
str->visual_first = str->visual_last = run;
}
ucstring_t *
#ifdef __STDC__
ucstring_create(unsigned long *source, unsigned long start, unsigned long end,
int default_direction, int cursor_motion)
#else
ucstring_create(source, start, end, default_direction, cursor_motion)
unsigned long *source, start, end;
int default_direction, cursor_motion;
#endif
{
int rtl_first;
unsigned long s, e;
ucstring_t *str;
str = (ucstring_t *) malloc(sizeof(ucstring_t));
/*
* Set the initial values.
*/
str->cursor_motion = cursor_motion;
str->logical_first = str->logical_last = 0;
str->visual_first = str->visual_last = str->cursor = 0;
str->source = source;
str->start = start;
str->end = end;
/*
* If the length of the string is 0, then just return it at this point.
*/
if (start == end)
return str;
/*
* This flag indicates whether the collection loop for RTL is called
* before the LTR loop the first time.
*/
rtl_first = 0;
/*
* Look for the first character in the string that has strong
* directionality.
*/
for (s = start; s < end && !ucisstrong(source[s]); s++) ;
if (s == end)
/*
* If the string contains no characters with strong directionality, use
* the default direction.
*/
str->direction = default_direction;
else
str->direction = ucisrtl(source[s]) ? UCPGBA_RTL : UCPGBA_LTR;
if (str->direction == UCPGBA_RTL)
/*
* Set the flag that causes the RTL collection loop to run first.
*/
rtl_first = 1;
/*
* This loop now separates the string into runs based on directionality.
*/
for (s = e = 0; s < end; s = e) {
if (!rtl_first) {
/*
* Determine the next run of LTR text.
*/
while (e < end && ISLTR_LTR(source[e]))
e++;
if (str->direction != UCPGBA_LTR) {
while (e > s && ISWEAK_NEUTRAL(source[e - 1]))
e--;
}
/*
* Add the LTR segment to the string.
*/
if (e > s)
_ucadd_ltr_segment(str, source, s, e);
}
/*
* Determine the next run of RTL text.
*/
s = e;
while (e < end && ISRTL_RTL(source[e]))
e++;
if (str->direction != UCPGBA_RTL) {
while (e > s && ISWEAK_NEUTRAL(source[e - 1]))
e--;
}
/*
* Add the RTL segment to the string.
*/
if (e > s)
_ucadd_rtl_segment(str, source, s, e);
/*
* Clear the flag that allowed the RTL collection loop to run first
* for strings with overall RTL directionality.
*/
rtl_first = 0;
}
/*
* Set up the initial cursor run.
*/
str->cursor = str->logical_first;
if (str != 0)
str->cursor->cursor = (str->cursor->direction == UCPGBA_RTL) ?
str->cursor->end - str->cursor->start : 0;
return str;
}
void
#ifdef __STDC__
ucstring_free(ucstring_t *s)
#else
ucstring_free(s)
ucstring_t *s;
#endif
{
ucrun_t *l, *r;
if (s == 0)
return;
for (l = 0, r = s->visual_first; r != 0; r = r->visual_next) {
if (r->end > r->start)
free((char *) r->chars);
if (l)
free((char *) l);
l = r;
}
if (l)
free((char *) l);
free((char *) s);
}
int
#ifdef __STDC__
ucstring_set_cursor_motion(ucstring_t *str, int cursor_motion)
#else
ucstring_set_cursor_motion(s, cursor_motion)
ucstring_t *str;
int cursor_motion;
#endif
{
int n;
if (str == 0)
return -1;
n = str->cursor_motion;
str->cursor_motion = cursor_motion;
return n;
}
static int
#ifdef __STDC__
_ucstring_visual_cursor_right(ucstring_t *str, int count)
#else
_ucstring_visual_cursor_right(str, count)
ucstring_t *str;
int count;
#endif
{
int cnt = count;
unsigned long size;
ucrun_t *cursor;
if (str == 0)
return 0;
cursor = str->cursor;
while (cnt > 0) {
size = cursor->end - cursor->start;
if ((cursor->direction == UCPGBA_RTL && cursor->cursor + 1 == size) ||
cursor->cursor + 1 > size) {
/*
* If the next run is NULL, then the cursor is already on the
* far right end already.
*/
if (cursor->visual_next == 0)
/*
* If movement occured, then report it.
*/
return (cnt != count);
/*
* Move to the next run.
*/
str->cursor = cursor = cursor->visual_next;
cursor->cursor = (cursor->direction == UCPGBA_RTL) ? -1 : 0;
size = cursor->end - cursor->start;
} else
cursor->cursor++;
cnt--;
}
return 1;
}
static int
#ifdef __STDC__
_ucstring_logical_cursor_right(ucstring_t *str, int count)
#else
_ucstring_logical_cursor_right(str, count)
ucstring_t *str;
int count;
#endif
{
int cnt = count;
unsigned long size;
ucrun_t *cursor;
if (str == 0)
return 0;
cursor = str->cursor;
while (cnt > 0) {
size = cursor->end - cursor->start;
if (str->direction == UCPGBA_RTL) {
if (cursor->direction == UCPGBA_RTL) {
if (cursor->cursor + 1 == size) {
if (cursor == str->logical_first)
/*
* Already at the beginning of the string.
*/
return (cnt != count);
str->cursor = cursor = cursor->logical_prev;
size = cursor->end - cursor->start;
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
size : 0;
} else
cursor->cursor++;
} else {
if (cursor->cursor == 0) {
if (cursor == str->logical_first)
/*
* At the beginning of the string already.
*/
return (cnt != count);
str->cursor = cursor = cursor->logical_prev;
size = cursor->end - cursor->start;
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
size : 0;
} else
cursor->cursor--;
}
} else {
if (cursor->direction == UCPGBA_RTL) {
if (cursor->cursor == 0) {
if (cursor == str->logical_last)
/*
* Already at the end of the string.
*/
return (cnt != count);
str->cursor = cursor = cursor->logical_next;
size = cursor->end - cursor->start;
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
0 : size - 1;
} else
cursor->cursor--;
} else {
if (cursor->cursor + 1 > size) {
if (cursor == str->logical_last)
/*
* Already at the end of the string.
*/
return (cnt != count);
str->cursor = cursor = cursor->logical_next;
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
0 : size - 1;
} else
cursor->cursor++;
}
}
cnt--;
}
return 1;
}
int
#ifdef __STDC__
ucstring_cursor_right(ucstring_t *str, int count)
#else
ucstring_cursor_right(str, count)
ucstring_t *str;
int count;
#endif
{
if (str == 0)
return 0;
return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
_ucstring_visual_cursor_right(str, count) :
_ucstring_logical_cursor_right(str, count);
}
static int
#ifdef __STDC__
_ucstring_visual_cursor_left(ucstring_t *str, int count)
#else
_ucstring_visual_cursor_left(str, count)
ucstring_t *str;
int count;
#endif
{
int cnt = count;
unsigned long size;
ucrun_t *cursor;
if (str == 0)
return 0;
cursor = str->cursor;
while (cnt > 0) {
size = cursor->end - cursor->start;
if ((cursor->direction == UCPGBA_LTR && cursor->cursor == 0) ||
cursor->cursor - 1 < -1) {
/*
* If the preceding run is NULL, then the cursor is already on the
* far left end already.
*/
if (cursor->visual_prev == 0)
/*
* If movement occured, then report it.
*/
return (cnt != count);
/*
* Move to the previous run.
*/
str->cursor = cursor = cursor->visual_prev;
size = cursor->end - cursor->start;
cursor->cursor = (cursor->direction == UCPGBA_RTL) ?
size : size - 1;
} else
cursor->cursor--;
cnt--;
}
return 1;
}
static int
#ifdef __STDC__
_ucstring_logical_cursor_left(ucstring_t *str, int count)
#else
_ucstring_logical_cursor_left(str, count)
ucstring_t *str;
int count;
#endif
{
int cnt = count;
unsigned long size;
ucrun_t *cursor;
if (str == 0)
return 0;
cursor = str->cursor;
while (cnt > 0) {
size = cursor->end - cursor->start;
if (str->direction == UCPGBA_RTL) {
if (cursor->direction == UCPGBA_RTL) {
if (cursor->cursor == -1) {
if (cursor == str->logical_last)
/*
* Already at the end of the string.
*/
return (cnt != count);
str->cursor = cursor = cursor->logical_next;
size = cursor->end - cursor->start;
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
0 : size - 1;
} else
cursor->cursor--;
} else {
if (cursor->cursor + 1 > size) {
if (cursor == str->logical_last)
/*
* At the end of the string already.
*/
return (cnt != count);
str->cursor = cursor = cursor->logical_next;
size = cursor->end - cursor->start;
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
0 : size - 1;
} else
cursor->cursor++;
}
} else {
if (cursor->direction == UCPGBA_RTL) {
if (cursor->cursor + 1 == size) {
if (cursor == str->logical_first)
/*
* Already at the beginning of the string.
*/
return (cnt != count);
str->cursor = cursor = cursor->logical_prev;
size = cursor->end - cursor->start;
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
size : 0;
} else
cursor->cursor++;
} else {
if (cursor->cursor == 0) {
if (cursor == str->logical_first)
/*
* Already at the beginning of the string.
*/
return (cnt != count);
str->cursor = cursor = cursor->logical_prev;
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
size : 0;
} else
cursor->cursor--;
}
}
cnt--;
}
return 1;
}
int
#ifdef __STDC__
ucstring_cursor_left(ucstring_t *str, int count)
#else
ucstring_cursor_left(str, count)
ucstring_t *str;
int count;
#endif
{
if (str == 0)
return 0;
return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
_ucstring_visual_cursor_left(str, count) :
_ucstring_logical_cursor_left(str, count);
}
void
#ifdef __STDC__
ucstring_cursor_info(ucstring_t *str, int *direction, unsigned long *position)
#else
ucstring_cursor_info(str, direction, position)
ucstring_t *str, int *direction;
unsigned long *position;
#endif
{
long c;
unsigned long size;
ucrun_t *cursor;
if (str == 0 || direction == 0 || position == 0)
return;
cursor = str->cursor;
*direction = cursor->direction;
c = cursor->cursor;
size = cursor->end - cursor->start;
if (c == size)
*position = (cursor->direction == UCPGBA_RTL) ?
cursor->start : cursor->positions[c - 1];
else if (c == -1)
*position = (cursor->direction == UCPGBA_RTL) ?
cursor->end : cursor->start;
else
*position = cursor->positions[c];
}

View file

@ -0,0 +1,162 @@
/*
* Copyright 1999 Computing Research Labs, New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _h_ucpgba
#define _h_ucpgba
/*
* $Id: ucpgba.h,v 1.4 1999/11/19 15:24:30 mleisher Exp $
*/
#ifdef __cplusplus
extern "C" {
#endif
#undef __
#ifdef __STDC__
#define __(x) x
#else
#define __(x) ()
#endif
/***************************************************************************
*
* Macros and types.
*
***************************************************************************/
/*
* These are the direction values that can appear in render runs and render
* strings.
*/
#define UCPGBA_LTR 0
#define UCPGBA_RTL 1
/*
* These are the flags for cursor motion.
*/
#define UCPGBA_CURSOR_VISUAL 0
#define UCPGBA_CURSOR_LOGICAL 1
/*
* This structure is used to contain runs of text in a particular direction.
*/
typedef struct _ucrun_t {
struct _ucrun_t *visual_prev; /* Pointer to the previous visual run. */
struct _ucrun_t *visual_next; /* Pointer to the next visual run. */
struct _ucrun_t *logical_prev; /* Pointer to the previous logical run. */
struct _ucrun_t *logical_next; /* Pointer to the next logical run. */
int direction; /* Direction of the run. */
long cursor; /* Position of "cursor" in the string. */
unsigned long *chars; /* List of characters for the run. */
unsigned long *positions; /* List of original positions in source. */
unsigned long *source; /* The source string. */
unsigned long start; /* Beginning offset in the source string. */
unsigned long end; /* Ending offset in the source string. */
} ucrun_t;
/*
* This represents a string of runs rendered up to a point that is not
* platform specific.
*/
typedef struct _ucstring_t {
int direction; /* Overall direction of the string. */
int cursor_motion; /* Logical or visual cursor motion flag. */
ucrun_t *cursor; /* The run containing the "cursor." */
ucrun_t *logical_first; /* First run in the logical order. */
ucrun_t *logical_last; /* Last run in the logical order. */
ucrun_t *visual_first; /* First run in the visual order. */
ucrun_t *visual_last; /* Last run in the visual order. */
unsigned long *source; /* The source string. */
unsigned long start; /* The beginning offset in the source. */
unsigned long end; /* The ending offset in the source. */
} ucstring_t;
/***************************************************************************
*
* API
*
***************************************************************************/
/*
* This creates and reorders the specified substring using the
* "Pretty Good Bidi Algorithm." A default direction is provided for cases
* of a string containing no strong direction characters and the default
* cursor motion should be provided.
*/
extern ucstring_t *ucstring_create __((unsigned long *source,
unsigned long start,
unsigned long end,
int default_direction,
int cursor_motion));
/*
* This releases the string.
*/
extern void ucstring_free __((ucstring_t *string));
/*
* This changes the cursor motion flag for the string.
*/
extern int ucstring_set_cursor_motion __((ucstring_t *string,
int cursor_motion));
/*
* This function will move the cursor to the right depending on the
* type of cursor motion that was specified for the string.
*
* A 0 is returned if no cursor motion is performed, otherwise a
* 1 is returned.
*/
extern int ucstring_cursor_right __((ucstring_t *string, int count));
/*
* This function will move the cursor to the left depending on the
* type of cursor motion that was specified for the string.
*
* A 0 is returned if no cursor motion is performed, otherwise a
* 1 is returned.
*/
extern int ucstring_cursor_left __((ucstring_t *string, int count));
/*
* This routine retrieves the direction of the run containing the cursor
* and the actual position in the original text string.
*/
extern void ucstring_cursor_info __((ucstring_t *string, int *direction,
unsigned long *position));
#undef __
#ifdef __cplusplus
}
#endif
#endif /* _h_ucpgba */

View file

@ -0,0 +1,97 @@
.\"
.\" $Id: ucpgba.man,v 1.1 1999/11/19 16:08:34 mleisher Exp $
.\"
.TH ucpgba 3 "19 November 1999"
.SH NAME
ucpgba \- functions for doing bidirectional reordering of Unicode text and
logical and visual cursor motion
.SH SYNOPSIS
.nf
#include <ucdata.h>
#include <ucpgba.h>
ucstring_t *ucstring_create(unsigned long *source, unsigned long start,
unsigned long end, int default_direction,
int cursor_motion)
.sp
void ucstring_free(ucstring_t *string)
.sp
int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion)
.sp
int ucstring_cursor_right(ucstring_t *string, int count)
.sp
int ucstring_cursor_left(ucstring_t *string, int count)
.sp
void ucstring_cursor_info(ucstring_t *string, int *direction,
unsigned long *position)
.SH DESCRIPTION
.TP 4
.BR Macros
UCPGBA_LTR
.br
UCPGBA_RTL
.br
UCPGBA_CURSOR_VISUAL
.br
UCPGBA_CURSOR_LOGICAL
.TP 4
.BR ucstring_create()
This function will create a reordered string by using the implicit
directionality of the characters in the specified substring.
.sp
The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL
and is used only in cases where a string contains no characters with strong
directionality.
.sp
The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or
UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion
behavior. This behavior can be switched at any time using
ustring_set_cursor_motion().
.TP 4
.BR ucstring_free()
This function will deallocate the memory used by the string, incuding the
string itself.
.TP 4
.BR ucstring_cursor_info()
This function will return the text position of the internal cursor and the
directionality of the text at that position. The position returned is the
original text position of the character.
.TP 4
.BR ucstring_set_cursor_motion()
This function will change the cursor motion type and return the previous
cursor motion type.
.TP 4
.BR ucstring_cursor_right()
This function will move the internal cursor to the right according to the
type of cursor motion set for the string.
.sp
If no cursor motion is performed, it returns 0. Otherwise it will return a 1.
.TP 4
.BR ucstring_cursor_left()
This function will move the internal cursor to the left according to the
type of cursor motion set for the string.
.sp
If no cursor motion is performed, it returns 0. Otherwise it will return a 1.
.SH "SEE ALSO"
ucdata(3)
.SH ACKNOWLEDGMENTS
These are people who have helped with patches or alerted me about problems.
.SH AUTHOR
Mark Leisher
.br
Computing Research Lab
.br
New Mexico State University
.br
Email: mleisher@crl.nmsu.edu

View file

@ -0,0 +1,212 @@
#
# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $
#
# Copyright 1997, 1998, 1999 Computing Research Labs,
# New Mexico State University
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
Unicode and Regular Expressions
Version 0.5
This is a simple regular expression package for matching against Unicode text
in UCS2 form. The implementation of this URE package is a variation on the
RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark
Hopkins' algorithm had the virtue of being very simple, so it was used as a
model.
---------------------------------------------------------------------------
Assumptions:
o Regular expression and text already normalized.
o Conversion to lower case assumes a 1-1 mapping.
Definitions:
Separator - any one of U+2028, U+2029, '\n', '\r'.
Operators:
. - match any character.
* - match zero or more of the last subexpression.
+ - match one or more of the last subexpression.
? - match zero or one of the last subexpression.
() - subexpression grouping.
Notes:
o The "." operator normally does not match separators, but a flag is
available for the ure_exec() function that will allow this operator to
match a separator.
Literals and Constants:
c - literal UCS2 character.
\x.... - hexadecimal number of up to 4 digits.
\X.... - hexadecimal number of up to 4 digits.
\u.... - hexadecimal number of up to 4 digits.
\U.... - hexadecimal number of up to 4 digits.
Character classes:
[...] - Character class.
[^...] - Negated character class.
\pN1,N2,...,Nn - Character properties class.
\PN1,N2,...,Nn - Negated character properties class.
POSIX character classes recognized:
:alnum:
:alpha:
:cntrl:
:digit:
:graph:
:lower:
:print:
:punct:
:space:
:upper:
:xdigit:
Notes:
o Character property classes are \p or \P followed by a comma separated
list of integers between 1 and 32. These integers are references to
the following character properties:
N Character Property
--------------------------
1 _URE_NONSPACING
2 _URE_COMBINING
3 _URE_NUMDIGIT
4 _URE_NUMOTHER
5 _URE_SPACESEP
6 _URE_LINESEP
7 _URE_PARASEP
8 _URE_CNTRL
9 _URE_PUA
10 _URE_UPPER
11 _URE_LOWER
12 _URE_TITLE
13 _URE_MODIFIER
14 _URE_OTHERLETTER
15 _URE_DASHPUNCT
16 _URE_OPENPUNCT
17 _URE_CLOSEPUNCT
18 _URE_OTHERPUNCT
19 _URE_MATHSYM
20 _URE_CURRENCYSYM
21 _URE_OTHERSYM
22 _URE_LTR
23 _URE_RTL
24 _URE_EURONUM
25 _URE_EURONUMSEP
26 _URE_EURONUMTERM
27 _URE_ARABNUM
28 _URE_COMMONSEP
29 _URE_BLOCKSEP
30 _URE_SEGMENTSEP
31 _URE_WHITESPACE
32 _URE_OTHERNEUT
o Character classes can contain literals, constants, and character
property classes. Example:
[abc\U10A\p1,3,4]
---------------------------------------------------------------------------
Before using URE
----------------
Before URE is used, two functions need to be created. One to check if a
character matches a set of URE character properties, and one to convert a
character to lower case.
Stubs for these function are located in the urestubs.c file.
Using URE
---------
Sample pseudo-code fragment.
ure_buffer_t rebuf;
ure_dfa_t dfa;
ucs2_t *re, *text;
unsigned long relen, textlen;
unsigned long match_start, match_end;
/*
* Allocate the dynamic storage needed to compile regular expressions.
*/
rebuf = ure_buffer_create();
for each regular expression in a list {
re = next regular expression;
relen = length(re);
/*
* Compile the regular expression with the case insensitive flag
* turned on.
*/
dfa = ure_compile(re, relen, 1, rebuf);
/*
* Look for the first match in some text. The matching will be done
* in a case insensitive manner because the expression was compiled
* with the case insensitive flag on.
*/
if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
printf("MATCH: %ld %ld\n", match_start, match_end);
/*
* Look for the first match in some text, ignoring non-spacing
* characters.
*/
if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
&match_start, &match_end))
printf("MATCH: %ld %ld\n", match_start, match_end);
/*
* Free the DFA.
*/
ure_free_dfa(dfa);
}
/*
* Free the dynamic storage used for compiling the expressions.
*/
ure_free_buffer(rebuf);
---------------------------------------------------------------------------
Mark Leisher <mleisher@crl.nmsu.edu>
29 March 1997
===========================================================================
CHANGES
-------
Version: 0.5
Date : 21 September 1999
==========================
1. Added copyright stuff and put in CVS.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,150 @@
/*
* Copyright 1997, 1998, 1999 Computing Research Labs,
* New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _h_ure
#define _h_ure
/*
* $Id: ure.h,v 1.2 1999/09/21 15:47:44 mleisher Exp $
*/
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
#undef __
#ifdef __STDC__
#define __(x) x
#else
#define __(x) ()
#endif
/*
* Set of character class flags.
*/
#define _URE_NONSPACING 0x00000001
#define _URE_COMBINING 0x00000002
#define _URE_NUMDIGIT 0x00000004
#define _URE_NUMOTHER 0x00000008
#define _URE_SPACESEP 0x00000010
#define _URE_LINESEP 0x00000020
#define _URE_PARASEP 0x00000040
#define _URE_CNTRL 0x00000080
#define _URE_PUA 0x00000100
#define _URE_UPPER 0x00000200
#define _URE_LOWER 0x00000400
#define _URE_TITLE 0x00000800
#define _URE_MODIFIER 0x00001000
#define _URE_OTHERLETTER 0x00002000
#define _URE_DASHPUNCT 0x00004000
#define _URE_OPENPUNCT 0x00008000
#define _URE_CLOSEPUNCT 0x00010000
#define _URE_OTHERPUNCT 0x00020000
#define _URE_MATHSYM 0x00040000
#define _URE_CURRENCYSYM 0x00080000
#define _URE_OTHERSYM 0x00100000
#define _URE_LTR 0x00200000
#define _URE_RTL 0x00400000
#define _URE_EURONUM 0x00800000
#define _URE_EURONUMSEP 0x01000000
#define _URE_EURONUMTERM 0x02000000
#define _URE_ARABNUM 0x04000000
#define _URE_COMMONSEP 0x08000000
#define _URE_BLOCKSEP 0x10000000
#define _URE_SEGMENTSEP 0x20000000
#define _URE_WHITESPACE 0x40000000
#define _URE_OTHERNEUT 0x80000000
/*
* Error codes.
*/
#define _URE_OK 0
#define _URE_UNEXPECTED_EOS -1
#define _URE_CCLASS_OPEN -2
#define _URE_UNBALANCED_GROUP -3
#define _URE_INVALID_PROPERTY -4
/*
* Options that can be combined for searching.
*/
#define URE_IGNORE_NONSPACING 0x01
#define URE_DOT_MATCHES_SEPARATORS 0x02
typedef unsigned long ucs4_t;
typedef unsigned short ucs2_t;
/*
* Opaque type for memory used when compiling expressions.
*/
typedef struct _ure_buffer_t *ure_buffer_t;
/*
* Opaque type for the minimal DFA used when matching.
*/
typedef struct _ure_dfa_t *ure_dfa_t;
/*************************************************************************
*
* API.
*
*************************************************************************/
extern ure_buffer_t ure_buffer_create __((void));
extern void ure_buffer_free __((ure_buffer_t buf));
extern ure_dfa_t ure_compile __((ucs2_t *re, unsigned long relen,
int casefold, ure_buffer_t buf));
extern void ure_dfa_free __((ure_dfa_t dfa));
extern void ure_write_dfa __((ure_dfa_t dfa, FILE *out));
extern int ure_exec __((ure_dfa_t dfa, int flags,
ucs2_t *text, unsigned long textlen,
unsigned long *match_start, unsigned long *match_end));
/*************************************************************************
*
* Prototypes for stub functions used for URE. These need to be rewritten to
* use the Unicode support available on the system.
*
*************************************************************************/
extern ucs4_t _ure_tolower __((ucs4_t c));
extern int _ure_matches_properties __((unsigned long props, ucs4_t c));
#undef __
#ifdef __cplusplus
}
#endif
#endif /* _h_ure */

View file

@ -0,0 +1,64 @@
/*
* Copyright 1997, 1998, 1999 Computing Research Labs,
* New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef lint
static char rcsid[] = "$Id: urestubs.c,v 1.2 1999/09/21 15:47:44 mleisher Exp $";
#endif
#include "ure.h"
/*
* This file contains stub routines needed by the URE package to test
* character properties and other Unicode implementation specific details.
*/
/*
* This routine should return the lower case equivalent for the character or,
* if there is no lower case quivalent, the character itself.
*/
ucs4_t
#ifdef __STDC__
_ure_tolower(ucs4_t c)
#else
_ure_tolower(c)
ucs4_t c;
#endif
{
return c;
}
/*
* This routine takes a set of URE character property flags (see ure.h) along
* with a character and tests to see if the character has one or more of those
* properties.
*/
int
#ifdef __STDC__
_ure_matches_properties(unsigned long props, ucs4_t c)
#else
_ure_matches_properties(props, c)
unsigned long props;
ucs4_t c;
#endif
{
return 1;
}

View file

@ -0,0 +1,121 @@
#
# $Id: README,v 1.1 1999/09/21 15:45:17 mleisher Exp $
#
# Copyright 1997, 1998, 1999 Computing Research Labs,
# New Mexico State University
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
Unicode and Boyer-Moore Searching
Version 0.2
UTBM (Unicode Tuned Boyer-Moore) is a simple package that provides tuned
Boyer-Moore searches on Unicode UCS2 text (handles high and low surrogates).
---------------------------------------------------------------------------
Assumptions:
o Search pattern and text already normalized in some fasion.
o Upper, lower, and title case conversions are one-to-one.
o For conversions between upper, lower, and title case, UCS2 characters
always convert to other UCS2 characters, and UTF-16 characters always
convert to other UTF-16 characters.
Flags:
UTBM provides three processing flags:
o UTBM_CASEFOLD - search in a case-insensitive manner.
o UTBM_IGNORE_NONSPACING - ignore non-spacing characters in the pattern and
the text.
o UTBM_SPACE_COMPRESS - view as a *single space*, sequential groups of
U+2028, U+2029, '\n', '\r', '\t', and any
character identified as a space by the Unicode
support on the platform.
This flag also causes all characters identified
as control by the Unicode support on the
platform to be ignored (except for '\n', '\r',
and '\t').
---------------------------------------------------------------------------
Before using UTBM
-----------------
Before UTBM is used, some functions need to be created. The "utbmstub.c" file
contains stubs that need to be rewritten so they work with the Unicode support
on the platform on which this package is being used.
Using UTBM
----------
Sample pseudo-code fragment.
utbm_pattern_t pat;
ucs2_t *pattern, *text;
unsigned long patternlen, textlen;
unsigned long flags, match_start, match_end;
/*
* Allocate the dynamic storage needed for a search pattern.
*/
pat = utbm_create_pattern();
/*
* Set the search flags desired.
*/
flags = UTBM_CASEFOLD|UTBM_IGNORE_NONSPACING;
/*
* Compile the search pattern.
*/
utbm_compile(pattern, patternlen, flags, pat);
/*
* Find the first occurance of the search pattern in the text.
*/
if (utbm_exec(pat, text, textlen, &match_start, &match_end))
printf("MATCH: %ld %ld\n", match_start, match_end);
/*
* Free the dynamic storage used for the search pattern.
*/
ure_free_pattern(pat);
---------------------------------------------------------------------------
Mark Leisher <mleisher@crl.nmsu.edu>
2 May 1997
===========================================================================
CHANGES
-------
Version: 0.2
Date : 21 September 1999
==========================
1. Added copyright stuff and put in CVS.

View file

@ -0,0 +1,497 @@
/*
* Copyright 1997, 1998, 1999 Computing Research Labs,
* New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef lint
static char rcsid[] = "$Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $";
#endif
/*
* Assumptions:
* 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
* 2. Case conversions are all one-to-one.
* 3. Text and pattern have already been normalized in some fashion.
*/
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include "utbm.h"
/*
* Single pattern character.
*/
typedef struct {
ucs4_t lc;
ucs4_t uc;
ucs4_t tc;
} _utbm_char_t;
typedef struct {
_utbm_char_t *ch;
unsigned long skip;
} _utbm_skip_t;
typedef struct _utbm_pattern_t {
unsigned long flags;
_utbm_char_t *pat;
unsigned long pat_used;
unsigned long pat_size;
unsigned long patlen;
_utbm_skip_t *skip;
unsigned long skip_used;
unsigned long skip_size;
unsigned long md4;
} _utbm_pattern_t;
/*************************************************************************
*
* Support functions.
*
*************************************************************************/
/*
* Routine to look up the skip value for a character.
*/
static unsigned long
#ifdef __STDC__
_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
#else
_utbm_skip(p, start, end)
utbm_pattern_t p;
ucs2_t *start, *end;
#endif
{
unsigned long i;
ucs4_t c1, c2;
_utbm_skip_t *sp;
if (start >= end)
return 0;
c1 = *start;
c2 = (start + 1 < end) ? *(start + 1) : ~0;
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
return ((unsigned long) (end - start) < sp->skip) ?
end - start : sp->skip;
}
}
return p->patlen;
}
static int
#ifdef __STDC__
_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
unsigned long *match_start, unsigned long *match_end)
#else
_utbm_match(pat, text, start, end, match_start, match_end)
utbm_pattern_t pat;
ucs2_t *text, *start, *end;
unsigned long *match_start, *match_end;
#endif
{
int check_space;
ucs4_t c1, c2;
unsigned long count;
_utbm_char_t *cp;
/*
* Set the potential match endpoint first.
*/
*match_end = (start - text) + 1;
c1 = *start;
c2 = (start + 1 < end) ? *(start + 1) : ~0;
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
/*
* Adjust the match end point to occur after the UTF-16 character.
*/
*match_end = *match_end + 1;
}
if (pat->pat_used == 1) {
*match_start = start - text;
return 1;
}
/*
* Compare backward.
*/
cp = pat->pat + (pat->pat_used - 1);
for (count = pat->patlen; start > text && count > 0;) {
/*
* Ignore non-spacing characters if indicated.
*/
if (pat->flags & UTBM_IGNORE_NONSPACING) {
while (start > text && _utbm_nonspacing(c1)) {
c2 = *--start;
c1 = (start - 1 > text) ? *(start - 1) : ~0;
if (0xdc00 <= c2 && c2 <= 0xdfff &&
0xd800 <= c1 && c1 <= 0xdbff) {
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
start--;
} else
c1 = c2;
}
}
/*
* Handle space compression if indicated.
*/
if (pat->flags & UTBM_SPACE_COMPRESS) {
check_space = 0;
while (start > text &&
(_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
check_space = _utbm_isspace(c1, 1);
c2 = *--start;
c1 = (start - 1 > text) ? *(start - 1) : ~0;
if (0xdc00 <= c2 && c2 <= 0xdfff &&
0xd800 <= c1 && c1 <= 0xdbff) {
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
start--;
} else
c1 = c2;
}
/*
* Handle things if space compression was indicated and one or
* more member characters were found.
*/
if (check_space) {
if (cp->uc != ' ')
return 0;
cp--;
count--;
}
}
/*
* Handle the normal comparison cases.
*/
if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
return 0;
count -= (c1 >= 0x10000) ? 2 : 1;
if (count > 0) {
cp--;
/*
* Get the next preceding character.
*/
if (start > text) {
c2 = *--start;
c1 = (start - 1 > text) ? *(start - 1) : ~0;
if (0xdc00 <= c2 && c2 <= 0xdfff &&
0xd800 <= c1 && c1 <= 0xdbff) {
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
start--;
} else
c1 = c2;
}
}
}
/*
* Set the match start position.
*/
*match_start = start - text;
return 1;
}
/*************************************************************************
*
* API.
*
*************************************************************************/
utbm_pattern_t
#ifdef __STDC__
utbm_create_pattern(void)
#else
utbm_create_pattern()
#endif
{
utbm_pattern_t p;
p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
(void) memset((char *) p, 0, sizeof(_utbm_pattern_t));
return p;
}
void
#ifdef __STDC__
utbm_free_pattern(utbm_pattern_t pattern)
#else
utbm_free_pattern(pattern)
utbm_pattern_t pattern;
#endif
{
if (pattern == 0)
return;
if (pattern->pat_size > 0)
free((char *) pattern->pat);
if (pattern->skip_size > 0)
free((char *) pattern->skip);
free((char *) pattern);
}
void
#ifdef __STDC__
utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
utbm_pattern_t p)
#else
utbm_compile(pat, patlen, flags, p)
ucs2_t *pat;
unsigned long patlen, flags;
utbm_pattern_t p;
#endif
{
int have_space;
unsigned long i, j, k, slen;
_utbm_char_t *cp;
_utbm_skip_t *sp;
ucs4_t c1, c2, sentinel;
if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
return;
/*
* Reset the pattern buffer.
*/
p->patlen = p->pat_used = p->skip_used = 0;
/*
* Set the flags.
*/
p->flags = flags;
/*
* Initialize the extra skip flag.
*/
p->md4 = 1;
/*
* Allocate more storage if necessary.
*/
if (patlen > p->pat_size) {
if (p->pat_size == 0) {
p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
} else {
p->pat = (_utbm_char_t *)
realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
p->skip = (_utbm_skip_t *)
realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
}
p->pat_size = p->skip_size = patlen;
}
/*
* Preprocess the pattern to remove controls (if specified) and determine
* case.
*/
for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
c1 = pat[i];
c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
/*
* Make sure the `have_space' flag is turned off if the character
* is not an appropriate one.
*/
if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
have_space = 0;
/*
* If non-spacing characters should be ignored, do it here.
*/
if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
continue;
/*
* Check if spaces and controls need to be compressed.
*/
if (flags & UTBM_SPACE_COMPRESS) {
if (_utbm_isspace(c1, 1)) {
if (!have_space) {
/*
* Add a space and set the flag.
*/
cp->uc = cp->lc = cp->tc = ' ';
cp++;
/*
* Increase the real pattern length.
*/
p->patlen++;
sentinel = ' ';
have_space = 1;
}
continue;
}
/*
* Ignore all control characters.
*/
if (_utbm_iscntrl(c1))
continue;
}
/*
* Add the character.
*/
if (flags & UTBM_CASEFOLD) {
cp->uc = _utbm_toupper(c1);
cp->lc = _utbm_tolower(c1);
cp->tc = _utbm_totitle(c1);
} else
cp->uc = cp->lc = cp->tc = c1;
/*
* Set the sentinel character.
*/
sentinel = cp->uc;
/*
* Move to the next character.
*/
cp++;
/*
* Increase the real pattern length appropriately.
*/
p->patlen += (c1 >= 0x10000) ? 2 : 1;
/*
* Increment the loop index for UTF-16 characters.
*/
i += (c1 >= 0x10000) ? 1 : 0;
}
/*
* Set the number of characters actually used.
*/
p->pat_used = cp - p->pat;
/*
* Go through and construct the skip array and determine the actual length
* of the pattern in UCS2 terms.
*/
slen = p->patlen - 1;
cp = p->pat;
for (i = k = 0; i < p->pat_used; i++, cp++) {
/*
* Locate the character in the skip array.
*/
for (sp = p->skip, j = 0;
j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
/*
* If the character is not found, set the new skip element and
* increase the number of skip elements.
*/
if (j == p->skip_used) {
sp->ch = cp;
p->skip_used++;
}
/*
* Set the updated skip value. If the character is UTF-16 and is
* not the last one in the pattern, add one to its skip value.
*/
sp->skip = slen - k;
if (cp->uc >= 0x10000 && k + 2 < slen)
sp->skip++;
/*
* Set the new extra skip for the sentinel character.
*/
if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
cp->uc == sentinel)
p->md4 = slen - k;
/*
* Increase the actual index.
*/
k += (cp->uc >= 0x10000) ? 2 : 1;
}
}
int
#ifdef __STDC__
utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
unsigned long *match_start, unsigned long *match_end)
#else
utbm_exec(pat, text, textlen, match_start, match_end)
utbm_pattern_t pat;
ucs2_t *text;
unsigned long textlen, *match_start, *match_end;
#endif
{
unsigned long k;
ucs2_t *start, *end;
if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
textlen < pat->patlen)
return 0;
start = text + pat->patlen;
end = text + textlen;
/*
* Adjust the start point if it points to a low surrogate.
*/
if (0xdc00 <= *start && *start <= 0xdfff &&
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
start--;
while (start < end) {
while ((k = _utbm_skip(pat, start, end))) {
start += k;
if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
start--;
}
if (start < end &&
_utbm_match(pat, text, start, end, match_start, match_end))
return 1;
start += pat->md4;
if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
start--;
}
return 0;
}

View file

@ -0,0 +1,109 @@
/*
* Copyright 1997, 1998, 1999 Computing Research Labs,
* New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _h_utbm
#define _h_utbm
/*
* $Id: utbm.h,v 1.1 1999/09/21 15:45:18 mleisher Exp $
*/
#ifdef __cplusplus
extern "C" {
#endif
#undef __
#ifdef __STDC__
#define __(x) x
#else
#define __(x) ()
#endif
/*************************************************************************
*
* Types.
*
*************************************************************************/
/*
* Fundamental character types.
*/
typedef unsigned long ucs4_t;
typedef unsigned short ucs2_t;
/*
* An opaque type used for the search pattern.
*/
typedef struct _utbm_pattern_t *utbm_pattern_t;
/*************************************************************************
*
* Flags.
*
*************************************************************************/
#define UTBM_CASEFOLD 0x01
#define UTBM_IGNORE_NONSPACING 0x02
#define UTBM_SPACE_COMPRESS 0x04
/*************************************************************************
*
* API.
*
*************************************************************************/
extern utbm_pattern_t utbm_create_pattern __((void));
extern void utbm_free_pattern __((utbm_pattern_t pattern));
extern void utbm_compile __((ucs2_t *pat, unsigned long patlen,
unsigned long flags, utbm_pattern_t pattern));
extern int utbm_exec __((utbm_pattern_t pat, ucs2_t *text,
unsigned long textlen, unsigned long *match_start,
unsigned long *match_end));
/*************************************************************************
*
* Prototypes for the stub functions needed.
*
*************************************************************************/
extern int _utbm_isspace __((ucs4_t c, int compress));
extern int _utbm_iscntrl __((ucs4_t c));
extern int _utbm_nonspacing __((ucs4_t c));
extern ucs4_t _utbm_tolower __((ucs4_t c));
extern ucs4_t _utbm_toupper __((ucs4_t c));
extern ucs4_t _utbm_totitle __((ucs4_t c));
#undef __
#ifdef __cplusplus
}
#endif
#endif /* _h_utbm */

View file

@ -0,0 +1,125 @@
/*
* Copyright 1997, 1998, 1999 Computing Research Labs,
* New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef lint
static char rcsid[] = "$Id: utbmstub.c,v 1.1 1999/09/21 15:45:18 mleisher Exp $";
#endif
#include "utbm.h"
/*
* This should be redefined to use the `isspace' function available in the
* Unicode support on the platform where this is being used.
*/
#define _platform_isspace(x) 0
/*
* Return non-zero for any character that should be considered the equivalent
* of a space character. Return zero otherwise.
*/
int
#ifdef __STDC__
_utbm_isspace(ucs4_t c, int compress)
#else
_utbm_isspace(c, compress)
ucs4_t c;
int compress;
#endif
{
if (compress)
return (c == 0x09 || c == 0x0a || c == 0x0d ||
c == 0x2028 || c == 0x2029 || _platform_isspace(c)) ? 1 : 0;
return _platform_isspace(c);
}
/*
* Return non-zero if the character is a control character, or zero otherwise.
*/
int
#ifdef __STDC__
_utbm_iscntrl(ucs4_t c)
#else
_utbm_iscntrl(c)
ucs4_t c;
#endif
{
return 0;
}
/*
* Return non-zero if the character is a non-spacing character, or zero
* otherwise.
*/
int
#ifdef __STDC__
_utbm_nonspacing(ucs4_t c)
#else
_utbm_nonspacing(c)
ucs4_t c;
#endif
{
return 0;
}
/*
* Convert a character to lower case.
*/
ucs4_t
#ifdef __STDC__
_utbm_tolower(ucs4_t c)
#else
_utbm_tolower(c)
ucs4_t c;
#endif
{
return c;
}
/*
* Convert a character to upper case.
*/
ucs4_t
#ifdef __STDC__
_utbm_toupper(ucs4_t c)
#else
_utbm_toupper(c)
ucs4_t c;
#endif
{
return c;
}
/*
* Convert a character to title case.
*/
ucs4_t
#ifdef __STDC__
_utbm_totitle(ucs4_t c)
#else
_utbm_totitle(c)
ucs4_t c;
#endif
{
return c;
}