mirror of
https://git.openldap.org/openldap/openldap.git
synced 2025-12-24 00:29:35 -05:00
Initial revision
This commit is contained in:
parent
9fec129997
commit
fe98d9fa7b
22 changed files with 10372 additions and 0 deletions
303
libraries/liblunicode/ucdata/MUTTUCData.txt
Normal file
303
libraries/liblunicode/ucdata/MUTTUCData.txt
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
#
|
||||
# $Id: MUTTUCData.txt,v 1.3 1999/10/29 00:04:35 mleisher Exp $
|
||||
#
|
||||
# Copyright 1999 Computing Research Labs, New Mexico State University
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
#
|
||||
# Implementation specific character properties.
|
||||
#
|
||||
#
|
||||
# Space, other.
|
||||
#
|
||||
0009;;Ss;;;;;;;;;;;;
|
||||
000A;;Ss;;;;;;;;;;;;
|
||||
000B;;Ss;;;;;;;;;;;;
|
||||
000C;;Ss;;;;;;;;;;;;
|
||||
000D;;Ss;;;;;;;;;;;;
|
||||
#
|
||||
# Non-breaking.
|
||||
#
|
||||
00A0;;Nb;;;;;;;;;;;;
|
||||
2007;;Nb;;;;;;;;;;;;
|
||||
2011;;Nb;;;;;;;;;;;;
|
||||
FEFF;;Nb;;;;;;;;;;;;
|
||||
#
|
||||
# Symmetric.
|
||||
#
|
||||
0028;;Sy;;;;;;;;;;;;
|
||||
0029;;Sy;;;;;;;;;;;;
|
||||
005B;;Sy;;;;;;;;;;;;
|
||||
005D;;Sy;;;;;;;;;;;;
|
||||
007B;;Sy;;;;;;;;;;;;
|
||||
007D;;Sy;;;;;;;;;;;;
|
||||
00AB;;Sy;;;;;;;;;;;;
|
||||
00BB;;Sy;;;;;;;;;;;;
|
||||
0F3A;;Sy;;;;;;;;;;;;
|
||||
0F3B;;Sy;;;;;;;;;;;;
|
||||
0F3C;;Sy;;;;;;;;;;;;
|
||||
0F3D;;Sy;;;;;;;;;;;;
|
||||
0F3E;;Sy;;;;;;;;;;;;
|
||||
0F3F;;Sy;;;;;;;;;;;;
|
||||
2018;;Sy;;;;;;;;;;;;
|
||||
2019;;Sy;;;;;;;;;;;;
|
||||
201A;;Sy;;;;;;;;;;;;
|
||||
201B;;Sy;;;;;;;;;;;;
|
||||
201C;;Sy;;;;;;;;;;;;
|
||||
201D;;Sy;;;;;;;;;;;;
|
||||
201E;;Sy;;;;;;;;;;;;
|
||||
201F;;Sy;;;;;;;;;;;;
|
||||
2039;;Sy;;;;;;;;;;;;
|
||||
203A;;Sy;;;;;;;;;;;;
|
||||
2045;;Sy;;;;;;;;;;;;
|
||||
2046;;Sy;;;;;;;;;;;;
|
||||
207D;;Sy;;;;;;;;;;;;
|
||||
207E;;Sy;;;;;;;;;;;;
|
||||
208D;;Sy;;;;;;;;;;;;
|
||||
208E;;Sy;;;;;;;;;;;;
|
||||
2329;;Sy;;;;;;;;;;;;
|
||||
232A;;Sy;;;;;;;;;;;;
|
||||
3008;;Sy;;;;;;;;;;;;
|
||||
3009;;Sy;;;;;;;;;;;;
|
||||
300A;;Sy;;;;;;;;;;;;
|
||||
300B;;Sy;;;;;;;;;;;;
|
||||
300C;;Sy;;;;;;;;;;;;
|
||||
300D;;Sy;;;;;;;;;;;;
|
||||
300E;;Sy;;;;;;;;;;;;
|
||||
300F;;Sy;;;;;;;;;;;;
|
||||
3010;;Sy;;;;;;;;;;;;
|
||||
3011;;Sy;;;;;;;;;;;;
|
||||
3014;;Sy;;;;;;;;;;;;
|
||||
3015;;Sy;;;;;;;;;;;;
|
||||
3016;;Sy;;;;;;;;;;;;
|
||||
3017;;Sy;;;;;;;;;;;;
|
||||
3018;;Sy;;;;;;;;;;;;
|
||||
3019;;Sy;;;;;;;;;;;;
|
||||
301A;;Sy;;;;;;;;;;;;
|
||||
301B;;Sy;;;;;;;;;;;;
|
||||
301D;;Sy;;;;;;;;;;;;
|
||||
301E;;Sy;;;;;;;;;;;;
|
||||
301F;;Sy;;;;;;;;;;;;
|
||||
FD3E;;Sy;;;;;;;;;;;;
|
||||
FD3F;;Sy;;;;;;;;;;;;
|
||||
FE35;;Sy;;;;;;;;;;;;
|
||||
FE36;;Sy;;;;;;;;;;;;
|
||||
FE37;;Sy;;;;;;;;;;;;
|
||||
FE38;;Sy;;;;;;;;;;;;
|
||||
FE39;;Sy;;;;;;;;;;;;
|
||||
FE3A;;Sy;;;;;;;;;;;;
|
||||
FE3B;;Sy;;;;;;;;;;;;
|
||||
FE3C;;Sy;;;;;;;;;;;;
|
||||
FE3D;;Sy;;;;;;;;;;;;
|
||||
FE3E;;Sy;;;;;;;;;;;;
|
||||
FE3F;;Sy;;;;;;;;;;;;
|
||||
FE40;;Sy;;;;;;;;;;;;
|
||||
FE41;;Sy;;;;;;;;;;;;
|
||||
FE42;;Sy;;;;;;;;;;;;
|
||||
FE43;;Sy;;;;;;;;;;;;
|
||||
FE44;;Sy;;;;;;;;;;;;
|
||||
FE59;;Sy;;;;;;;;;;;;
|
||||
FE5A;;Sy;;;;;;;;;;;;
|
||||
FE5B;;Sy;;;;;;;;;;;;
|
||||
FE5C;;Sy;;;;;;;;;;;;
|
||||
FE5D;;Sy;;;;;;;;;;;;
|
||||
FE5E;;Sy;;;;;;;;;;;;
|
||||
FF08;;Sy;;;;;;;;;;;;
|
||||
FF09;;Sy;;;;;;;;;;;;
|
||||
FF3B;;Sy;;;;;;;;;;;;
|
||||
FF3D;;Sy;;;;;;;;;;;;
|
||||
FF5B;;Sy;;;;;;;;;;;;
|
||||
FF5D;;Sy;;;;;;;;;;;;
|
||||
FF62;;Sy;;;;;;;;;;;;
|
||||
FF63;;Sy;;;;;;;;;;;;
|
||||
#
|
||||
# Hex digit.
|
||||
#
|
||||
0030;;Hd;;;;;;;;;;;;
|
||||
0031;;Hd;;;;;;;;;;;;
|
||||
0032;;Hd;;;;;;;;;;;;
|
||||
0033;;Hd;;;;;;;;;;;;
|
||||
0034;;Hd;;;;;;;;;;;;
|
||||
0035;;Hd;;;;;;;;;;;;
|
||||
0036;;Hd;;;;;;;;;;;;
|
||||
0037;;Hd;;;;;;;;;;;;
|
||||
0038;;Hd;;;;;;;;;;;;
|
||||
0039;;Hd;;;;;;;;;;;;
|
||||
0041;;Hd;;;;;;;;;;;;
|
||||
0042;;Hd;;;;;;;;;;;;
|
||||
0043;;Hd;;;;;;;;;;;;
|
||||
0044;;Hd;;;;;;;;;;;;
|
||||
0045;;Hd;;;;;;;;;;;;
|
||||
0046;;Hd;;;;;;;;;;;;
|
||||
0061;;Hd;;;;;;;;;;;;
|
||||
0062;;Hd;;;;;;;;;;;;
|
||||
0063;;Hd;;;;;;;;;;;;
|
||||
0064;;Hd;;;;;;;;;;;;
|
||||
0065;;Hd;;;;;;;;;;;;
|
||||
0066;;Hd;;;;;;;;;;;;
|
||||
FF10;;Hd;;;;;;;;;;;;
|
||||
FF11;;Hd;;;;;;;;;;;;
|
||||
FF12;;Hd;;;;;;;;;;;;
|
||||
FF13;;Hd;;;;;;;;;;;;
|
||||
FF14;;Hd;;;;;;;;;;;;
|
||||
FF15;;Hd;;;;;;;;;;;;
|
||||
FF16;;Hd;;;;;;;;;;;;
|
||||
FF17;;Hd;;;;;;;;;;;;
|
||||
FF18;;Hd;;;;;;;;;;;;
|
||||
FF19;;Hd;;;;;;;;;;;;
|
||||
FF21;;Hd;;;;;;;;;;;;
|
||||
FF22;;Hd;;;;;;;;;;;;
|
||||
FF23;;Hd;;;;;;;;;;;;
|
||||
FF24;;Hd;;;;;;;;;;;;
|
||||
FF25;;Hd;;;;;;;;;;;;
|
||||
FF26;;Hd;;;;;;;;;;;;
|
||||
FF41;;Hd;;;;;;;;;;;;
|
||||
FF42;;Hd;;;;;;;;;;;;
|
||||
FF43;;Hd;;;;;;;;;;;;
|
||||
FF44;;Hd;;;;;;;;;;;;
|
||||
FF45;;Hd;;;;;;;;;;;;
|
||||
FF46;;Hd;;;;;;;;;;;;
|
||||
#
|
||||
# Quote marks.
|
||||
#
|
||||
0022;;Qm;;;;;;;;;;;;
|
||||
0027;;Qm;;;;;;;;;;;;
|
||||
00AB;;Qm;;;;;;;;;;;;
|
||||
00BB;;Qm;;;;;;;;;;;;
|
||||
2018;;Qm;;;;;;;;;;;;
|
||||
2019;;Qm;;;;;;;;;;;;
|
||||
201A;;Qm;;;;;;;;;;;;
|
||||
201B;;Qm;;;;;;;;;;;;
|
||||
201C;;Qm;;;;;;;;;;;;
|
||||
201D;;Qm;;;;;;;;;;;;
|
||||
201E;;Qm;;;;;;;;;;;;
|
||||
201F;;Qm;;;;;;;;;;;;
|
||||
2039;;Qm;;;;;;;;;;;;
|
||||
203A;;Qm;;;;;;;;;;;;
|
||||
300C;;Qm;;;;;;;;;;;;
|
||||
300D;;Qm;;;;;;;;;;;;
|
||||
300E;;Qm;;;;;;;;;;;;
|
||||
300F;;Qm;;;;;;;;;;;;
|
||||
301D;;Qm;;;;;;;;;;;;
|
||||
301E;;Qm;;;;;;;;;;;;
|
||||
301F;;Qm;;;;;;;;;;;;
|
||||
FE41;;Qm;;;;;;;;;;;;
|
||||
FE42;;Qm;;;;;;;;;;;;
|
||||
FE43;;Qm;;;;;;;;;;;;
|
||||
FE44;;Qm;;;;;;;;;;;;
|
||||
FF02;;Qm;;;;;;;;;;;;
|
||||
FF07;;Qm;;;;;;;;;;;;
|
||||
FF62;;Qm;;;;;;;;;;;;
|
||||
FF63;;Qm;;;;;;;;;;;;
|
||||
#
|
||||
# Special Devanagari forms
|
||||
#
|
||||
E900;DEVANAGARI KSHA LIGATURE;Lo;0;L;0915 094D 0937;;;;N;;;;;
|
||||
E901;DEVANAGARI GNYA LIGATURE;Lo;0;L;091C 094D 091E;;;;N;;;;;
|
||||
E902;DEVANAGARI TTA LIGATURE;Lo;0;L;0924 094D 0924;;;;N;;;;;
|
||||
E903;DEVANAGARI TRA LIGATURE;Lo;0;L;0924 094D 0930;;;;N;;;;;
|
||||
E904;DEVANAGARI SHCHA LIGATURE;Lo;0;L;0936 094D 091B;;;;N;;;;;
|
||||
E905;DEVANAGARI SHRA LIGATURE;Lo;0;L;0936 094D 0930;;;;N;;;;;
|
||||
E906;DEVANAGARI SHVA LIGATURE;Lo;0;L;0936 094D 0935;;;;N;;;;;
|
||||
E907;DEVANAGARI KRA LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E908;DEVANAGARI JRA LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E909;DEVANAGARI ZRA LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E90A;DEVANAGARI PHRA LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E90B;DEVANAGARI FRA LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E90C;DEVANAGARI PRA LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E90D;DEVANAGARI SRA LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E90E;DEVANAGARI RU LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E90F;DEVANAGARI RUU LIGATURE;Lo;0;L;;;;;N;;;;;
|
||||
E915;DEVANAGARI HALF LETTER KA;Lo;0;L;;;;;N;;;;;
|
||||
E916;DEVANAGARI HALF LETTER KHA;Lo;0;L;;;;;N;;;;;
|
||||
E917;DEVANAGARI HALF LETTER GA;Lo;0;L;;;;;N;;;;;
|
||||
E918;DEVANAGARI HALF LETTER GHA;Lo;0;L;;;;;N;;;;;
|
||||
E919;DEVANAGARI HALF LETTER NGA;Lo;0;L;;;;;N;;;;;
|
||||
E91A;DEVANAGARI HALF LETTER CA;Lo;0;L;;;;;N;;;;;
|
||||
E91B;DEVANAGARI HALF LETTER CHA;Lo;0;L;;;;;N;;;;;
|
||||
E91C;DEVANAGARI HALF LETTER JA;Lo;0;L;;;;;N;;;;;
|
||||
E91D;DEVANAGARI HALF LETTER JHA;Lo;0;L;;;;;N;;;;;
|
||||
E91E;DEVANAGARI HALF LETTER NYA;Lo;0;L;;;;;N;;;;;
|
||||
E91F;DEVANAGARI HALF LETTER TTA;Lo;0;L;;;;;N;;;;;
|
||||
E920;DEVANAGARI HALF LETTER TTHA;Lo;0;L;;;;;N;;;;;
|
||||
E921;DEVANAGARI HALF LETTER DDA;Lo;0;L;;;;;N;;;;;
|
||||
E922;DEVANAGARI HALF LETTER DDHA;Lo;0;L;;;;;N;;;;;
|
||||
E923;DEVANAGARI HALF LETTER NNA;Lo;0;L;;;;;N;;;;;
|
||||
E924;DEVANAGARI HALF LETTER TA;Lo;0;L;;;;;N;;;;;
|
||||
E925;DEVANAGARI HALF LETTER THA;Lo;0;L;;;;;N;;;;;
|
||||
E926;DEVANAGARI HALF LETTER DA;Lo;0;L;;;;;N;;;;;
|
||||
E927;DEVANAGARI HALF LETTER DHA;Lo;0;L;;;;;N;;;;;
|
||||
E928;DEVANAGARI HALF LETTER NA;Lo;0;L;;;;;N;;;;;
|
||||
E929;DEVANAGARI HALF LETTER NNNA;Lo;0;L;0928 093C;;;;N;;;;;
|
||||
E92A;DEVANAGARI HALF LETTER PA;Lo;0;L;;;;;N;;;;;
|
||||
E92B;DEVANAGARI HALF LETTER PHA;Lo;0;L;;;;;N;;;;;
|
||||
E92C;DEVANAGARI HALF LETTER BA;Lo;0;L;;;;;N;;;;;
|
||||
E92D;DEVANAGARI HALF LETTER BHA;Lo;0;L;;;;;N;;;;;
|
||||
E92E;DEVANAGARI HALF LETTER MA;Lo;0;L;;;;;N;;;;;
|
||||
E92F;DEVANAGARI HALF LETTER YA;Lo;0;L;;;;;N;;;;;
|
||||
E930;DEVANAGARI HALF LETTER RA;Lo;0;L;;;;;N;;;;;
|
||||
E931;DEVANAGARI HALF LETTER RRA;Lo;0;L;0930 093C;;;;N;;;;;
|
||||
E932;DEVANAGARI HALF LETTER LA;Lo;0;L;;;;;N;;;;;
|
||||
E933;DEVANAGARI HALF LETTER LLA;Lo;0;L;;;;;N;;;;;
|
||||
E934;DEVANAGARI HALF LETTER LLLA;Lo;0;L;0933 093C;;;;N;;;;;
|
||||
E935;DEVANAGARI HALF LETTER VA;Lo;0;L;;;;;N;;;;;
|
||||
E936;DEVANAGARI HALF LETTER SHA;Lo;0;L;;;;;N;;;;;
|
||||
E937;DEVANAGARI HALF LETTER SSA;Lo;0;L;;;;;N;;;;;
|
||||
E938;DEVANAGARI HALF LETTER SA;Lo;0;L;;;;;N;;;;;
|
||||
E939;DEVANAGARI HALF LETTER HA;Lo;0;L;;;;;N;;;;;
|
||||
E940;DEVANAGARI KKA LIGATURE;Lo;0;L;0915 094D 0915;;;;N;;;;;
|
||||
E941;DEVANAGARI KTA LIGATURE;Lo;0;L;0915 094D 0924;;;;N;;;;;
|
||||
E942;DEVANAGARI NGKA LIGATURE;Lo;0;L;0919 094D 0915;;;;N;;;;;
|
||||
E943;DEVANAGARI NGKHA LIGATURE;Lo;0;L;0919 094D 0916;;;;N;;;;;
|
||||
E944;DEVANAGARI NGGA LIGATURE;Lo;0;L;0919 094D 0917;;;;N;;;;;
|
||||
E945;DEVANAGARI NGGHA LIGATURE;Lo;0;L;0919 094D 0918;;;;N;;;;;
|
||||
E946;DEVANAGARI NYJA LIGATURE;Lo;0;L;091E 094D 091C;;;;N;;;;;
|
||||
E947;DEVANAGARI DGHA LIGATURE;Lo;0;L;0926 094D 0918;;;;N;;;;;
|
||||
E948;DEVANAGARI DDA LIGATURE;Lo;0;L;0926 094D 0926;;;;N;;;;;
|
||||
E949;DEVANAGARI DDHA LIGATURE;Lo;0;L;0926 094D 0927;;;;N;;;;;
|
||||
E94A;DEVANAGARI DBA LIGATURE;Lo;0;L;0926 094D 092C;;;;N;;;;;
|
||||
E94B;DEVANAGARI DBHA LIGATURE;Lo;0;L;0926 094D 092D;;;;N;;;;;
|
||||
E94C;DEVANAGARI DMA LIGATURE;Lo;0;L;0926 094D 092E;;;;N;;;;;
|
||||
E94D;DEVANAGARI DYA LIGATURE;Lo;0;L;0926 094D 092F;;;;N;;;;;
|
||||
E94E;DEVANAGARI DVA LIGATURE;Lo;0;L;0926 094D 0935;;;;N;;;;;
|
||||
E94F;DEVANAGARI TT-TTA LIGATURE;Lo;0;L;091F 094D 091F;;;;N;;;;;
|
||||
E950;DEVANAGARI TT-TTHA LIGATURE;Lo;0;L;091F 094D 0920;;;;N;;;;;
|
||||
E951;DEVANAGARI TTH-TTHA LIGATURE;Lo;0;L;0920 094D 0920;;;;N;;;;;
|
||||
E952;DEVANAGARI DD-GA LIGATURE;Lo;0;L;0921 094D 0917;;;;N;;;;;
|
||||
E953;DEVANAGARI DD-DDA LIGATURE;Lo;0;L;0921 094D 0921;;;;N;;;;;
|
||||
E954;DEVANAGARI DD-DDHA LIGATURE;Lo;0;L;0921 094D 0922;;;;N;;;;;
|
||||
E955;DEVANAGARI NNA LIGATURE;Lo;0;L;0928 094D 0928;;;;N;;;;;
|
||||
E956;DEVANAGARI HMA LIGATURE;Lo;0;L;0939 094D 092E;;;;N;;;;;
|
||||
E957;DEVANAGARI HYA LIGATURE;Lo;0;L;0939 094D 092F;;;;N;;;;;
|
||||
E958;DEVANAGARI HLA LIGATURE;Lo;0;L;0939 094D 0932;;;;N;;;;;
|
||||
E959;DEVANAGARI HVA LIGATURE;Lo;0;L;0939 094D 0935;;;;N;;;;;
|
||||
E95A;DEVANAGARI STRA LIGATURE;Lo;0;L;0938 094D 0924 094D 0930;;;;N;;;;;
|
||||
E970;DEVANAGARI HALF KSHA LIGATURE;Lo;0;L;0915 094D 0937;;;;N;;;;;
|
||||
E971;DEVANAGARI HALF GNYA LIGATURE;Lo;0;L;091C 094D 091E;;;;N;;;;;
|
||||
E972;DEVANAGARI HALF TTA LIGATURE;Lo;0;L;0924 094D 0924;;;;N;;;;;
|
||||
E973;DEVANAGARI HALF TRA LIGATURE;Lo;0;L;0924 094D 0930;;;;N;;;;;
|
||||
E974;DEVANAGARI HALF SHCHA LIGATURE;Lo;0;L;0936 094D 091B;;;;N;;;;;
|
||||
E975;DEVANAGARI HALF SHRA LIGATURE;Lo;0;L;0936 094D 0930;;;;N;;;;;
|
||||
E976;DEVANAGARI HALF SHVA LIGATURE;Lo;0;L;0936 094D 0935;;;;N;;;;;
|
||||
E97B;DEVANAGARI SIGN RRA-REPHA;Mn;36;L;;;;;N;;;;;
|
||||
E97C;DEVANAGARI HAR LIGATURE;Lo;0;L;0939 0943;;;;N;;;;;
|
||||
E97D;DEVANAGARI SIGN EYELASH RA;Lo;0;L;;;;;N;;;;;
|
||||
E97E;DEVANAGARI SIGN REPHA;Mn;36;L;;;;;N;;;;;
|
||||
E97F;DEVANAGARI SIGN SUBJOINED RA;Mn;36;L;;;;;N;;;;;
|
||||
300
libraries/liblunicode/ucdata/README
Normal file
300
libraries/liblunicode/ucdata/README
Normal file
|
|
@ -0,0 +1,300 @@
|
|||
#
|
||||
# $Id: README,v 1.32 1999/11/29 16:41:05 mleisher Exp $
|
||||
#
|
||||
|
||||
MUTT UCData Package 2.4
|
||||
-----------------------
|
||||
|
||||
This is a package that supports ctype-like operations for Unicode UCS-2 text
|
||||
(and surrogates), case mapping, decomposition lookup, and provides a
|
||||
bidirectional reordering algorithm. To use it, you will need to get the
|
||||
latest "UnicodeData-*.txt" (or later) file from the Unicode Web or FTP site.
|
||||
|
||||
The character information portion of the package consists of three parts:
|
||||
|
||||
1. A program called "ucgendat" which generates five data files from the
|
||||
UnicodeData-*.txt file. The files are:
|
||||
|
||||
A. case.dat - the case mappings.
|
||||
B. ctype.dat - the character property tables.
|
||||
C. decomp.dat - the character decompositions.
|
||||
D. cmbcl.dat - the non-zero combining classes.
|
||||
E. num.dat - the codes representing numbers.
|
||||
|
||||
2. The "ucdata.[ch]" files which implement the functions needed to
|
||||
check to see if a character matches groups of properties, to map between
|
||||
upper, lower, and title case, to look up the decomposition of a
|
||||
character, look up the combining class of a character, and get the number
|
||||
value of a character.
|
||||
|
||||
3. The UCData.java class which provides the same API (with minor changes for
|
||||
the numbers) and loads the same binary data files as the C code.
|
||||
|
||||
A short reference to the functions available is in the "api.txt" file.
|
||||
|
||||
Techie Details
|
||||
==============
|
||||
|
||||
The "ucgendat" program parses files from the command line which are all in the
|
||||
Unicode Character Database (UCDB) format. An additional properties file,
|
||||
"MUTTUCData.txt", provides some extra properties for some characters.
|
||||
|
||||
The program looks for the two character properties fields (2 and 4), the
|
||||
combining class field (3), the decomposition field (5), the numeric value
|
||||
field (8), and the case mapping fields (12, 13, and 14). The decompositions
|
||||
are recursively expanded before being written out.
|
||||
|
||||
The decomposition table contains all the canonical decompositions. This means
|
||||
all decompositions that do not have tags such as "<compat>" or "<font>".
|
||||
|
||||
The data is almost all stored as unsigned longs (32-bits assumed) and the
|
||||
routines that load the data take care of endian swaps when necessary. This
|
||||
also means that surrogates (>= 0x10000) can be placed in the data files the
|
||||
"ucgendat" program parses.
|
||||
|
||||
The data is written as external files and broken into five parts so it can be
|
||||
selectively updated at runtime if necessary.
|
||||
|
||||
The data files currently generated from the "ucgendat" program total about 56K
|
||||
in size all together.
|
||||
|
||||
The format of the binary data files is documented in the "format.txt" file.
|
||||
|
||||
==========================================================================
|
||||
|
||||
The "Pretty Good Bidi Algorithm"
|
||||
--------------------------------
|
||||
|
||||
This routine provides an alternative to the Unicode Bidi algorithm. The
|
||||
difference is that this version of the PGBA does not handle the explicit
|
||||
directional codes (LRE, RLE, LRO, RLO, PDF). It should now produce the same
|
||||
results as the Unicode BiDi algorithm for implicit reordering. Included are
|
||||
functions for doing cursor motion in both logical and visual order.
|
||||
|
||||
This implementation is provided to demonstrate an effective alternate method
|
||||
for implicit reordering. To make this useful for an application, it probably
|
||||
needs some changes to the memory allocation and deallocation, as well as data
|
||||
structure additions for rendering.
|
||||
|
||||
Mark Leisher <mleisher@crl.nmsu.edu>
|
||||
19 November 1999
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
CHANGES
|
||||
=======
|
||||
|
||||
Version 2.4
|
||||
-----------
|
||||
1. Improved some bidi algorithm documentation in the code.
|
||||
|
||||
2. Fixed a code mixup that produced a non-working version.
|
||||
|
||||
Version 2.3
|
||||
-----------
|
||||
1. Fixed a misspelling in the ucpgba.h header file.
|
||||
|
||||
2. Fixed a bug which caused trailing weak non-digit sequences to be left out of
|
||||
the reordered string in the bidi algorithm.
|
||||
|
||||
3. Fixed a problem with weak sequences containing non-spacing marks in the
|
||||
bidi algorithm.
|
||||
|
||||
4. Fixed a problem with text runs of the opposite direction of the string
|
||||
surrounding a weak + neutral text run appearing in the wrong order in the
|
||||
bidi algorithm.
|
||||
|
||||
5. Added a default overall direction parameter to the reordering function for
|
||||
cases of strings with no strong directional characters in the bidi
|
||||
algorithm.
|
||||
|
||||
6. The bidi API documentation was improved.
|
||||
|
||||
7. Added a man page for the bidi API.
|
||||
|
||||
Version 2.2
|
||||
-----------
|
||||
1. Fixed a problem with the bidi algorithm locating directional section
|
||||
boundaries.
|
||||
|
||||
2. Fixed a problem with the bidi algorithm starting the reordering correctly.
|
||||
|
||||
3. Fixed a problem with the bidi algorithm determining end boundaries for LTR
|
||||
segments.
|
||||
|
||||
4. Fixed a problem with the bidi algorithm reordering weak (digits and number
|
||||
separators) segments.
|
||||
|
||||
5. Added automatic switching of symmetrically paired characters when
|
||||
reversing RTL segments.
|
||||
|
||||
6. Added a missing symmetric character to the extra character properties in
|
||||
MUTTUCData.txt.
|
||||
|
||||
7. Added support for doing logical and visual cursor traversal.
|
||||
|
||||
Version 2.1
|
||||
-----------
|
||||
1. Updated the ucgendat program to handle the Unicode 3.0 character database
|
||||
properties. The AL and BM bidi properties gets marked as strong RTL and
|
||||
Other Neutral, the NSM, LRE, RLE, PDF, LRO, and RLO controls all get marked
|
||||
as Other Neutral.
|
||||
|
||||
2. Fixed some problems with testing against signed values in the UCData.java
|
||||
code and some minor cleanup.
|
||||
|
||||
3. Added the "Pretty Good Bidi Algorithm."
|
||||
|
||||
Version 2.0
|
||||
-----------
|
||||
1. Removed the old Java stuff for a new class that loads directly from the
|
||||
same data files as the C code does.
|
||||
|
||||
2. Fixed a problem with choosing the correct field when mapping case.
|
||||
|
||||
3. Adjust some search routines to start their search in the correct position.
|
||||
|
||||
4. Moved the copyright year to 1999.
|
||||
|
||||
Version 1.9
|
||||
-----------
|
||||
1. Fixed a problem with an incorrect amount of storage being allocated for the
|
||||
combining class nodes.
|
||||
|
||||
2. Fixed an invalid initialization in the number code.
|
||||
|
||||
3. Changed the Java template file formatting a bit.
|
||||
|
||||
4. Added tables and function for getting decompositions in the Java class.
|
||||
|
||||
Version 1.8
|
||||
-----------
|
||||
1. Fixed a problem with adding certain ranges.
|
||||
|
||||
2. Added two more macros for testing for identifiers.
|
||||
|
||||
3. Tested with the UnicodeData-2.1.5.txt file.
|
||||
|
||||
Version 1.7
|
||||
-----------
|
||||
1. Fixed a problem with looking up decompositions in "ucgendat."
|
||||
|
||||
Version 1.6
|
||||
-----------
|
||||
1. Added two new properties introduced with UnicodeData-2.1.4.txt.
|
||||
|
||||
2. Changed the "ucgendat.c" program a little to automatically align the
|
||||
property data on a 4-byte boundary when new properties are added.
|
||||
|
||||
3. Changed the "ucgendat.c" programs to only generate canonical
|
||||
decompositions.
|
||||
|
||||
4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for
|
||||
initial and final punctuation characters.
|
||||
|
||||
5. Minor additions and changes to the documentation.
|
||||
|
||||
Version 1.5
|
||||
-----------
|
||||
1. Changed all file open calls to include binary mode with "b" for DOS/WIN
|
||||
platforms.
|
||||
|
||||
2. Wrapped the unistd.h include so it won't be included when compiled under
|
||||
Win32.
|
||||
|
||||
3. Fixed a bad range check for hex digits in ucgendat.c.
|
||||
|
||||
4. Fixed a bad endian swap for combining classes.
|
||||
|
||||
5. Added code to make a number table and associated lookup functions.
|
||||
Functions added are ucnumber(), ucdigit(), and ucgetnumber(). The last
|
||||
function is to maintain compatibility with John Cowan's "uctype" package.
|
||||
|
||||
Version 1.4
|
||||
-----------
|
||||
1. Fixed a bug with adding a range.
|
||||
|
||||
2. Fixed a bug with inserting a range in order.
|
||||
|
||||
3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros.
|
||||
|
||||
4. Added the missing unload for the combining class data.
|
||||
|
||||
5. Fixed a bad macro placement in ucisweak().
|
||||
|
||||
Version 1.3
|
||||
-----------
|
||||
1. Bug with case mapping calculations fixed.
|
||||
|
||||
2. Bug with empty character property entries fixed.
|
||||
|
||||
3. Bug with incorrect type in the combining class lookup fixed.
|
||||
|
||||
4. Some corrections done to api.txt.
|
||||
|
||||
5. Bug in certain character property lookups fixed.
|
||||
|
||||
6. Added a character property table that records the defined characters.
|
||||
|
||||
7. Replaced ucisunknown() with ucisdefined() and ucisundefined().
|
||||
|
||||
Version 1.2
|
||||
-----------
|
||||
1. Added code to ucgendat to generate a combining class table.
|
||||
|
||||
2. Fixed an endian problem with the byte count of decompositions.
|
||||
|
||||
3. Fixed some minor problems in the "format.txt" file.
|
||||
|
||||
4. Removed some bogus "Ss" values from MUTTUCData.txt file.
|
||||
|
||||
5. Added API function to get combining class.
|
||||
|
||||
6. Changed the open mode to "rb" so binary data files will be opened correctly
|
||||
on DOS/WIN as well as other platforms.
|
||||
|
||||
7. Added the "api.txt" file.
|
||||
|
||||
Version 1.1
|
||||
-----------
|
||||
1. Added ucisxdigit() which I overlooked.
|
||||
|
||||
2. Added UC_LT to the ucisalpha() macro which I overlooked.
|
||||
|
||||
3. Change uciscntrl() to include UC_CF.
|
||||
|
||||
4. Added ucisocntrl() and ucfntcntrl() macros.
|
||||
|
||||
5. Added a ucisblank() which I overlooked.
|
||||
|
||||
6. Added missing properties to ucissymbol() and ucisnumber().
|
||||
|
||||
7. Added ucisgraph() and ucisprint().
|
||||
|
||||
8. Changed the "Mr" property to "Sy" to mark this subset of mirroring
|
||||
characters as symmetric to avoid trampling the Unicode/ISO10646 sense of
|
||||
mirroring.
|
||||
|
||||
9. Added another property called "Ss" which includes control characters
|
||||
traditionally seen as spaces in the isspace() macro.
|
||||
|
||||
10. Added a bunch of macros to be API compatible with John Cowan's package.
|
||||
|
||||
ACKNOWLEDGEMENTS
|
||||
================
|
||||
|
||||
Thanks go to John Cowan <cowan@locke.ccil.org> for pointing out lots of
|
||||
missing things and giving me stuff, particularly a bunch of new macros.
|
||||
|
||||
Thanks go to Bob Verbrugge <bob_verbrugge@nl.compuware.com> for pointing out
|
||||
various bugs.
|
||||
|
||||
Thanks go to Christophe Pierret <cpierret@businessobjects.com> for pointing
|
||||
out that file modes need to have "b" for DOS/WIN machines, pointing out
|
||||
unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum().
|
||||
|
||||
Thanks go to Kent Johnson <kent@pondview.mv.com> for finding a bug that caused
|
||||
incomplete decompositions to be generated by the "ucgendat" program.
|
||||
|
||||
Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
|
||||
error and an initialization error.
|
||||
935
libraries/liblunicode/ucdata/UCData.java
Normal file
935
libraries/liblunicode/ucdata/UCData.java
Normal file
|
|
@ -0,0 +1,935 @@
|
|||
/*
|
||||
* $Id: UCData.java,v 1.2 1999/10/07 20:49:56 mleisher Exp $
|
||||
*
|
||||
* Copyright 1999 Computing Research Labs, New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
import java.io.*;
|
||||
import java.net.*;
|
||||
|
||||
public class UCData {
|
||||
private static byte[] buffer;
|
||||
private static boolean endian;
|
||||
private static int bytes, buffpos;
|
||||
|
||||
//
|
||||
// Do the static initialization.
|
||||
//
|
||||
static {
|
||||
buffer = new byte[24576];
|
||||
}
|
||||
|
||||
private static boolean load_file(InputStream in) {
|
||||
buffpos = 0;
|
||||
try {
|
||||
bytes = in.read(buffer);
|
||||
} catch (IOException e) {
|
||||
return false;
|
||||
}
|
||||
endian = (buffer[0] == -2 && buffer[1] == -2);
|
||||
buffpos = 2;
|
||||
return (bytes > 0);
|
||||
}
|
||||
|
||||
private static int getInt() {
|
||||
int b1, b2, b3, b4;
|
||||
|
||||
if (!endian) {
|
||||
b1 = buffer[buffpos++];
|
||||
b2 = buffer[buffpos++];
|
||||
b3 = buffer[buffpos++];
|
||||
b4 = buffer[buffpos++];
|
||||
} else {
|
||||
b4 = buffer[buffpos++];
|
||||
b3 = buffer[buffpos++];
|
||||
b2 = buffer[buffpos++];
|
||||
b1 = buffer[buffpos++];
|
||||
}
|
||||
if (b1 < 0)
|
||||
b1 += 256;
|
||||
if (b2 < 0)
|
||||
b2 += 256;
|
||||
if (b3 < 0)
|
||||
b3 += 256;
|
||||
if (b4 < 0)
|
||||
b4 += 256;
|
||||
return ((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
|
||||
}
|
||||
|
||||
private static int getInt(int from) {
|
||||
buffpos = from;
|
||||
return getInt();
|
||||
}
|
||||
|
||||
private static short getShort() {
|
||||
int b1, b2;
|
||||
|
||||
if (!endian) {
|
||||
b1 = buffer[buffpos++];
|
||||
b2 = buffer[buffpos++];
|
||||
} else {
|
||||
b2 = buffer[buffpos++];
|
||||
b1 = buffer[buffpos++];
|
||||
}
|
||||
if (b1 < 0)
|
||||
b1 += 256;
|
||||
if (b2 < 0)
|
||||
b2 += 256;
|
||||
|
||||
return (short) ((b1 << 8) | b2);
|
||||
}
|
||||
|
||||
private static short getShort(int from) {
|
||||
buffpos = from;
|
||||
return getShort();
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
* Character type info section.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
private static int masks32[] = {
|
||||
0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
|
||||
0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
|
||||
0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
|
||||
0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
|
||||
0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
|
||||
0x40000000, 0x80000000
|
||||
};
|
||||
|
||||
//
|
||||
// The arrays with the character property info.
|
||||
//
|
||||
private static short[] _ucprop_offsets = null;
|
||||
private static int[] _ucprop_ranges = null;
|
||||
|
||||
public static final int UC_MN = 0x00000001;
|
||||
public static final int UC_MC = 0x00000002;
|
||||
public static final int UC_ME = 0x00000004;
|
||||
public static final int UC_ND = 0x00000008;
|
||||
public static final int UC_NL = 0x00000010;
|
||||
public static final int UC_NO = 0x00000020;
|
||||
public static final int UC_ZS = 0x00000040;
|
||||
public static final int UC_ZL = 0x00000080;
|
||||
public static final int UC_ZP = 0x00000100;
|
||||
public static final int UC_CC = 0x00000200;
|
||||
public static final int UC_CF = 0x00000400;
|
||||
public static final int UC_OS = 0x00000800;
|
||||
public static final int UC_CO = 0x00001000;
|
||||
public static final int UC_CN = 0x00002000;
|
||||
public static final int UC_LU = 0x00004000;
|
||||
public static final int UC_LL = 0x00008000;
|
||||
public static final int UC_LT = 0x00010000;
|
||||
public static final int UC_LM = 0x00020000;
|
||||
public static final int UC_LO = 0x00040000;
|
||||
public static final int UC_PC = 0x00080000;
|
||||
public static final int UC_PD = 0x00100000;
|
||||
public static final int UC_PS = 0x00200000;
|
||||
public static final int UC_PE = 0x00400000;
|
||||
public static final int UC_PO = 0x00800000;
|
||||
public static final int UC_SM = 0x01000000;
|
||||
public static final int UC_SC = 0x02000000;
|
||||
public static final int UC_SK = 0x04000000;
|
||||
public static final int UC_SO = 0x08000000;
|
||||
public static final int UC_L = 0x10000000;
|
||||
public static final int UC_R = 0x20000000;
|
||||
public static final int UC_EN = 0x40000000;
|
||||
public static final int UC_ES = 0x80000000;
|
||||
public static final int UC_ET = 0x00000001;
|
||||
public static final int UC_AN = 0x00000002;
|
||||
public static final int UC_CS = 0x00000004;
|
||||
public static final int UC_B = 0x00000008;
|
||||
public static final int UC_S = 0x00000010;
|
||||
public static final int UC_WS = 0x00000020;
|
||||
public static final int UC_ON = 0x00000040;
|
||||
public static final int UC_CM = 0x00000080;
|
||||
public static final int UC_NB = 0x00000100;
|
||||
public static final int UC_SY = 0x00000200;
|
||||
public static final int UC_HD = 0x00000400;
|
||||
public static final int UC_QM = 0x00000800;
|
||||
public static final int UC_MR = 0x00001000;
|
||||
public static final int UC_SS = 0x00002000;
|
||||
public static final int UC_CP = 0x00004000;
|
||||
public static final int UC_PI = 0x00008000;
|
||||
public static final int UC_PF = 0x00010000;
|
||||
|
||||
private static boolean _ucprop_load(URL where) {
|
||||
int i, hsize, size = 0;
|
||||
boolean res;
|
||||
InputStream in = null;
|
||||
|
||||
//
|
||||
// If the offsets array is not null, then this file has been loaded.
|
||||
//
|
||||
if (_ucprop_offsets != null)
|
||||
return true;
|
||||
|
||||
try {
|
||||
in = where.openStream();
|
||||
} catch (IOException e1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
res = load_file(in);
|
||||
|
||||
try {
|
||||
in.close();
|
||||
} catch (IOException e) {}
|
||||
|
||||
if (res == false)
|
||||
return res;
|
||||
|
||||
hsize = getShort();
|
||||
|
||||
if (((size = (hsize + 1) << 1) & 3) != 0)
|
||||
size += 4 - (size & 3);
|
||||
|
||||
_ucprop_offsets = new short[hsize + 1];
|
||||
|
||||
//
|
||||
// Skip the byte count which won't be needed.
|
||||
//
|
||||
buffpos += 4;
|
||||
|
||||
//
|
||||
// Adjust the byte count used to position at the beginning of the
|
||||
// ranges to include the 4 bytes at the beginning and the byte count
|
||||
// which is unused.
|
||||
//
|
||||
size += 8;
|
||||
|
||||
for (i = 0; i <= hsize; i++)
|
||||
_ucprop_offsets[i] = getShort();
|
||||
|
||||
//
|
||||
// Now allocate the ranges.
|
||||
//
|
||||
_ucprop_ranges = new int[_ucprop_offsets[hsize]];
|
||||
for (i = 0, buffpos = size; i < _ucprop_offsets[hsize]; i++)
|
||||
_ucprop_ranges[i] = getInt();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static void _ucprop_unload() {
|
||||
_ucprop_offsets = null;
|
||||
_ucprop_ranges = null;
|
||||
}
|
||||
|
||||
private static boolean uclookup(int code, int n) {
|
||||
int l, r, m;
|
||||
|
||||
if ((l = _ucprop_offsets[n]) == -1)
|
||||
return false;
|
||||
|
||||
for (m = 1; n + m < _ucprop_offsets.length &&
|
||||
_ucprop_offsets[n + m] == -1; m++) ;
|
||||
|
||||
r = _ucprop_offsets[n + m] - 1;
|
||||
while (l <= r) {
|
||||
m = (l + r) >> 1;
|
||||
m -= (m & 1);
|
||||
if (code > _ucprop_ranges[m + 1])
|
||||
l = m + 2;
|
||||
else if (code < _ucprop_ranges[m])
|
||||
r = m - 2;
|
||||
else if (_ucprop_ranges[m] <= code && code <= _ucprop_ranges[m+1])
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean ucisprop(int code, int mask1, int mask2) {
|
||||
int i;
|
||||
|
||||
if (mask1 == 0 && mask2 == 0)
|
||||
return false;
|
||||
|
||||
if (mask1 != 0) {
|
||||
for (i = 0; i < 32; i++) {
|
||||
if ((mask1 & masks32[i]) != 0 && uclookup(code, i))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (mask2 != 0) {
|
||||
for (i = 32; i < _ucprop_offsets.length; i++) {
|
||||
if ((mask2 & masks32[i & 31]) != 0 && uclookup(code, i))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean ucisalpha(int code) {
|
||||
return ucisprop(code, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0);
|
||||
}
|
||||
public static boolean ucisdigit(int code) {
|
||||
return ucisprop(code, UC_ND, 0);
|
||||
}
|
||||
public static boolean ucisalnum(int code) {
|
||||
return ucisprop(code, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0);
|
||||
}
|
||||
public static boolean uciscntrl(int code) {
|
||||
return ucisprop(code, UC_CC|UC_CF, 0);
|
||||
}
|
||||
public static boolean ucisspace(int code) {
|
||||
return ucisprop(code, UC_ZS|UC_SS, 0);
|
||||
}
|
||||
public static boolean ucisblank(int code) {
|
||||
return ucisprop(code, UC_ZS, 0);
|
||||
}
|
||||
public static boolean ucispunct(int code) {
|
||||
return ucisprop(code, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF);
|
||||
}
|
||||
public static boolean ucisgraph(int code) {
|
||||
return ucisprop(code, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|
|
||||
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|
|
||||
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|
|
||||
UC_SO, UC_PI|UC_PF);
|
||||
}
|
||||
public static boolean ucisprint(int code) {
|
||||
return ucisprop(code, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|
|
||||
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|
|
||||
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|
|
||||
UC_SO|UC_ZS, UC_PI|UC_PF);
|
||||
}
|
||||
public static boolean ucisupper(int code) {
|
||||
return ucisprop(code, UC_LU, 0);
|
||||
}
|
||||
public static boolean ucislower(int code) {
|
||||
return ucisprop(code, UC_LL, 0);
|
||||
}
|
||||
public static boolean ucistitle(int code) {
|
||||
return ucisprop(code, UC_LT, 0);
|
||||
}
|
||||
public static boolean ucisxdigit(int code) {
|
||||
return ucisprop(code, 0, UC_HD);
|
||||
}
|
||||
public static boolean ucisisocntrl(int code) {
|
||||
return ucisprop(code, UC_CC, 0);
|
||||
}
|
||||
public static boolean ucisfmtcntrl(int code) {
|
||||
return ucisprop(code, UC_CF, 0);
|
||||
}
|
||||
public static boolean ucissymbol(int code) {
|
||||
return ucisprop(code, UC_SM|UC_SC|UC_SO|UC_SK, 0);
|
||||
}
|
||||
public static boolean ucisnumber(int code) {
|
||||
return ucisprop(code, UC_ND|UC_NO|UC_NL, 0);
|
||||
}
|
||||
public static boolean ucisnonspacing(int code) {
|
||||
return ucisprop(code, UC_MN, 0);
|
||||
}
|
||||
public static boolean ucisopenpunct(int code) {
|
||||
return ucisprop(code, UC_PS, 0);
|
||||
}
|
||||
public static boolean ucisclosepunct(int code) {
|
||||
return ucisprop(code, UC_PE, 0);
|
||||
}
|
||||
public static boolean ucisinitialpunct(int code) {
|
||||
return ucisprop(code, 0, UC_PI);
|
||||
}
|
||||
public static boolean ucisfinalpunct(int code) {
|
||||
return ucisprop(code, 0, UC_PF);
|
||||
}
|
||||
public static boolean uciscomposite(int code) {
|
||||
return ucisprop(code, 0, UC_CM);
|
||||
}
|
||||
public static boolean ucishex(int code) {
|
||||
return ucisprop(code, 0, UC_HD);
|
||||
}
|
||||
public static boolean ucisquote(int code) {
|
||||
return ucisprop(code, 0, UC_QM);
|
||||
}
|
||||
public static boolean ucissymmetric(int code) {
|
||||
return ucisprop(code, 0, UC_SY);
|
||||
}
|
||||
public static boolean ucismirroring(int code) {
|
||||
return ucisprop(code, 0, UC_MR);
|
||||
}
|
||||
public static boolean ucisnonbreaking(int code) {
|
||||
return ucisprop(code, 0, UC_NB);
|
||||
}
|
||||
public static boolean ucisrtl(int code) {
|
||||
return ucisprop(code, UC_R, 0);
|
||||
}
|
||||
public static boolean ucisltr(int code) {
|
||||
return ucisprop(code, UC_L, 0);
|
||||
}
|
||||
public static boolean ucisstrong(int code) {
|
||||
return ucisprop(code, UC_L|UC_R, 0);
|
||||
}
|
||||
public static boolean ucisweak(int code) {
|
||||
return ucisprop(code, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS);
|
||||
}
|
||||
public static boolean ucisneutral(int code) {
|
||||
return ucisprop(code, 0, UC_B|UC_S|UC_WS|UC_ON);
|
||||
}
|
||||
public static boolean ucisseparator(int code) {
|
||||
return ucisprop(code, 0, UC_B|UC_S);
|
||||
}
|
||||
public static boolean ucismark(int code) {
|
||||
return ucisprop(code, UC_MN|UC_MC|UC_ME, 0);
|
||||
}
|
||||
public static boolean ucismodif(int code) {
|
||||
return ucisprop(code, UC_LM, 0);
|
||||
}
|
||||
public static boolean ucisletnum(int code) {
|
||||
return ucisprop(code, UC_NL, 0);
|
||||
}
|
||||
public static boolean ucisconnect(int code) {
|
||||
return ucisprop(code, UC_PC, 0);
|
||||
}
|
||||
public static boolean ucisdash(int code) {
|
||||
return ucisprop(code, UC_PD, 0);
|
||||
}
|
||||
public static boolean ucismath(int code) {
|
||||
return ucisprop(code, UC_SM, 0);
|
||||
}
|
||||
public static boolean uciscurrency(int code) {
|
||||
return ucisprop(code, UC_SC, 0);
|
||||
}
|
||||
public static boolean ucismodifsymbol(int code) {
|
||||
return ucisprop(code, UC_SK, 0);
|
||||
}
|
||||
public static boolean ucisnsmark(int code) {
|
||||
return ucisprop(code, UC_MN, 0);
|
||||
}
|
||||
public static boolean ucisspmark(int code) {
|
||||
return ucisprop(code, UC_MC, 0);
|
||||
}
|
||||
public static boolean ucisenclosing(int code) {
|
||||
return ucisprop(code, UC_ME, 0);
|
||||
}
|
||||
public static boolean ucisprivate(int code) {
|
||||
return ucisprop(code, UC_CO, 0);
|
||||
}
|
||||
public static boolean ucissurrogate(int code) {
|
||||
return ucisprop(code, UC_OS, 0);
|
||||
}
|
||||
public static boolean ucislsep(int code) {
|
||||
return ucisprop(code, UC_ZL, 0);
|
||||
}
|
||||
public static boolean ucispsep(int code) {
|
||||
return ucisprop(code, UC_ZP, 0);
|
||||
}
|
||||
public static boolean ucisidentstart(int code) {
|
||||
return ucisprop(code, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0);
|
||||
}
|
||||
public static boolean ucisidentpart(int code) {
|
||||
return ucisprop(code, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|
|
||||
UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0);
|
||||
}
|
||||
public static boolean ucisdefined(int code) {
|
||||
return ucisprop(code, 0, UC_CP);
|
||||
}
|
||||
public static boolean ucisundefined(int code) {
|
||||
return (ucisprop(code, 0, UC_CP) == true) ? false : true;
|
||||
}
|
||||
public static boolean ucishan(int code) {
|
||||
return ((0x4e00 <= code && code <= 0x9fff) ||
|
||||
(0xf900 <= code && code <= 0xfaff)) ? true : false;
|
||||
}
|
||||
public static boolean ucishangul(int code) {
|
||||
return (0xac00 <= code && code <= 0xd7ff) ? true : false;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
* Case mapping section.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
private static int[] _uccase_len = {0, 0};
|
||||
private static int[] _uccase_map = null;
|
||||
|
||||
private static boolean _uccase_load(URL where) {
|
||||
int i, n;
|
||||
boolean res;
|
||||
InputStream in = null;
|
||||
|
||||
//
|
||||
// If this array exists, then the file has already been loaded.
|
||||
//
|
||||
if (_uccase_map != null)
|
||||
return true;
|
||||
|
||||
try {
|
||||
in = where.openStream();
|
||||
} catch (IOException e1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
res = load_file(in);
|
||||
|
||||
try {
|
||||
in.close();
|
||||
} catch (IOException e) {}
|
||||
|
||||
if (res == false)
|
||||
return res;
|
||||
|
||||
n = getShort(2) * 3;
|
||||
_uccase_len[0] = getShort() * 3;
|
||||
_uccase_len[1] = getShort() * 3;
|
||||
|
||||
_uccase_map = new int[n];
|
||||
for (i = 0; i < n; i++)
|
||||
_uccase_map[i] = getInt();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static void _uccase_unload() {
|
||||
_uccase_len[0] = _uccase_len[1] = 0;
|
||||
_uccase_map = null;
|
||||
}
|
||||
|
||||
private static int _uccase_lookup(int code, int l, int r, int field) {
|
||||
int m;
|
||||
|
||||
while (l <= r) {
|
||||
m = (l + r) >> 1;
|
||||
m -= (m % 3);
|
||||
if (code > _uccase_map[m])
|
||||
l = m + 3;
|
||||
else if (code < _uccase_map[m])
|
||||
r = m - 3;
|
||||
else
|
||||
return _uccase_map[m + field];
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static int uctoupper(int code) {
|
||||
int l, r, field;
|
||||
|
||||
if (ucisupper(code))
|
||||
return code;
|
||||
|
||||
if (ucislower(code)) {
|
||||
//
|
||||
// Lower case.
|
||||
//
|
||||
field = 2;
|
||||
l = _uccase_len[0];
|
||||
r = (l + _uccase_len[1]) - 3;
|
||||
} else {
|
||||
//
|
||||
// Title case.
|
||||
//
|
||||
field = 1;
|
||||
l = _uccase_len[0] + _uccase_len[1];
|
||||
r = _uccase_map.length - 3;
|
||||
}
|
||||
return _uccase_lookup(code, l, r, field);
|
||||
}
|
||||
|
||||
public static int uctolower(int code) {
|
||||
int l, r, field;
|
||||
|
||||
if (ucislower(code))
|
||||
return code;
|
||||
|
||||
if (ucisupper(code)) {
|
||||
//
|
||||
// Upper case.
|
||||
//
|
||||
field = 1;
|
||||
l = 0;
|
||||
r = _uccase_len[0] - 3;
|
||||
} else {
|
||||
//
|
||||
// Title case.
|
||||
//
|
||||
field = 2;
|
||||
l = _uccase_len[0] + _uccase_len[1];
|
||||
r = _uccase_map.length - 1;
|
||||
}
|
||||
return _uccase_lookup(code, l, r, field);
|
||||
}
|
||||
|
||||
public static int uctotitle(int code) {
|
||||
int l, r, field;
|
||||
|
||||
if (ucistitle(code))
|
||||
return code;
|
||||
|
||||
field = 2;
|
||||
if (ucisupper(code)) {
|
||||
//
|
||||
// Upper case.
|
||||
//
|
||||
l = 0;
|
||||
r = _uccase_len[0] - 3;
|
||||
} else {
|
||||
//
|
||||
// Lower case.
|
||||
//
|
||||
l = _uccase_len[0];
|
||||
r = (l + _uccase_len[1]) - 3;
|
||||
}
|
||||
return _uccase_lookup(code, l, r, field);
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
* Character decomposition section.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
static int _ucdcmp_node_count = 0;
|
||||
static int[] _ucdcmp_data = null;
|
||||
|
||||
private static boolean _ucdcmp_load(URL where) {
|
||||
int i, bcnt;
|
||||
boolean res;
|
||||
InputStream in = null;
|
||||
|
||||
//
|
||||
// If this array is not null, then the file has already been loaded.
|
||||
//
|
||||
if (_ucdcmp_data != null)
|
||||
return true;
|
||||
|
||||
try {
|
||||
in = where.openStream();
|
||||
} catch (IOException e1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
res = load_file(in);
|
||||
|
||||
try {
|
||||
in.close();
|
||||
} catch (IOException e) {}
|
||||
|
||||
if (res == false)
|
||||
return res;
|
||||
|
||||
//
|
||||
// This specifies how many of the _ucdmp_data elements are nodes which
|
||||
// leaves the remaining number to be decompositions.
|
||||
//
|
||||
_ucdcmp_node_count = getShort() << 1;
|
||||
|
||||
bcnt = getInt() >> 2;
|
||||
|
||||
_ucdcmp_data = new int[bcnt];
|
||||
|
||||
for (i = 0; i < bcnt; i++)
|
||||
_ucdcmp_data[i] = getInt();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private static void _ucdcmp_unload() {
|
||||
_ucdcmp_node_count = 0;
|
||||
_ucdcmp_data = null;
|
||||
}
|
||||
|
||||
public static int[] ucdecomp(int code) {
|
||||
int l, r, m, out[];
|
||||
|
||||
l = 0;
|
||||
r = _ucdcmp_data[_ucdcmp_node_count] - 1;
|
||||
|
||||
while (l <= r) {
|
||||
//
|
||||
// Determine a "mid" point and adjust to make sure the mid point
|
||||
// is at the beginning of a code+offset pair.
|
||||
//
|
||||
m = (l + r) >> 1;
|
||||
m -= (m & 1);
|
||||
if (code > _ucdcmp_data[m])
|
||||
l = m + 2;
|
||||
else if (code < _ucdcmp_data[m])
|
||||
r = m - 2;
|
||||
else {
|
||||
l = _ucdcmp_data[m + 3] - _ucdcmp_data[m + 1];
|
||||
out = new int[l];
|
||||
for (r = 0; r < l; r++)
|
||||
out[r] = _ucdcmp_data[_ucdcmp_node_count + 1 +
|
||||
_ucdcmp_data[m + 1] + r];
|
||||
return out;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static int[] ucdecomp_hangul(int code) {
|
||||
int out[], decomp[] = {0, 0, 0};
|
||||
|
||||
if (!ucishangul(code))
|
||||
return null;
|
||||
|
||||
code -= 0xac00;
|
||||
decomp[0] = 0x1100 + (code / 588);
|
||||
decomp[1] = 0x1161 + ((code % 588) / 28);
|
||||
decomp[2] = 0x11a7 + (code % 28);
|
||||
|
||||
out = new int[(decomp[2] != 0x11a7) ? 3 : 2];
|
||||
out[0] = decomp[0];
|
||||
out[1] = decomp[1];
|
||||
if (decomp[0] != 0x11a7)
|
||||
out[2] = decomp[2];
|
||||
return out;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
* Combining class section.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
private static int[] _uccmbcl_nodes = null;
|
||||
|
||||
private static boolean _uccmbcl_load(URL where) {
|
||||
int i, n;
|
||||
boolean res;
|
||||
InputStream in = null;
|
||||
|
||||
//
|
||||
// If this array is not null, the file has already been loaded.
|
||||
//
|
||||
if (_uccmbcl_nodes != null)
|
||||
return true;
|
||||
|
||||
try {
|
||||
in = where.openStream();
|
||||
} catch (IOException e1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
res = load_file(in);
|
||||
|
||||
try {
|
||||
in.close();
|
||||
} catch (IOException e) {}
|
||||
|
||||
if (res == false)
|
||||
return res;
|
||||
|
||||
n = getShort() * 3;
|
||||
|
||||
buffpos += 4;
|
||||
|
||||
_uccmbcl_nodes = new int[n];
|
||||
for (i = 0; i < n; i++)
|
||||
_uccmbcl_nodes[i] = getInt();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static void _uccmbcl_unload() {
|
||||
_uccmbcl_nodes = null;
|
||||
}
|
||||
|
||||
public static int uccombining_class(int code) {
|
||||
int l, r, m;
|
||||
|
||||
l = 0;
|
||||
r = _uccmbcl_nodes.length - 3;
|
||||
|
||||
while (l <= r) {
|
||||
m = (l + r) >> 1;
|
||||
m -= (m % 3);
|
||||
if (code > _uccmbcl_nodes[m + 1])
|
||||
l = m + 3;
|
||||
else if (code < _uccmbcl_nodes[m])
|
||||
r = m - 3;
|
||||
else if (_uccmbcl_nodes[m] <= code &&
|
||||
code <= _uccmbcl_nodes[m + 1])
|
||||
return _uccmbcl_nodes[m + 2];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
* Number section.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
private static short[] _ucnum_vals;
|
||||
private static int[] _ucnum_nodes;
|
||||
|
||||
private static boolean _ucnumb_load(URL where) {
|
||||
int i, n, b;
|
||||
boolean res;
|
||||
InputStream in = null;
|
||||
|
||||
//
|
||||
// If this array is not null, then the file has already been loaded.
|
||||
//
|
||||
if (_ucnum_nodes != null)
|
||||
return true;
|
||||
|
||||
try {
|
||||
in = where.openStream();
|
||||
} catch (IOException e1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
res = load_file(in);
|
||||
|
||||
try {
|
||||
in.close();
|
||||
} catch (IOException e) {}
|
||||
|
||||
if (res == false)
|
||||
return res;
|
||||
|
||||
n = getShort();
|
||||
b = (getInt() - (n << 2)) >> 1;
|
||||
|
||||
_ucnum_nodes = new int[n];
|
||||
for (i = 0; i < n; i++)
|
||||
_ucnum_nodes[i] = getInt();
|
||||
|
||||
_ucnum_vals = new short[b];
|
||||
for (i = 0; i < b; i++)
|
||||
_ucnum_vals[i] = getShort();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static void _ucnumb_unload() {
|
||||
_ucnum_vals = null;
|
||||
_ucnum_nodes = null;
|
||||
}
|
||||
|
||||
public static boolean ucnumber_lookup(int code, int[] result) {
|
||||
int l, r, m;
|
||||
|
||||
result[0] = result[1] = 0;
|
||||
|
||||
l = 0;
|
||||
r = _ucnum_nodes.length - 1;
|
||||
while (l <= r) {
|
||||
m = (l + r) >> 1;
|
||||
m -= (m & 1);
|
||||
if (code > _ucnum_nodes[m])
|
||||
l = m + 2;
|
||||
else if (code < _ucnum_nodes[m])
|
||||
r = m - 2;
|
||||
else {
|
||||
result[0] = _ucnum_vals[_ucnum_nodes[m + 1]];
|
||||
result[1] = _ucnum_vals[_ucnum_nodes[m + 1] + 1];
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean ucdigit_lookup(int code, int[] result) {
|
||||
int l, r, m;
|
||||
|
||||
result[0] = -1;
|
||||
|
||||
l = 0;
|
||||
r = _ucnum_nodes.length - 1;
|
||||
while (l <= r) {
|
||||
m = (l + r) >> 1;
|
||||
m -= (m & 1);
|
||||
if (code > _ucnum_nodes[m])
|
||||
l = m + 2;
|
||||
else if (code < _ucnum_nodes[m])
|
||||
r = m - 2;
|
||||
else {
|
||||
short d1 = _ucnum_vals[_ucnum_nodes[m + 1]];
|
||||
short d2 = _ucnum_vals[_ucnum_nodes[m + 1] + 1];
|
||||
if (d1 == d2) {
|
||||
result[0] = d1;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
* File loading and unloading routines.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
//
|
||||
// Masks that combine to load and unload files using a base URL.
|
||||
//
|
||||
public final static int UCDATA_CASE = 0x01;
|
||||
public final static int UCDATA_CTYPE = 0x02;
|
||||
public final static int UCDATA_DECOMP = 0x04;
|
||||
public final static int UCDATA_CMBCL = 0x08;
|
||||
public final static int UCDATA_NUM = 0x10;
|
||||
public final static int UCDATA_ALL = 0x1f;
|
||||
|
||||
public static void ucdata_load(URL base, int masks) {
|
||||
//
|
||||
// Make sure the base has the trailing slash.
|
||||
//
|
||||
String url = base.toString();
|
||||
if (url.lastIndexOf('/') != url.length() - 1)
|
||||
url += "/";
|
||||
|
||||
if ((masks & UCDATA_CTYPE) != 0) {
|
||||
try {
|
||||
_ucprop_load(new URL(url + "ctype.dat"));
|
||||
} catch (MalformedURLException mue) {}
|
||||
}
|
||||
if ((masks & UCDATA_CASE) != 0) {
|
||||
try {
|
||||
_uccase_load(new URL(url + "case.dat"));
|
||||
} catch (MalformedURLException mue) {}
|
||||
}
|
||||
if ((masks & UCDATA_DECOMP) != 0) {
|
||||
try {
|
||||
_ucdcmp_load(new URL(url + "decomp.dat"));
|
||||
} catch (MalformedURLException mue) {}
|
||||
}
|
||||
if ((masks & UCDATA_CMBCL) != 0) {
|
||||
try {
|
||||
_uccmbcl_load(new URL(url + "cmbcl.dat"));
|
||||
} catch (MalformedURLException mue) {}
|
||||
}
|
||||
if ((masks & UCDATA_NUM) != 0) {
|
||||
try {
|
||||
_ucnumb_load(new URL(url + "num.dat"));
|
||||
} catch (MalformedURLException mue) {}
|
||||
}
|
||||
}
|
||||
|
||||
public static void ucdata_unload(int masks) {
|
||||
if ((masks & UCDATA_CTYPE) != 0)
|
||||
_ucprop_unload();
|
||||
if ((masks & UCDATA_CASE) != 0)
|
||||
_uccase_unload();
|
||||
if ((masks & UCDATA_DECOMP) != 0)
|
||||
_ucdcmp_unload();
|
||||
if ((masks & UCDATA_CMBCL) != 0)
|
||||
_uccmbcl_unload();
|
||||
if ((masks & UCDATA_NUM) != 0)
|
||||
_ucnumb_unload();
|
||||
}
|
||||
}
|
||||
94
libraries/liblunicode/ucdata/UCDataTest.java
Normal file
94
libraries/liblunicode/ucdata/UCDataTest.java
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* $Id: UCDataTest.java,v 1.1 1999/08/23 16:14:08 mleisher Exp $
|
||||
*
|
||||
* Copyright 1999 Computing Research Labs, New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
import java.io.*;
|
||||
import java.net.*;
|
||||
import UCData.*;
|
||||
|
||||
public class UCDataTest {
|
||||
/**********************************************************************
|
||||
*
|
||||
* Main.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
public static void main(String[] args) {
|
||||
URL url = null;
|
||||
|
||||
try {
|
||||
url = new URL("file:/home/mleisher/unicode/textutils/ucdata");
|
||||
} catch (MalformedURLException mue) {}
|
||||
|
||||
UCData.ucdata_load(url, UCData.UCDATA_ALL);
|
||||
|
||||
if (UCData.ucisalpha(0x1d5))
|
||||
System.out.println("0x1d5 is alpha");
|
||||
else
|
||||
System.out.println("0x1d5 is not alpha");
|
||||
|
||||
int c;
|
||||
|
||||
c = UCData.uctolower(0x1f1);
|
||||
System.out.println("0x1f1 lower is 0x"+Integer.toHexString(c));
|
||||
c = UCData.uctotitle(0x1f1);
|
||||
System.out.println("0x1f1 title is 0x"+Integer.toHexString(c));
|
||||
|
||||
c = UCData.uctolower(0xff3a);
|
||||
System.out.println("0xff3a lower is 0x"+Integer.toHexString(c));
|
||||
c = UCData.uctotitle(0xff3a);
|
||||
System.out.println("0xff3a title is 0x"+Integer.toHexString(c));
|
||||
|
||||
int[] decomp = UCData.ucdecomp(0x1d5);
|
||||
if (decomp != null) {
|
||||
System.out.print("0x1d5 decomposition :");
|
||||
for (int i = 0; i < decomp.length; i++)
|
||||
System.out.print("0x"+Integer.toHexString(decomp[i])+" ");
|
||||
System.out.println("");
|
||||
}
|
||||
|
||||
int ccl = UCData.uccombining_class(0x41);
|
||||
System.out.println("0x41 combining class " + ccl);
|
||||
ccl = UCData.uccombining_class(0xfe23);
|
||||
System.out.println("0xfe23 combining class " + ccl);
|
||||
|
||||
int num[] = {0,0};
|
||||
if (UCData.ucnumber_lookup(0x30, num)) {
|
||||
if (num[0] != num[1])
|
||||
System.out.println("0x30 is fraction "+num[0]+"/"+num[1]);
|
||||
else
|
||||
System.out.println("0x30 is digit "+num[0]);
|
||||
}
|
||||
|
||||
if (UCData.ucnumber_lookup(0xbc, num)) {
|
||||
if (num[0] != num[1])
|
||||
System.out.println("0xbc is fraction "+num[0]+"/"+num[1]);
|
||||
else
|
||||
System.out.println("0xbc is digit "+num[0]);
|
||||
}
|
||||
|
||||
if (UCData.ucdigit_lookup(0x6f9, num))
|
||||
System.out.println("0x6f9 is digit " + num[0]);
|
||||
else
|
||||
System.out.println("0x6f9 is not a digit");
|
||||
}
|
||||
}
|
||||
343
libraries/liblunicode/ucdata/api.txt
Normal file
343
libraries/liblunicode/ucdata/api.txt
Normal file
|
|
@ -0,0 +1,343 @@
|
|||
#
|
||||
# $Id: api.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $
|
||||
#
|
||||
|
||||
The MUTT UCData API
|
||||
-------------------
|
||||
|
||||
|
||||
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
Macros that combine to select data tables for ucdata_load(), ucdata_unload(),
|
||||
and ucdata_reload().
|
||||
|
||||
#define UCDATA_CASE 0x01
|
||||
#define UCDATA_CTYPE 0x02
|
||||
#define UCDATA_DECOMP 0x04
|
||||
#define UCDATA_CMBCL 0x08
|
||||
#define UCDATA_NUM 0x10
|
||||
#define UCATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
|
||||
UCDATA_CMBCL|UCDATA_NUM)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
void ucdata_load(char *paths, int masks)
|
||||
|
||||
This function initializes the UCData library by locating the data files in
|
||||
one of the colon-separated directories in the `paths' parameter. The data
|
||||
files to be loaded are specified in the `masks' parameter as a bitwise
|
||||
combination of the macros listed above.
|
||||
|
||||
This should be called before using any of the other functions.
|
||||
|
||||
NOTE: the ucdata_setup(char *paths) function is now a macro that expands
|
||||
into this function at compile time.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
void ucdata_unload(int masks)
|
||||
|
||||
This function unloads the data tables specified in the `masks' parameter.
|
||||
|
||||
This function should be called when the application is done using the UCData
|
||||
package.
|
||||
|
||||
NOTE: the ucdata_cleanup() function is now a macro that expands into this
|
||||
function at compile time.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
void ucdata_reload(char *paths, int masks)
|
||||
|
||||
This function reloads the data files from one of the colon-separated
|
||||
directories in the `paths' parameter. The data files to be reloaded are
|
||||
specified in the `masks' parameter as a bitwise combination of the macros
|
||||
listed above.
|
||||
|
||||
If the data files have already been loaded, they are unloaded before the
|
||||
data files are loaded again.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
|
||||
|
||||
This function determines if a character has a decomposition and returns the
|
||||
decomposition information if it exists.
|
||||
|
||||
If a zero is returned, there is no decomposition. If a non-zero is
|
||||
returned, then the `num' and `decomp' variables are filled in with the
|
||||
appropriate values.
|
||||
|
||||
Example call:
|
||||
|
||||
unsigned long i, num, *decomp;
|
||||
|
||||
if (ucdecomp(0x1d5, &num, &decomp) != 0) {
|
||||
for (i = 0; i < num; i++)
|
||||
printf("0x%08lX,", decomp[i]);
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int ucdecomp_hangul(unsigned long code, unsigned long *num,
|
||||
unsigned long decomp[])
|
||||
|
||||
This function determines if a Hangul syllable has a decomposition and
|
||||
returns the decomposition information.
|
||||
|
||||
An array of at least size 3 should be passed to the function for the
|
||||
decomposition of the syllable.
|
||||
|
||||
If a zero is returned, the character is not a Hangul syllable. If a
|
||||
non-zero is returned, the `num' field will be 2 or 3 and the syllable will
|
||||
be decomposed into the `decomp' array arithmetically.
|
||||
|
||||
Example call:
|
||||
|
||||
unsigned long i, num, decomp[3];
|
||||
|
||||
if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) {
|
||||
for (i = 0; i < num; i++)
|
||||
printf("0x%08lX,", decomp[i]);
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
struct ucnumber {
|
||||
int numerator;
|
||||
int denominator;
|
||||
};
|
||||
|
||||
int ucnumber_lookup(unsigned long code, struct ucnumber *num)
|
||||
|
||||
This function determines if the code is a number and fills in the `num'
|
||||
field with the numerator and denominator. If the code happens to be a
|
||||
single digit, the numerator and denominator fields will be the same.
|
||||
|
||||
If the function returns 0, the code is not a number. Any other return
|
||||
value means the code is a number.
|
||||
|
||||
int ucdigit_lookup(unsigned long code, int *digit)
|
||||
|
||||
This function determines if the code is a digit and fills in the `digit'
|
||||
field with the digit value.
|
||||
|
||||
If the function returns 0, the code is not a number. Any other return
|
||||
value means the code is a number.
|
||||
|
||||
struct ucnumber ucgetnumber(unsigned long code)
|
||||
|
||||
This is a compatibility function with John Cowan's "uctype" package. It
|
||||
uses ucnumber_lookup().
|
||||
|
||||
int ucgetdigit(unsigned long code)
|
||||
|
||||
This is a compatibility function with John Cowan's "uctype" package. It
|
||||
uses ucdigit_lookup().
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
unsigned long uctoupper(unsigned long code)
|
||||
|
||||
This function returns the code unchanged if it is already upper case or has
|
||||
no upper case equivalent. Otherwise the upper case equivalent is returned.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
unsigned long uctolower(unsigned long code)
|
||||
|
||||
This function returns the code unchanged if it is already lower case or has
|
||||
no lower case equivalent. Otherwise the lower case equivalent is returned.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
unsigned long uctotitle(unsigned long code)
|
||||
|
||||
This function returns the code unchanged if it is already title case or has
|
||||
no title case equivalent. Otherwise the title case equivalent is returned.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int ucisalpha(unsigned long code)
|
||||
int ucisalnum(unsigned long code)
|
||||
int ucisdigit(unsigned long code)
|
||||
int uciscntrl(unsigned long code)
|
||||
int ucisspace(unsigned long code)
|
||||
int ucisblank(unsigned long code)
|
||||
int ucispunct(unsigned long code)
|
||||
int ucisgraph(unsigned long code)
|
||||
int ucisprint(unsigned long code)
|
||||
int ucisxdigit(unsigned long code)
|
||||
|
||||
int ucisupper(unsigned long code)
|
||||
int ucislower(unsigned long code)
|
||||
int ucistitle(unsigned long code)
|
||||
|
||||
These functions (actually macros) determine if a character has these
|
||||
properties. These behave in a fashion very similar to the venerable ctype
|
||||
package.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int ucisisocntrl(unsigned long code)
|
||||
|
||||
Is the character a C0 control character (< 32) ?
|
||||
|
||||
int ucisfmtcntrl(unsigned long code)
|
||||
|
||||
Is the character a format control character?
|
||||
|
||||
int ucissymbol(unsigned long code)
|
||||
|
||||
Is the character a symbol?
|
||||
|
||||
int ucisnumber(unsigned long code)
|
||||
|
||||
Is the character a number or digit?
|
||||
|
||||
int ucisnonspacing(unsigned long code)
|
||||
|
||||
Is the character non-spacing?
|
||||
|
||||
int ucisopenpunct(unsigned long code)
|
||||
|
||||
Is the character an open/left punctuation (i.e. '[')
|
||||
|
||||
int ucisclosepunct(unsigned long code)
|
||||
|
||||
Is the character an close/right punctuation (i.e. ']')
|
||||
|
||||
int ucisinitialpunct(unsigned long code)
|
||||
|
||||
Is the character an initial punctuation (i.e. U+2018 LEFT SINGLE QUOTATION
|
||||
MARK)
|
||||
|
||||
int ucisfinalpunct(unsigned long code)
|
||||
|
||||
Is the character a final punctuation (i.e. U+2019 RIGHT SINGLE QUOTATION
|
||||
MARK)
|
||||
|
||||
int uciscomposite(unsigned long code)
|
||||
|
||||
Can the character be decomposed into a set of other characters?
|
||||
|
||||
int ucisquote(unsigned long code)
|
||||
|
||||
Is the character one of the many quotation marks?
|
||||
|
||||
int ucissymmetric(unsigned long code)
|
||||
|
||||
Is the character one that has an opposite form (i.e. <>)
|
||||
|
||||
int ucismirroring(unsigned long code)
|
||||
|
||||
Is the character mirroring (superset of symmetric)?
|
||||
|
||||
int ucisnonbreaking(unsigned long code)
|
||||
|
||||
Is the character non-breaking (i.e. non-breaking space)?
|
||||
|
||||
int ucisrtl(unsigned long code)
|
||||
|
||||
Does the character have strong right-to-left directionality (i.e. Arabic
|
||||
letters)?
|
||||
|
||||
int ucisltr(unsigned long code)
|
||||
|
||||
Does the character have strong left-to-right directionality (i.e. Latin
|
||||
letters)?
|
||||
|
||||
int ucisstrong(unsigned long code)
|
||||
|
||||
Does the character have strong directionality?
|
||||
|
||||
int ucisweak(unsigned long code)
|
||||
|
||||
Does the character have weak directionality (i.e. numbers)?
|
||||
|
||||
int ucisneutral(unsigned long code)
|
||||
|
||||
Does the character have neutral directionality (i.e. whitespace)?
|
||||
|
||||
int ucisseparator(unsigned long code)
|
||||
|
||||
Is the character a block or segment separator?
|
||||
|
||||
int ucislsep(unsigned long code)
|
||||
|
||||
Is the character a line separator?
|
||||
|
||||
int ucispsep(unsigned long code)
|
||||
|
||||
Is the character a paragraph separator?
|
||||
|
||||
int ucismark(unsigned long code)
|
||||
|
||||
Is the character a mark of some kind?
|
||||
|
||||
int ucisnsmark(unsigned long code)
|
||||
|
||||
Is the character a non-spacing mark?
|
||||
|
||||
int ucisspmark(unsigned long code)
|
||||
|
||||
Is the character a spacing mark?
|
||||
|
||||
int ucismodif(unsigned long code)
|
||||
|
||||
Is the character a modifier letter?
|
||||
|
||||
int ucismodifsymbol(unsigned long code)
|
||||
|
||||
Is the character a modifier symbol?
|
||||
|
||||
int ucisletnum(unsigned long code)
|
||||
|
||||
Is the character a number represented by a letter?
|
||||
|
||||
int ucisconnect(unsigned long code)
|
||||
|
||||
Is the character connecting punctuation?
|
||||
|
||||
int ucisdash(unsigned long code)
|
||||
|
||||
Is the character dash punctuation?
|
||||
|
||||
int ucismath(unsigned long code)
|
||||
|
||||
Is the character a math character?
|
||||
|
||||
int uciscurrency(unsigned long code)
|
||||
|
||||
Is the character a currency character?
|
||||
|
||||
int ucisenclosing(unsigned long code)
|
||||
|
||||
Is the character enclosing (i.e. enclosing box)?
|
||||
|
||||
int ucisprivate(unsigned long code)
|
||||
|
||||
Is the character from the Private Use Area?
|
||||
|
||||
int ucissurrogate(unsigned long code)
|
||||
|
||||
Is the character one of the surrogate codes?
|
||||
|
||||
int ucisdefined(unsigned long code)
|
||||
|
||||
Is the character defined (appeared in one of the data files)?
|
||||
|
||||
int ucisundefined(unsigned long code)
|
||||
|
||||
Is the character not defined (non-Unicode)?
|
||||
|
||||
int ucishan(unsigned long code)
|
||||
|
||||
Is the character a Han ideograph?
|
||||
|
||||
int ucishangul(unsigned long code)
|
||||
|
||||
Is the character a pre-composed Hangul syllable?
|
||||
84
libraries/liblunicode/ucdata/bidiapi.txt
Normal file
84
libraries/liblunicode/ucdata/bidiapi.txt
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
#
|
||||
# $Id: bidiapi.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $
|
||||
#
|
||||
|
||||
"Pretty Good Bidi Algorithm" API
|
||||
|
||||
The PGBA (Pretty Good Bidi Algorithm) is an effective alternative to the
|
||||
Unicode BiDi algorithm. It currently provides only implicit reordering and
|
||||
does not yet support explicit reordering codes that the Unicode BiDi algorithm
|
||||
supports. In addition to reordering, the PGBA includes cursor movement
|
||||
support for both visual and logical navigation.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
#define UCPGBA_LTR 0
|
||||
#define UCPGBA_RTL 1
|
||||
|
||||
These macros appear in the `direction' field of the data structures.
|
||||
|
||||
#define UCPGBA_CURSOR_VISUAL 0
|
||||
#define UCPGBA_CURSOR_LOGICAL 1
|
||||
|
||||
These macros are used to set the cursor movement for each reordered string.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
ucstring_t *ucstring_create(unsigned long *source, unsigned long start,
|
||||
unsigned long end, int default_direction,
|
||||
int cursor_motion)
|
||||
|
||||
This function will create a reordered string by using the implicit
|
||||
directionality of the characters in the specified substring.
|
||||
|
||||
The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL
|
||||
and is used only in cases where a string contains no characters with strong
|
||||
directionality.
|
||||
|
||||
The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or
|
||||
UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion
|
||||
behavior. This behavior can be switched at any time using
|
||||
ustring_set_cursor_motion().
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
void ucstring_free(ucstring_t *string)
|
||||
|
||||
This function will deallocate the memory used by the string, incuding the
|
||||
string itself.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
void ucstring_cursor_info(ustring_t *string, int *direction,
|
||||
unsigned long *position)
|
||||
|
||||
This function will return the text position of the internal cursor and the
|
||||
directionality of the text at that position. The position returned is the
|
||||
original text position of the character.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion)
|
||||
|
||||
This function will change the cursor motion type and return the previous
|
||||
cursor motion type.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int ucstring_cursor_right(ucstring_t *string, int count)
|
||||
|
||||
This function will move the internal cursor to the right according to the
|
||||
type of cursor motion set for the string.
|
||||
|
||||
If no cursor motion is performed, it returns 0. Otherwise it will return a
|
||||
1.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
int ucstring_cursor_left(ucstring_t *string, int count)
|
||||
|
||||
This function will move the internal cursor to the left according to the
|
||||
type of cursor motion set for the string.
|
||||
|
||||
If no cursor motion is performed, it returns 0. Otherwise it will return a
|
||||
1.
|
||||
243
libraries/liblunicode/ucdata/format.txt
Normal file
243
libraries/liblunicode/ucdata/format.txt
Normal file
|
|
@ -0,0 +1,243 @@
|
|||
#
|
||||
# $Id: format.txt,v 1.1 1998/07/24 15:17:21 mleisher Exp $
|
||||
#
|
||||
|
||||
CHARACTER DATA
|
||||
==============
|
||||
|
||||
This package generates some data files that contain character properties useful
|
||||
for text processing.
|
||||
|
||||
CHARACTER PROPERTIES
|
||||
====================
|
||||
|
||||
The first data file is called "ctype.dat" and contains a compressed form of
|
||||
the character properties found in the Unicode Character Database (UCDB).
|
||||
Additional properties can be specified in limited UCDB format in another file
|
||||
to avoid modifying the original UCDB.
|
||||
|
||||
The following is a property name and code table to be used with the character
|
||||
data:
|
||||
|
||||
NAME CODE DESCRIPTION
|
||||
---------------------
|
||||
Mn 0 Mark, Non-Spacing
|
||||
Mc 1 Mark, Spacing Combining
|
||||
Me 2 Mark, Enclosing
|
||||
Nd 3 Number, Decimal Digit
|
||||
Nl 4 Number, Letter
|
||||
No 5 Number, Other
|
||||
Zs 6 Separator, Space
|
||||
Zl 7 Separator, Line
|
||||
Zp 8 Separator, Paragraph
|
||||
Cc 9 Other, Control
|
||||
Cf 10 Other, Format
|
||||
Cs 11 Other, Surrogate
|
||||
Co 12 Other, Private Use
|
||||
Cn 13 Other, Not Assigned
|
||||
Lu 14 Letter, Uppercase
|
||||
Ll 15 Letter, Lowercase
|
||||
Lt 16 Letter, Titlecase
|
||||
Lm 17 Letter, Modifier
|
||||
Lo 18 Letter, Other
|
||||
Pc 19 Punctuation, Connector
|
||||
Pd 20 Punctuation, Dash
|
||||
Ps 21 Punctuation, Open
|
||||
Pe 22 Punctuation, Close
|
||||
Po 23 Punctuation, Other
|
||||
Sm 24 Symbol, Math
|
||||
Sc 25 Symbol, Currency
|
||||
Sk 26 Symbol, Modifier
|
||||
So 27 Symbol, Other
|
||||
L 28 Left-To-Right
|
||||
R 29 Right-To-Left
|
||||
EN 30 European Number
|
||||
ES 31 European Number Separator
|
||||
ET 32 European Number Terminator
|
||||
AN 33 Arabic Number
|
||||
CS 34 Common Number Separator
|
||||
B 35 Block Separator
|
||||
S 36 Segment Separator
|
||||
WS 37 Whitespace
|
||||
ON 38 Other Neutrals
|
||||
Pi 47 Punctuation, Initial
|
||||
Pf 48 Punctuation, Final
|
||||
#
|
||||
# Implementation specific properties.
|
||||
#
|
||||
Cm 39 Composite
|
||||
Nb 40 Non-Breaking
|
||||
Sy 41 Symmetric (characters which are part of open/close pairs)
|
||||
Hd 42 Hex Digit
|
||||
Qm 43 Quote Mark
|
||||
Mr 44 Mirroring
|
||||
Ss 45 Space, Other (controls viewed as spaces in ctype isspace())
|
||||
Cp 46 Defined character
|
||||
|
||||
The actual binary data is formatted as follows:
|
||||
|
||||
Assumptions: unsigned short is at least 16-bits in size and unsigned long
|
||||
is at least 32-bits in size.
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short OffsetArraySize
|
||||
unsigned long Bytes
|
||||
unsigned short Offsets[OffsetArraySize + 1]
|
||||
unsigned long Ranges[N], N = value of Offsets[OffsetArraySize]
|
||||
|
||||
The Bytes field provides the total byte count used for the Offsets[] and
|
||||
Ranges[] arrays. The Offsets[] array is aligned on a 4-byte boundary and
|
||||
there is always one extra node on the end to hold the final index of the
|
||||
Ranges[] array. The Ranges[] array contains pairs of 4-byte values
|
||||
representing a range of Unicode characters. The pairs are arranged in
|
||||
increasing order by the first character code in the range.
|
||||
|
||||
Determining if a particular character is in the property list requires a
|
||||
simple binary search to determine if a character is in any of the ranges
|
||||
for the property.
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a
|
||||
machine with a different endian order and the values must be byte-swapped.
|
||||
|
||||
To swap a 16-bit value:
|
||||
c = (c >> 8) | ((c & 0xff) << 8)
|
||||
|
||||
To swap a 32-bit value:
|
||||
c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) |
|
||||
(((c >> 16) & 0xff) << 8) | (c >> 24)
|
||||
|
||||
CASE MAPPINGS
|
||||
=============
|
||||
|
||||
The next data file is called "case.dat" and contains three case mapping tables
|
||||
in the following order: upper, lower, and title case. Each table is in
|
||||
increasing order by character code and each mapping contains 3 unsigned longs
|
||||
which represent the possible mappings.
|
||||
|
||||
The format for the binary form of these tables is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumMappingNodes, count of all mapping nodes
|
||||
unsigned short CaseTableSizes[2], upper and lower mapping node counts
|
||||
unsigned long CaseTables[NumMappingNodes]
|
||||
|
||||
The starting indexes of the case tables are calculated as following:
|
||||
|
||||
UpperIndex = 0;
|
||||
LowerIndex = CaseTableSizes[0] * 3;
|
||||
TitleIndex = LowerIndex + CaseTableSizes[1] * 3;
|
||||
|
||||
The order of the fields for the three tables are:
|
||||
|
||||
Upper case
|
||||
----------
|
||||
unsigned long upper;
|
||||
unsigned long lower;
|
||||
unsigned long title;
|
||||
|
||||
Lower case
|
||||
----------
|
||||
unsigned long lower;
|
||||
unsigned long upper;
|
||||
unsigned long title;
|
||||
|
||||
Title case
|
||||
----------
|
||||
unsigned long title;
|
||||
unsigned long upper;
|
||||
unsigned long lower;
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
Because the tables are in increasing order by character code, locating a
|
||||
mapping requires a simple binary search on one of the 3 codes that make up
|
||||
each node.
|
||||
|
||||
It is important to note that there can only be 65536 mapping nodes which
|
||||
divided into 3 portions allows 21845 nodes for each case mapping table. The
|
||||
distribution of mappings may be more or less than 21845 per table, but only
|
||||
65536 are allowed.
|
||||
|
||||
DECOMPOSITIONS
|
||||
==============
|
||||
|
||||
The next data file is called "decomp.dat" and contains the decomposition data
|
||||
for all characters with decompositions containing more than one character and
|
||||
are *not* compatibility decompositions. Compatibility decompositions are
|
||||
signaled in the UCDB format by the use of the <compat> tag in the
|
||||
decomposition field. Each list of character codes represents a full
|
||||
decomposition of a composite character. The nodes are arranged in increasing
|
||||
order by character code.
|
||||
|
||||
The format for the binary form of this table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumDecompNodes, count of all decomposition nodes
|
||||
unsigned long Bytes
|
||||
unsigned long DecompNodes[(NumDecompNodes * 2) + 1]
|
||||
unsigned long Decomp[N], N = sum of all counts in DecompNodes[]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The DecompNodes[] array consists of pairs of unsigned longs, the first of
|
||||
which is the character code and the second is the initial index of the list
|
||||
of character codes representing the decomposition.
|
||||
|
||||
Locating the decomposition of a composite character requires a binary search
|
||||
for a character code in the DecompNodes[] array and using its index to
|
||||
locate the start of the decomposition. The length of the decomposition list
|
||||
is the index in the following element in DecompNode[] minus the current
|
||||
index.
|
||||
|
||||
COMBINING CLASSES
|
||||
=================
|
||||
|
||||
The fourth data file is called "cmbcl.dat" and contains the characters with
|
||||
non-zero combining classes.
|
||||
|
||||
The format for the binary form of this table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumCCLNodes
|
||||
unsigned long Bytes
|
||||
unsigned long CCLNodes[NumCCLNodes * 3]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The CCLNodes[] array consists of groups of three unsigned longs. The first
|
||||
and second are the beginning and ending of a range and the third is the
|
||||
combining class of that range.
|
||||
|
||||
If a character is not found in this table, then the combining class is
|
||||
assumed to be 0.
|
||||
|
||||
It is important to note that only 65536 distinct ranges plus combining class
|
||||
can be specified because the NumCCLNodes is usually a 16-bit number.
|
||||
|
||||
NUMBER TABLE
|
||||
============
|
||||
|
||||
The final data file is called "num.dat" and contains the characters that have
|
||||
a numeric value associated with them.
|
||||
|
||||
The format for the binary form of the table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumNumberNodes
|
||||
unsigned long Bytes
|
||||
unsigned long NumberNodes[NumNumberNodes]
|
||||
unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long)))
|
||||
/ sizeof(short)]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The NumberNodes array contains pairs of values, the first of which is the
|
||||
character code and the second an index into the ValueNodes array. The
|
||||
ValueNodes array contains pairs of integers which represent the numerator
|
||||
and denominator of the numeric value of the character. If the character
|
||||
happens to map to an integer, both the values in ValueNodes will be the
|
||||
same.
|
||||
1161
libraries/liblunicode/ucdata/ucdata.c
Normal file
1161
libraries/liblunicode/ucdata/ucdata.c
Normal file
File diff suppressed because it is too large
Load diff
306
libraries/liblunicode/ucdata/ucdata.h
Normal file
306
libraries/liblunicode/ucdata/ucdata.h
Normal file
|
|
@ -0,0 +1,306 @@
|
|||
/*
|
||||
* Copyright 1999 Computing Research Labs, New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef _h_ucdata
|
||||
#define _h_ucdata
|
||||
|
||||
/*
|
||||
* $Id: ucdata.h,v 1.5 1999/11/19 15:24:29 mleisher Exp $
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#undef __
|
||||
#ifdef __STDC__
|
||||
#define __(x) x
|
||||
#else
|
||||
#define __(x) ()
|
||||
#endif
|
||||
|
||||
#define UCDATA_VERSION "2.3"
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Masks and macros for character properties.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/*
|
||||
* Values that can appear in the `mask1' parameter of the ucisprop()
|
||||
* function.
|
||||
*/
|
||||
#define UC_MN 0x00000001 /* Mark, Non-Spacing */
|
||||
#define UC_MC 0x00000002 /* Mark, Spacing Combining */
|
||||
#define UC_ME 0x00000004 /* Mark, Enclosing */
|
||||
#define UC_ND 0x00000008 /* Number, Decimal Digit */
|
||||
#define UC_NL 0x00000010 /* Number, Letter */
|
||||
#define UC_NO 0x00000020 /* Number, Other */
|
||||
#define UC_ZS 0x00000040 /* Separator, Space */
|
||||
#define UC_ZL 0x00000080 /* Separator, Line */
|
||||
#define UC_ZP 0x00000100 /* Separator, Paragraph */
|
||||
#define UC_CC 0x00000200 /* Other, Control */
|
||||
#define UC_CF 0x00000400 /* Other, Format */
|
||||
#define UC_OS 0x00000800 /* Other, Surrogate */
|
||||
#define UC_CO 0x00001000 /* Other, Private Use */
|
||||
#define UC_CN 0x00002000 /* Other, Not Assigned */
|
||||
#define UC_LU 0x00004000 /* Letter, Uppercase */
|
||||
#define UC_LL 0x00008000 /* Letter, Lowercase */
|
||||
#define UC_LT 0x00010000 /* Letter, Titlecase */
|
||||
#define UC_LM 0x00020000 /* Letter, Modifier */
|
||||
#define UC_LO 0x00040000 /* Letter, Other */
|
||||
#define UC_PC 0x00080000 /* Punctuation, Connector */
|
||||
#define UC_PD 0x00100000 /* Punctuation, Dash */
|
||||
#define UC_PS 0x00200000 /* Punctuation, Open */
|
||||
#define UC_PE 0x00400000 /* Punctuation, Close */
|
||||
#define UC_PO 0x00800000 /* Punctuation, Other */
|
||||
#define UC_SM 0x01000000 /* Symbol, Math */
|
||||
#define UC_SC 0x02000000 /* Symbol, Currency */
|
||||
#define UC_SK 0x04000000 /* Symbol, Modifier */
|
||||
#define UC_SO 0x08000000 /* Symbol, Other */
|
||||
#define UC_L 0x10000000 /* Left-To-Right */
|
||||
#define UC_R 0x20000000 /* Right-To-Left */
|
||||
#define UC_EN 0x40000000 /* European Number */
|
||||
#define UC_ES 0x80000000 /* European Number Separator */
|
||||
|
||||
/*
|
||||
* Values that can appear in the `mask2' parameter of the ucisprop()
|
||||
* function.
|
||||
*/
|
||||
#define UC_ET 0x00000001 /* European Number Terminator */
|
||||
#define UC_AN 0x00000002 /* Arabic Number */
|
||||
#define UC_CS 0x00000004 /* Common Number Separator */
|
||||
#define UC_B 0x00000008 /* Block Separator */
|
||||
#define UC_S 0x00000010 /* Segment Separator */
|
||||
#define UC_WS 0x00000020 /* Whitespace */
|
||||
#define UC_ON 0x00000040 /* Other Neutrals */
|
||||
/*
|
||||
* Implementation specific character properties.
|
||||
*/
|
||||
#define UC_CM 0x00000080 /* Composite */
|
||||
#define UC_NB 0x00000100 /* Non-Breaking */
|
||||
#define UC_SY 0x00000200 /* Symmetric */
|
||||
#define UC_HD 0x00000400 /* Hex Digit */
|
||||
#define UC_QM 0x00000800 /* Quote Mark */
|
||||
#define UC_MR 0x00001000 /* Mirroring */
|
||||
#define UC_SS 0x00002000 /* Space, other */
|
||||
|
||||
#define UC_CP 0x00004000 /* Defined */
|
||||
|
||||
/*
|
||||
* Added for UnicodeData-2.1.3.
|
||||
*/
|
||||
#define UC_PI 0x00008000 /* Punctuation, Initial */
|
||||
#define UC_PF 0x00010000 /* Punctuation, Final */
|
||||
|
||||
/*
|
||||
* This is the primary function for testing to see if a character has some set
|
||||
* of properties. The macros that test for various character properties all
|
||||
* call this function with some set of masks.
|
||||
*/
|
||||
extern int ucisprop __((unsigned long code, unsigned long mask1,
|
||||
unsigned long mask2));
|
||||
|
||||
#define ucisalpha(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0)
|
||||
#define ucisdigit(cc) ucisprop(cc, UC_ND, 0)
|
||||
#define ucisalnum(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0)
|
||||
#define uciscntrl(cc) ucisprop(cc, UC_CC|UC_CF, 0)
|
||||
#define ucisspace(cc) ucisprop(cc, UC_ZS|UC_SS, 0)
|
||||
#define ucisblank(cc) ucisprop(cc, UC_ZS, 0)
|
||||
#define ucispunct(cc) ucisprop(cc, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF)
|
||||
#define ucisgraph(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
|
||||
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
|
||||
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
|
||||
UC_SO, UC_PI|UC_PF)
|
||||
#define ucisprint(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
|
||||
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
|
||||
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
|
||||
UC_SO|UC_ZS, UC_PI|UC_PF)
|
||||
#define ucisupper(cc) ucisprop(cc, UC_LU, 0)
|
||||
#define ucislower(cc) ucisprop(cc, UC_LL, 0)
|
||||
#define ucistitle(cc) ucisprop(cc, UC_LT, 0)
|
||||
#define ucisxdigit(cc) ucisprop(cc, 0, UC_HD)
|
||||
|
||||
#define ucisisocntrl(cc) ucisprop(cc, UC_CC, 0)
|
||||
#define ucisfmtcntrl(cc) ucisprop(cc, UC_CF, 0)
|
||||
|
||||
#define ucissymbol(cc) ucisprop(cc, UC_SM|UC_SC|UC_SO|UC_SK, 0)
|
||||
#define ucisnumber(cc) ucisprop(cc, UC_ND|UC_NO|UC_NL, 0)
|
||||
#define ucisnonspacing(cc) ucisprop(cc, UC_MN, 0)
|
||||
#define ucisopenpunct(cc) ucisprop(cc, UC_PS, 0)
|
||||
#define ucisclosepunct(cc) ucisprop(cc, UC_PE, 0)
|
||||
#define ucisinitialpunct(cc) ucisprop(cc, 0, UC_PI)
|
||||
#define ucisfinalpunct(cc) ucisprop(cc, 0, UC_PF)
|
||||
|
||||
#define uciscomposite(cc) ucisprop(cc, 0, UC_CM)
|
||||
#define ucishex(cc) ucisprop(cc, 0, UC_HD)
|
||||
#define ucisquote(cc) ucisprop(cc, 0, UC_QM)
|
||||
#define ucissymmetric(cc) ucisprop(cc, 0, UC_SY)
|
||||
#define ucismirroring(cc) ucisprop(cc, 0, UC_MR)
|
||||
#define ucisnonbreaking(cc) ucisprop(cc, 0, UC_NB)
|
||||
|
||||
/*
|
||||
* Directionality macros.
|
||||
*/
|
||||
#define ucisrtl(cc) ucisprop(cc, UC_R, 0)
|
||||
#define ucisltr(cc) ucisprop(cc, UC_L, 0)
|
||||
#define ucisstrong(cc) ucisprop(cc, UC_L|UC_R, 0)
|
||||
#define ucisweak(cc) ucisprop(cc, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS)
|
||||
#define ucisneutral(cc) ucisprop(cc, 0, UC_B|UC_S|UC_WS|UC_ON)
|
||||
#define ucisseparator(cc) ucisprop(cc, 0, UC_B|UC_S)
|
||||
|
||||
/*
|
||||
* Other macros inspired by John Cowan.
|
||||
*/
|
||||
#define ucismark(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME, 0)
|
||||
#define ucismodif(cc) ucisprop(cc, UC_LM, 0)
|
||||
#define ucisletnum(cc) ucisprop(cc, UC_NL, 0)
|
||||
#define ucisconnect(cc) ucisprop(cc, UC_PC, 0)
|
||||
#define ucisdash(cc) ucisprop(cc, UC_PD, 0)
|
||||
#define ucismath(cc) ucisprop(cc, UC_SM, 0)
|
||||
#define uciscurrency(cc) ucisprop(cc, UC_SC, 0)
|
||||
#define ucismodifsymbol(cc) ucisprop(cc, UC_SK, 0)
|
||||
#define ucisnsmark(cc) ucisprop(cc, UC_MN, 0)
|
||||
#define ucisspmark(cc) ucisprop(cc, UC_MC, 0)
|
||||
#define ucisenclosing(cc) ucisprop(cc, UC_ME, 0)
|
||||
#define ucisprivate(cc) ucisprop(cc, UC_CO, 0)
|
||||
#define ucissurrogate(cc) ucisprop(cc, UC_OS, 0)
|
||||
#define ucislsep(cc) ucisprop(cc, UC_ZL, 0)
|
||||
#define ucispsep(cc) ucisprop(cc, UC_ZP, 0)
|
||||
|
||||
#define ucisidentstart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0)
|
||||
#define ucisidentpart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|\
|
||||
UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0)
|
||||
|
||||
#define ucisdefined(cc) ucisprop(cc, 0, UC_CP)
|
||||
#define ucisundefined(cc) !ucisprop(cc, 0, UC_CP)
|
||||
|
||||
/*
|
||||
* Other miscellaneous character property macros.
|
||||
*/
|
||||
#define ucishan(cc) (((cc) >= 0x4e00 && (cc) <= 0x9fff) ||\
|
||||
((cc) >= 0xf900 && (cc) <= 0xfaff))
|
||||
#define ucishangul(cc) ((cc) >= 0xac00 && (cc) <= 0xd7ff)
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions for case conversion.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
extern unsigned long uctoupper __((unsigned long code));
|
||||
extern unsigned long uctolower __((unsigned long code));
|
||||
extern unsigned long uctotitle __((unsigned long code));
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions for getting decompositions.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/*
|
||||
* This routine determines if the code has a decomposition. If it returns 0,
|
||||
* there is no decomposition. Any other value indicates a decomposition was
|
||||
* returned.
|
||||
*/
|
||||
extern int ucdecomp __((unsigned long code, unsigned long *num,
|
||||
|
||||
unsigned long **decomp));
|
||||
|
||||
/*
|
||||
* If the code is a Hangul syllable, this routine decomposes it into the array
|
||||
* passed. The array size should be at least 3.
|
||||
*/
|
||||
extern int ucdecomp_hangul __((unsigned long code, unsigned long *num,
|
||||
unsigned long decomp[]));
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions for getting combining classes.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/*
|
||||
* This will return the combining class for a character to be used with the
|
||||
* Canonical Ordering algorithm.
|
||||
*/
|
||||
extern unsigned long uccombining_class __((unsigned long code));
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions for getting numbers and digits.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
struct ucnumber {
|
||||
int numerator;
|
||||
int denominator;
|
||||
};
|
||||
|
||||
extern int ucnumber_lookup __((unsigned long code, struct ucnumber *num));
|
||||
extern int ucdigit_lookup __((unsigned long code, int *digit));
|
||||
|
||||
/*
|
||||
* For compatibility with John Cowan's "uctype" package.
|
||||
*/
|
||||
extern struct ucnumber ucgetnumber __((unsigned long code));
|
||||
extern int ucgetdigit __((unsigned long code));
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions library initialization and cleanup.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/*
|
||||
* Macros for specifying the data tables to be loaded, unloaded, or reloaded
|
||||
* by the ucdata_load(), ucdata_unload(), and ucdata_reload() routines.
|
||||
*/
|
||||
#define UCDATA_CASE 0x01
|
||||
#define UCDATA_CTYPE 0x02
|
||||
#define UCDATA_DECOMP 0x04
|
||||
#define UCDATA_CMBCL 0x08
|
||||
#define UCDATA_NUM 0x10
|
||||
|
||||
#define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
|
||||
UCDATA_CMBCL|UCDATA_NUM)
|
||||
|
||||
/*
|
||||
* Functions to load, unload, and reload specific data files.
|
||||
*/
|
||||
extern void ucdata_load __((char *paths, int mask));
|
||||
extern void ucdata_unload __((int mask));
|
||||
extern void ucdata_reload __((char *paths, int mask));
|
||||
|
||||
/*
|
||||
* Deprecated functions, now just compatibility macros.
|
||||
*/
|
||||
#define ucdata_setup(p) ucdata_load(p, UCDATA_ALL)
|
||||
#define ucdata_cleanup() ucdata_unload(UCDATA_ALL)
|
||||
|
||||
#undef __
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _h_ucdata */
|
||||
464
libraries/liblunicode/ucdata/ucdata.man
Normal file
464
libraries/liblunicode/ucdata/ucdata.man
Normal file
|
|
@ -0,0 +1,464 @@
|
|||
.\"
|
||||
.\" $Id: ucdata.man,v 1.4 1999/11/19 16:08:33 mleisher Exp $
|
||||
.\"
|
||||
.TH ucdata 3 "19 November 1999"
|
||||
.SH NAME
|
||||
ucdata \- package for providing Unicode/ISO10646 character information
|
||||
|
||||
.SH SYNOPSIS
|
||||
#include <ucdata.h>
|
||||
.sp
|
||||
void ucdata_load(char * paths, int masks)
|
||||
.sp
|
||||
void ucdata_unload(int masks)
|
||||
.sp
|
||||
void ucdata_reload(char * paths, int masks)
|
||||
.sp
|
||||
int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
|
||||
.sp
|
||||
int ucdecomp_hangul(unsigned long code, unsigned long *num,
|
||||
unsigned long decomp[])
|
||||
.sp
|
||||
.nf
|
||||
struct ucnumber {
|
||||
int numerator;
|
||||
int denominator;
|
||||
};
|
||||
.sp
|
||||
int ucnumber_lookup(unsigned long code, struct ucnumber *num)
|
||||
.sp
|
||||
int ucdigit_lookup(unsigned long code, int *digit)
|
||||
.sp
|
||||
struct ucnumber ucgetnumber(unsigned long code)
|
||||
.sp
|
||||
int ucgetdigit(unsigned long code)
|
||||
.sp
|
||||
unsigned long uctoupper(unsigned long code)
|
||||
.sp
|
||||
unsigned long uctolower(unsigned long code)
|
||||
.sp
|
||||
unsigned long uctotitle(unsigned long code)
|
||||
.sp
|
||||
int ucisalpha(unsigned long code)
|
||||
.sp
|
||||
int ucisalnum(unsigned long code)
|
||||
.sp
|
||||
int ucisdigit(unsigned long code)
|
||||
.sp
|
||||
int uciscntrl(unsigned long code)
|
||||
.sp
|
||||
int ucisspace(unsigned long code)
|
||||
.sp
|
||||
int ucisblank(unsigned long code)
|
||||
.sp
|
||||
int ucispunct(unsigned long code)
|
||||
.sp
|
||||
int ucisgraph(unsigned long code)
|
||||
.sp
|
||||
int ucisprint(unsigned long code)
|
||||
.sp
|
||||
int ucisxdigit(unsigned long code)
|
||||
.sp
|
||||
int ucisupper(unsigned long code)
|
||||
.sp
|
||||
int ucislower(unsigned long code)
|
||||
.sp
|
||||
int ucistitle(unsigned long code)
|
||||
.sp
|
||||
int ucisisocntrl(unsigned long code)
|
||||
.sp
|
||||
int ucisfmtcntrl(unsigned long code)
|
||||
.sp
|
||||
int ucissymbol(unsigned long code)
|
||||
.sp
|
||||
int ucisnumber(unsigned long code)
|
||||
.sp
|
||||
int ucisnonspacing(unsigned long code)
|
||||
.sp
|
||||
int ucisopenpunct(unsigned long code)
|
||||
.sp
|
||||
int ucisclosepunct(unsigned long code)
|
||||
.sp
|
||||
int ucisinitialpunct(unsigned long code)
|
||||
.sp
|
||||
int ucisfinalpunct(unsigned long code)
|
||||
.sp
|
||||
int uciscomposite(unsigned long code)
|
||||
.sp
|
||||
int ucisquote(unsigned long code)
|
||||
.sp
|
||||
int ucissymmetric(unsigned long code)
|
||||
.sp
|
||||
int ucismirroring(unsigned long code)
|
||||
.sp
|
||||
int ucisnonbreaking(unsigned long code)
|
||||
.sp
|
||||
int ucisrtl(unsigned long code)
|
||||
.sp
|
||||
int ucisltr(unsigned long code)
|
||||
.sp
|
||||
int ucisstrong(unsigned long code)
|
||||
.sp
|
||||
int ucisweak(unsigned long code)
|
||||
.sp
|
||||
int ucisneutral(unsigned long code)
|
||||
.sp
|
||||
int ucisseparator(unsigned long code)
|
||||
.sp
|
||||
int ucislsep(unsigned long code)
|
||||
.sp
|
||||
int ucispsep(unsigned long code)
|
||||
.sp
|
||||
int ucismark(unsigned long code)
|
||||
.sp
|
||||
int ucisnsmark(unsigned long code)
|
||||
.sp
|
||||
int ucisspmark(unsigned long code)
|
||||
.sp
|
||||
int ucismodif(unsigned long code)
|
||||
.sp
|
||||
int ucismodifsymbol(unsigned long code)
|
||||
.sp
|
||||
int ucisletnum(unsigned long code)
|
||||
.sp
|
||||
int ucisconnect(unsigned long code)
|
||||
.sp
|
||||
int ucisdash(unsigned long code)
|
||||
.sp
|
||||
int ucismath(unsigned long code)
|
||||
.sp
|
||||
int uciscurrency(unsigned long code)
|
||||
.sp
|
||||
int ucisenclosing(unsigned long code)
|
||||
.sp
|
||||
int ucisprivate(unsigned long code)
|
||||
.sp
|
||||
int ucissurrogate(unsigned long code)
|
||||
.sp
|
||||
int ucisidentstart(unsigned long code)
|
||||
.sp
|
||||
int ucisidentpart(unsigned long code)
|
||||
.sp
|
||||
int ucisdefined(unsigned long code)
|
||||
.sp
|
||||
int ucisundefined(unsigned long code)
|
||||
.sp
|
||||
int ucishan(unsigned long code)
|
||||
.sp
|
||||
int ucishangul(unsigned long code)
|
||||
|
||||
.SH DESCRIPTION
|
||||
.TP 4
|
||||
.BR Macros
|
||||
.br
|
||||
UCDATA_CASE
|
||||
.br
|
||||
UCDATA_CTYPE
|
||||
.br
|
||||
UCDATA_DECOMP
|
||||
.br
|
||||
UCDATA_CMBCL
|
||||
.br
|
||||
UCDATA_NUM
|
||||
.br
|
||||
UCDATA_ALL
|
||||
.br
|
||||
.TP 4
|
||||
.BR ucdata_load()
|
||||
This function initializes the UCData library by locating the data files in one
|
||||
of the colon-separated directories in the `paths' parameter. The data files
|
||||
to be loaded are specified in the `masks' parameter as a bitwise combination
|
||||
of the macros listed above.
|
||||
.sp
|
||||
This should be called before using any of the other functions.
|
||||
.TP 4
|
||||
.BR ucdata_unload()
|
||||
This function unloads the data tables specified in the `masks' parameter.
|
||||
.sp
|
||||
This function should be called when the application is done using the UCData
|
||||
package.
|
||||
.TP 4
|
||||
.BR ucdata_reload()
|
||||
This function reloads the data files from one of the colon-separated
|
||||
directories in the `paths' parameter. The data files to be reloaded are
|
||||
specified in the `masks' parameter as a bitwise combination of the macros
|
||||
listed above.
|
||||
.TP 4
|
||||
.BR ucdecomp()
|
||||
This function determines if a character has a decomposition and returns the
|
||||
decomposition information if it exists.
|
||||
.sp
|
||||
If a zero is returned, there is no decomposition. If a non-zero is
|
||||
returned, then the `num' and `decomp' variables are filled in with the
|
||||
appropriate values.
|
||||
.sp
|
||||
Example call:
|
||||
.sp
|
||||
.nf
|
||||
unsigned long i, num, *decomp;
|
||||
|
||||
if (ucdecomp(0x1d5, &num, &decomp) != 0) {
|
||||
for (i = 0; i < num; i++)
|
||||
printf("0x%08lX,", decomp[i]);
|
||||
putchar('\n');
|
||||
}
|
||||
.TP 4
|
||||
.BR ucdecomp_hangul()
|
||||
This function determines if a Hangul syllable has a
|
||||
decomposition and returns the decomposition information.
|
||||
.sp
|
||||
An array of at least size 3 should be passed to the function
|
||||
for the decomposition of the syllable.
|
||||
.sp
|
||||
If a zero is returned, the character is not a Hangul
|
||||
syllable. If a non-zero is returned, the `num' field
|
||||
will be 2 or 3 and the syllable will be decomposed into
|
||||
the `decomp' array arithmetically.
|
||||
.sp
|
||||
Example call:
|
||||
.sp
|
||||
.nf
|
||||
unsigned long i, num, decomp[3];
|
||||
|
||||
if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) {
|
||||
for (i = 0; i < num; i++)
|
||||
printf("0x%08lX,", decomp[i]);
|
||||
putchar('\n');
|
||||
}
|
||||
.TP 4
|
||||
.BR ucnumber_lookup()
|
||||
This function determines if the code is a number and
|
||||
fills in the `num' field with the numerator and
|
||||
denominator. If the code happens to be a single digit,
|
||||
the numerator and denominator fields will be the same.
|
||||
.sp
|
||||
If the function returns 0, the code is not a number.
|
||||
Any other return value means the code is a number.
|
||||
.TP 4
|
||||
.BR ucdigit_lookup()
|
||||
This function determines if the code is a digit and
|
||||
fills in the `digit' field with the digit value.
|
||||
.sp
|
||||
If the function returns 0, the code is not a number.
|
||||
Any other return value means the code is a number.
|
||||
.TP 4
|
||||
.BR ucgetnumber()
|
||||
This is a compatibility function with John Cowan's
|
||||
"uctype" package. It uses ucnumber_lookup().
|
||||
.TP 4
|
||||
.BR ucgetdigit()
|
||||
This is a compatibility function with John Cowan's
|
||||
"uctype" package. It uses ucdigit_lookup().
|
||||
.TP 4
|
||||
.BR uctoupper()
|
||||
This function returns the code unchanged if it is
|
||||
already upper case or has no upper case equivalent.
|
||||
Otherwise the upper case equivalent is returned.
|
||||
.TP 4
|
||||
.BR uctolower()
|
||||
This function returns the code unchanged if it is
|
||||
already lower case or has no lower case equivalent.
|
||||
Otherwise the lower case equivalent is returned.
|
||||
.TP 4
|
||||
.BR uctotitle()
|
||||
This function returns the code unchanged if it is
|
||||
already title case or has no title case equivalent.
|
||||
Otherwise the title case equivalent is returned.
|
||||
.TP 4
|
||||
.BR ucisalpha()
|
||||
Test if \fIcode\fR is an alpha character.
|
||||
.TP 4
|
||||
.BR ucisalnum()
|
||||
Test if \fIcode\fR is an alpha or digit character.
|
||||
.TP 4
|
||||
.BR ucisdigit()
|
||||
Test if \fIcode\fR is a digit character.
|
||||
.TP 4
|
||||
.BR uciscntrl()
|
||||
Test if \fIcode\fR is a control character.
|
||||
.TP 4
|
||||
.BR ucisspace()
|
||||
Test if \fIcode\fR is a space character.
|
||||
.TP 4
|
||||
.BR ucisblank()
|
||||
Test if \fIcode\fR is a blank character.
|
||||
.TP 4
|
||||
.BR ucispunct()
|
||||
Test if \fIcode\fR is a punctuation character.
|
||||
.TP 4
|
||||
.BR ucisgraph()
|
||||
Test if \fIcode\fR is a graphical (visible) character.
|
||||
.TP 4
|
||||
.BR ucisprint()
|
||||
Test if \fIcode\fR is a printable character.
|
||||
.TP 4
|
||||
.BR ucisxdigit()
|
||||
Test if \fIcode\fR is a hexadecimal digit character.
|
||||
.TP 4
|
||||
.BR ucisupper()
|
||||
Test if \fIcode\fR is an upper case character.
|
||||
.TP 4
|
||||
.BR ucislower()
|
||||
Test if \fIcode\fR is a lower case character.
|
||||
.TP 4
|
||||
.BR ucistitle()
|
||||
Test if \fIcode\fR is a title case character.
|
||||
.TP 4
|
||||
.BR ucisisocntrl()
|
||||
Is the character a C0 control character (< 32)?
|
||||
.TP 4
|
||||
.BR ucisfmtcntrl()
|
||||
Is the character a format control character?
|
||||
.TP 4
|
||||
.BR ucissymbol()
|
||||
Is the character a symbol?
|
||||
.TP 4
|
||||
.BR ucisnumber()
|
||||
Is the character a number or digit?
|
||||
.TP 4
|
||||
.BR ucisnonspacing()
|
||||
Is the character non-spacing?
|
||||
.TP 4
|
||||
.BR ucisopenpunct()
|
||||
Is the character an open/left punctuation (i.e. '[')
|
||||
.TP 4
|
||||
.BR ucisclosepunct()
|
||||
Is the character an close/right punctuation (i.e. ']')
|
||||
.TP 4
|
||||
.BR ucisinitialpunct()
|
||||
Is the character an initial punctuation (i.e. U+2018 LEFT
|
||||
SINGLE QUOTATION MARK)
|
||||
.TP 4
|
||||
.BR ucisfinalpunct()
|
||||
Is the character a final punctuation (i.e. U+2019 RIGHT
|
||||
SINGLE QUOTATION MARK)
|
||||
.TP 4
|
||||
.BR uciscomposite()
|
||||
Can the character be decomposed into a set of other
|
||||
characters?
|
||||
.TP 4
|
||||
.BR ucisquote()
|
||||
Is the character one of the many quotation marks?
|
||||
.TP 4
|
||||
.BR ucissymmetric()
|
||||
Is the character one that has an opposite form
|
||||
(i.e. <>)
|
||||
.TP 4
|
||||
.BR ucismirroring()
|
||||
Is the character mirroring (superset of symmetric)?
|
||||
.TP 4
|
||||
.BR ucisnonbreaking()
|
||||
Is the character non-breaking (i.e. non-breaking
|
||||
space)?
|
||||
.TP 4
|
||||
.BR ucisrtl()
|
||||
Does the character have strong right-to-left
|
||||
directionality (i.e. Arabic letters)?
|
||||
.TP 4
|
||||
.BR ucisltr()
|
||||
Does the character have strong left-to-right
|
||||
directionality (i.e. Latin letters)?
|
||||
.TP 4
|
||||
.BR ucisstrong()
|
||||
Does the character have strong directionality?
|
||||
.TP 4
|
||||
.BR ucisweak()
|
||||
Does the character have weak directionality
|
||||
(i.e. numbers)?
|
||||
.TP 4
|
||||
.BR ucisneutral()
|
||||
Does the character have neutral directionality
|
||||
(i.e. whitespace)?
|
||||
.TP 4
|
||||
.BR ucisseparator()
|
||||
Is the character a block or segment separator?
|
||||
.TP 4
|
||||
.BR ucislsep()
|
||||
Is the character a line separator?
|
||||
.TP 4
|
||||
.BR ucispsep()
|
||||
Is the character a paragraph separator?
|
||||
.TP 4
|
||||
.BR ucismark()
|
||||
Is the character a mark of some kind?
|
||||
.TP 4
|
||||
.BR ucisnsmark()
|
||||
Is the character a non-spacing mark?
|
||||
.TP 4
|
||||
.BR ucisspmark()
|
||||
Is the character a spacing mark?
|
||||
.TP 4
|
||||
.BR ucismodif()
|
||||
Is the character a modifier letter?
|
||||
.TP 4
|
||||
.BR ucismodifsymbol()
|
||||
Is the character a modifier symbol?
|
||||
.TP 4
|
||||
.BR ucisletnum()
|
||||
Is the character a number represented by a letter?
|
||||
.TP 4
|
||||
.BR ucisconnect()
|
||||
Is the character connecting punctuation?
|
||||
.TP 4
|
||||
.BR ucisdash()
|
||||
Is the character dash punctuation?
|
||||
.TP 4
|
||||
.BR ucismath()
|
||||
Is the character a math character?
|
||||
.TP 4
|
||||
.BR uciscurrency()
|
||||
Is the character a currency character?
|
||||
.TP 4
|
||||
.BR ucisenclosing()
|
||||
Is the character enclosing (i.e. enclosing box)?
|
||||
.TP 4
|
||||
.BR ucisprivate()
|
||||
Is the character from the Private Use Area?
|
||||
.TP 4
|
||||
.BR ucissurrogate()
|
||||
Is the character one of the surrogate codes?
|
||||
.TP 4
|
||||
.BR ucisidentstart()
|
||||
Is the character a legal initial character of an identifier?
|
||||
.TP 4
|
||||
.BR ucisidentpart()
|
||||
Is the character a legal identifier character?
|
||||
.TP 4
|
||||
.BR ucisdefined()
|
||||
Is the character defined (appeared in one of the data
|
||||
files)?
|
||||
.TP 4
|
||||
.BR ucisundefined()
|
||||
Is the character not defined (non-Unicode)?
|
||||
.TP 4
|
||||
.BR ucishan()
|
||||
Is the character a Han ideograph?
|
||||
.TP 4
|
||||
.BR ucishangul()
|
||||
Is the character a pre-composed Hangul syllable?
|
||||
|
||||
.SH "SEE ALSO"
|
||||
ctype(3)
|
||||
|
||||
.SH ACKNOWLEDGMENTS
|
||||
These are people who have helped with patches or
|
||||
alerted me about problems.
|
||||
.sp
|
||||
John Cowan <cowan@locke.ccil.org>
|
||||
.br
|
||||
Bob Verbrugge <bob_verbrugge@nl.compuware.com>
|
||||
.br
|
||||
Christophe Pierret <cpierret@businessobjects.com>
|
||||
.br
|
||||
Kent Johnson <kent@pondview.mv.com>
|
||||
.br
|
||||
Valeriy E. Ushakov <uwe@ptc.spbu.ru>
|
||||
|
||||
.SH AUTHOR
|
||||
Mark Leisher
|
||||
.br
|
||||
Computing Research Lab
|
||||
.br
|
||||
New Mexico State University
|
||||
.br
|
||||
Email: mleisher@crl.nmsu.edu
|
||||
1485
libraries/liblunicode/ucdata/ucgendat.c
Normal file
1485
libraries/liblunicode/ucdata/ucgendat.c
Normal file
File diff suppressed because it is too large
Load diff
813
libraries/liblunicode/ucdata/ucpgba.c
Normal file
813
libraries/liblunicode/ucdata/ucpgba.c
Normal file
|
|
@ -0,0 +1,813 @@
|
|||
/*
|
||||
* Copyright 1999 Computing Research Labs, New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef lint
|
||||
#ifdef __GNUC__
|
||||
static char rcsid[] __attribute__ ((unused)) = "$Id: ucpgba.c,v 1.4 1999/11/29 16:41:06 mleisher Exp $";
|
||||
#else
|
||||
static char rcsid[] = "$Id: ucpgba.c,v 1.4 1999/11/29 16:41:06 mleisher Exp $";
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "ucdata.h"
|
||||
#include "ucpgba.h"
|
||||
|
||||
/*
|
||||
* These macros are used while reordering of RTL runs of text for the
|
||||
* special case of non-spacing characters being in runs of weakly
|
||||
* directional text. They check for weak and non-spacing, and digits and
|
||||
* non-spacing.
|
||||
*/
|
||||
#define ISWEAKSPECIAL(cc) ucisprop(cc, UC_EN|UC_ES|UC_MN, UC_ET|UC_AN|UC_CS)
|
||||
#define ISDIGITSPECIAL(cc) ucisprop(cc, UC_ND|UC_MN, 0)
|
||||
|
||||
/*
|
||||
* These macros are used while breaking a string into runs of text in
|
||||
* different directions. Descriptions:
|
||||
*
|
||||
* ISLTR_LTR - Test for members of an LTR run in an LTR context. This looks
|
||||
* for characters with ltr, non-spacing, weak, and neutral
|
||||
* properties.
|
||||
*
|
||||
* ISRTL_RTL - Test for members of an RTL run in an RTL context. This looks
|
||||
* for characters with rtl, non-spacing, weak, and neutral
|
||||
* properties.
|
||||
*
|
||||
* ISRTL_NEUTRAL - Test for RTL or neutral characters.
|
||||
*
|
||||
* ISWEAK_NEUTRAL - Test for weak or neutral characters.
|
||||
*/
|
||||
#define ISLTR_LTR(cc) ucisprop(cc, UC_L|UC_MN|UC_EN|UC_ES,\
|
||||
UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
|
||||
|
||||
#define ISRTL_RTL(cc) ucisprop(cc, UC_R|UC_MN|UC_EN|UC_ES,\
|
||||
UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
|
||||
|
||||
#define ISRTL_NEUTRAL(cc) ucisprop(cc, UC_R, UC_B|UC_S|UC_WS|UC_ON)
|
||||
#define ISWEAK_NEUTRAL(cc) ucisprop(cc, UC_EN|UC_ES, \
|
||||
UC_B|UC_S|UC_WS|UC_ON|UC_ET|UC_AN|UC_CS)
|
||||
|
||||
/*
|
||||
* This table is temporarily hard-coded here until it can be constructed
|
||||
* automatically somehow.
|
||||
*/
|
||||
static unsigned long _symmetric_pairs[] = {
|
||||
0x0028, 0x0029, 0x0029, 0x0028, 0x003C, 0x003E, 0x003E, 0x003C,
|
||||
0x005B, 0x005D, 0x005D, 0x005B, 0x007B, 0x007D, 0x007D, 0x007B,
|
||||
0x2045, 0x2046, 0x2046, 0x2045, 0x207D, 0x207E, 0x207E, 0x207D,
|
||||
0x208D, 0x208E, 0x208E, 0x208D, 0x3008, 0x3009, 0x3009, 0x3008,
|
||||
0x300A, 0x300B, 0x300B, 0x300A, 0x300C, 0x300D, 0x300D, 0x300C,
|
||||
0x300E, 0x300F, 0x300F, 0x300E, 0x3010, 0x3011, 0x3011, 0x3010,
|
||||
0x3014, 0x3015, 0x3015, 0x3014, 0x3016, 0x3017, 0x3017, 0x3016,
|
||||
0x3018, 0x3019, 0x3019, 0x3018, 0x301A, 0x301B, 0x301B, 0x301A,
|
||||
0xFD3E, 0xFD3F, 0xFD3F, 0xFD3E, 0xFE59, 0xFE5A, 0xFE5A, 0xFE59,
|
||||
0xFE5B, 0xFE5C, 0xFE5C, 0xFE5B, 0xFE5D, 0xFE5E, 0xFE5E, 0xFE5D,
|
||||
0xFF08, 0xFF09, 0xFF09, 0xFF08, 0xFF3B, 0xFF3D, 0xFF3D, 0xFF3B,
|
||||
0xFF5B, 0xFF5D, 0xFF5D, 0xFF5B, 0xFF62, 0xFF63, 0xFF63, 0xFF62,
|
||||
};
|
||||
|
||||
static int _symmetric_pairs_size =
|
||||
sizeof(_symmetric_pairs)/sizeof(_symmetric_pairs[0]);
|
||||
|
||||
/*
|
||||
* This routine looks up the other form of a symmetric pair.
|
||||
*/
|
||||
static unsigned long
|
||||
#ifdef __STDC__
|
||||
_ucsymmetric_pair(unsigned long c)
|
||||
#else
|
||||
_ucsymmetric_pair(c)
|
||||
unsigned long c;
|
||||
#endif
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < _symmetric_pairs_size; i += 2) {
|
||||
if (_symmetric_pairs[i] == c)
|
||||
return _symmetric_pairs[i+1];
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine creates a new run, copies the text into it, links it into the
|
||||
* logical text order chain and returns it to the caller to be linked into
|
||||
* the visual text order chain.
|
||||
*/
|
||||
static ucrun_t *
|
||||
#ifdef __STDC__
|
||||
_add_run(ucstring_t *str, unsigned long *src,
|
||||
unsigned long start, unsigned long end, int direction)
|
||||
#else
|
||||
_add_run(str, src, start, end, direction)
|
||||
ucstring_t *str;
|
||||
unsigned long *src, start, end;
|
||||
int direction;
|
||||
#endif
|
||||
{
|
||||
long i, t;
|
||||
ucrun_t *run;
|
||||
|
||||
run = (ucrun_t *) malloc(sizeof(ucrun_t));
|
||||
run->visual_next = run->visual_prev = 0;
|
||||
run->direction = direction;
|
||||
|
||||
run->cursor = ~0;
|
||||
|
||||
run->chars = (unsigned long *)
|
||||
malloc(sizeof(unsigned long) * ((end - start) << 1));
|
||||
run->positions = run->chars + (end - start);
|
||||
|
||||
run->source = src;
|
||||
run->start = start;
|
||||
run->end = end;
|
||||
|
||||
if (direction == UCPGBA_RTL) {
|
||||
/*
|
||||
* Copy the source text into the run in reverse order and select
|
||||
* replacements for the pairwise punctuation and the <> characters.
|
||||
*/
|
||||
for (i = 0, t = end - 1; start < end; start++, t--, i++) {
|
||||
run->positions[i] = t;
|
||||
if (ucissymmetric(src[t]) || src[t] == '<' || src[t] == '>')
|
||||
run->chars[i] = _ucsymmetric_pair(src[t]);
|
||||
else
|
||||
run->chars[i] = src[t];
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Copy the source text into the run directly.
|
||||
*/
|
||||
for (i = start; i < end; i++) {
|
||||
run->positions[i - start] = i;
|
||||
run->chars[i - start] = src[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the run to the logical list for cursor traversal.
|
||||
*/
|
||||
if (str->logical_first == 0)
|
||||
str->logical_first = str->logical_last = run;
|
||||
else {
|
||||
run->logical_prev = str->logical_last;
|
||||
str->logical_last->logical_next = run;
|
||||
str->logical_last = run;
|
||||
}
|
||||
|
||||
return run;
|
||||
}
|
||||
|
||||
static void
|
||||
#ifdef __STDC__
|
||||
_ucadd_rtl_segment(ucstring_t *str, unsigned long *source, unsigned long start,
|
||||
unsigned long end)
|
||||
#else
|
||||
_ucadd_rtl_segment(str, source, start, end)
|
||||
ucstring_t *str;
|
||||
unsigned long *source, start, end;
|
||||
#endif
|
||||
{
|
||||
unsigned long s, e;
|
||||
ucrun_t *run, *lrun;
|
||||
|
||||
/*
|
||||
* This is used to splice runs into strings with overall LTR direction.
|
||||
* The `lrun' variable will never be NULL because at least one LTR run was
|
||||
* added before this RTL run.
|
||||
*/
|
||||
lrun = str->visual_last;
|
||||
|
||||
for (e = s = start; s < end;) {
|
||||
for (; e < end && ISRTL_NEUTRAL(source[e]); e++) ;
|
||||
|
||||
if (e > s) {
|
||||
run = _add_run(str, source, s, e, UCPGBA_RTL);
|
||||
|
||||
/*
|
||||
* Add the run to the visual list for cursor traversal.
|
||||
*/
|
||||
if (str->visual_first != 0) {
|
||||
if (str->direction == UCPGBA_LTR) {
|
||||
run->visual_prev = lrun;
|
||||
run->visual_next = lrun->visual_next;
|
||||
if (lrun->visual_next != 0)
|
||||
lrun->visual_next->visual_prev = run;
|
||||
lrun->visual_next = run;
|
||||
if (lrun == str->visual_last)
|
||||
str->visual_last = run;
|
||||
} else {
|
||||
run->visual_next = str->visual_first;
|
||||
str->visual_first->visual_prev = run;
|
||||
str->visual_first = run;
|
||||
}
|
||||
} else
|
||||
str->visual_first = str->visual_last = run;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now handle the weak sequences such that multiple non-digit groups
|
||||
* are kept together appropriately and added as RTL sequences.
|
||||
*/
|
||||
for (s = e; e < end && ISWEAKSPECIAL(source[e]); e++) {
|
||||
if (!ISDIGITSPECIAL(source[e]) &&
|
||||
(e + 1 == end || !ISDIGITSPECIAL(source[e + 1])))
|
||||
break;
|
||||
}
|
||||
|
||||
if (e > s) {
|
||||
run = _add_run(str, source, s, e, UCPGBA_LTR);
|
||||
|
||||
/*
|
||||
* Add the run to the visual list for cursor traversal.
|
||||
*/
|
||||
if (str->visual_first != 0) {
|
||||
if (str->direction == UCPGBA_LTR) {
|
||||
run->visual_prev = lrun;
|
||||
run->visual_next = lrun->visual_next;
|
||||
if (lrun->visual_next != 0)
|
||||
lrun->visual_next->visual_prev = run;
|
||||
lrun->visual_next = run;
|
||||
if (lrun == str->visual_last)
|
||||
str->visual_last = run;
|
||||
} else {
|
||||
run->visual_next = str->visual_first;
|
||||
str->visual_first->visual_prev = run;
|
||||
str->visual_first = run;
|
||||
}
|
||||
} else
|
||||
str->visual_first = str->visual_last = run;
|
||||
}
|
||||
|
||||
/*
|
||||
* Collect all weak non-digit sequences for an RTL segment. These
|
||||
* will appear as part of the next RTL segment or will be added as
|
||||
* an RTL segment by themselves.
|
||||
*/
|
||||
for (s = e; e < end && ucisweak(source[e]) && !ucisdigit(source[e]);
|
||||
e++) ;
|
||||
}
|
||||
|
||||
/*
|
||||
* Capture any weak non-digit sequences that occur at the end of the RTL
|
||||
* run.
|
||||
*/
|
||||
if (e > s) {
|
||||
run = _add_run(str, source, s, e, UCPGBA_RTL);
|
||||
|
||||
/*
|
||||
* Add the run to the visual list for cursor traversal.
|
||||
*/
|
||||
if (str->visual_first != 0) {
|
||||
if (str->direction == UCPGBA_LTR) {
|
||||
run->visual_prev = lrun;
|
||||
run->visual_next = lrun->visual_next;
|
||||
if (lrun->visual_next != 0)
|
||||
lrun->visual_next->visual_prev = run;
|
||||
lrun->visual_next = run;
|
||||
if (lrun == str->visual_last)
|
||||
str->visual_last = run;
|
||||
} else {
|
||||
run->visual_next = str->visual_first;
|
||||
str->visual_first->visual_prev = run;
|
||||
str->visual_first = run;
|
||||
}
|
||||
} else
|
||||
str->visual_first = str->visual_last = run;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
#ifdef __STDC__
|
||||
_ucadd_ltr_segment(ucstring_t *str, unsigned long *source, unsigned long start,
|
||||
unsigned long end)
|
||||
#else
|
||||
_ucadd_ltr_segment(str, source, start, end)
|
||||
ucstring_t *str;
|
||||
unsigned long *source, start, end;
|
||||
#endif
|
||||
{
|
||||
ucrun_t *run;
|
||||
|
||||
run = _add_run(str, source, start, end, UCPGBA_LTR);
|
||||
|
||||
/*
|
||||
* Add the run to the visual list for cursor traversal.
|
||||
*/
|
||||
if (str->visual_first != 0) {
|
||||
if (str->direction == UCPGBA_LTR) {
|
||||
run->visual_prev = str->visual_last;
|
||||
str->visual_last->visual_next = run;
|
||||
str->visual_last = run;
|
||||
} else {
|
||||
run->visual_next = str->visual_first;
|
||||
str->visual_first->visual_prev = run;
|
||||
str->visual_first = run;
|
||||
}
|
||||
} else
|
||||
str->visual_first = str->visual_last = run;
|
||||
}
|
||||
|
||||
ucstring_t *
|
||||
#ifdef __STDC__
|
||||
ucstring_create(unsigned long *source, unsigned long start, unsigned long end,
|
||||
int default_direction, int cursor_motion)
|
||||
#else
|
||||
ucstring_create(source, start, end, default_direction, cursor_motion)
|
||||
unsigned long *source, start, end;
|
||||
int default_direction, cursor_motion;
|
||||
#endif
|
||||
{
|
||||
int rtl_first;
|
||||
unsigned long s, e;
|
||||
ucstring_t *str;
|
||||
|
||||
str = (ucstring_t *) malloc(sizeof(ucstring_t));
|
||||
|
||||
/*
|
||||
* Set the initial values.
|
||||
*/
|
||||
str->cursor_motion = cursor_motion;
|
||||
str->logical_first = str->logical_last = 0;
|
||||
str->visual_first = str->visual_last = str->cursor = 0;
|
||||
str->source = source;
|
||||
str->start = start;
|
||||
str->end = end;
|
||||
|
||||
/*
|
||||
* If the length of the string is 0, then just return it at this point.
|
||||
*/
|
||||
if (start == end)
|
||||
return str;
|
||||
|
||||
/*
|
||||
* This flag indicates whether the collection loop for RTL is called
|
||||
* before the LTR loop the first time.
|
||||
*/
|
||||
rtl_first = 0;
|
||||
|
||||
/*
|
||||
* Look for the first character in the string that has strong
|
||||
* directionality.
|
||||
*/
|
||||
for (s = start; s < end && !ucisstrong(source[s]); s++) ;
|
||||
|
||||
if (s == end)
|
||||
/*
|
||||
* If the string contains no characters with strong directionality, use
|
||||
* the default direction.
|
||||
*/
|
||||
str->direction = default_direction;
|
||||
else
|
||||
str->direction = ucisrtl(source[s]) ? UCPGBA_RTL : UCPGBA_LTR;
|
||||
|
||||
if (str->direction == UCPGBA_RTL)
|
||||
/*
|
||||
* Set the flag that causes the RTL collection loop to run first.
|
||||
*/
|
||||
rtl_first = 1;
|
||||
|
||||
/*
|
||||
* This loop now separates the string into runs based on directionality.
|
||||
*/
|
||||
for (s = e = 0; s < end; s = e) {
|
||||
if (!rtl_first) {
|
||||
/*
|
||||
* Determine the next run of LTR text.
|
||||
*/
|
||||
|
||||
while (e < end && ISLTR_LTR(source[e]))
|
||||
e++;
|
||||
if (str->direction != UCPGBA_LTR) {
|
||||
while (e > s && ISWEAK_NEUTRAL(source[e - 1]))
|
||||
e--;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the LTR segment to the string.
|
||||
*/
|
||||
if (e > s)
|
||||
_ucadd_ltr_segment(str, source, s, e);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine the next run of RTL text.
|
||||
*/
|
||||
s = e;
|
||||
while (e < end && ISRTL_RTL(source[e]))
|
||||
e++;
|
||||
if (str->direction != UCPGBA_RTL) {
|
||||
while (e > s && ISWEAK_NEUTRAL(source[e - 1]))
|
||||
e--;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the RTL segment to the string.
|
||||
*/
|
||||
if (e > s)
|
||||
_ucadd_rtl_segment(str, source, s, e);
|
||||
|
||||
/*
|
||||
* Clear the flag that allowed the RTL collection loop to run first
|
||||
* for strings with overall RTL directionality.
|
||||
*/
|
||||
rtl_first = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up the initial cursor run.
|
||||
*/
|
||||
str->cursor = str->logical_first;
|
||||
if (str != 0)
|
||||
str->cursor->cursor = (str->cursor->direction == UCPGBA_RTL) ?
|
||||
str->cursor->end - str->cursor->start : 0;
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
void
|
||||
#ifdef __STDC__
|
||||
ucstring_free(ucstring_t *s)
|
||||
#else
|
||||
ucstring_free(s)
|
||||
ucstring_t *s;
|
||||
#endif
|
||||
{
|
||||
ucrun_t *l, *r;
|
||||
|
||||
if (s == 0)
|
||||
return;
|
||||
|
||||
for (l = 0, r = s->visual_first; r != 0; r = r->visual_next) {
|
||||
if (r->end > r->start)
|
||||
free((char *) r->chars);
|
||||
if (l)
|
||||
free((char *) l);
|
||||
l = r;
|
||||
}
|
||||
if (l)
|
||||
free((char *) l);
|
||||
|
||||
free((char *) s);
|
||||
}
|
||||
|
||||
int
|
||||
#ifdef __STDC__
|
||||
ucstring_set_cursor_motion(ucstring_t *str, int cursor_motion)
|
||||
#else
|
||||
ucstring_set_cursor_motion(s, cursor_motion)
|
||||
ucstring_t *str;
|
||||
int cursor_motion;
|
||||
#endif
|
||||
{
|
||||
int n;
|
||||
|
||||
if (str == 0)
|
||||
return -1;
|
||||
|
||||
n = str->cursor_motion;
|
||||
str->cursor_motion = cursor_motion;
|
||||
return n;
|
||||
}
|
||||
|
||||
static int
|
||||
#ifdef __STDC__
|
||||
_ucstring_visual_cursor_right(ucstring_t *str, int count)
|
||||
#else
|
||||
_ucstring_visual_cursor_right(str, count)
|
||||
ucstring_t *str;
|
||||
int count;
|
||||
#endif
|
||||
{
|
||||
int cnt = count;
|
||||
unsigned long size;
|
||||
ucrun_t *cursor;
|
||||
|
||||
if (str == 0)
|
||||
return 0;
|
||||
|
||||
cursor = str->cursor;
|
||||
while (cnt > 0) {
|
||||
size = cursor->end - cursor->start;
|
||||
if ((cursor->direction == UCPGBA_RTL && cursor->cursor + 1 == size) ||
|
||||
cursor->cursor + 1 > size) {
|
||||
/*
|
||||
* If the next run is NULL, then the cursor is already on the
|
||||
* far right end already.
|
||||
*/
|
||||
if (cursor->visual_next == 0)
|
||||
/*
|
||||
* If movement occured, then report it.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
/*
|
||||
* Move to the next run.
|
||||
*/
|
||||
str->cursor = cursor = cursor->visual_next;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_RTL) ? -1 : 0;
|
||||
size = cursor->end - cursor->start;
|
||||
} else
|
||||
cursor->cursor++;
|
||||
cnt--;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
#ifdef __STDC__
|
||||
_ucstring_logical_cursor_right(ucstring_t *str, int count)
|
||||
#else
|
||||
_ucstring_logical_cursor_right(str, count)
|
||||
ucstring_t *str;
|
||||
int count;
|
||||
#endif
|
||||
{
|
||||
int cnt = count;
|
||||
unsigned long size;
|
||||
ucrun_t *cursor;
|
||||
|
||||
if (str == 0)
|
||||
return 0;
|
||||
|
||||
cursor = str->cursor;
|
||||
while (cnt > 0) {
|
||||
size = cursor->end - cursor->start;
|
||||
if (str->direction == UCPGBA_RTL) {
|
||||
if (cursor->direction == UCPGBA_RTL) {
|
||||
if (cursor->cursor + 1 == size) {
|
||||
if (cursor == str->logical_first)
|
||||
/*
|
||||
* Already at the beginning of the string.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
str->cursor = cursor = cursor->logical_prev;
|
||||
size = cursor->end - cursor->start;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
|
||||
size : 0;
|
||||
} else
|
||||
cursor->cursor++;
|
||||
} else {
|
||||
if (cursor->cursor == 0) {
|
||||
if (cursor == str->logical_first)
|
||||
/*
|
||||
* At the beginning of the string already.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
str->cursor = cursor = cursor->logical_prev;
|
||||
size = cursor->end - cursor->start;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
|
||||
size : 0;
|
||||
} else
|
||||
cursor->cursor--;
|
||||
}
|
||||
} else {
|
||||
if (cursor->direction == UCPGBA_RTL) {
|
||||
if (cursor->cursor == 0) {
|
||||
if (cursor == str->logical_last)
|
||||
/*
|
||||
* Already at the end of the string.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
str->cursor = cursor = cursor->logical_next;
|
||||
size = cursor->end - cursor->start;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
|
||||
0 : size - 1;
|
||||
} else
|
||||
cursor->cursor--;
|
||||
} else {
|
||||
if (cursor->cursor + 1 > size) {
|
||||
if (cursor == str->logical_last)
|
||||
/*
|
||||
* Already at the end of the string.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
str->cursor = cursor = cursor->logical_next;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
|
||||
0 : size - 1;
|
||||
} else
|
||||
cursor->cursor++;
|
||||
}
|
||||
}
|
||||
cnt--;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
#ifdef __STDC__
|
||||
ucstring_cursor_right(ucstring_t *str, int count)
|
||||
#else
|
||||
ucstring_cursor_right(str, count)
|
||||
ucstring_t *str;
|
||||
int count;
|
||||
#endif
|
||||
{
|
||||
if (str == 0)
|
||||
return 0;
|
||||
return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
|
||||
_ucstring_visual_cursor_right(str, count) :
|
||||
_ucstring_logical_cursor_right(str, count);
|
||||
}
|
||||
|
||||
static int
|
||||
#ifdef __STDC__
|
||||
_ucstring_visual_cursor_left(ucstring_t *str, int count)
|
||||
#else
|
||||
_ucstring_visual_cursor_left(str, count)
|
||||
ucstring_t *str;
|
||||
int count;
|
||||
#endif
|
||||
{
|
||||
int cnt = count;
|
||||
unsigned long size;
|
||||
ucrun_t *cursor;
|
||||
|
||||
if (str == 0)
|
||||
return 0;
|
||||
|
||||
cursor = str->cursor;
|
||||
while (cnt > 0) {
|
||||
size = cursor->end - cursor->start;
|
||||
if ((cursor->direction == UCPGBA_LTR && cursor->cursor == 0) ||
|
||||
cursor->cursor - 1 < -1) {
|
||||
/*
|
||||
* If the preceding run is NULL, then the cursor is already on the
|
||||
* far left end already.
|
||||
*/
|
||||
if (cursor->visual_prev == 0)
|
||||
/*
|
||||
* If movement occured, then report it.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
/*
|
||||
* Move to the previous run.
|
||||
*/
|
||||
str->cursor = cursor = cursor->visual_prev;
|
||||
size = cursor->end - cursor->start;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_RTL) ?
|
||||
size : size - 1;
|
||||
} else
|
||||
cursor->cursor--;
|
||||
cnt--;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
#ifdef __STDC__
|
||||
_ucstring_logical_cursor_left(ucstring_t *str, int count)
|
||||
#else
|
||||
_ucstring_logical_cursor_left(str, count)
|
||||
ucstring_t *str;
|
||||
int count;
|
||||
#endif
|
||||
{
|
||||
int cnt = count;
|
||||
unsigned long size;
|
||||
ucrun_t *cursor;
|
||||
|
||||
if (str == 0)
|
||||
return 0;
|
||||
|
||||
cursor = str->cursor;
|
||||
while (cnt > 0) {
|
||||
size = cursor->end - cursor->start;
|
||||
if (str->direction == UCPGBA_RTL) {
|
||||
if (cursor->direction == UCPGBA_RTL) {
|
||||
if (cursor->cursor == -1) {
|
||||
if (cursor == str->logical_last)
|
||||
/*
|
||||
* Already at the end of the string.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
str->cursor = cursor = cursor->logical_next;
|
||||
size = cursor->end - cursor->start;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
|
||||
0 : size - 1;
|
||||
} else
|
||||
cursor->cursor--;
|
||||
} else {
|
||||
if (cursor->cursor + 1 > size) {
|
||||
if (cursor == str->logical_last)
|
||||
/*
|
||||
* At the end of the string already.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
str->cursor = cursor = cursor->logical_next;
|
||||
size = cursor->end - cursor->start;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
|
||||
0 : size - 1;
|
||||
} else
|
||||
cursor->cursor++;
|
||||
}
|
||||
} else {
|
||||
if (cursor->direction == UCPGBA_RTL) {
|
||||
if (cursor->cursor + 1 == size) {
|
||||
if (cursor == str->logical_first)
|
||||
/*
|
||||
* Already at the beginning of the string.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
str->cursor = cursor = cursor->logical_prev;
|
||||
size = cursor->end - cursor->start;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
|
||||
size : 0;
|
||||
} else
|
||||
cursor->cursor++;
|
||||
} else {
|
||||
if (cursor->cursor == 0) {
|
||||
if (cursor == str->logical_first)
|
||||
/*
|
||||
* Already at the beginning of the string.
|
||||
*/
|
||||
return (cnt != count);
|
||||
|
||||
str->cursor = cursor = cursor->logical_prev;
|
||||
cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
|
||||
size : 0;
|
||||
} else
|
||||
cursor->cursor--;
|
||||
}
|
||||
}
|
||||
cnt--;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
#ifdef __STDC__
|
||||
ucstring_cursor_left(ucstring_t *str, int count)
|
||||
#else
|
||||
ucstring_cursor_left(str, count)
|
||||
ucstring_t *str;
|
||||
int count;
|
||||
#endif
|
||||
{
|
||||
if (str == 0)
|
||||
return 0;
|
||||
return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
|
||||
_ucstring_visual_cursor_left(str, count) :
|
||||
_ucstring_logical_cursor_left(str, count);
|
||||
}
|
||||
|
||||
void
|
||||
#ifdef __STDC__
|
||||
ucstring_cursor_info(ucstring_t *str, int *direction, unsigned long *position)
|
||||
#else
|
||||
ucstring_cursor_info(str, direction, position)
|
||||
ucstring_t *str, int *direction;
|
||||
unsigned long *position;
|
||||
#endif
|
||||
{
|
||||
long c;
|
||||
unsigned long size;
|
||||
ucrun_t *cursor;
|
||||
|
||||
if (str == 0 || direction == 0 || position == 0)
|
||||
return;
|
||||
|
||||
cursor = str->cursor;
|
||||
|
||||
*direction = cursor->direction;
|
||||
|
||||
c = cursor->cursor;
|
||||
size = cursor->end - cursor->start;
|
||||
|
||||
if (c == size)
|
||||
*position = (cursor->direction == UCPGBA_RTL) ?
|
||||
cursor->start : cursor->positions[c - 1];
|
||||
else if (c == -1)
|
||||
*position = (cursor->direction == UCPGBA_RTL) ?
|
||||
cursor->end : cursor->start;
|
||||
else
|
||||
*position = cursor->positions[c];
|
||||
}
|
||||
162
libraries/liblunicode/ucdata/ucpgba.h
Normal file
162
libraries/liblunicode/ucdata/ucpgba.h
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* Copyright 1999 Computing Research Labs, New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef _h_ucpgba
|
||||
#define _h_ucpgba
|
||||
|
||||
/*
|
||||
* $Id: ucpgba.h,v 1.4 1999/11/19 15:24:30 mleisher Exp $
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#undef __
|
||||
#ifdef __STDC__
|
||||
#define __(x) x
|
||||
#else
|
||||
#define __(x) ()
|
||||
#endif
|
||||
|
||||
/***************************************************************************
|
||||
*
|
||||
* Macros and types.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
* These are the direction values that can appear in render runs and render
|
||||
* strings.
|
||||
*/
|
||||
#define UCPGBA_LTR 0
|
||||
#define UCPGBA_RTL 1
|
||||
|
||||
/*
|
||||
* These are the flags for cursor motion.
|
||||
*/
|
||||
#define UCPGBA_CURSOR_VISUAL 0
|
||||
#define UCPGBA_CURSOR_LOGICAL 1
|
||||
|
||||
/*
|
||||
* This structure is used to contain runs of text in a particular direction.
|
||||
*/
|
||||
typedef struct _ucrun_t {
|
||||
struct _ucrun_t *visual_prev; /* Pointer to the previous visual run. */
|
||||
struct _ucrun_t *visual_next; /* Pointer to the next visual run. */
|
||||
|
||||
struct _ucrun_t *logical_prev; /* Pointer to the previous logical run. */
|
||||
struct _ucrun_t *logical_next; /* Pointer to the next logical run. */
|
||||
|
||||
int direction; /* Direction of the run. */
|
||||
|
||||
long cursor; /* Position of "cursor" in the string. */
|
||||
|
||||
unsigned long *chars; /* List of characters for the run. */
|
||||
unsigned long *positions; /* List of original positions in source. */
|
||||
|
||||
unsigned long *source; /* The source string. */
|
||||
unsigned long start; /* Beginning offset in the source string. */
|
||||
unsigned long end; /* Ending offset in the source string. */
|
||||
} ucrun_t;
|
||||
|
||||
/*
|
||||
* This represents a string of runs rendered up to a point that is not
|
||||
* platform specific.
|
||||
*/
|
||||
typedef struct _ucstring_t {
|
||||
int direction; /* Overall direction of the string. */
|
||||
|
||||
int cursor_motion; /* Logical or visual cursor motion flag. */
|
||||
|
||||
ucrun_t *cursor; /* The run containing the "cursor." */
|
||||
|
||||
ucrun_t *logical_first; /* First run in the logical order. */
|
||||
ucrun_t *logical_last; /* Last run in the logical order. */
|
||||
|
||||
ucrun_t *visual_first; /* First run in the visual order. */
|
||||
ucrun_t *visual_last; /* Last run in the visual order. */
|
||||
|
||||
unsigned long *source; /* The source string. */
|
||||
unsigned long start; /* The beginning offset in the source. */
|
||||
unsigned long end; /* The ending offset in the source. */
|
||||
} ucstring_t;
|
||||
|
||||
/***************************************************************************
|
||||
*
|
||||
* API
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
* This creates and reorders the specified substring using the
|
||||
* "Pretty Good Bidi Algorithm." A default direction is provided for cases
|
||||
* of a string containing no strong direction characters and the default
|
||||
* cursor motion should be provided.
|
||||
*/
|
||||
extern ucstring_t *ucstring_create __((unsigned long *source,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
int default_direction,
|
||||
int cursor_motion));
|
||||
/*
|
||||
* This releases the string.
|
||||
*/
|
||||
extern void ucstring_free __((ucstring_t *string));
|
||||
|
||||
/*
|
||||
* This changes the cursor motion flag for the string.
|
||||
*/
|
||||
extern int ucstring_set_cursor_motion __((ucstring_t *string,
|
||||
int cursor_motion));
|
||||
|
||||
/*
|
||||
* This function will move the cursor to the right depending on the
|
||||
* type of cursor motion that was specified for the string.
|
||||
*
|
||||
* A 0 is returned if no cursor motion is performed, otherwise a
|
||||
* 1 is returned.
|
||||
*/
|
||||
extern int ucstring_cursor_right __((ucstring_t *string, int count));
|
||||
|
||||
/*
|
||||
* This function will move the cursor to the left depending on the
|
||||
* type of cursor motion that was specified for the string.
|
||||
*
|
||||
* A 0 is returned if no cursor motion is performed, otherwise a
|
||||
* 1 is returned.
|
||||
*/
|
||||
extern int ucstring_cursor_left __((ucstring_t *string, int count));
|
||||
|
||||
/*
|
||||
* This routine retrieves the direction of the run containing the cursor
|
||||
* and the actual position in the original text string.
|
||||
*/
|
||||
extern void ucstring_cursor_info __((ucstring_t *string, int *direction,
|
||||
unsigned long *position));
|
||||
|
||||
#undef __
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _h_ucpgba */
|
||||
97
libraries/liblunicode/ucdata/ucpgba.man
Normal file
97
libraries/liblunicode/ucdata/ucpgba.man
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
.\"
|
||||
.\" $Id: ucpgba.man,v 1.1 1999/11/19 16:08:34 mleisher Exp $
|
||||
.\"
|
||||
.TH ucpgba 3 "19 November 1999"
|
||||
.SH NAME
|
||||
ucpgba \- functions for doing bidirectional reordering of Unicode text and
|
||||
logical and visual cursor motion
|
||||
|
||||
.SH SYNOPSIS
|
||||
.nf
|
||||
#include <ucdata.h>
|
||||
#include <ucpgba.h>
|
||||
|
||||
ucstring_t *ucstring_create(unsigned long *source, unsigned long start,
|
||||
unsigned long end, int default_direction,
|
||||
int cursor_motion)
|
||||
.sp
|
||||
void ucstring_free(ucstring_t *string)
|
||||
.sp
|
||||
int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion)
|
||||
.sp
|
||||
int ucstring_cursor_right(ucstring_t *string, int count)
|
||||
.sp
|
||||
int ucstring_cursor_left(ucstring_t *string, int count)
|
||||
.sp
|
||||
void ucstring_cursor_info(ucstring_t *string, int *direction,
|
||||
unsigned long *position)
|
||||
|
||||
.SH DESCRIPTION
|
||||
.TP 4
|
||||
.BR Macros
|
||||
UCPGBA_LTR
|
||||
.br
|
||||
UCPGBA_RTL
|
||||
.br
|
||||
UCPGBA_CURSOR_VISUAL
|
||||
.br
|
||||
UCPGBA_CURSOR_LOGICAL
|
||||
|
||||
.TP 4
|
||||
.BR ucstring_create()
|
||||
This function will create a reordered string by using the implicit
|
||||
directionality of the characters in the specified substring.
|
||||
.sp
|
||||
The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL
|
||||
and is used only in cases where a string contains no characters with strong
|
||||
directionality.
|
||||
.sp
|
||||
The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or
|
||||
UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion
|
||||
behavior. This behavior can be switched at any time using
|
||||
ustring_set_cursor_motion().
|
||||
|
||||
.TP 4
|
||||
.BR ucstring_free()
|
||||
This function will deallocate the memory used by the string, incuding the
|
||||
string itself.
|
||||
|
||||
.TP 4
|
||||
.BR ucstring_cursor_info()
|
||||
This function will return the text position of the internal cursor and the
|
||||
directionality of the text at that position. The position returned is the
|
||||
original text position of the character.
|
||||
|
||||
.TP 4
|
||||
.BR ucstring_set_cursor_motion()
|
||||
This function will change the cursor motion type and return the previous
|
||||
cursor motion type.
|
||||
|
||||
.TP 4
|
||||
.BR ucstring_cursor_right()
|
||||
This function will move the internal cursor to the right according to the
|
||||
type of cursor motion set for the string.
|
||||
.sp
|
||||
If no cursor motion is performed, it returns 0. Otherwise it will return a 1.
|
||||
|
||||
.TP 4
|
||||
.BR ucstring_cursor_left()
|
||||
This function will move the internal cursor to the left according to the
|
||||
type of cursor motion set for the string.
|
||||
.sp
|
||||
If no cursor motion is performed, it returns 0. Otherwise it will return a 1.
|
||||
|
||||
.SH "SEE ALSO"
|
||||
ucdata(3)
|
||||
|
||||
.SH ACKNOWLEDGMENTS
|
||||
These are people who have helped with patches or alerted me about problems.
|
||||
|
||||
.SH AUTHOR
|
||||
Mark Leisher
|
||||
.br
|
||||
Computing Research Lab
|
||||
.br
|
||||
New Mexico State University
|
||||
.br
|
||||
Email: mleisher@crl.nmsu.edu
|
||||
212
libraries/liblunicode/ure/README
Normal file
212
libraries/liblunicode/ure/README
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
#
|
||||
# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $
|
||||
#
|
||||
# Copyright 1997, 1998, 1999 Computing Research Labs,
|
||||
# New Mexico State University
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
|
||||
|
||||
Unicode and Regular Expressions
|
||||
Version 0.5
|
||||
|
||||
This is a simple regular expression package for matching against Unicode text
|
||||
in UCS2 form. The implementation of this URE package is a variation on the
|
||||
RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark
|
||||
Hopkins' algorithm had the virtue of being very simple, so it was used as a
|
||||
model.
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
Assumptions:
|
||||
|
||||
o Regular expression and text already normalized.
|
||||
|
||||
o Conversion to lower case assumes a 1-1 mapping.
|
||||
|
||||
Definitions:
|
||||
|
||||
Separator - any one of U+2028, U+2029, '\n', '\r'.
|
||||
|
||||
Operators:
|
||||
. - match any character.
|
||||
* - match zero or more of the last subexpression.
|
||||
+ - match one or more of the last subexpression.
|
||||
? - match zero or one of the last subexpression.
|
||||
() - subexpression grouping.
|
||||
|
||||
Notes:
|
||||
|
||||
o The "." operator normally does not match separators, but a flag is
|
||||
available for the ure_exec() function that will allow this operator to
|
||||
match a separator.
|
||||
|
||||
Literals and Constants:
|
||||
|
||||
c - literal UCS2 character.
|
||||
\x.... - hexadecimal number of up to 4 digits.
|
||||
\X.... - hexadecimal number of up to 4 digits.
|
||||
\u.... - hexadecimal number of up to 4 digits.
|
||||
\U.... - hexadecimal number of up to 4 digits.
|
||||
|
||||
Character classes:
|
||||
|
||||
[...] - Character class.
|
||||
[^...] - Negated character class.
|
||||
\pN1,N2,...,Nn - Character properties class.
|
||||
\PN1,N2,...,Nn - Negated character properties class.
|
||||
|
||||
POSIX character classes recognized:
|
||||
|
||||
:alnum:
|
||||
:alpha:
|
||||
:cntrl:
|
||||
:digit:
|
||||
:graph:
|
||||
:lower:
|
||||
:print:
|
||||
:punct:
|
||||
:space:
|
||||
:upper:
|
||||
:xdigit:
|
||||
|
||||
Notes:
|
||||
|
||||
o Character property classes are \p or \P followed by a comma separated
|
||||
list of integers between 1 and 32. These integers are references to
|
||||
the following character properties:
|
||||
|
||||
N Character Property
|
||||
--------------------------
|
||||
1 _URE_NONSPACING
|
||||
2 _URE_COMBINING
|
||||
3 _URE_NUMDIGIT
|
||||
4 _URE_NUMOTHER
|
||||
5 _URE_SPACESEP
|
||||
6 _URE_LINESEP
|
||||
7 _URE_PARASEP
|
||||
8 _URE_CNTRL
|
||||
9 _URE_PUA
|
||||
10 _URE_UPPER
|
||||
11 _URE_LOWER
|
||||
12 _URE_TITLE
|
||||
13 _URE_MODIFIER
|
||||
14 _URE_OTHERLETTER
|
||||
15 _URE_DASHPUNCT
|
||||
16 _URE_OPENPUNCT
|
||||
17 _URE_CLOSEPUNCT
|
||||
18 _URE_OTHERPUNCT
|
||||
19 _URE_MATHSYM
|
||||
20 _URE_CURRENCYSYM
|
||||
21 _URE_OTHERSYM
|
||||
22 _URE_LTR
|
||||
23 _URE_RTL
|
||||
24 _URE_EURONUM
|
||||
25 _URE_EURONUMSEP
|
||||
26 _URE_EURONUMTERM
|
||||
27 _URE_ARABNUM
|
||||
28 _URE_COMMONSEP
|
||||
29 _URE_BLOCKSEP
|
||||
30 _URE_SEGMENTSEP
|
||||
31 _URE_WHITESPACE
|
||||
32 _URE_OTHERNEUT
|
||||
|
||||
o Character classes can contain literals, constants, and character
|
||||
property classes. Example:
|
||||
|
||||
[abc\U10A\p1,3,4]
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
Before using URE
|
||||
----------------
|
||||
Before URE is used, two functions need to be created. One to check if a
|
||||
character matches a set of URE character properties, and one to convert a
|
||||
character to lower case.
|
||||
|
||||
Stubs for these function are located in the urestubs.c file.
|
||||
|
||||
Using URE
|
||||
---------
|
||||
|
||||
Sample pseudo-code fragment.
|
||||
|
||||
ure_buffer_t rebuf;
|
||||
ure_dfa_t dfa;
|
||||
ucs2_t *re, *text;
|
||||
unsigned long relen, textlen;
|
||||
unsigned long match_start, match_end;
|
||||
|
||||
/*
|
||||
* Allocate the dynamic storage needed to compile regular expressions.
|
||||
*/
|
||||
rebuf = ure_buffer_create();
|
||||
|
||||
for each regular expression in a list {
|
||||
re = next regular expression;
|
||||
relen = length(re);
|
||||
|
||||
/*
|
||||
* Compile the regular expression with the case insensitive flag
|
||||
* turned on.
|
||||
*/
|
||||
dfa = ure_compile(re, relen, 1, rebuf);
|
||||
|
||||
/*
|
||||
* Look for the first match in some text. The matching will be done
|
||||
* in a case insensitive manner because the expression was compiled
|
||||
* with the case insensitive flag on.
|
||||
*/
|
||||
if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
|
||||
printf("MATCH: %ld %ld\n", match_start, match_end);
|
||||
|
||||
/*
|
||||
* Look for the first match in some text, ignoring non-spacing
|
||||
* characters.
|
||||
*/
|
||||
if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
|
||||
&match_start, &match_end))
|
||||
printf("MATCH: %ld %ld\n", match_start, match_end);
|
||||
|
||||
/*
|
||||
* Free the DFA.
|
||||
*/
|
||||
ure_free_dfa(dfa);
|
||||
}
|
||||
|
||||
/*
|
||||
* Free the dynamic storage used for compiling the expressions.
|
||||
*/
|
||||
ure_free_buffer(rebuf);
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
Mark Leisher <mleisher@crl.nmsu.edu>
|
||||
29 March 1997
|
||||
|
||||
===========================================================================
|
||||
|
||||
CHANGES
|
||||
-------
|
||||
|
||||
Version: 0.5
|
||||
Date : 21 September 1999
|
||||
==========================
|
||||
1. Added copyright stuff and put in CVS.
|
||||
2304
libraries/liblunicode/ure/ure.c
Normal file
2304
libraries/liblunicode/ure/ure.c
Normal file
File diff suppressed because it is too large
Load diff
150
libraries/liblunicode/ure/ure.h
Normal file
150
libraries/liblunicode/ure/ure.h
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
/*
|
||||
* Copyright 1997, 1998, 1999 Computing Research Labs,
|
||||
* New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef _h_ure
|
||||
#define _h_ure
|
||||
|
||||
/*
|
||||
* $Id: ure.h,v 1.2 1999/09/21 15:47:44 mleisher Exp $
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#undef __
|
||||
#ifdef __STDC__
|
||||
#define __(x) x
|
||||
#else
|
||||
#define __(x) ()
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Set of character class flags.
|
||||
*/
|
||||
#define _URE_NONSPACING 0x00000001
|
||||
#define _URE_COMBINING 0x00000002
|
||||
#define _URE_NUMDIGIT 0x00000004
|
||||
#define _URE_NUMOTHER 0x00000008
|
||||
#define _URE_SPACESEP 0x00000010
|
||||
#define _URE_LINESEP 0x00000020
|
||||
#define _URE_PARASEP 0x00000040
|
||||
#define _URE_CNTRL 0x00000080
|
||||
#define _URE_PUA 0x00000100
|
||||
|
||||
#define _URE_UPPER 0x00000200
|
||||
#define _URE_LOWER 0x00000400
|
||||
#define _URE_TITLE 0x00000800
|
||||
#define _URE_MODIFIER 0x00001000
|
||||
#define _URE_OTHERLETTER 0x00002000
|
||||
#define _URE_DASHPUNCT 0x00004000
|
||||
#define _URE_OPENPUNCT 0x00008000
|
||||
#define _URE_CLOSEPUNCT 0x00010000
|
||||
#define _URE_OTHERPUNCT 0x00020000
|
||||
#define _URE_MATHSYM 0x00040000
|
||||
#define _URE_CURRENCYSYM 0x00080000
|
||||
#define _URE_OTHERSYM 0x00100000
|
||||
|
||||
#define _URE_LTR 0x00200000
|
||||
#define _URE_RTL 0x00400000
|
||||
|
||||
#define _URE_EURONUM 0x00800000
|
||||
#define _URE_EURONUMSEP 0x01000000
|
||||
#define _URE_EURONUMTERM 0x02000000
|
||||
#define _URE_ARABNUM 0x04000000
|
||||
#define _URE_COMMONSEP 0x08000000
|
||||
|
||||
#define _URE_BLOCKSEP 0x10000000
|
||||
#define _URE_SEGMENTSEP 0x20000000
|
||||
|
||||
#define _URE_WHITESPACE 0x40000000
|
||||
#define _URE_OTHERNEUT 0x80000000
|
||||
|
||||
/*
|
||||
* Error codes.
|
||||
*/
|
||||
#define _URE_OK 0
|
||||
#define _URE_UNEXPECTED_EOS -1
|
||||
#define _URE_CCLASS_OPEN -2
|
||||
#define _URE_UNBALANCED_GROUP -3
|
||||
#define _URE_INVALID_PROPERTY -4
|
||||
|
||||
/*
|
||||
* Options that can be combined for searching.
|
||||
*/
|
||||
#define URE_IGNORE_NONSPACING 0x01
|
||||
#define URE_DOT_MATCHES_SEPARATORS 0x02
|
||||
|
||||
typedef unsigned long ucs4_t;
|
||||
typedef unsigned short ucs2_t;
|
||||
|
||||
/*
|
||||
* Opaque type for memory used when compiling expressions.
|
||||
*/
|
||||
typedef struct _ure_buffer_t *ure_buffer_t;
|
||||
|
||||
/*
|
||||
* Opaque type for the minimal DFA used when matching.
|
||||
*/
|
||||
typedef struct _ure_dfa_t *ure_dfa_t;
|
||||
|
||||
/*************************************************************************
|
||||
*
|
||||
* API.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
extern ure_buffer_t ure_buffer_create __((void));
|
||||
|
||||
extern void ure_buffer_free __((ure_buffer_t buf));
|
||||
|
||||
extern ure_dfa_t ure_compile __((ucs2_t *re, unsigned long relen,
|
||||
int casefold, ure_buffer_t buf));
|
||||
|
||||
extern void ure_dfa_free __((ure_dfa_t dfa));
|
||||
|
||||
extern void ure_write_dfa __((ure_dfa_t dfa, FILE *out));
|
||||
|
||||
extern int ure_exec __((ure_dfa_t dfa, int flags,
|
||||
ucs2_t *text, unsigned long textlen,
|
||||
unsigned long *match_start, unsigned long *match_end));
|
||||
|
||||
/*************************************************************************
|
||||
*
|
||||
* Prototypes for stub functions used for URE. These need to be rewritten to
|
||||
* use the Unicode support available on the system.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
extern ucs4_t _ure_tolower __((ucs4_t c));
|
||||
|
||||
extern int _ure_matches_properties __((unsigned long props, ucs4_t c));
|
||||
|
||||
#undef __
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _h_ure */
|
||||
64
libraries/liblunicode/ure/urestubs.c
Normal file
64
libraries/liblunicode/ure/urestubs.c
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright 1997, 1998, 1999 Computing Research Labs,
|
||||
* New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef lint
|
||||
static char rcsid[] = "$Id: urestubs.c,v 1.2 1999/09/21 15:47:44 mleisher Exp $";
|
||||
#endif
|
||||
|
||||
#include "ure.h"
|
||||
|
||||
/*
|
||||
* This file contains stub routines needed by the URE package to test
|
||||
* character properties and other Unicode implementation specific details.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This routine should return the lower case equivalent for the character or,
|
||||
* if there is no lower case quivalent, the character itself.
|
||||
*/
|
||||
ucs4_t
|
||||
#ifdef __STDC__
|
||||
_ure_tolower(ucs4_t c)
|
||||
#else
|
||||
_ure_tolower(c)
|
||||
ucs4_t c;
|
||||
#endif
|
||||
{
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine takes a set of URE character property flags (see ure.h) along
|
||||
* with a character and tests to see if the character has one or more of those
|
||||
* properties.
|
||||
*/
|
||||
int
|
||||
#ifdef __STDC__
|
||||
_ure_matches_properties(unsigned long props, ucs4_t c)
|
||||
#else
|
||||
_ure_matches_properties(props, c)
|
||||
unsigned long props;
|
||||
ucs4_t c;
|
||||
#endif
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
121
libraries/liblunicode/utbm/README
Normal file
121
libraries/liblunicode/utbm/README
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
#
|
||||
# $Id: README,v 1.1 1999/09/21 15:45:17 mleisher Exp $
|
||||
#
|
||||
# Copyright 1997, 1998, 1999 Computing Research Labs,
|
||||
# New Mexico State University
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
|
||||
Unicode and Boyer-Moore Searching
|
||||
Version 0.2
|
||||
|
||||
UTBM (Unicode Tuned Boyer-Moore) is a simple package that provides tuned
|
||||
Boyer-Moore searches on Unicode UCS2 text (handles high and low surrogates).
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
Assumptions:
|
||||
|
||||
o Search pattern and text already normalized in some fasion.
|
||||
|
||||
o Upper, lower, and title case conversions are one-to-one.
|
||||
|
||||
o For conversions between upper, lower, and title case, UCS2 characters
|
||||
always convert to other UCS2 characters, and UTF-16 characters always
|
||||
convert to other UTF-16 characters.
|
||||
|
||||
Flags:
|
||||
|
||||
UTBM provides three processing flags:
|
||||
|
||||
o UTBM_CASEFOLD - search in a case-insensitive manner.
|
||||
|
||||
o UTBM_IGNORE_NONSPACING - ignore non-spacing characters in the pattern and
|
||||
the text.
|
||||
|
||||
o UTBM_SPACE_COMPRESS - view as a *single space*, sequential groups of
|
||||
U+2028, U+2029, '\n', '\r', '\t', and any
|
||||
character identified as a space by the Unicode
|
||||
support on the platform.
|
||||
|
||||
This flag also causes all characters identified
|
||||
as control by the Unicode support on the
|
||||
platform to be ignored (except for '\n', '\r',
|
||||
and '\t').
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
Before using UTBM
|
||||
-----------------
|
||||
Before UTBM is used, some functions need to be created. The "utbmstub.c" file
|
||||
contains stubs that need to be rewritten so they work with the Unicode support
|
||||
on the platform on which this package is being used.
|
||||
|
||||
Using UTBM
|
||||
----------
|
||||
|
||||
Sample pseudo-code fragment.
|
||||
|
||||
utbm_pattern_t pat;
|
||||
ucs2_t *pattern, *text;
|
||||
unsigned long patternlen, textlen;
|
||||
unsigned long flags, match_start, match_end;
|
||||
|
||||
/*
|
||||
* Allocate the dynamic storage needed for a search pattern.
|
||||
*/
|
||||
pat = utbm_create_pattern();
|
||||
|
||||
/*
|
||||
* Set the search flags desired.
|
||||
*/
|
||||
flags = UTBM_CASEFOLD|UTBM_IGNORE_NONSPACING;
|
||||
|
||||
/*
|
||||
* Compile the search pattern.
|
||||
*/
|
||||
utbm_compile(pattern, patternlen, flags, pat);
|
||||
|
||||
/*
|
||||
* Find the first occurance of the search pattern in the text.
|
||||
*/
|
||||
if (utbm_exec(pat, text, textlen, &match_start, &match_end))
|
||||
printf("MATCH: %ld %ld\n", match_start, match_end);
|
||||
|
||||
/*
|
||||
* Free the dynamic storage used for the search pattern.
|
||||
*/
|
||||
ure_free_pattern(pat);
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
Mark Leisher <mleisher@crl.nmsu.edu>
|
||||
2 May 1997
|
||||
|
||||
===========================================================================
|
||||
|
||||
CHANGES
|
||||
-------
|
||||
|
||||
Version: 0.2
|
||||
Date : 21 September 1999
|
||||
==========================
|
||||
1. Added copyright stuff and put in CVS.
|
||||
|
||||
497
libraries/liblunicode/utbm/utbm.c
Normal file
497
libraries/liblunicode/utbm/utbm.c
Normal file
|
|
@ -0,0 +1,497 @@
|
|||
/*
|
||||
* Copyright 1997, 1998, 1999 Computing Research Labs,
|
||||
* New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef lint
|
||||
static char rcsid[] = "$Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $";
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Assumptions:
|
||||
* 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
|
||||
* 2. Case conversions are all one-to-one.
|
||||
* 3. Text and pattern have already been normalized in some fashion.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include "utbm.h"
|
||||
|
||||
/*
|
||||
* Single pattern character.
|
||||
*/
|
||||
typedef struct {
|
||||
ucs4_t lc;
|
||||
ucs4_t uc;
|
||||
ucs4_t tc;
|
||||
} _utbm_char_t;
|
||||
|
||||
typedef struct {
|
||||
_utbm_char_t *ch;
|
||||
unsigned long skip;
|
||||
} _utbm_skip_t;
|
||||
|
||||
typedef struct _utbm_pattern_t {
|
||||
unsigned long flags;
|
||||
|
||||
_utbm_char_t *pat;
|
||||
unsigned long pat_used;
|
||||
unsigned long pat_size;
|
||||
unsigned long patlen;
|
||||
|
||||
_utbm_skip_t *skip;
|
||||
unsigned long skip_used;
|
||||
unsigned long skip_size;
|
||||
|
||||
unsigned long md4;
|
||||
} _utbm_pattern_t;
|
||||
|
||||
/*************************************************************************
|
||||
*
|
||||
* Support functions.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
/*
|
||||
* Routine to look up the skip value for a character.
|
||||
*/
|
||||
static unsigned long
|
||||
#ifdef __STDC__
|
||||
_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
|
||||
#else
|
||||
_utbm_skip(p, start, end)
|
||||
utbm_pattern_t p;
|
||||
ucs2_t *start, *end;
|
||||
#endif
|
||||
{
|
||||
unsigned long i;
|
||||
ucs4_t c1, c2;
|
||||
_utbm_skip_t *sp;
|
||||
|
||||
if (start >= end)
|
||||
return 0;
|
||||
|
||||
c1 = *start;
|
||||
c2 = (start + 1 < end) ? *(start + 1) : ~0;
|
||||
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
|
||||
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
|
||||
|
||||
for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
|
||||
if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
|
||||
return ((unsigned long) (end - start) < sp->skip) ?
|
||||
end - start : sp->skip;
|
||||
}
|
||||
}
|
||||
return p->patlen;
|
||||
}
|
||||
|
||||
static int
|
||||
#ifdef __STDC__
|
||||
_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
|
||||
unsigned long *match_start, unsigned long *match_end)
|
||||
#else
|
||||
_utbm_match(pat, text, start, end, match_start, match_end)
|
||||
utbm_pattern_t pat;
|
||||
ucs2_t *text, *start, *end;
|
||||
unsigned long *match_start, *match_end;
|
||||
#endif
|
||||
{
|
||||
int check_space;
|
||||
ucs4_t c1, c2;
|
||||
unsigned long count;
|
||||
_utbm_char_t *cp;
|
||||
|
||||
/*
|
||||
* Set the potential match endpoint first.
|
||||
*/
|
||||
*match_end = (start - text) + 1;
|
||||
|
||||
c1 = *start;
|
||||
c2 = (start + 1 < end) ? *(start + 1) : ~0;
|
||||
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
|
||||
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
|
||||
/*
|
||||
* Adjust the match end point to occur after the UTF-16 character.
|
||||
*/
|
||||
*match_end = *match_end + 1;
|
||||
}
|
||||
|
||||
if (pat->pat_used == 1) {
|
||||
*match_start = start - text;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compare backward.
|
||||
*/
|
||||
cp = pat->pat + (pat->pat_used - 1);
|
||||
|
||||
for (count = pat->patlen; start > text && count > 0;) {
|
||||
/*
|
||||
* Ignore non-spacing characters if indicated.
|
||||
*/
|
||||
if (pat->flags & UTBM_IGNORE_NONSPACING) {
|
||||
while (start > text && _utbm_nonspacing(c1)) {
|
||||
c2 = *--start;
|
||||
c1 = (start - 1 > text) ? *(start - 1) : ~0;
|
||||
if (0xdc00 <= c2 && c2 <= 0xdfff &&
|
||||
0xd800 <= c1 && c1 <= 0xdbff) {
|
||||
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
|
||||
start--;
|
||||
} else
|
||||
c1 = c2;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle space compression if indicated.
|
||||
*/
|
||||
if (pat->flags & UTBM_SPACE_COMPRESS) {
|
||||
check_space = 0;
|
||||
while (start > text &&
|
||||
(_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
|
||||
check_space = _utbm_isspace(c1, 1);
|
||||
c2 = *--start;
|
||||
c1 = (start - 1 > text) ? *(start - 1) : ~0;
|
||||
if (0xdc00 <= c2 && c2 <= 0xdfff &&
|
||||
0xd800 <= c1 && c1 <= 0xdbff) {
|
||||
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
|
||||
start--;
|
||||
} else
|
||||
c1 = c2;
|
||||
}
|
||||
/*
|
||||
* Handle things if space compression was indicated and one or
|
||||
* more member characters were found.
|
||||
*/
|
||||
if (check_space) {
|
||||
if (cp->uc != ' ')
|
||||
return 0;
|
||||
cp--;
|
||||
count--;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle the normal comparison cases.
|
||||
*/
|
||||
if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
|
||||
return 0;
|
||||
|
||||
count -= (c1 >= 0x10000) ? 2 : 1;
|
||||
if (count > 0) {
|
||||
cp--;
|
||||
|
||||
/*
|
||||
* Get the next preceding character.
|
||||
*/
|
||||
if (start > text) {
|
||||
c2 = *--start;
|
||||
c1 = (start - 1 > text) ? *(start - 1) : ~0;
|
||||
if (0xdc00 <= c2 && c2 <= 0xdfff &&
|
||||
0xd800 <= c1 && c1 <= 0xdbff) {
|
||||
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
|
||||
start--;
|
||||
} else
|
||||
c1 = c2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the match start position.
|
||||
*/
|
||||
*match_start = start - text;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
*
|
||||
* API.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
utbm_pattern_t
|
||||
#ifdef __STDC__
|
||||
utbm_create_pattern(void)
|
||||
#else
|
||||
utbm_create_pattern()
|
||||
#endif
|
||||
{
|
||||
utbm_pattern_t p;
|
||||
|
||||
p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
|
||||
(void) memset((char *) p, 0, sizeof(_utbm_pattern_t));
|
||||
return p;
|
||||
}
|
||||
|
||||
void
|
||||
#ifdef __STDC__
|
||||
utbm_free_pattern(utbm_pattern_t pattern)
|
||||
#else
|
||||
utbm_free_pattern(pattern)
|
||||
utbm_pattern_t pattern;
|
||||
#endif
|
||||
{
|
||||
if (pattern == 0)
|
||||
return;
|
||||
|
||||
if (pattern->pat_size > 0)
|
||||
free((char *) pattern->pat);
|
||||
|
||||
if (pattern->skip_size > 0)
|
||||
free((char *) pattern->skip);
|
||||
|
||||
free((char *) pattern);
|
||||
}
|
||||
|
||||
void
|
||||
#ifdef __STDC__
|
||||
utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
|
||||
utbm_pattern_t p)
|
||||
#else
|
||||
utbm_compile(pat, patlen, flags, p)
|
||||
ucs2_t *pat;
|
||||
unsigned long patlen, flags;
|
||||
utbm_pattern_t p;
|
||||
#endif
|
||||
{
|
||||
int have_space;
|
||||
unsigned long i, j, k, slen;
|
||||
_utbm_char_t *cp;
|
||||
_utbm_skip_t *sp;
|
||||
ucs4_t c1, c2, sentinel;
|
||||
|
||||
if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Reset the pattern buffer.
|
||||
*/
|
||||
p->patlen = p->pat_used = p->skip_used = 0;
|
||||
|
||||
/*
|
||||
* Set the flags.
|
||||
*/
|
||||
p->flags = flags;
|
||||
|
||||
/*
|
||||
* Initialize the extra skip flag.
|
||||
*/
|
||||
p->md4 = 1;
|
||||
|
||||
/*
|
||||
* Allocate more storage if necessary.
|
||||
*/
|
||||
if (patlen > p->pat_size) {
|
||||
if (p->pat_size == 0) {
|
||||
p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
|
||||
p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
|
||||
} else {
|
||||
p->pat = (_utbm_char_t *)
|
||||
realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
|
||||
p->skip = (_utbm_skip_t *)
|
||||
realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
|
||||
}
|
||||
p->pat_size = p->skip_size = patlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* Preprocess the pattern to remove controls (if specified) and determine
|
||||
* case.
|
||||
*/
|
||||
for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
|
||||
c1 = pat[i];
|
||||
c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
|
||||
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
|
||||
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
|
||||
|
||||
/*
|
||||
* Make sure the `have_space' flag is turned off if the character
|
||||
* is not an appropriate one.
|
||||
*/
|
||||
if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
|
||||
have_space = 0;
|
||||
|
||||
/*
|
||||
* If non-spacing characters should be ignored, do it here.
|
||||
*/
|
||||
if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Check if spaces and controls need to be compressed.
|
||||
*/
|
||||
if (flags & UTBM_SPACE_COMPRESS) {
|
||||
if (_utbm_isspace(c1, 1)) {
|
||||
if (!have_space) {
|
||||
/*
|
||||
* Add a space and set the flag.
|
||||
*/
|
||||
cp->uc = cp->lc = cp->tc = ' ';
|
||||
cp++;
|
||||
|
||||
/*
|
||||
* Increase the real pattern length.
|
||||
*/
|
||||
p->patlen++;
|
||||
sentinel = ' ';
|
||||
have_space = 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ignore all control characters.
|
||||
*/
|
||||
if (_utbm_iscntrl(c1))
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the character.
|
||||
*/
|
||||
if (flags & UTBM_CASEFOLD) {
|
||||
cp->uc = _utbm_toupper(c1);
|
||||
cp->lc = _utbm_tolower(c1);
|
||||
cp->tc = _utbm_totitle(c1);
|
||||
} else
|
||||
cp->uc = cp->lc = cp->tc = c1;
|
||||
|
||||
/*
|
||||
* Set the sentinel character.
|
||||
*/
|
||||
sentinel = cp->uc;
|
||||
|
||||
/*
|
||||
* Move to the next character.
|
||||
*/
|
||||
cp++;
|
||||
|
||||
/*
|
||||
* Increase the real pattern length appropriately.
|
||||
*/
|
||||
p->patlen += (c1 >= 0x10000) ? 2 : 1;
|
||||
|
||||
/*
|
||||
* Increment the loop index for UTF-16 characters.
|
||||
*/
|
||||
i += (c1 >= 0x10000) ? 1 : 0;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the number of characters actually used.
|
||||
*/
|
||||
p->pat_used = cp - p->pat;
|
||||
|
||||
/*
|
||||
* Go through and construct the skip array and determine the actual length
|
||||
* of the pattern in UCS2 terms.
|
||||
*/
|
||||
slen = p->patlen - 1;
|
||||
cp = p->pat;
|
||||
for (i = k = 0; i < p->pat_used; i++, cp++) {
|
||||
/*
|
||||
* Locate the character in the skip array.
|
||||
*/
|
||||
for (sp = p->skip, j = 0;
|
||||
j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
|
||||
|
||||
/*
|
||||
* If the character is not found, set the new skip element and
|
||||
* increase the number of skip elements.
|
||||
*/
|
||||
if (j == p->skip_used) {
|
||||
sp->ch = cp;
|
||||
p->skip_used++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the updated skip value. If the character is UTF-16 and is
|
||||
* not the last one in the pattern, add one to its skip value.
|
||||
*/
|
||||
sp->skip = slen - k;
|
||||
if (cp->uc >= 0x10000 && k + 2 < slen)
|
||||
sp->skip++;
|
||||
|
||||
/*
|
||||
* Set the new extra skip for the sentinel character.
|
||||
*/
|
||||
if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
|
||||
cp->uc == sentinel)
|
||||
p->md4 = slen - k;
|
||||
|
||||
/*
|
||||
* Increase the actual index.
|
||||
*/
|
||||
k += (cp->uc >= 0x10000) ? 2 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
#ifdef __STDC__
|
||||
utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
|
||||
unsigned long *match_start, unsigned long *match_end)
|
||||
#else
|
||||
utbm_exec(pat, text, textlen, match_start, match_end)
|
||||
utbm_pattern_t pat;
|
||||
ucs2_t *text;
|
||||
unsigned long textlen, *match_start, *match_end;
|
||||
#endif
|
||||
{
|
||||
unsigned long k;
|
||||
ucs2_t *start, *end;
|
||||
|
||||
if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
|
||||
textlen < pat->patlen)
|
||||
return 0;
|
||||
|
||||
start = text + pat->patlen;
|
||||
end = text + textlen;
|
||||
|
||||
/*
|
||||
* Adjust the start point if it points to a low surrogate.
|
||||
*/
|
||||
if (0xdc00 <= *start && *start <= 0xdfff &&
|
||||
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
|
||||
start--;
|
||||
|
||||
while (start < end) {
|
||||
while ((k = _utbm_skip(pat, start, end))) {
|
||||
start += k;
|
||||
if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
|
||||
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
|
||||
start--;
|
||||
}
|
||||
|
||||
if (start < end &&
|
||||
_utbm_match(pat, text, start, end, match_start, match_end))
|
||||
return 1;
|
||||
|
||||
start += pat->md4;
|
||||
if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
|
||||
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
|
||||
start--;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
109
libraries/liblunicode/utbm/utbm.h
Normal file
109
libraries/liblunicode/utbm/utbm.h
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* Copyright 1997, 1998, 1999 Computing Research Labs,
|
||||
* New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef _h_utbm
|
||||
#define _h_utbm
|
||||
|
||||
/*
|
||||
* $Id: utbm.h,v 1.1 1999/09/21 15:45:18 mleisher Exp $
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#undef __
|
||||
#ifdef __STDC__
|
||||
#define __(x) x
|
||||
#else
|
||||
#define __(x) ()
|
||||
#endif
|
||||
|
||||
/*************************************************************************
|
||||
*
|
||||
* Types.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
/*
|
||||
* Fundamental character types.
|
||||
*/
|
||||
typedef unsigned long ucs4_t;
|
||||
typedef unsigned short ucs2_t;
|
||||
|
||||
/*
|
||||
* An opaque type used for the search pattern.
|
||||
*/
|
||||
typedef struct _utbm_pattern_t *utbm_pattern_t;
|
||||
|
||||
/*************************************************************************
|
||||
*
|
||||
* Flags.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
#define UTBM_CASEFOLD 0x01
|
||||
#define UTBM_IGNORE_NONSPACING 0x02
|
||||
#define UTBM_SPACE_COMPRESS 0x04
|
||||
|
||||
/*************************************************************************
|
||||
*
|
||||
* API.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
extern utbm_pattern_t utbm_create_pattern __((void));
|
||||
|
||||
extern void utbm_free_pattern __((utbm_pattern_t pattern));
|
||||
|
||||
extern void utbm_compile __((ucs2_t *pat, unsigned long patlen,
|
||||
unsigned long flags, utbm_pattern_t pattern));
|
||||
|
||||
extern int utbm_exec __((utbm_pattern_t pat, ucs2_t *text,
|
||||
unsigned long textlen, unsigned long *match_start,
|
||||
unsigned long *match_end));
|
||||
|
||||
/*************************************************************************
|
||||
*
|
||||
* Prototypes for the stub functions needed.
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
extern int _utbm_isspace __((ucs4_t c, int compress));
|
||||
|
||||
extern int _utbm_iscntrl __((ucs4_t c));
|
||||
|
||||
extern int _utbm_nonspacing __((ucs4_t c));
|
||||
|
||||
extern ucs4_t _utbm_tolower __((ucs4_t c));
|
||||
|
||||
extern ucs4_t _utbm_toupper __((ucs4_t c));
|
||||
|
||||
extern ucs4_t _utbm_totitle __((ucs4_t c));
|
||||
|
||||
#undef __
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _h_utbm */
|
||||
125
libraries/liblunicode/utbm/utbmstub.c
Normal file
125
libraries/liblunicode/utbm/utbmstub.c
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
* Copyright 1997, 1998, 1999 Computing Research Labs,
|
||||
* New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef lint
|
||||
static char rcsid[] = "$Id: utbmstub.c,v 1.1 1999/09/21 15:45:18 mleisher Exp $";
|
||||
#endif
|
||||
|
||||
#include "utbm.h"
|
||||
|
||||
/*
|
||||
* This should be redefined to use the `isspace' function available in the
|
||||
* Unicode support on the platform where this is being used.
|
||||
*/
|
||||
#define _platform_isspace(x) 0
|
||||
|
||||
/*
|
||||
* Return non-zero for any character that should be considered the equivalent
|
||||
* of a space character. Return zero otherwise.
|
||||
*/
|
||||
int
|
||||
#ifdef __STDC__
|
||||
_utbm_isspace(ucs4_t c, int compress)
|
||||
#else
|
||||
_utbm_isspace(c, compress)
|
||||
ucs4_t c;
|
||||
int compress;
|
||||
#endif
|
||||
{
|
||||
if (compress)
|
||||
return (c == 0x09 || c == 0x0a || c == 0x0d ||
|
||||
c == 0x2028 || c == 0x2029 || _platform_isspace(c)) ? 1 : 0;
|
||||
|
||||
return _platform_isspace(c);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Return non-zero if the character is a control character, or zero otherwise.
|
||||
*/
|
||||
int
|
||||
#ifdef __STDC__
|
||||
_utbm_iscntrl(ucs4_t c)
|
||||
#else
|
||||
_utbm_iscntrl(c)
|
||||
ucs4_t c;
|
||||
#endif
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return non-zero if the character is a non-spacing character, or zero
|
||||
* otherwise.
|
||||
*/
|
||||
int
|
||||
#ifdef __STDC__
|
||||
_utbm_nonspacing(ucs4_t c)
|
||||
#else
|
||||
_utbm_nonspacing(c)
|
||||
ucs4_t c;
|
||||
#endif
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a character to lower case.
|
||||
*/
|
||||
ucs4_t
|
||||
#ifdef __STDC__
|
||||
_utbm_tolower(ucs4_t c)
|
||||
#else
|
||||
_utbm_tolower(c)
|
||||
ucs4_t c;
|
||||
#endif
|
||||
{
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a character to upper case.
|
||||
*/
|
||||
ucs4_t
|
||||
#ifdef __STDC__
|
||||
_utbm_toupper(ucs4_t c)
|
||||
#else
|
||||
_utbm_toupper(c)
|
||||
ucs4_t c;
|
||||
#endif
|
||||
{
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a character to title case.
|
||||
*/
|
||||
ucs4_t
|
||||
#ifdef __STDC__
|
||||
_utbm_totitle(ucs4_t c)
|
||||
#else
|
||||
_utbm_totitle(c)
|
||||
ucs4_t c;
|
||||
#endif
|
||||
{
|
||||
return c;
|
||||
}
|
||||
Loading…
Reference in a new issue