mirror of
https://github.com/postgres/postgres.git
synced 2026-06-09 00:32:10 -04:00
Fix off-by-one with NFC recomposition for Hangul U+11A7 (TBASE)
The NFC recomposition incorrectly included TBASE as a valid T syllable, which is incorrect based on the Unicode specification (TBASE is one below the start of the range, range beginning at U+11A8). This would cause the TBASE to be silently swallowed in the normalization, leading to an incorrect result. A couple of regression tests are added to check more patterns with Hangul recomposition and decomposition, on top of a test to check the problem with TBASE. Diego has submitted the code fix, and I have written the tests. Author: Diego Frias <mail@dzfrias.dev> Co-authored-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/B92ED640-7D4A-4505-B09F-3548F58CBB16@dzfrias.dev Backpatch-through: 14
This commit is contained in:
parent
4ae3e98c02
commit
f2ff15e4c3
3 changed files with 99 additions and 1 deletions
|
|
@ -236,7 +236,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
|
|||
/* Check if two current characters are LV and T */
|
||||
else if (start >= SBASE && start < (SBASE + SCOUNT) &&
|
||||
((start - SBASE) % TCOUNT) == 0 &&
|
||||
code >= TBASE && code < (TBASE + TCOUNT))
|
||||
code > TBASE && code < (TBASE + TCOUNT))
|
||||
{
|
||||
/* make syllable of form LVT */
|
||||
uint32 tindex = code - TBASE;
|
||||
|
|
|
|||
|
|
@ -105,3 +105,81 @@ ORDER BY num;
|
|||
|
||||
SELECT is_normalized('abc', 'def'); -- run-time error
|
||||
ERROR: invalid normalization form: def
|
||||
-- Hangul NFC recomposition tests
|
||||
-- L+V -> LV composition (first and last)
|
||||
SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first;
|
||||
hangul_lv_first
|
||||
-----------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last;
|
||||
hangul_lv_last
|
||||
----------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- LV+T -> LVT composition
|
||||
SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t;
|
||||
hangul_lvt_first_t
|
||||
--------------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t;
|
||||
hangul_lvt_last_t
|
||||
-------------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv;
|
||||
hangul_lvt_last_lv
|
||||
--------------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- L+V+T -> LVT composition
|
||||
SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt;
|
||||
hangul_full_lvt
|
||||
-----------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt;
|
||||
hangul_full_lvt
|
||||
-----------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- TBASE invalid T syllable
|
||||
SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined;
|
||||
hangul_tbase_not_combined
|
||||
---------------------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate;
|
||||
hangul_lv_tbase_separate
|
||||
--------------------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- Hangul NFD decomposition tests
|
||||
SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv;
|
||||
hangul_nfd_lv
|
||||
---------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt;
|
||||
hangul_nfd_lvt
|
||||
----------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last;
|
||||
hangul_nfd_last
|
||||
-----------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
|
|
|
|||
|
|
@ -36,3 +36,23 @@ FROM
|
|||
ORDER BY num;
|
||||
|
||||
SELECT is_normalized('abc', 'def'); -- run-time error
|
||||
|
||||
-- Hangul NFC recomposition tests
|
||||
-- L+V -> LV composition (first and last)
|
||||
SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first;
|
||||
SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last;
|
||||
-- LV+T -> LVT composition
|
||||
SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t;
|
||||
SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t;
|
||||
SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv;
|
||||
-- L+V+T -> LVT composition
|
||||
SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt;
|
||||
SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt;
|
||||
-- TBASE invalid T syllable
|
||||
SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined;
|
||||
SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate;
|
||||
|
||||
-- Hangul NFD decomposition tests
|
||||
SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv;
|
||||
SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt;
|
||||
SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last;
|
||||
|
|
|
|||
Loading…
Reference in a new issue