Fix off-by-one with NFC recomposition for Hangul U+11A7 (TBASE)

The NFC recomposition incorrectly included TBASE as a valid T syllable,
which is incorrect based on the Unicode specification (TBASE is one
below the start of the range, range beginning at U+11A8).

This would cause the TBASE to be silently swallowed in the
normalization, leading to an incorrect result.

A couple of regression tests are added to check more patterns with
Hangul recomposition and decomposition, on top of a test to check the
problem with TBASE.  Diego has submitted the code fix, and I have
written the tests.

Author: Diego Frias <mail@dzfrias.dev>
Co-authored-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/B92ED640-7D4A-4505-B09F-3548F58CBB16@dzfrias.dev
Backpatch-through: 14
This commit is contained in:
Michael Paquier 2026-06-05 07:50:08 +09:00
parent 4ae3e98c02
commit f2ff15e4c3
3 changed files with 99 additions and 1 deletions

View file

@ -236,7 +236,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
/* Check if two current characters are LV and T */
else if (start >= SBASE && start < (SBASE + SCOUNT) &&
((start - SBASE) % TCOUNT) == 0 &&
code >= TBASE && code < (TBASE + TCOUNT))
code > TBASE && code < (TBASE + TCOUNT))
{
/* make syllable of form LVT */
uint32 tindex = code - TBASE;

View file

@ -105,3 +105,81 @@ ORDER BY num;
SELECT is_normalized('abc', 'def'); -- run-time error
ERROR: invalid normalization form: def
-- Hangul NFC recomposition tests
-- L+V -> LV composition (first and last)
SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first;
hangul_lv_first
-----------------
t
(1 row)
SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last;
hangul_lv_last
----------------
t
(1 row)
-- LV+T -> LVT composition
SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t;
hangul_lvt_first_t
--------------------
t
(1 row)
SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t;
hangul_lvt_last_t
-------------------
t
(1 row)
SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv;
hangul_lvt_last_lv
--------------------
t
(1 row)
-- L+V+T -> LVT composition
SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt;
hangul_full_lvt
-----------------
t
(1 row)
SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt;
hangul_full_lvt
-----------------
t
(1 row)
-- TBASE invalid T syllable
SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined;
hangul_tbase_not_combined
---------------------------
t
(1 row)
SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate;
hangul_lv_tbase_separate
--------------------------
t
(1 row)
-- Hangul NFD decomposition tests
SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv;
hangul_nfd_lv
---------------
t
(1 row)
SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt;
hangul_nfd_lvt
----------------
t
(1 row)
SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last;
hangul_nfd_last
-----------------
t
(1 row)

View file

@ -36,3 +36,23 @@ FROM
ORDER BY num;
SELECT is_normalized('abc', 'def'); -- run-time error
-- Hangul NFC recomposition tests
-- L+V -> LV composition (first and last)
SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first;
SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last;
-- LV+T -> LVT composition
SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t;
SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t;
SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv;
-- L+V+T -> LVT composition
SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt;
SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt;
-- TBASE invalid T syllable
SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined;
SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate;
-- Hangul NFD decomposition tests
SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv;
SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt;
SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last;