postgresql/src/test/regress/sql/collate.utf8.sql
Jeff Davis f69319f2f1 Support C.UTF-8 locale in the new builtin collation provider.
The builtin C.UTF-8 locale has similar semantics to the libc locale of
the same name. That is, code point sort order (fast, memcmp-based)
combined with Unicode semantics for character operations such as
pattern matching, regular expressions, and
LOWER()/INITCAP()/UPPER(). The character semantics are based on
Unicode simple case mappings.

The builtin provider's C.UTF-8 offers several important advantages
over libc:

 * faster sorting -- benefits from additional optimizations such as
   abbreviated keys and varstrfastcmp_c
 * faster case conversion, e.g. LOWER(), at least compared with some
   libc implementations
 * available on all platforms with identical semantics, and the
   semantics are stable, testable, and documentable within a given
   Postgres major version

Being based on memcmp, the builtin C.UTF-8 locale does not offer
natural language sort order. But it is an improvement for most use
cases that might otherwise use libc's "C.UTF-8" locale, as well as
many use cases that use libc's "C" locale.

Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
2024-03-19 15:24:41 -07:00

67 lines
1.9 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* This test is for collations and character operations when using the
* builtin provider with the C.UTF-8 locale.
*/
/* skip test if not UTF8 server encoding */
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit
\endif
SET client_encoding TO UTF8;
--
-- Test PG_C_UTF8
--
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C_UTF8'); -- fails
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C.UTF8');
DROP COLLATION regress_pg_c_utf8;
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C.UTF-8');
CREATE TABLE test_pg_c_utf8 (
t TEXT COLLATE PG_C_UTF8
);
INSERT INTO test_pg_c_utf8 VALUES
('abc DEF 123abc'),
('ábc sßs ßss DÉF'),
('DŽxxDŽ džxxDž Džxxdž'),
('ȺȺȺ'),
('ⱥⱥⱥ'),
('ⱥȺ');
SELECT
t, lower(t), initcap(t), upper(t),
length(convert_to(t, 'UTF8')) AS t_bytes,
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
FROM test_pg_c_utf8;
DROP TABLE test_pg_c_utf8;
-- negative test: Final_Sigma not used for builtin locale C.UTF-8
SELECT lower('ΑΣ' COLLATE PG_C_UTF8);
SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8);
SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8);
-- properties
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8;
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8;
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8;
SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8;
SELECT '' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix
-- case mapping
SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8;
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed