From d3d0983169130a9b81e3fe48d5c2ca4931480956 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Fri, 17 Jan 2025 15:56:30 -0800 Subject: Support PG_UNICODE_FAST locale in the builtin collation provider. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "Dž" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite --- src/backend/utils/adt/pg_locale.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'src/backend/utils/adt/pg_locale.c') diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 875cca6efc8..94444acd2c5 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1590,8 +1590,11 @@ builtin_locale_encoding(const char *locale) { if (strcmp(locale, "C") == 0) return -1; - if (strcmp(locale, "C.UTF-8") == 0) + else if (strcmp(locale, "C.UTF-8") == 0) return PG_UTF8; + else if (strcmp(locale, "PG_UNICODE_FAST") == 0) + return PG_UTF8; + ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -1616,6 +1619,8 @@ builtin_validate_locale(int encoding, const char *locale) canonical_name = "C"; else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0) canonical_name = "C.UTF-8"; + else if (strcmp(locale, "PG_UNICODE_FAST") == 0) + canonical_name = "PG_UNICODE_FAST"; if (!canonical_name) ereport(ERROR, -- cgit v1.2.3