diff options
author | Jeff Davis <jdavis@postgresql.org> | 2025-01-17 15:56:20 -0800 |
---|---|---|
committer | Jeff Davis <jdavis@postgresql.org> | 2025-01-17 15:56:20 -0800 |
commit | 286a365b9c25479f8ad82043ed136748733adfa6 (patch) | |
tree | 162996323c339d82211c10d2d93bf3470e226b1a /src/common/unicode/case_test.c | |
parent | 6a9b2a631aa3dbac5b351cd0c45631cbc6e8d19e (diff) |
Support Unicode full case mapping and conversion.
Generate tables from Unicode SpecialCasing.txt to support more
sophisticated case mapping behavior:
* support case mappings to multiple codepoints, such as "ß"
uppercasing to "SS"
* support conditional case mappings, such as the "final sigma"
* support titlecase variants, such as "dž" uppercasing to "DŽ" but
titlecasing to "Dž"
Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
Diffstat (limited to 'src/common/unicode/case_test.c')
-rw-r--r-- | src/common/unicode/case_test.c | 202 |
1 files changed, 191 insertions, 11 deletions
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c index 52c4a8bbc69..c4ba7e781be 100644 --- a/src/common/unicode/case_test.c +++ b/src/common/unicode/case_test.c @@ -18,12 +18,61 @@ #include <wctype.h> #ifdef USE_ICU +#include <unicode/ucasemap.h> #include <unicode/uchar.h> #endif #include "common/unicode_case.h" #include "common/unicode_category.h" #include "common/unicode_version.h" +/* enough to hold largest source or result string, including NUL */ +#define BUFSZ 256 + +#ifdef USE_ICU +static UCaseMap * casemap = NULL; +#endif + +typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src, + ssize_t srclen); + +/* simple boundary iterator copied from pg_locale_builtin.c */ +struct WordBoundaryState +{ + const char *str; + size_t len; + size_t offset; + bool init; + bool prev_alnum; +}; + +static size_t +initcap_wbnext(void *state) +{ + struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state; + + while (wbstate->offset < wbstate->len && + wbstate->str[wbstate->offset] != '\0') + { + pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + + wbstate->offset); + bool curr_alnum = pg_u_isalnum(u, true); + + if (!wbstate->init || curr_alnum != wbstate->prev_alnum) + { + size_t prev_offset = wbstate->offset; + + wbstate->init = true; + wbstate->offset += unicode_utf8len(u); + wbstate->prev_alnum = curr_alnum; + return prev_offset; + } + + wbstate->offset += unicode_utf8len(u); + } + + return wbstate->len; +} + #ifdef USE_ICU static void @@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code) } } +static void +icu_test_full(char *str) +{ + char lower[BUFSZ]; + char title[BUFSZ]; + char upper[BUFSZ]; + char icu_lower[BUFSZ]; + char icu_title[BUFSZ]; + char icu_upper[BUFSZ]; + UErrorCode status; + struct WordBoundaryState wbstate = { + .str = str, + .len = strlen(str), + .offset = 0, + .init = false, + .prev_alnum = false, + }; + + unicode_strlower(lower, BUFSZ, str, -1, true); + unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate); + unicode_strupper(upper, BUFSZ, str, -1, true); + status = U_ZERO_ERROR; + ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status); + status = U_ZERO_ERROR; + ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status); + status = U_ZERO_ERROR; + ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status); + + if (strcmp(lower, icu_lower) != 0) + { + printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower, + icu_lower); + exit(1); + } + if (strcmp(title, icu_title) != 0) + { + printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title, + icu_title); + exit(1); + } + if (strcmp(upper, icu_upper) != 0) + { + printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper, + icu_upper); + exit(1); + } +} + /* * Exhaustively compare case mappings with the results from ICU. */ @@ -64,6 +161,7 @@ test_icu(void) if (category != PG_U_UNASSIGNED) { uint8_t icu_category = u_charType(code); + char code_str[5] = {0}; if (icu_category == PG_U_UNASSIGNED) { @@ -72,6 +170,9 @@ test_icu(void) } icu_test_simple(code); + unicode_to_utf8(code, (unsigned char *) code_str); + icu_test_full(code_str); + successful++; } } @@ -86,7 +187,7 @@ test_icu(void) #endif static void -test_strlower(const char *test_string, const char *expected) +test_convert(TestFunc tfunc, const char *test_string, const char *expected) { size_t src1len = strlen(test_string); size_t src2len = -1; /* NUL-terminated */ @@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected) /* neither source nor destination are NUL-terminated */ memset(dst1, 0x7F, dst1len); - needed = unicode_strlower(dst1, dst1len, src1, src1len); + needed = tfunc(dst1, dst1len, src1, src1len); if (needed != strlen(expected)) { - printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed); + printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n", + test_string, needed, strlen(expected)); exit(1); } if (memcmp(dst1, expected, dst1len) != 0) @@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected) /* destination is NUL-terminated and source is not */ memset(dst2, 0x7F, dst2len); - needed = unicode_strlower(dst2, dst2len, src1, src1len); + needed = tfunc(dst2, dst2len, src1, src1len); if (needed != strlen(expected)) { - printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed); + printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n", + test_string, needed, strlen(expected)); exit(1); } if (strcmp(dst2, expected) != 0) @@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected) /* source is NUL-terminated and destination is not */ memset(dst1, 0x7F, dst1len); - needed = unicode_strlower(dst1, dst1len, src2, src2len); + needed = tfunc(dst1, dst1len, src2, src2len); if (needed != strlen(expected)) { + printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n", + test_string, needed, strlen(expected)); printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed); exit(1); } @@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected) /* both source and destination are NUL-terminated */ memset(dst2, 0x7F, dst2len); - needed = unicode_strlower(dst2, dst2len, src2, src2len); + needed = tfunc(dst2, dst2len, src2, src2len); if (needed != strlen(expected)) { - printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed); + printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n", + test_string, needed, strlen(expected)); exit(1); } if (strcmp(dst2, expected) != 0) @@ -166,15 +272,69 @@ test_strlower(const char *test_string, const char *expected) free(dst2); } +static size_t +tfunc_lower(char *dst, size_t dstsize, const char *src, + ssize_t srclen) +{ + return unicode_strlower(dst, dstsize, src, srclen, true); +} + +static size_t +tfunc_title(char *dst, size_t dstsize, const char *src, + ssize_t srclen) +{ + struct WordBoundaryState wbstate = { + .str = src, + .len = srclen, + .offset = 0, + .init = false, + .prev_alnum = false, + }; + + return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext, + &wbstate); +} + +static size_t +tfunc_upper(char *dst, size_t dstsize, const char *src, + ssize_t srclen) +{ + return unicode_strupper(dst, dstsize, src, srclen, true); +} + + static void test_convert_case() { /* test string with no case changes */ - test_strlower("√∞", "√∞"); + test_convert(tfunc_lower, "√∞", "√∞"); + /* test adjust-to-cased behavior */ + test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz"); /* test string with case changes */ - test_strlower("ABC", "abc"); + test_convert(tfunc_upper, "abc", "ABC"); /* test string with case changes and byte length changes */ - test_strlower("ȺȺȺ", "ⱥⱥⱥ"); + test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ"); + /* test special case conversions */ + test_convert(tfunc_upper, "ß", "SS"); + test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307"); + test_convert(tfunc_upper, "ıiIİ", "IIIİ"); + /* test final sigma */ + test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς"); + test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'"); + test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς"); + +#ifdef USE_ICU + icu_test_full(""); + icu_test_full("ȺȺȺ"); + icu_test_full("ßßß"); + icu_test_full("√∞"); + icu_test_full("a b"); + icu_test_full("abc 123xyz"); + icu_test_full("σςΣ ΣΣΣ"); + icu_test_full("ıiIİ"); + /* test <alpha><iota_subscript><acute> */ + icu_test_full("\u0391\u0345\u0301"); +#endif printf("case_test: convert_case: success\n"); } @@ -182,6 +342,22 @@ test_convert_case() int main(int argc, char **argv) { +#ifdef USE_ICU + UErrorCode status = U_ZERO_ERROR; + + /* + * Disable ICU's word break adjustment for titlecase to match the expected + * behavior of unicode_strtitle(). + */ + casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status); + if (U_FAILURE(status)) + { + printf("case_test: failure opening UCaseMap: %s\n", + u_errorName(status)); + exit(1); + } +#endif + printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION); #ifdef USE_ICU printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION); @@ -191,5 +367,9 @@ main(int argc, char **argv) #endif test_convert_case(); + +#ifdef USE_ICU + ucasemap_close(casemap); +#endif exit(0); } |