summaryrefslogtreecommitdiff
path: root/src/common/unicode/case_test.c
diff options
context:
space:
mode:
authorJeff Davis <jdavis@postgresql.org>2025-01-17 15:56:20 -0800
committerJeff Davis <jdavis@postgresql.org>2025-01-17 15:56:20 -0800
commit286a365b9c25479f8ad82043ed136748733adfa6 (patch)
tree162996323c339d82211c10d2d93bf3470e226b1a /src/common/unicode/case_test.c
parent6a9b2a631aa3dbac5b351cd0c45631cbc6e8d19e (diff)
Support Unicode full case mapping and conversion.
Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "dž" uppercasing to "DŽ" but titlecasing to "Dž" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
Diffstat (limited to 'src/common/unicode/case_test.c')
-rw-r--r--src/common/unicode/case_test.c202
1 files changed, 191 insertions, 11 deletions
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 52c4a8bbc69..c4ba7e781be 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -18,12 +18,61 @@
#include <wctype.h>
#ifdef USE_ICU
+#include <unicode/ucasemap.h>
#include <unicode/uchar.h>
#endif
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+/* enough to hold largest source or result string, including NUL */
+#define BUFSZ 256
+
+#ifdef USE_ICU
+static UCaseMap * casemap = NULL;
+#endif
+
+typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
+ ssize_t srclen);
+
+/* simple boundary iterator copied from pg_locale_builtin.c */
+struct WordBoundaryState
+{
+ const char *str;
+ size_t len;
+ size_t offset;
+ bool init;
+ bool prev_alnum;
+};
+
+static size_t
+initcap_wbnext(void *state)
+{
+ struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+ while (wbstate->offset < wbstate->len &&
+ wbstate->str[wbstate->offset] != '\0')
+ {
+ pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ wbstate->offset);
+ bool curr_alnum = pg_u_isalnum(u, true);
+
+ if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+ {
+ size_t prev_offset = wbstate->offset;
+
+ wbstate->init = true;
+ wbstate->offset += unicode_utf8len(u);
+ wbstate->prev_alnum = curr_alnum;
+ return prev_offset;
+ }
+
+ wbstate->offset += unicode_utf8len(u);
+ }
+
+ return wbstate->len;
+}
+
#ifdef USE_ICU
static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
}
}
+static void
+icu_test_full(char *str)
+{
+ char lower[BUFSZ];
+ char title[BUFSZ];
+ char upper[BUFSZ];
+ char icu_lower[BUFSZ];
+ char icu_title[BUFSZ];
+ char icu_upper[BUFSZ];
+ UErrorCode status;
+ struct WordBoundaryState wbstate = {
+ .str = str,
+ .len = strlen(str),
+ .offset = 0,
+ .init = false,
+ .prev_alnum = false,
+ };
+
+ unicode_strlower(lower, BUFSZ, str, -1, true);
+ unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
+ unicode_strupper(upper, BUFSZ, str, -1, true);
+ status = U_ZERO_ERROR;
+ ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
+ status = U_ZERO_ERROR;
+ ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
+ status = U_ZERO_ERROR;
+ ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
+
+ if (strcmp(lower, icu_lower) != 0)
+ {
+ printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
+ icu_lower);
+ exit(1);
+ }
+ if (strcmp(title, icu_title) != 0)
+ {
+ printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
+ icu_title);
+ exit(1);
+ }
+ if (strcmp(upper, icu_upper) != 0)
+ {
+ printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
+ icu_upper);
+ exit(1);
+ }
+}
+
/*
* Exhaustively compare case mappings with the results from ICU.
*/
@@ -64,6 +161,7 @@ test_icu(void)
if (category != PG_U_UNASSIGNED)
{
uint8_t icu_category = u_charType(code);
+ char code_str[5] = {0};
if (icu_category == PG_U_UNASSIGNED)
{
@@ -72,6 +170,9 @@ test_icu(void)
}
icu_test_simple(code);
+ unicode_to_utf8(code, (unsigned char *) code_str);
+ icu_test_full(code_str);
+
successful++;
}
}
@@ -86,7 +187,7 @@ test_icu(void)
#endif
static void
-test_strlower(const char *test_string, const char *expected)
+test_convert(TestFunc tfunc, const char *test_string, const char *expected)
{
size_t src1len = strlen(test_string);
size_t src2len = -1; /* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
/* neither source nor destination are NUL-terminated */
memset(dst1, 0x7F, dst1len);
- needed = unicode_strlower(dst1, dst1len, src1, src1len);
+ needed = tfunc(dst1, dst1len, src1, src1len);
if (needed != strlen(expected))
{
- printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
+ printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
+ test_string, needed, strlen(expected));
exit(1);
}
if (memcmp(dst1, expected, dst1len) != 0)
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
/* destination is NUL-terminated and source is not */
memset(dst2, 0x7F, dst2len);
- needed = unicode_strlower(dst2, dst2len, src1, src1len);
+ needed = tfunc(dst2, dst2len, src1, src1len);
if (needed != strlen(expected))
{
- printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
+ printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
+ test_string, needed, strlen(expected));
exit(1);
}
if (strcmp(dst2, expected) != 0)
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
/* source is NUL-terminated and destination is not */
memset(dst1, 0x7F, dst1len);
- needed = unicode_strlower(dst1, dst1len, src2, src2len);
+ needed = tfunc(dst1, dst1len, src2, src2len);
if (needed != strlen(expected))
{
+ printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
+ test_string, needed, strlen(expected));
printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
exit(1);
}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
/* both source and destination are NUL-terminated */
memset(dst2, 0x7F, dst2len);
- needed = unicode_strlower(dst2, dst2len, src2, src2len);
+ needed = tfunc(dst2, dst2len, src2, src2len);
if (needed != strlen(expected))
{
- printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
+ printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
+ test_string, needed, strlen(expected));
exit(1);
}
if (strcmp(dst2, expected) != 0)
@@ -166,15 +272,69 @@ test_strlower(const char *test_string, const char *expected)
free(dst2);
}
+static size_t
+tfunc_lower(char *dst, size_t dstsize, const char *src,
+ ssize_t srclen)
+{
+ return unicode_strlower(dst, dstsize, src, srclen, true);
+}
+
+static size_t
+tfunc_title(char *dst, size_t dstsize, const char *src,
+ ssize_t srclen)
+{
+ struct WordBoundaryState wbstate = {
+ .str = src,
+ .len = srclen,
+ .offset = 0,
+ .init = false,
+ .prev_alnum = false,
+ };
+
+ return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
+ &wbstate);
+}
+
+static size_t
+tfunc_upper(char *dst, size_t dstsize, const char *src,
+ ssize_t srclen)
+{
+ return unicode_strupper(dst, dstsize, src, srclen, true);
+}
+
+
static void
test_convert_case()
{
/* test string with no case changes */
- test_strlower("√∞", "√∞");
+ test_convert(tfunc_lower, "√∞", "√∞");
+ /* test adjust-to-cased behavior */
+ test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
/* test string with case changes */
- test_strlower("ABC", "abc");
+ test_convert(tfunc_upper, "abc", "ABC");
/* test string with case changes and byte length changes */
- test_strlower("ȺȺȺ", "ⱥⱥⱥ");
+ test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
+ /* test special case conversions */
+ test_convert(tfunc_upper, "ß", "SS");
+ test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
+ test_convert(tfunc_upper, "ıiIİ", "IIIİ");
+ /* test final sigma */
+ test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
+ test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
+ test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
+
+#ifdef USE_ICU
+ icu_test_full("");
+ icu_test_full("ȺȺȺ");
+ icu_test_full("ßßß");
+ icu_test_full("√∞");
+ icu_test_full("a b");
+ icu_test_full("abc 123xyz");
+ icu_test_full("σςΣ ΣΣΣ");
+ icu_test_full("ıiIİ");
+ /* test <alpha><iota_subscript><acute> */
+ icu_test_full("\u0391\u0345\u0301");
+#endif
printf("case_test: convert_case: success\n");
}
@@ -182,6 +342,22 @@ test_convert_case()
int
main(int argc, char **argv)
{
+#ifdef USE_ICU
+ UErrorCode status = U_ZERO_ERROR;
+
+ /*
+ * Disable ICU's word break adjustment for titlecase to match the expected
+ * behavior of unicode_strtitle().
+ */
+ casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
+ if (U_FAILURE(status))
+ {
+ printf("case_test: failure opening UCaseMap: %s\n",
+ u_errorName(status));
+ exit(1);
+ }
+#endif
+
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
#ifdef USE_ICU
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
@@ -191,5 +367,9 @@ main(int argc, char **argv)
#endif
test_convert_case();
+
+#ifdef USE_ICU
+ ucasemap_close(casemap);
+#endif
exit(0);
}