Support Unicode full case mapping and conversion.

Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "ǆ" uppercasing to "Ǆ" but titlecasing to "ǅ" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
author: Jeff Davis <jdavis@postgresql.org> 2025-01-17 15:56:20 -0800
committer: Jeff Davis <jdavis@postgresql.org> 2025-01-17 15:56:20 -0800
commit: 286a365b9c25479f8ad82043ed136748733adfa6 (patch)
tree: 162996323c339d82211c10d2d93bf3470e226b1a /src/common/unicode/case_test.c
parent: 6a9b2a631aa3dbac5b351cd0c45631cbc6e8d19e (diff)
1 files changed, 191 insertions, 11 deletions
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 52c4a8bbc69..c4ba7e781be 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -18,12 +18,61 @@
 #include <wctype.h>
 
 #ifdef USE_ICU
+#include <unicode/ucasemap.h>
 #include <unicode/uchar.h>
 #endif
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
 
+/* enough to hold largest source or result string, including NUL */
+#define BUFSZ 256
+
+#ifdef USE_ICU
+static UCaseMap * casemap = NULL;
+#endif
+
+typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
+							ssize_t srclen);
+
+/* simple boundary iterator copied from pg_locale_builtin.c */
+struct WordBoundaryState
+{
+	const char *str;
+	size_t		len;
+	size_t		offset;
+	bool		init;
+	bool		prev_alnum;
+};
+
+static size_t
+initcap_wbnext(void *state)
+{
+	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+	while (wbstate->offset < wbstate->len &&
+		   wbstate->str[wbstate->offset] != '\0')
+	{
+		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+										wbstate->offset);
+		bool		curr_alnum = pg_u_isalnum(u, true);
+
+		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		{
+			size_t		prev_offset = wbstate->offset;
+
+			wbstate->init = true;
+			wbstate->offset += unicode_utf8len(u);
+			wbstate->prev_alnum = curr_alnum;
+			return prev_offset;
+		}
+
+		wbstate->offset += unicode_utf8len(u);
+	}
+
+	return wbstate->len;
+}
+
 #ifdef USE_ICU
 
 static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
 	}
 }
 
+static void
+icu_test_full(char *str)
+{
+	char		lower[BUFSZ];
+	char		title[BUFSZ];
+	char		upper[BUFSZ];
+	char		icu_lower[BUFSZ];
+	char		icu_title[BUFSZ];
+	char		icu_upper[BUFSZ];
+	UErrorCode	status;
+	struct WordBoundaryState wbstate = {
+		.str = str,
+		.len = strlen(str),
+		.offset = 0,
+		.init = false,
+		.prev_alnum = false,
+	};
+
+	unicode_strlower(lower, BUFSZ, str, -1, true);
+	unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
+	unicode_strupper(upper, BUFSZ, str, -1, true);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
+
+	if (strcmp(lower, icu_lower) != 0)
+	{
+		printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
+			   icu_lower);
+		exit(1);
+	}
+	if (strcmp(title, icu_title) != 0)
+	{
+		printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
+			   icu_title);
+		exit(1);
+	}
+	if (strcmp(upper, icu_upper) != 0)
+	{
+		printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
+			   icu_upper);
+		exit(1);
+	}
+}
+
 /*
  * Exhaustively compare case mappings with the results from ICU.
  */
@@ -64,6 +161,7 @@ test_icu(void)
 		if (category != PG_U_UNASSIGNED)
 		{
 			uint8_t		icu_category = u_charType(code);
+			char		code_str[5] = {0};
 
 			if (icu_category == PG_U_UNASSIGNED)
 			{
@@ -72,6 +170,9 @@ test_icu(void)
 			}
 
 			icu_test_simple(code);
+			unicode_to_utf8(code, (unsigned char *) code_str);
+			icu_test_full(code_str);
+
 			successful++;
 		}
 	}
@@ -86,7 +187,7 @@ test_icu(void)
 #endif
 
 static void
-test_strlower(const char *test_string, const char *expected)
+test_convert(TestFunc tfunc, const char *test_string, const char *expected)
 {
 	size_t		src1len = strlen(test_string);
 	size_t		src2len = -1;	/* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
 
 	/* neither source nor destination are NUL-terminated */
 	memset(dst1, 0x7F, dst1len);
-	needed = unicode_strlower(dst1, dst1len, src1, src1len);
+	needed = tfunc(dst1, dst1len, src1, src1len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (memcmp(dst1, expected, dst1len) != 0)
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
 
 	/* destination is NUL-terminated and source is not */
 	memset(dst2, 0x7F, dst2len);
-	needed = unicode_strlower(dst2, dst2len, src1, src1len);
+	needed = tfunc(dst2, dst2len, src1, src1len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (strcmp(dst2, expected) != 0)
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
 
 	/* source is NUL-terminated and destination is not */
 	memset(dst1, 0x7F, dst1len);
-	needed = unicode_strlower(dst1, dst1len, src2, src2len);
+	needed = tfunc(dst1, dst1len, src2, src2len);
 	if (needed != strlen(expected))
 	{
+		printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
 		exit(1);
 	}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
 
 	/* both source and destination are NUL-terminated */
 	memset(dst2, 0x7F, dst2len);
-	needed = unicode_strlower(dst2, dst2len, src2, src2len);
+	needed = tfunc(dst2, dst2len, src2, src2len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (strcmp(dst2, expected) != 0)
@@ -166,15 +272,69 @@ test_strlower(const char *test_string, const char *expected)
 	free(dst2);
 }
 
+static size_t
+tfunc_lower(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	return unicode_strlower(dst, dstsize, src, srclen, true);
+}
+
+static size_t
+tfunc_title(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	struct WordBoundaryState wbstate = {
+		.str = src,
+		.len = srclen,
+		.offset = 0,
+		.init = false,
+		.prev_alnum = false,
+	};
+
+	return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
+							&wbstate);
+}
+
+static size_t
+tfunc_upper(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	return unicode_strupper(dst, dstsize, src, srclen, true);
+}
+
+
 static void
 test_convert_case()
 {
 	/* test string with no case changes */
-	test_strlower("√∞", "√∞");
+	test_convert(tfunc_lower, "√∞", "√∞");
+	/* test adjust-to-cased behavior */
+	test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
 	/* test string with case changes */
-	test_strlower("ABC", "abc");
+	test_convert(tfunc_upper, "abc", "ABC");
 	/* test string with case changes and byte length changes */
-	test_strlower("ȺȺȺ", "ⱥⱥⱥ");
+	test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
+	/* test special case conversions */
+	test_convert(tfunc_upper, "ß", "SS");
+	test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
+	test_convert(tfunc_upper, "ıiIİ", "IIIİ");
+	/* test final sigma */
+	test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
+	test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
+	test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
+
+#ifdef USE_ICU
+	icu_test_full("");
+	icu_test_full("ȺȺȺ");
+	icu_test_full("ßßß");
+	icu_test_full("√∞");
+	icu_test_full("a b");
+	icu_test_full("abc 123xyz");
+	icu_test_full("σςΣ ΣΣΣ");
+	icu_test_full("ıiIİ");
+	/* test <alpha><iota_subscript><acute> */
+	icu_test_full("\u0391\u0345\u0301");
+#endif
 
 	printf("case_test: convert_case: success\n");
 }
@@ -182,6 +342,22 @@ test_convert_case()
 int
 main(int argc, char **argv)
 {
+#ifdef USE_ICU
+	UErrorCode	status = U_ZERO_ERROR;
+
+	/*
+	 * Disable ICU's word break adjustment for titlecase to match the expected
+	 * behavior of unicode_strtitle().
+	 */
+	casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
+	if (U_FAILURE(status))
+	{
+		printf("case_test: failure opening UCaseMap: %s\n",
+			   u_errorName(status));
+		exit(1);
+	}
+#endif
+
 	printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
 #ifdef USE_ICU
 	printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
@@ -191,5 +367,9 @@ main(int argc, char **argv)
 #endif
 
 	test_convert_case();
+
+#ifdef USE_ICU
+	ucasemap_close(casemap);
+#endif
 	exit(0);
 }
author	Jeff Davis <jdavis@postgresql.org>	2025-01-17 15:56:20 -0800
committer	Jeff Davis <jdavis@postgresql.org>	2025-01-17 15:56:20 -0800
commit	286a365b9c25479f8ad82043ed136748733adfa6 (patch)
tree	162996323c339d82211c10d2d93bf3470e226b1a /src/common/unicode/case_test.c
parent	6a9b2a631aa3dbac5b351cd0c45631cbc6e8d19e (diff)