Use C11 char16_t and char32_t for Unicode code points.

Reviewed-by: Tatsuo Ishii <ishii@postgresql.org> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Reviewed-by: Peter Eisentraut <peter@eisentraut.org> Discussion: https://postgr.es/m/bedcc93d06203dfd89815b10f815ca2de8626e85.camel%40j-davis.com
author: Jeff Davis <jdavis@postgresql.org> 2025-10-29 14:17:13 -0700
committer: Jeff Davis <jdavis@postgresql.org> 2025-10-29 14:17:13 -0700
commit: 3853a6956c3e3bc7a6fa9bcdb205a2997f46bac2 (patch)
tree: d53058de062edbdbae7fadccf5f40b11147d2293 /src/common
parent: 16edc1b94fc2db6e6a376471e280b50a418907c2 (diff)
10 files changed, 131 insertions, 130 deletions
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 97beb47940b..101e8d65a4d 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -47,7 +47,7 @@
 
 /* Prototypes for local functions */
 static int	codepoint_range_cmp(const void *a, const void *b);
-static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize);
+static bool is_code_in_table(char32_t code, const char32_t *map, int mapsize);
 static int	pg_utf8_string_len(const char *source);
 
 /*
@@ -64,7 +64,7 @@ static int	pg_utf8_string_len(const char *source);
  *
  * These are all mapped to the ASCII space character (U+00A0).
  */
-static const pg_wchar non_ascii_space_ranges[] =
+static const char32_t non_ascii_space_ranges[] =
 {
 	0x00A0, 0x00A0,
 	0x1680, 0x1680,
@@ -79,7 +79,7 @@ static const pg_wchar non_ascii_space_ranges[] =
  *
  * If any of these appear in the input, they are removed.
  */
-static const pg_wchar commonly_mapped_to_nothing_ranges[] =
+static const char32_t commonly_mapped_to_nothing_ranges[] =
 {
 	0x00AD, 0x00AD,
 	0x034F, 0x034F,
@@ -114,7 +114,7 @@ static const pg_wchar commonly_mapped_to_nothing_ranges[] =
  * tables, so one code might originate from multiple source tables.
  * Adjacent ranges have also been merged together, to save space.
  */
-static const pg_wchar prohibited_output_ranges[] =
+static const char32_t prohibited_output_ranges[] =
 {
 	0x0000, 0x001F,				/* C.2.1 */
 	0x007F, 0x00A0,				/* C.1.2, C.2.1, C.2.2 */
@@ -155,7 +155,7 @@ static const pg_wchar prohibited_output_ranges[] =
 };
 
 /* A.1 Unassigned code points in Unicode 3.2 */
-static const pg_wchar unassigned_codepoint_ranges[] =
+static const char32_t unassigned_codepoint_ranges[] =
 {
 	0x0221, 0x0221,
 	0x0234, 0x024F,
@@ -556,7 +556,7 @@ static const pg_wchar unassigned_codepoint_ranges[] =
 };
 
 /* D.1 Characters with bidirectional property "R" or "AL" */
-static const pg_wchar RandALCat_codepoint_ranges[] =
+static const char32_t RandALCat_codepoint_ranges[] =
 {
 	0x05BE, 0x05BE,
 	0x05C0, 0x05C0,
@@ -595,7 +595,7 @@ static const pg_wchar RandALCat_codepoint_ranges[] =
 };
 
 /* D.2 Characters with bidirectional property "L" */
-static const pg_wchar LCat_codepoint_ranges[] =
+static const char32_t LCat_codepoint_ranges[] =
 {
 	0x0041, 0x005A,
 	0x0061, 0x007A,
@@ -968,8 +968,8 @@ static const pg_wchar LCat_codepoint_ranges[] =
 static int
 codepoint_range_cmp(const void *a, const void *b)
 {
-	const pg_wchar *key = (const pg_wchar *) a;
-	const pg_wchar *range = (const pg_wchar *) b;
+	const char32_t *key = (const char32_t *) a;
+	const char32_t *range = (const char32_t *) b;
 
 	if (*key < range[0])
 		return -1;				/* less than lower bound */
@@ -980,14 +980,14 @@ codepoint_range_cmp(const void *a, const void *b)
 }
 
 static bool
-is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize)
+is_code_in_table(char32_t code, const char32_t *map, int mapsize)
 {
 	Assert(mapsize % 2 == 0);
 
 	if (code < map[0] || code > map[mapsize - 1])
 		return false;
 
-	if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2,
+	if (bsearch(&code, map, mapsize / 2, sizeof(char32_t) * 2,
 				codepoint_range_cmp))
 		return true;
 	else
@@ -1046,8 +1046,8 @@ pg_utf8_string_len(const char *source)
 pg_saslprep_rc
 pg_saslprep(const char *input, char **output)
 {
-	pg_wchar   *input_chars = NULL;
-	pg_wchar   *output_chars = NULL;
+	char32_t   *input_chars = NULL;
+	char32_t   *output_chars = NULL;
 	int			input_size;
 	char	   *result;
 	int			result_size;
@@ -1055,7 +1055,7 @@ pg_saslprep(const char *input, char **output)
 	int			i;
 	bool		contains_RandALCat;
 	unsigned char *p;
-	pg_wchar   *wp;
+	char32_t   *wp;
 
 	/* Ensure we return *output as NULL on failure */
 	*output = NULL;
@@ -1080,10 +1080,10 @@ pg_saslprep(const char *input, char **output)
 	input_size = pg_utf8_string_len(input);
 	if (input_size < 0)
 		return SASLPREP_INVALID_UTF8;
-	if (input_size >= MaxAllocSize / sizeof(pg_wchar))
+	if (input_size >= MaxAllocSize / sizeof(char32_t))
 		goto oom;
 
-	input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
+	input_chars = ALLOC((input_size + 1) * sizeof(char32_t));
 	if (!input_chars)
 		goto oom;
 
@@ -1093,7 +1093,7 @@ pg_saslprep(const char *input, char **output)
 		input_chars[i] = utf8_to_unicode(p);
 		p += pg_utf_mblen(p);
 	}
-	input_chars[i] = (pg_wchar) '\0';
+	input_chars[i] = (char32_t) '\0';
 
 	/*
 	 * The steps below correspond to the steps listed in [RFC3454], Section
@@ -1107,7 +1107,7 @@ pg_saslprep(const char *input, char **output)
 	count = 0;
 	for (i = 0; i < input_size; i++)
 	{
-		pg_wchar	code = input_chars[i];
+		char32_t	code = input_chars[i];
 
 		if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
 			input_chars[count++] = 0x0020;
@@ -1118,7 +1118,7 @@ pg_saslprep(const char *input, char **output)
 		else
 			input_chars[count++] = code;
 	}
-	input_chars[count] = (pg_wchar) '\0';
+	input_chars[count] = (char32_t) '\0';
 	input_size = count;
 
 	if (input_size == 0)
@@ -1138,7 +1138,7 @@ pg_saslprep(const char *input, char **output)
 	 */
 	for (i = 0; i < input_size; i++)
 	{
-		pg_wchar	code = input_chars[i];
+		char32_t	code = input_chars[i];
 
 		if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
 			goto prohibited;
@@ -1170,7 +1170,7 @@ pg_saslprep(const char *input, char **output)
 	contains_RandALCat = false;
 	for (i = 0; i < input_size; i++)
 	{
-		pg_wchar	code = input_chars[i];
+		char32_t	code = input_chars[i];
 
 		if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
 		{
@@ -1181,12 +1181,12 @@ pg_saslprep(const char *input, char **output)
 
 	if (contains_RandALCat)
 	{
-		pg_wchar	first = input_chars[0];
-		pg_wchar	last = input_chars[input_size - 1];
+		char32_t	first = input_chars[0];
+		char32_t	last = input_chars[input_size - 1];
 
 		for (i = 0; i < input_size; i++)
 		{
-			pg_wchar	code = input_chars[i];
+			char32_t	code = input_chars[i];
 
 			if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
 				goto prohibited;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index fdfb62e8552..00d4f85e5a5 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -24,6 +24,7 @@
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
 
 /* enough to hold largest source or result string, including NUL */
 #define BUFSZ 256
@@ -54,7 +55,7 @@ initcap_wbnext(void *state)
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
@@ -77,16 +78,16 @@ initcap_wbnext(void *state)
 #ifdef USE_ICU
 
 static void
-icu_test_simple(pg_wchar code)
+icu_test_simple(char32_t code)
 {
-	pg_wchar	lower = unicode_lowercase_simple(code);
-	pg_wchar	title = unicode_titlecase_simple(code);
-	pg_wchar	upper = unicode_uppercase_simple(code);
-	pg_wchar	fold = unicode_casefold_simple(code);
-	pg_wchar	iculower = u_tolower(code);
-	pg_wchar	icutitle = u_totitle(code);
-	pg_wchar	icuupper = u_toupper(code);
-	pg_wchar	icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
+	char32_t	lower = unicode_lowercase_simple(code);
+	char32_t	title = unicode_titlecase_simple(code);
+	char32_t	upper = unicode_uppercase_simple(code);
+	char32_t	fold = unicode_casefold_simple(code);
+	char32_t	iculower = u_tolower(code);
+	char32_t	icutitle = u_totitle(code);
+	char32_t	icuupper = u_toupper(code);
+	char32_t	icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
 
 	if (lower != iculower || title != icutitle || upper != icuupper ||
 		fold != icufold)
@@ -172,7 +173,7 @@ test_icu(void)
 	int			successful = 0;
 	int			skipped_mismatch = 0;
 
-	for (pg_wchar code = 0; code <= 0x10ffff; code++)
+	for (char32_t code = 0; code <= 0x10ffff; code++)
 	{
 		pg_unicode_category category = unicode_category(code);
 
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
index 5d37ba39196..1e8c1f7905f 100644
--- a/src/common/unicode/category_test.c
+++ b/src/common/unicode/category_test.c
@@ -22,6 +22,7 @@
 
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
 
 static int	pg_unicode_version = 0;
 #ifdef USE_ICU
@@ -59,7 +60,7 @@ icu_test()
 	int			pg_skipped_codepoints = 0;
 	int			icu_skipped_codepoints = 0;
 
-	for (pg_wchar code = 0; code <= 0x10ffff; code++)
+	for (char32_t code = 0; code <= 0x10ffff; code++)
 	{
 		uint8_t		pg_category = unicode_category(code);
 		uint8_t		icu_category = u_charType(code);
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 1b401be9409..1a8b908ff33 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -47,8 +47,8 @@ print $OUTPUT <<HEADER;
 typedef struct
 {
 	int			linenum;
-	pg_wchar	input[50];
-	pg_wchar	output[4][50];
+	char32_t	input[50];
+	char32_t	output[4][50];
 } pg_unicode_test;
 
 /* test table */
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 5d9ddd62803..f71eb25c94e 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -270,7 +270,6 @@ print $OT <<"EOS";
  */
 
 #include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
 
 /*
  * The maximum number of codepoints that can result from case mapping
@@ -297,7 +296,7 @@ typedef enum
 typedef struct
 {
 	int16		conditions;
-	pg_wchar	map[NCaseKind][MAX_CASE_EXPANSION];
+	char32_t	map[NCaseKind][MAX_CASE_EXPANSION];
 } pg_special_case;
 
 /*
@@ -430,7 +429,7 @@ foreach my $kind ('lower', 'title', 'upper', 'fold')
  * The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
  * given codepoint.
  */
-static const pg_wchar case_map_$kind\[$index\] =
+static const char32_t case_map_$kind\[$index\] =
 {
 EOS
 
@@ -502,7 +501,7 @@ print $OT <<"EOS";
  * the offset into the mapping tables.
  */
 static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
 {
 	/* Fast path for codepoints < $fastpath_limit */
 	if (cp < $fastpath_limit)
diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl
index abab5cd9696..7e094b13720 100644
--- a/src/common/unicode/generate-unicode_category_table.pl
+++ b/src/common/unicode/generate-unicode_category_table.pl
@@ -366,15 +366,15 @@ print $OT <<"EOS";
  */
 typedef struct
 {
-	uint32		first;			/* Unicode codepoint */
-	uint32		last;			/* Unicode codepoint */
+	char32_t	first;			/* Unicode codepoint */
+	char32_t	last;			/* Unicode codepoint */
 	uint8		category;		/* General Category */
 } pg_category_range;
 
 typedef struct
 {
-	uint32		first;			/* Unicode codepoint */
-	uint32		last;			/* Unicode codepoint */
+	char32_t	first;			/* Unicode codepoint */
+	char32_t	last;			/* Unicode codepoint */
 } pg_unicode_range;
 
 typedef struct
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 25bc59463f2..058817f1719 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -20,7 +20,7 @@
 #include "norm_test_table.h"
 
 static char *
-print_wchar_str(const pg_wchar *s)
+print_wchar_str(const char32_t *s)
 {
 #define BUF_DIGITS 50
 	static char buf[BUF_DIGITS * 11 + 1];
@@ -41,7 +41,7 @@ print_wchar_str(const pg_wchar *s)
 }
 
 static int
-pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+pg_wcscmp(const char32_t *s1, const char32_t *s2)
 {
 	for (;;)
 	{
@@ -65,7 +65,7 @@ main(int argc, char **argv)
 	{
 		for (int form = 0; form < 4; form++)
 		{
-			pg_wchar   *result;
+			char32_t   *result;
 
 			result = unicode_normalize(form, test->input);
 
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 073faf6a0d5..e5e494db43c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -30,7 +30,7 @@ enum CaseMapResult
 /*
  * Map for each case kind.
  */
-static const pg_wchar *const casekind_map[NCaseKind] =
+static const char32_t *const casekind_map[NCaseKind] =
 {
 	[CaseLower] = case_map_lower,
 	[CaseTitle] = case_map_title,
@@ -38,42 +38,42 @@ static const pg_wchar *const casekind_map[NCaseKind] =
 	[CaseFold] = case_map_fold,
 };
 
-static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
+static char32_t find_case_map(char32_t ucs, const char32_t *map);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 						   void *wbstate);
-static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
 								  const char *src, size_t srclen, size_t srcoff,
-								  pg_wchar *simple, const pg_wchar **special);
+								  char32_t *simple, const char32_t **special);
 
-pg_wchar
-unicode_lowercase_simple(pg_wchar code)
+char32_t
+unicode_lowercase_simple(char32_t code)
 {
-	pg_wchar	cp = find_case_map(code, case_map_lower);
+	char32_t	cp = find_case_map(code, case_map_lower);
 
 	return cp != 0 ? cp : code;
 }
 
-pg_wchar
-unicode_titlecase_simple(pg_wchar code)
+char32_t
+unicode_titlecase_simple(char32_t code)
 {
-	pg_wchar	cp = find_case_map(code, case_map_title);
+	char32_t	cp = find_case_map(code, case_map_title);
 
 	return cp != 0 ? cp : code;
 }
 
-pg_wchar
-unicode_uppercase_simple(pg_wchar code)
+char32_t
+unicode_uppercase_simple(char32_t code)
 {
-	pg_wchar	cp = find_case_map(code, case_map_upper);
+	char32_t	cp = find_case_map(code, case_map_upper);
 
 	return cp != 0 ? cp : code;
 }
 
-pg_wchar
-unicode_casefold_simple(pg_wchar code)
+char32_t
+unicode_casefold_simple(char32_t code)
 {
-	pg_wchar	cp = find_case_map(code, case_map_fold);
+	char32_t	cp = find_case_map(code, case_map_fold);
 
 	return cp != 0 ? cp : code;
 }
@@ -231,10 +231,10 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 
 	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
 	{
-		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 		int			u1len = unicode_utf8len(u1);
-		pg_wchar	simple = 0;
-		const pg_wchar *special = NULL;
+		char32_t	simple = 0;
+		const char32_t *special = NULL;
 		enum CaseMapResult casemap_result;
 
 		if (str_casekind == CaseTitle)
@@ -265,8 +265,8 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 			case CASEMAP_SIMPLE:
 				{
 					/* replace with single character */
-					pg_wchar	u2 = simple;
-					pg_wchar	u2len = unicode_utf8len(u2);
+					char32_t	u2 = simple;
+					char32_t	u2len = unicode_utf8len(u2);
 
 					Assert(special == NULL);
 					if (result_len + u2len <= dstsize)
@@ -280,7 +280,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 				Assert(simple == 0);
 				for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
 				{
-					pg_wchar	u2 = special[i];
+					char32_t	u2 = special[i];
 					size_t		u2len = unicode_utf8len(u2);
 
 					if (result_len + u2len <= dstsize)
@@ -320,7 +320,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			pg_wchar	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -344,7 +344,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			pg_wchar	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -394,9 +394,9 @@ check_special_conditions(int conditions, const char *str, size_t len,
  * character without modification.
  */
 static enum CaseMapResult
-casemap(pg_wchar u1, CaseKind casekind, bool full,
+casemap(char32_t u1, CaseKind casekind, bool full,
 		const char *src, size_t srclen, size_t srcoff,
-		pg_wchar *simple, const pg_wchar **special)
+		char32_t *simple, const char32_t **special)
 {
 	uint16		idx;
 
@@ -434,8 +434,8 @@ casemap(pg_wchar u1, CaseKind casekind, bool full,
  * Find entry in simple case map.
  * If the entry does not exist, 0 will be returned.
  */
-static pg_wchar
-find_case_map(pg_wchar ucs, const pg_wchar *map)
+static char32_t
+find_case_map(char32_t ucs, const char32_t *map)
 {
 	/* Fast path for codepoints < 0x80 */
 	if (ucs < 0x80)
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c
index 4136c4d4f92..aab667a7bb4 100644
--- a/src/common/unicode_category.c
+++ b/src/common/unicode_category.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  * unicode_category.c
  *		Determine general category and character properties of Unicode
- *		characters. Encoding must be UTF8, where we assume that the pg_wchar
+ *		characters. Encoding must be UTF8, where we assume that the char32_t
  *		representation is a code point.
  *
  * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
@@ -76,13 +76,13 @@
 #define PG_U_CHARACTER_TAB	0x09
 
 static bool range_search(const pg_unicode_range *tbl, size_t size,
-						 pg_wchar code);
+						 char32_t code);
 
 /*
  * Unicode general category for the given codepoint.
  */
 pg_unicode_category
-unicode_category(pg_wchar code)
+unicode_category(char32_t code)
 {
 	int			min = 0;
 	int			mid;
@@ -108,7 +108,7 @@ unicode_category(pg_wchar code)
 }
 
 bool
-pg_u_prop_alphabetic(pg_wchar code)
+pg_u_prop_alphabetic(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
@@ -119,7 +119,7 @@ pg_u_prop_alphabetic(pg_wchar code)
 }
 
 bool
-pg_u_prop_lowercase(pg_wchar code)
+pg_u_prop_lowercase(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
@@ -130,7 +130,7 @@ pg_u_prop_lowercase(pg_wchar code)
 }
 
 bool
-pg_u_prop_uppercase(pg_wchar code)
+pg_u_prop_uppercase(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
@@ -141,7 +141,7 @@ pg_u_prop_uppercase(pg_wchar code)
 }
 
 bool
-pg_u_prop_cased(pg_wchar code)
+pg_u_prop_cased(char32_t code)
 {
 	uint32		category_mask;
 
@@ -156,7 +156,7 @@ pg_u_prop_cased(pg_wchar code)
 }
 
 bool
-pg_u_prop_case_ignorable(pg_wchar code)
+pg_u_prop_case_ignorable(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
@@ -167,7 +167,7 @@ pg_u_prop_case_ignorable(pg_wchar code)
 }
 
 bool
-pg_u_prop_white_space(pg_wchar code)
+pg_u_prop_white_space(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
@@ -178,7 +178,7 @@ pg_u_prop_white_space(pg_wchar code)
 }
 
 bool
-pg_u_prop_hex_digit(pg_wchar code)
+pg_u_prop_hex_digit(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
@@ -189,7 +189,7 @@ pg_u_prop_hex_digit(pg_wchar code)
 }
 
 bool
-pg_u_prop_join_control(pg_wchar code)
+pg_u_prop_join_control(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
@@ -208,7 +208,7 @@ pg_u_prop_join_control(pg_wchar code)
  */
 
 bool
-pg_u_isdigit(pg_wchar code, bool posix)
+pg_u_isdigit(char32_t code, bool posix)
 {
 	if (posix)
 		return ('0' <= code && code <= '9');
@@ -217,19 +217,19 @@ pg_u_isdigit(pg_wchar code, bool posix)
 }
 
 bool
-pg_u_isalpha(pg_wchar code)
+pg_u_isalpha(char32_t code)
 {
 	return pg_u_prop_alphabetic(code);
 }
 
 bool
-pg_u_isalnum(pg_wchar code, bool posix)
+pg_u_isalnum(char32_t code, bool posix)
 {
 	return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
 }
 
 bool
-pg_u_isword(pg_wchar code)
+pg_u_isword(char32_t code)
 {
 	uint32		category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 
@@ -240,32 +240,32 @@ pg_u_isword(pg_wchar code)
 }
 
 bool
-pg_u_isupper(pg_wchar code)
+pg_u_isupper(char32_t code)
 {
 	return pg_u_prop_uppercase(code);
 }
 
 bool
-pg_u_islower(pg_wchar code)
+pg_u_islower(char32_t code)
 {
 	return pg_u_prop_lowercase(code);
 }
 
 bool
-pg_u_isblank(pg_wchar code)
+pg_u_isblank(char32_t code)
 {
 	return code == PG_U_CHARACTER_TAB ||
 		unicode_category(code) == PG_U_SPACE_SEPARATOR;
 }
 
 bool
-pg_u_iscntrl(pg_wchar code)
+pg_u_iscntrl(char32_t code)
 {
 	return unicode_category(code) == PG_U_CONTROL;
 }
 
 bool
-pg_u_isgraph(pg_wchar code)
+pg_u_isgraph(char32_t code)
 {
 	uint32		category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 
@@ -276,7 +276,7 @@ pg_u_isgraph(pg_wchar code)
 }
 
 bool
-pg_u_isprint(pg_wchar code)
+pg_u_isprint(char32_t code)
 {
 	pg_unicode_category category = unicode_category(code);
 
@@ -287,7 +287,7 @@ pg_u_isprint(pg_wchar code)
 }
 
 bool
-pg_u_ispunct(pg_wchar code, bool posix)
+pg_u_ispunct(char32_t code, bool posix)
 {
 	uint32		category_mask;
 
@@ -308,13 +308,13 @@ pg_u_ispunct(pg_wchar code, bool posix)
 }
 
 bool
-pg_u_isspace(pg_wchar code)
+pg_u_isspace(char32_t code)
 {
 	return pg_u_prop_white_space(code);
 }
 
 bool
-pg_u_isxdigit(pg_wchar code, bool posix)
+pg_u_isxdigit(char32_t code, bool posix)
 {
 	if (posix)
 		return (('0' <= code && code <= '9') ||
@@ -478,7 +478,7 @@ unicode_category_abbrev(pg_unicode_category category)
  * given table.
  */
 static bool
-range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
+range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
 {
 	int			min = 0;
 	int			mid;
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 6654b4cbc49..489d99cd5ab 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -69,7 +69,7 @@ conv_compare(const void *p1, const void *p2)
  * lookup, while the frontend version uses a binary search.
  */
 static const pg_unicode_decomposition *
-get_code_entry(pg_wchar code)
+get_code_entry(char32_t code)
 {
 #ifndef FRONTEND
 	int			h;
@@ -109,7 +109,7 @@ get_code_entry(pg_wchar code)
  * Get the combining class of the given codepoint.
  */
 static uint8
-get_canonical_class(pg_wchar code)
+get_canonical_class(char32_t code)
 {
 	const pg_unicode_decomposition *entry = get_code_entry(code);
 
@@ -130,15 +130,15 @@ get_canonical_class(pg_wchar code)
  * Note: the returned pointer can point to statically allocated buffer, and
  * is only valid until next call to this function!
  */
-static const pg_wchar *
+static const char32_t *
 get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
 {
-	static pg_wchar x;
+	static char32_t x;
 
 	if (DECOMPOSITION_IS_INLINE(entry))
 	{
 		Assert(DECOMPOSITION_SIZE(entry) == 1);
-		x = (pg_wchar) entry->dec_index;
+		x = (char32_t) entry->dec_index;
 		*dec_size = 1;
 		return &x;
 	}
@@ -156,7 +156,7 @@ get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
  * are, in turn, decomposable.
  */
 static int
-get_decomposed_size(pg_wchar code, bool compat)
+get_decomposed_size(char32_t code, bool compat)
 {
 	const pg_unicode_decomposition *entry;
 	int			size = 0;
@@ -318,7 +318,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
  * in the array result.
  */
 static void
-decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
+decompose_code(char32_t code, bool compat, char32_t **result, int *current)
 {
 	const pg_unicode_decomposition *entry;
 	int			i;
@@ -337,7 +337,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 					v,
 					tindex,
 					sindex;
-		pg_wchar   *res = *result;
+		char32_t   *res = *result;
 
 		sindex = code - SBASE;
 		l = LBASE + sindex / (VCOUNT * TCOUNT);
@@ -369,7 +369,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
 		(!compat && DECOMPOSITION_IS_COMPAT(entry)))
 	{
-		pg_wchar   *res = *result;
+		char32_t   *res = *result;
 
 		res[*current] = code;
 		(*current)++;
@@ -382,7 +382,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 	decomp = get_code_decomposition(entry, &dec_size);
 	for (i = 0; i < dec_size; i++)
 	{
-		pg_wchar	lcode = (pg_wchar) decomp[i];
+		char32_t	lcode = (char32_t) decomp[i];
 
 		/* Leave if no more decompositions */
 		decompose_code(lcode, compat, result, current);
@@ -398,17 +398,17 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
  * malloc. Or NULL if we run out of memory. In backend, the returned
  * string is palloc'd instead, and OOM is reported with ereport().
  */
-pg_wchar *
-unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
+char32_t *
+unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
 {
 	bool		compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
 	bool		recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
-	pg_wchar   *decomp_chars;
-	pg_wchar   *recomp_chars;
+	char32_t   *decomp_chars;
+	char32_t   *recomp_chars;
 	int			decomp_size,
 				current_size;
 	int			count;
-	const pg_wchar *p;
+	const char32_t *p;
 
 	/* variables for recomposition */
 	int			last_class;
@@ -425,7 +425,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 	for (p = input; *p; p++)
 		decomp_size += get_decomposed_size(*p, compat);
 
-	decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+	decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
 	if (decomp_chars == NULL)
 		return NULL;
 
@@ -448,9 +448,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 	 */
 	for (count = 1; count < decomp_size; count++)
 	{
-		pg_wchar	prev = decomp_chars[count - 1];
-		pg_wchar	next = decomp_chars[count];
-		pg_wchar	tmp;
+		char32_t	prev = decomp_chars[count - 1];
+		char32_t	next = decomp_chars[count];
+		char32_t	tmp;
 		const uint8 prevClass = get_canonical_class(prev);
 		const uint8 nextClass = get_canonical_class(next);
 
@@ -487,7 +487,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 	 * longer than the decomposed one, so make the allocation of the output
 	 * string based on that assumption.
 	 */
-	recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+	recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
 	if (!recomp_chars)
 	{
 		FREE(decomp_chars);
@@ -501,9 +501,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 
 	for (count = 1; count < decomp_size; count++)
 	{
-		pg_wchar	ch = decomp_chars[count];
+		char32_t	ch = decomp_chars[count];
 		int			ch_class = get_canonical_class(ch);
-		pg_wchar	composite;
+		char32_t	composite;
 
 		if (last_class < ch_class &&
 			recompose_code(starter_ch, ch, &composite))
@@ -524,7 +524,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 			recomp_chars[target_pos++] = ch;
 		}
 	}
-	recomp_chars[target_pos] = (pg_wchar) '\0';
+	recomp_chars[target_pos] = (char32_t) '\0';
 
 	FREE(decomp_chars);
 
@@ -540,7 +540,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 #ifndef FRONTEND
 
 static const pg_unicode_normprops *
-qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
+qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
 {
 	int			h;
 	uint32		hashkey;
@@ -571,7 +571,7 @@ qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
  * Look up the normalization quick check character property
  */
 static UnicodeNormalizationQC
-qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
 {
 	const pg_unicode_normprops *found = NULL;
 
@@ -595,7 +595,7 @@ qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
 }
 
 UnicodeNormalizationQC
-unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
 {
 	uint8		lastCanonicalClass = 0;
 	UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
@@ -610,9 +610,9 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *
 	if (form == UNICODE_NFD || form == UNICODE_NFKD)
 		return UNICODE_NORM_QC_MAYBE;
 
-	for (const pg_wchar *p = input; *p; p++)
+	for (const char32_t *p = input; *p; p++)
 	{
-		pg_wchar	ch = *p;
+		char32_t	ch = *p;
 		uint8		canonicalClass;
 		UnicodeNormalizationQC check;
author	Jeff Davis <jdavis@postgresql.org>	2025-10-29 14:17:13 -0700
committer	Jeff Davis <jdavis@postgresql.org>	2025-10-29 14:17:13 -0700
commit	3853a6956c3e3bc7a6fa9bcdb205a2997f46bac2 (patch)
tree	d53058de062edbdbae7fadccf5f40b11147d2293 /src/common
parent	16edc1b94fc2db6e6a376471e280b50a418907c2 (diff)