6 files changed, 63 insertions, 47 deletions
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 33a040506b4..a3679f8e86c 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -339,7 +339,7 @@ hexval(unsigned char c)
 
 /* is Unicode code point acceptable? */
 static void
-check_unicode_value(pg_wchar c)
+check_unicode_value(char32_t c)
 {
 	if (!is_valid_unicode_codepoint(c))
 		ereport(ERROR,
@@ -376,7 +376,7 @@ str_udeescape(const char *str, char escape,
 	char	   *new,
 			   *out;
 	size_t		new_len;
-	pg_wchar	pair_first = 0;
+	char16_t	pair_first = 0;
 	ScannerCallbackState scbstate;
 
 	/*
@@ -420,7 +420,7 @@ str_udeescape(const char *str, char escape,
 					 isxdigit((unsigned char) in[3]) &&
 					 isxdigit((unsigned char) in[4]))
 			{
-				pg_wchar	unicode;
+				char32_t	unicode;
 
 				unicode = (hexval(in[1]) << 12) +
 					(hexval(in[2]) << 8) +
@@ -457,7 +457,7 @@ str_udeescape(const char *str, char escape,
 					 isxdigit((unsigned char) in[6]) &&
 					 isxdigit((unsigned char) in[7]))
 			{
-				pg_wchar	unicode;
+				char32_t	unicode;
 
 				unicode = (hexval(in[2]) << 20) +
 					(hexval(in[3]) << 16) +
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 08990831fe8..a67815339b7 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -121,7 +121,7 @@ static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
 static char *litbufdup(core_yyscan_t yyscanner);
 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
 static int	process_integer_literal(const char *token, YYSTYPE *lval, int base);
-static void addunicode(pg_wchar c, yyscan_t yyscanner);
+static void addunicode(char32_t c, yyscan_t yyscanner);
 
 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
 
@@ -640,7 +640,7 @@ other			.
 					addlit(yytext, yyleng, yyscanner);
 				}
 <xe>{xeunicode} {
-					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
+					char32_t	c = strtoul(yytext + 2, NULL, 16);
 
 					/*
 					 * For consistency with other productions, issue any
@@ -668,7 +668,7 @@ other			.
 					POP_YYLLOC();
 				}
 <xeu>{xeunicode} {
-					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
+					char32_t	c = strtoul(yytext + 2, NULL, 16);
 
 					/* Remember start of overall string token ... */
 					PUSH_YYLLOC();
@@ -1376,7 +1376,7 @@ process_integer_literal(const char *token, YYSTYPE *lval, int base)
 }
 
 static void
-addunicode(pg_wchar c, core_yyscan_t yyscanner)
+addunicode(char32_t c, core_yyscan_t yyscanner)
 {
 	ScannerCallbackState scbstate;
 	char		buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
index c7aab83eeb4..8c3a0a9c642 100644
--- a/src/backend/utils/adt/jsonpath_scan.l
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -574,7 +574,7 @@ hexval(char c, int *result, struct Node *escontext, yyscan_t yyscanner)
 
 /* Add given unicode character to scanstring */
 static bool
-addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
+addUnicodeChar(char32_t ch, struct Node *escontext, yyscan_t yyscanner)
 {
 	if (ch == 0)
 	{
@@ -607,7 +607,7 @@ addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
 
 /* Add unicode character, processing any surrogate pairs */
 static bool
-addUnicode(int ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
+addUnicode(char32_t ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
 {
 	if (is_utf16_surrogate_first(ch))
 	{
@@ -655,7 +655,7 @@ parseUnicode(char *s, int l, struct Node *escontext, yyscan_t yyscanner)
 
 	for (i = 2; i < l; i += 2)	/* skip '\u' */
 	{
-		int			ch = 0;
+		char32_t		ch = 0;
 		int			j,
 					si;
 
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 3dc611b50e1..1021e0d129b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -15,7 +15,6 @@
 #include "catalog/pg_collation.h"
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
-#include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "utils/builtins.h"
 #include "utils/pg_locale.h"
@@ -36,6 +35,23 @@ struct WordBoundaryState
 };
 
 /*
+ * In UTF-8, pg_wchar is guaranteed to be the code point value.
+ */
+static inline char32_t
+to_char32(pg_wchar wc)
+{
+	Assert(GetDatabaseEncoding() == PG_UTF8);
+	return (char32_t) wc;
+}
+
+static inline pg_wchar
+to_pg_wchar(char32_t c32)
+{
+	Assert(GetDatabaseEncoding() == PG_UTF8);
+	return (pg_wchar) c32;
+}
+
+/*
  * Simple word boundary iterator that draws boundaries each time the result of
  * pg_u_isalnum() changes.
  */
@@ -47,7 +63,7 @@ initcap_wbnext(void *state)
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
@@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 static bool
 wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isdigit(wc, !locale->builtin.casemap_full);
+	return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
 }
 
 static bool
 wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isalpha(wc);
+	return pg_u_isalpha(to_char32(wc));
 }
 
 static bool
 wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isalnum(wc, !locale->builtin.casemap_full);
+	return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
 }
 
 static bool
 wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isupper(wc);
+	return pg_u_isupper(to_char32(wc));
 }
 
 static bool
 wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_islower(wc);
+	return pg_u_islower(to_char32(wc));
 }
 
 static bool
 wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isgraph(wc);
+	return pg_u_isgraph(to_char32(wc));
 }
 
 static bool
 wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isprint(wc);
+	return pg_u_isprint(to_char32(wc));
 }
 
 static bool
 wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_ispunct(wc, !locale->builtin.casemap_full);
+	return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
 }
 
 static bool
 wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isspace(wc);
+	return pg_u_isspace(to_char32(wc));
 }
 
 static bool
 wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+	return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
 }
 
 static bool
@@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale)
 static pg_wchar
 wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return unicode_uppercase_simple(wc);
+	return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
 }
 
 static pg_wchar
 wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return unicode_lowercase_simple(wc);
+	return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
 }
 
 static const struct ctype_methods ctype_methods_builtin = {
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2c398cd9e5c..8d735786e51 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS)
 		ereport(ERROR,
 				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
 
-	/* convert to pg_wchar */
+	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	p = (unsigned char *) VARDATA_ANY(input);
 	for (int i = 0; i < size; i++)
 	{
-		pg_wchar	uchar = utf8_to_unicode(p);
+		char32_t	uchar = utf8_to_unicode(p);
 		int			category = unicode_category(uchar);
 
 		if (category == PG_U_UNASSIGNED)
@@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
 	UnicodeNormalizationForm form;
 	int			size;
-	pg_wchar   *input_chars;
-	pg_wchar   *output_chars;
+	char32_t   *input_chars;
+	char32_t   *output_chars;
 	unsigned char *p;
 	text	   *result;
 	int			i;
 
 	form = unicode_norm_form_from_string(formstr);
 
-	/* convert to pg_wchar */
+	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
-	input_chars = palloc((size + 1) * sizeof(pg_wchar));
+	input_chars = palloc((size + 1) * sizeof(char32_t));
 	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
 		input_chars[i] = utf8_to_unicode(p);
 		p += pg_utf_mblen(p);
 	}
-	input_chars[i] = (pg_wchar) '\0';
+	input_chars[i] = (char32_t) '\0';
 	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
 
 	/* action */
@@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 
 	/* convert back to UTF-8 string */
 	size = 0;
-	for (pg_wchar *wp = output_chars; *wp; wp++)
+	for (char32_t *wp = output_chars; *wp; wp++)
 	{
 		unsigned char buf[4];
 
@@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	SET_VARSIZE(result, size + VARHDRSZ);
 
 	p = (unsigned char *) VARDATA_ANY(result);
-	for (pg_wchar *wp = output_chars; *wp; wp++)
+	for (char32_t *wp = output_chars; *wp; wp++)
 	{
 		unicode_to_utf8(*wp, p);
 		p += pg_utf_mblen(p);
@@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
 	UnicodeNormalizationForm form;
 	int			size;
-	pg_wchar   *input_chars;
-	pg_wchar   *output_chars;
+	char32_t   *input_chars;
+	char32_t   *output_chars;
 	unsigned char *p;
 	int			i;
 	UnicodeNormalizationQC quickcheck;
@@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
 	form = unicode_norm_form_from_string(formstr);
 
-	/* convert to pg_wchar */
+	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
-	input_chars = palloc((size + 1) * sizeof(pg_wchar));
+	input_chars = palloc((size + 1) * sizeof(char32_t));
 	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
 		input_chars[i] = utf8_to_unicode(p);
 		p += pg_utf_mblen(p);
 	}
-	input_chars[i] = (pg_wchar) '\0';
+	input_chars[i] = (char32_t) '\0';
 	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
 
 	/* quick check (see UAX #15) */
@@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	output_chars = unicode_normalize(form, input_chars);
 
 	output_size = 0;
-	for (pg_wchar *wp = output_chars; *wp; wp++)
+	for (char32_t *wp = output_chars; *wp; wp++)
 		output_size++;
 
 	result = (size == output_size) &&
-		(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+		(memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
 
 	PG_RETURN_BOOL(result);
 }
@@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS)
 	int			len;
 	StringInfoData str;
 	text	   *result;
-	pg_wchar	pair_first = 0;
+	char16_t	pair_first = 0;
 	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
 
 	instr = VARDATA_ANY(input_text);
@@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS)
 			else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
 					 (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
 			{
-				pg_wchar	unicode;
+				char32_t	unicode;
 				int			offset = instr[1] == 'u' ? 2 : 1;
 
 				unicode = hexval_n(instr + offset, 4);
@@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS)
 			}
 			else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
 			{
-				pg_wchar	unicode;
+				char32_t	unicode;
 
 				unicode = hexval_n(instr + 2, 6);
 
@@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS)
 			}
 			else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
 			{
-				pg_wchar	unicode;
+				char32_t	unicode;
 
 				unicode = hexval_n(instr + 2, 8);
 
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 886ecbad871..fb629ed5c8f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -862,7 +862,7 @@ perform_default_encoding_conversion(const char *src, int len,
  * may call this outside any transaction, or in an aborted transaction.
  */
 void
-pg_unicode_to_server(pg_wchar c, unsigned char *s)
+pg_unicode_to_server(char32_t c, unsigned char *s)
 {
 	unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
 	int			c_as_utf8_len;
@@ -924,7 +924,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
  * but simply return false on conversion failure.
  */
 bool
-pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
+pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
 {
 	unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
 	int			c_as_utf8_len;