downcase_identifier(): use method table from locale provider.

Previously, libc's tolower() was always used for lowercasing identifiers, regardless of the database locale (though only characters beyond 127 in single-byte encodings were affected). Refactor to allow each provider to supply its own implementation of identifier downcasing. For historical compatibility, when using a single-byte encoding, ICU still relies on tolower(). One minor behavior change is that, before the database default locale is initialized, it uses ASCII semantics to downcase the identifiers. Previously, it would use the postmaster's LC_CTYPE setting from the environment. While that could have some effect during GUC processing, for example, it would have been fragile to rely on the environment setting anyway. (Also, it only matters when the encoding is single-byte.) Reviewed-by: Chao Li <li.evan.chao@gmail.com> Reviewed-by: Peter Eisentraut <peter@eisentraut.org> Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
author: Jeff Davis <jdavis@postgresql.org> 2025-12-16 15:32:41 -0800
committer: Jeff Davis <jdavis@postgresql.org> 2025-12-16 15:32:41 -0800
commit: 87b2968df0f866aaccb6ba69adf284e3c4a79454 (patch)
tree: 83669f96e2198bc8dda0666e7e8b0805c77f95a9 /src/backend/utils
parent: 7f007e4a044a713df5320fca09621d6ba8e632ba (diff)
4 files changed, 90 insertions, 1 deletions
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 8a3796aa5d0..ee08ac045b7 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1353,6 +1353,26 @@ pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 }
 
 /*
+ * Lowercase an identifier using the database default locale.
+ *
+ * For historical reasons, does not use ordinary locale behavior. Should only
+ * be used for identifiers. XXX: can we make this equivalent to
+ * pg_strfold(..., default_locale)?
+ */
+size_t
+pg_downcase_ident(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+{
+	pg_locale_t locale = default_locale;
+
+	if (locale == NULL || locale->ctype == NULL ||
+		locale->ctype->downcase_ident == NULL)
+		return strlower_c(dst, dstsize, src, srclen);
+	else
+		return locale->ctype->downcase_ident(dst, dstsize, src, srclen,
+											 locale);
+}
+
+/*
  * pg_strcoll
  *
  * Like pg_strncoll for NUL-terminated input strings.
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0c2920112bb..145b4641b1b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -208,6 +208,8 @@ static const struct ctype_methods ctype_methods_builtin = {
 	.strtitle = strtitle_builtin,
 	.strupper = strupper_builtin,
 	.strfold = strfold_builtin,
+	/* uses plain ASCII semantics for historical reasons */
+	.downcase_ident = NULL,
 	.wc_isdigit = wc_isdigit_builtin,
 	.wc_isalpha = wc_isalpha_builtin,
 	.wc_isalnum = wc_isalnum_builtin,
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index 18d026deda8..69f22b47a68 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -61,6 +61,8 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src,
 						   ssize_t srclen, pg_locale_t locale);
 static size_t strfold_icu(char *dest, size_t destsize, const char *src,
 						  ssize_t srclen, pg_locale_t locale);
+static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
+								 ssize_t srclen, pg_locale_t locale);
 static int	strncoll_icu(const char *arg1, ssize_t len1,
 						 const char *arg2, ssize_t len2,
 						 pg_locale_t locale);
@@ -123,7 +125,7 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
 
 /*
  * XXX: many of the functions below rely on casts directly from pg_wchar to
- * UChar32, which is correct for the UTF-8 encoding, but not in general.
+ * UChar32, which is correct for UTF-8 and LATIN1, but not in general.
  */
 
 static pg_wchar
@@ -227,6 +229,7 @@ static const struct ctype_methods ctype_methods_icu = {
 	.strtitle = strtitle_icu,
 	.strupper = strupper_icu,
 	.strfold = strfold_icu,
+	.downcase_ident = downcase_ident_icu,
 	.wc_isdigit = wc_isdigit_icu,
 	.wc_isalpha = wc_isalpha_icu,
 	.wc_isalnum = wc_isalnum_icu,
@@ -565,6 +568,37 @@ strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
 }
 
 /*
+ * For historical compatibility, behavior is not multibyte-aware.
+ *
+ * NB: uses libc tolower() for single-byte encodings (also for historical
+ * compatibility), and therefore relies on the global LC_CTYPE setting.
+ */
+static size_t
+downcase_ident_icu(char *dst, size_t dstsize, const char *src,
+				   ssize_t srclen, pg_locale_t locale)
+{
+	int			i;
+	bool		enc_is_single_byte;
+
+	enc_is_single_byte = pg_database_encoding_max_length() == 1;
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch = pg_ascii_tolower(ch);
+		else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
+			ch = tolower(ch);
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
+/*
  * strncoll_icu_utf8
  *
  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 3baa5816b5f..ab6117aaace 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -318,12 +318,41 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
 		return wc;
 }
 
+/*
+ * Characters A..Z always downcase to a..z, even in the Turkish
+ * locale. Characters beyond 127 use tolower().
+ */
+static size_t
+downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src,
+					   ssize_t srclen, pg_locale_t locale)
+{
+	locale_t	loc = locale->lt;
+	int			i;
+
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch = pg_ascii_tolower(ch);
+		else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc))
+			ch = tolower_l(ch, loc);
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
 static const struct ctype_methods ctype_methods_libc_sb = {
 	.strlower = strlower_libc_sb,
 	.strtitle = strtitle_libc_sb,
 	.strupper = strupper_libc_sb,
 	/* in libc, casefolding is the same as lowercasing */
 	.strfold = strlower_libc_sb,
+	.downcase_ident = downcase_ident_libc_sb,
 	.wc_isdigit = wc_isdigit_libc_sb,
 	.wc_isalpha = wc_isalpha_libc_sb,
 	.wc_isalnum = wc_isalnum_libc_sb,
@@ -349,6 +378,8 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
 	.strupper = strupper_libc_mb,
 	/* in libc, casefolding is the same as lowercasing */
 	.strfold = strlower_libc_mb,
+	/* uses plain ASCII semantics for historical reasons */
+	.downcase_ident = NULL,
 	.wc_isdigit = wc_isdigit_libc_sb,
 	.wc_isalpha = wc_isalpha_libc_sb,
 	.wc_isalnum = wc_isalnum_libc_sb,
@@ -370,6 +401,8 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
 	.strupper = strupper_libc_mb,
 	/* in libc, casefolding is the same as lowercasing */
 	.strfold = strlower_libc_mb,
+	/* uses plain ASCII semantics for historical reasons */
+	.downcase_ident = NULL,
 	.wc_isdigit = wc_isdigit_libc_mb,
 	.wc_isalpha = wc_isalpha_libc_mb,
 	.wc_isalnum = wc_isalnum_libc_mb,
author	Jeff Davis <jdavis@postgresql.org>	2025-12-16 15:32:41 -0800
committer	Jeff Davis <jdavis@postgresql.org>	2025-12-16 15:32:41 -0800
commit	87b2968df0f866aaccb6ba69adf284e3c4a79454 (patch)
tree	83669f96e2198bc8dda0666e7e8b0805c77f95a9 /src/backend/utils
parent	7f007e4a044a713df5320fca09621d6ba8e632ba (diff)