From bfc5992069cf00b189af83d96a83ae5ebb65e938 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Fri, 24 Jan 2025 14:56:22 -0800 Subject: Add SQL function CASEFOLD(). Useful for caseless matching. Similar to LOWER(), but avoids edge-case problems with using LOWER() for caseless matching. For collations that support it, CASEFOLD() handles characters with more than two case variations or multi-character case variations. Some characters may fold to uppercase. The results of case folding are also more stable across Unicode versions than LOWER() or UPPER(). Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick --- src/backend/utils/adt/formatting.c | 69 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'src/backend/utils/adt/formatting.c') diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 7c4c4aa07d5..2720d3902ab 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1819,6 +1819,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) return result; } +/* + * collation-aware, wide-character-aware case folding + * + * We pass the number of bytes so we can pass varlena and char* + * to this function. The result is a palloc'd, null-terminated string. + */ +char * +str_casefold(const char *buff, size_t nbytes, Oid collid) +{ + char *result; + pg_locale_t mylocale; + + if (!buff) + return NULL; + + if (!OidIsValid(collid)) + { + /* + * This typically means that the parser could not resolve a conflict + * of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for %s function", + "lower()"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + + if (GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode case folding can only be performed if server encoding is UTF8"))); + + mylocale = pg_newlocale_from_collation(collid); + + /* C/POSIX collations use this path regardless of database encoding */ + if (mylocale->ctype_is_c) + { + result = asc_tolower(buff, nbytes); + } + else + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strfold(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strfold(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); + } + + Assert(dst[needed] == '\0'); + result = dst; + } + + return result; +} + /* * ASCII-only lower function * -- cgit v1.2.3