summaryrefslogtreecommitdiff
path: root/src/common/unicode_case.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/unicode_case.c')
-rw-r--r--src/common/unicode_case.c158
1 files changed, 147 insertions, 11 deletions
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 8e7d4122d53..48521d83239 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -17,12 +17,15 @@
#include "common/unicode_case.h"
#include "common/unicode_case_table.h"
+#include "common/unicode_category.h"
#include "mb/pg_wchar.h"
static const pg_case_map *find_case_map(pg_wchar ucs);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
- CaseKind str_casekind, WordBoundaryNext wbnext,
+ CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate);
+static bool check_special_conditions(int conditions, const char *str,
+ size_t len, size_t offset);
pg_wchar
unicode_lowercase_simple(pg_wchar code)
@@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
*
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
+ *
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied.
*/
size_t
-unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+ bool full)
{
- return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
+ return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
+ NULL);
}
/*
@@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
*
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied. Otherwise, use only simple mappings and use
+ * uppercase instead of titlecase.
+ *
* Titlecasing requires knowledge about word boundaries, which is provided by
* the callback wbnext. A word boundary is the offset of the start of a word
* or the offset of the character immediately following a word.
@@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
*/
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
- WordBoundaryNext wbnext, void *wbstate)
+ bool full, WordBoundaryNext wbnext, void *wbstate)
{
- return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
+ return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
wbstate);
}
@@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
*
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
+ *
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied.
*/
size_t
-unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+ bool full)
{
- return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
+ return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
+ NULL);
}
/*
+ * Implement Unicode Default Case Conversion algorithm.
+ *
* If str_casekind is CaseLower or CaseUpper, map each character in the string
* for which a mapping is available.
*
* If str_casekind is CaseTitle, maps characters found on a word boundary to
- * uppercase and other characters to lowercase.
+ * titlecase (or uppercase if full is false) and other characters to
+ * lowercase. NB: does not currently implement the Unicode behavior in which
+ * the word boundary is adjusted to the next Cased character. That behavior
+ * could be implemented as an option, but it doesn't match the default
+ * behavior of ICU, nor does it match the documented behavior of INITCAP().
+ *
+ * If full is true, use special mappings for relevant characters, which can
+ * map a single codepoint to multiple codepoints, or depend on conditions.
*/
static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
- CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
+ CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
+ void *wbstate)
{
/* character CaseKind varies while titlecasing */
CaseKind chr_casekind = str_casekind;
@@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
int u1len = unicode_utf8len(u1);
const pg_case_map *casemap = find_case_map(u1);
+ const pg_special_case *special = NULL;
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
{
- chr_casekind = CaseUpper;
+ chr_casekind = full ? CaseTitle : CaseUpper;
boundary = wbnext(wbstate);
}
else
chr_casekind = CaseLower;
}
+ /*
+ * Find special case that matches the conditions, if any.
+ *
+ * Note: only a single special mapping per codepoint is currently
+ * supported, though Unicode allows for multiple special mappings for
+ * a single codepoint.
+ */
+ if (full && casemap && casemap->special_case)
+ {
+ int16 conditions = casemap->special_case->conditions;
+
+ Assert(casemap->special_case->codepoint == u1);
+ if (check_special_conditions(conditions, src, srclen, srcoff))
+ special = casemap->special_case;
+ }
+
/* perform mapping, update result_len, and write to dst */
- if (casemap)
+ if (special)
+ {
+ for (int i = 0; i < MAX_CASE_EXPANSION; i++)
+ {
+ pg_wchar u2 = special->map[chr_casekind][i];
+ size_t u2len = unicode_utf8len(u2);
+
+ if (u2 == '\0')
+ break;
+
+ if (result_len + u2len <= dstsize)
+ unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+ result_len += u2len;
+ }
+ }
+ else if (casemap)
{
pg_wchar u2 = casemap->simplemap[chr_casekind];
pg_wchar u2len = unicode_utf8len(u2);
@@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
return result_len;
}
+/*
+ * Check that the condition matches Final_Sigma, described in Unicode Table
+ * 3-17. The character at the given offset must be directly preceded by a
+ * Cased character, and must not be directly followed by a Cased character.
+ *
+ * Case_Ignorable characters are ignored. NB: some characters may be both
+ * Cased and Case_Ignorable, in which case they are ignored.
+ */
+static bool
+check_final_sigma(const unsigned char *str, size_t len, size_t offset)
+{
+ /* the start of the string is not preceded by a Cased character */
+ if (offset == 0)
+ return false;
+
+ /* iterate backwards, looking for Cased character */
+ for (int i = offset - 1; i >= 0; i--)
+ {
+ if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
+ {
+ pg_wchar curr = utf8_to_unicode(str + i);
+
+ if (pg_u_prop_case_ignorable(curr))
+ continue;
+ else if (pg_u_prop_cased(curr))
+ break;
+ else
+ return false;
+ }
+ else if ((str[i] & 0xC0) == 0x80)
+ continue;
+
+ Assert(false); /* invalid UTF-8 */
+ }
+
+ /* end of string is not followed by a Cased character */
+ if (offset == len)
+ return true;
+
+ /* iterate forwards, looking for Cased character */
+ for (int i = offset + 1; i < len && str[i] != '\0'; i++)
+ {
+ if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
+ {
+ pg_wchar curr = utf8_to_unicode(str + i);
+
+ if (pg_u_prop_case_ignorable(curr))
+ continue;
+ else if (pg_u_prop_cased(curr))
+ return false;
+ else
+ break;
+ }
+ else if ((str[i] & 0xC0) == 0x80)
+ continue;
+
+ Assert(false); /* invalid UTF-8 */
+ }
+
+ return true;
+}
+
+static bool
+check_special_conditions(int conditions, const char *str, size_t len,
+ size_t offset)
+{
+ if (conditions == 0)
+ return true;
+ else if (conditions == PG_U_FINAL_SIGMA)
+ return check_final_sigma((unsigned char *) str, len, offset);
+
+ /* no other conditions supported */
+ Assert(false);
+ return false;
+}
+
/* find entry in simple case map, if any */
static const pg_case_map *
find_case_map(pg_wchar ucs)