summaryrefslogtreecommitdiff
path: root/src/backend/regex/regc_locale.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/regex/regc_locale.c')
-rw-r--r--src/backend/regex/regc_locale.c119
1 files changed, 39 insertions, 80 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 6cf27958b15..c0414a24912 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -351,6 +351,16 @@ static const struct cname
/*
+ * We do not use the hard-wired Unicode classification tables that Tcl does.
+ * This is because (a) we need to deal with other encodings besides Unicode,
+ * and (b) we want to track the behavior of the libc locale routines as
+ * closely as possible. For example, it wouldn't be unreasonable for a
+ * locale to not consider every Unicode letter as a letter. So we build
+ * character classification cvecs by asking libc, even for Unicode.
+ */
+
+
+/*
* element - map collating-element name to celt
*/
static celt
@@ -489,7 +499,11 @@ eclass(struct vars * v, /* context */
/*
* cclass - supply cvec for a character class
*
- * Must include case counterparts on request.
+ * Must include case counterparts if "cases" is true.
+ *
+ * The returned cvec might be either a transient cvec gotten from getcvec(),
+ * or a permanently cached one from pg_ctype_get_cache(). This is okay
+ * because callers are not supposed to explicitly free the result either way.
*/
static struct cvec *
cclass(struct vars * v, /* context */
@@ -548,79 +562,54 @@ cclass(struct vars * v, /* context */
index = (int) CC_ALPHA;
/*
- * Now compute the character class contents.
- *
- * For the moment, assume that only char codes < 256 can be in these
- * classes.
+ * Now compute the character class contents. For classes that are
+ * based on the behavior of a <wctype.h> or <ctype.h> function, we use
+ * pg_ctype_get_cache so that we can cache the results. Other classes
+ * have definitions that are hard-wired here, and for those we just
+ * construct a transient cvec on the fly.
*/
switch ((enum classes) index)
{
case CC_PRINT:
- cv = getcvec(v, UCHAR_MAX, 0);
- if (cv)
- {
- for (i = 0; i <= UCHAR_MAX; i++)
- {
- if (pg_wc_isprint((chr) i))
- addchr(cv, (chr) i);
- }
- }
+ cv = pg_ctype_get_cache(pg_wc_isprint);
break;
case CC_ALNUM:
- cv = getcvec(v, UCHAR_MAX, 0);
- if (cv)
- {
- for (i = 0; i <= UCHAR_MAX; i++)
- {
- if (pg_wc_isalnum((chr) i))
- addchr(cv, (chr) i);
- }
- }
+ cv = pg_ctype_get_cache(pg_wc_isalnum);
break;
case CC_ALPHA:
- cv = getcvec(v, UCHAR_MAX, 0);
- if (cv)
- {
- for (i = 0; i <= UCHAR_MAX; i++)
- {
- if (pg_wc_isalpha((chr) i))
- addchr(cv, (chr) i);
- }
- }
+ cv = pg_ctype_get_cache(pg_wc_isalpha);
break;
case CC_ASCII:
+ /* hard-wired meaning */
cv = getcvec(v, 0, 1);
if (cv)
addrange(cv, 0, 0x7f);
break;
case CC_BLANK:
+ /* hard-wired meaning */
cv = getcvec(v, 2, 0);
addchr(cv, '\t');
addchr(cv, ' ');
break;
case CC_CNTRL:
+ /* hard-wired meaning */
cv = getcvec(v, 0, 2);
addrange(cv, 0x0, 0x1f);
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
- cv = getcvec(v, 0, 1);
- if (cv)
- addrange(cv, (chr) '0', (chr) '9');
+ cv = pg_ctype_get_cache(pg_wc_isdigit);
break;
case CC_PUNCT:
- cv = getcvec(v, UCHAR_MAX, 0);
- if (cv)
- {
- for (i = 0; i <= UCHAR_MAX; i++)
- {
- if (pg_wc_ispunct((chr) i))
- addchr(cv, (chr) i);
- }
- }
+ cv = pg_ctype_get_cache(pg_wc_ispunct);
break;
case CC_XDIGIT:
+ /*
+ * It's not clear how to define this in non-western locales, and
+ * even less clear that there's any particular use in trying.
+ * So just hard-wire the meaning.
+ */
cv = getcvec(v, 0, 3);
if (cv)
{
@@ -630,50 +619,20 @@ cclass(struct vars * v, /* context */
}
break;
case CC_SPACE:
- cv = getcvec(v, UCHAR_MAX, 0);
- if (cv)
- {
- for (i = 0; i <= UCHAR_MAX; i++)
- {
- if (pg_wc_isspace((chr) i))
- addchr(cv, (chr) i);
- }
- }
+ cv = pg_ctype_get_cache(pg_wc_isspace);
break;
case CC_LOWER:
- cv = getcvec(v, UCHAR_MAX, 0);
- if (cv)
- {
- for (i = 0; i <= UCHAR_MAX; i++)
- {
- if (pg_wc_islower((chr) i))
- addchr(cv, (chr) i);
- }
- }
+ cv = pg_ctype_get_cache(pg_wc_islower);
break;
case CC_UPPER:
- cv = getcvec(v, UCHAR_MAX, 0);
- if (cv)
- {
- for (i = 0; i <= UCHAR_MAX; i++)
- {
- if (pg_wc_isupper((chr) i))
- addchr(cv, (chr) i);
- }
- }
+ cv = pg_ctype_get_cache(pg_wc_isupper);
break;
case CC_GRAPH:
- cv = getcvec(v, UCHAR_MAX, 0);
- if (cv)
- {
- for (i = 0; i <= UCHAR_MAX; i++)
- {
- if (pg_wc_isgraph((chr) i))
- addchr(cv, (chr) i);
- }
- }
+ cv = pg_ctype_get_cache(pg_wc_isgraph);
break;
}
+
+ /* If cv is NULL now, the reason must be "out of memory" */
if (cv == NULL)
ERR(REG_ESPACE);
return cv;