diff options
Diffstat (limited to 'src/backend/regex/regc_locale.c')
-rw-r--r-- | src/backend/regex/regc_locale.c | 119 |
1 files changed, 39 insertions, 80 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c index 6cf27958b15..c0414a24912 100644 --- a/src/backend/regex/regc_locale.c +++ b/src/backend/regex/regc_locale.c @@ -351,6 +351,16 @@ static const struct cname /* + * We do not use the hard-wired Unicode classification tables that Tcl does. + * This is because (a) we need to deal with other encodings besides Unicode, + * and (b) we want to track the behavior of the libc locale routines as + * closely as possible. For example, it wouldn't be unreasonable for a + * locale to not consider every Unicode letter as a letter. So we build + * character classification cvecs by asking libc, even for Unicode. + */ + + +/* * element - map collating-element name to celt */ static celt @@ -489,7 +499,11 @@ eclass(struct vars * v, /* context */ /* * cclass - supply cvec for a character class * - * Must include case counterparts on request. + * Must include case counterparts if "cases" is true. + * + * The returned cvec might be either a transient cvec gotten from getcvec(), + * or a permanently cached one from pg_ctype_get_cache(). This is okay + * because callers are not supposed to explicitly free the result either way. */ static struct cvec * cclass(struct vars * v, /* context */ @@ -548,79 +562,54 @@ cclass(struct vars * v, /* context */ index = (int) CC_ALPHA; /* - * Now compute the character class contents. - * - * For the moment, assume that only char codes < 256 can be in these - * classes. + * Now compute the character class contents. For classes that are + * based on the behavior of a <wctype.h> or <ctype.h> function, we use + * pg_ctype_get_cache so that we can cache the results. Other classes + * have definitions that are hard-wired here, and for those we just + * construct a transient cvec on the fly. */ switch ((enum classes) index) { case CC_PRINT: - cv = getcvec(v, UCHAR_MAX, 0); - if (cv) - { - for (i = 0; i <= UCHAR_MAX; i++) - { - if (pg_wc_isprint((chr) i)) - addchr(cv, (chr) i); - } - } + cv = pg_ctype_get_cache(pg_wc_isprint); break; case CC_ALNUM: - cv = getcvec(v, UCHAR_MAX, 0); - if (cv) - { - for (i = 0; i <= UCHAR_MAX; i++) - { - if (pg_wc_isalnum((chr) i)) - addchr(cv, (chr) i); - } - } + cv = pg_ctype_get_cache(pg_wc_isalnum); break; case CC_ALPHA: - cv = getcvec(v, UCHAR_MAX, 0); - if (cv) - { - for (i = 0; i <= UCHAR_MAX; i++) - { - if (pg_wc_isalpha((chr) i)) - addchr(cv, (chr) i); - } - } + cv = pg_ctype_get_cache(pg_wc_isalpha); break; case CC_ASCII: + /* hard-wired meaning */ cv = getcvec(v, 0, 1); if (cv) addrange(cv, 0, 0x7f); break; case CC_BLANK: + /* hard-wired meaning */ cv = getcvec(v, 2, 0); addchr(cv, '\t'); addchr(cv, ' '); break; case CC_CNTRL: + /* hard-wired meaning */ cv = getcvec(v, 0, 2); addrange(cv, 0x0, 0x1f); addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: - cv = getcvec(v, 0, 1); - if (cv) - addrange(cv, (chr) '0', (chr) '9'); + cv = pg_ctype_get_cache(pg_wc_isdigit); break; case CC_PUNCT: - cv = getcvec(v, UCHAR_MAX, 0); - if (cv) - { - for (i = 0; i <= UCHAR_MAX; i++) - { - if (pg_wc_ispunct((chr) i)) - addchr(cv, (chr) i); - } - } + cv = pg_ctype_get_cache(pg_wc_ispunct); break; case CC_XDIGIT: + /* + * It's not clear how to define this in non-western locales, and + * even less clear that there's any particular use in trying. + * So just hard-wire the meaning. + */ cv = getcvec(v, 0, 3); if (cv) { @@ -630,50 +619,20 @@ cclass(struct vars * v, /* context */ } break; case CC_SPACE: - cv = getcvec(v, UCHAR_MAX, 0); - if (cv) - { - for (i = 0; i <= UCHAR_MAX; i++) - { - if (pg_wc_isspace((chr) i)) - addchr(cv, (chr) i); - } - } + cv = pg_ctype_get_cache(pg_wc_isspace); break; case CC_LOWER: - cv = getcvec(v, UCHAR_MAX, 0); - if (cv) - { - for (i = 0; i <= UCHAR_MAX; i++) - { - if (pg_wc_islower((chr) i)) - addchr(cv, (chr) i); - } - } + cv = pg_ctype_get_cache(pg_wc_islower); break; case CC_UPPER: - cv = getcvec(v, UCHAR_MAX, 0); - if (cv) - { - for (i = 0; i <= UCHAR_MAX; i++) - { - if (pg_wc_isupper((chr) i)) - addchr(cv, (chr) i); - } - } + cv = pg_ctype_get_cache(pg_wc_isupper); break; case CC_GRAPH: - cv = getcvec(v, UCHAR_MAX, 0); - if (cv) - { - for (i = 0; i <= UCHAR_MAX; i++) - { - if (pg_wc_isgraph((chr) i)) - addchr(cv, (chr) i); - } - } + cv = pg_ctype_get_cache(pg_wc_isgraph); break; } + + /* If cv is NULL now, the reason must be "out of memory" */ if (cv == NULL) ERR(REG_ESPACE); return cv; |