summaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/selfuncs.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2012-07-10 18:00:51 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2012-07-10 18:00:51 -0400
commitb9edaa784e407d1b6e890b776c607a26f3aa7e49 (patch)
treea12b47469b4f61ba9037922eff25c7b9868ddf8d /src/backend/utils/adt/selfuncs.c
parentb84da909d925f0f109757a5c0645e6d10f9fc57b (diff)
Back-patch fix for extraction of fixed prefixes from regular expressions.
Back-patch of commits 628cbb50ba80c83917b07a7609ddec12cda172d0 and c6aae3042be5249e672b731ebeb21875b5343010. This has been broken since 7.3, so back-patch to all supported branches.
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r--src/backend/utils/adt/selfuncs.c204
1 files changed, 38 insertions, 166 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index ce7f7c487dd..39c43422a78 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -167,7 +167,8 @@ static Selectivity prefix_selectivity(VariableStatData *vardata,
static Selectivity like_selectivity(const char *patt, int pattlen,
bool case_insensitive);
static Selectivity regex_selectivity(const char *patt, int pattlen,
- bool case_insensitive);
+ bool case_insensitive,
+ int fixed_prefix_len);
static Datum string_to_datum(const char *str, Oid datatype);
static Const *string_to_const(const char *str, Oid datatype);
static Const *string_to_bytea_const(const char *str, size_t str_len);
@@ -4645,17 +4646,9 @@ static Pattern_Prefix_Status
regex_fixed_prefix(Const *patt_const, bool case_insensitive,
Const **prefix_const, Selectivity *rest_selec)
{
- char *match;
- int pos,
- match_pos,
- prev_pos,
- prev_match_pos;
- bool have_leading_paren;
- char *patt;
- char *rest;
Oid typeid = patt_const->consttype;
- bool is_basic = regex_flavor_is_basic();
- bool is_multibyte = (pg_database_encoding_max_length() > 1);
+ char *prefix;
+ bool exact;
/*
* Should be unnecessary, there are no bytea regex operators defined. As
@@ -4667,181 +4660,54 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("regular-expression matching not supported on type bytea")));
- /* the right-hand const is type text for all of these */
- patt = TextDatumGetCString(patt_const->constvalue);
+ /* Use the regexp machinery to extract the prefix, if any */
+ prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
+ case_insensitive,
+ &exact);
- /*
- * Check for ARE director prefix. It's worth our trouble to recognize
- * this because similar_escape() uses it.
- */
- pos = 0;
- if (strncmp(patt, "***:", 4) == 0)
- {
- pos = 4;
- is_basic = false;
- }
-
- /* Pattern must be anchored left */
- if (patt[pos] != '^')
+ if (prefix == NULL)
{
*prefix_const = NULL;
if (rest_selec != NULL)
- *rest_selec = regex_selectivity(patt, strlen(patt),
- case_insensitive);
-
- return Pattern_Prefix_None;
- }
- pos++;
-
- /*
- * If '|' is present in pattern, then there may be multiple alternatives
- * for the start of the string. (There are cases where this isn't so, for
- * instance if the '|' is inside parens, but detecting that reliably is
- * too hard.)
- */
- if (strchr(patt + pos, '|') != NULL)
- {
- *prefix_const = NULL;
+ {
+ char *patt = TextDatumGetCString(patt_const->constvalue);
- if (rest_selec != NULL)
*rest_selec = regex_selectivity(patt, strlen(patt),
- case_insensitive);
+ case_insensitive,
+ 0);
+ pfree(patt);
+ }
return Pattern_Prefix_None;
}
- /* OK, allocate space for pattern */
- match = palloc(strlen(patt) + 1);
- prev_match_pos = match_pos = 0;
+ *prefix_const = string_to_const(prefix, typeid);
- /*
- * We special-case the syntax '^(...)$' because psql uses it. But beware:
- * in BRE mode these parentheses are just ordinary characters. Also,
- * sequences beginning "(?" are not what they seem, unless they're "(?:".
- * (We should recognize that, too, because of similar_escape().)
- *
- * Note: it's a bit bogus to be depending on the current regex_flavor
- * setting here, because the setting could change before the pattern is
- * used. We minimize the risk by trusting the flavor as little as we can,
- * but perhaps it would be a good idea to get rid of the "basic" setting.
- */
- have_leading_paren = false;
- if (patt[pos] == '(' && !is_basic &&
- (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
- {
- have_leading_paren = true;
- pos += (patt[pos + 1] != '?' ? 1 : 3);
- }
-
- /* Scan remainder of pattern */
- prev_pos = pos;
- while (patt[pos])
+ if (rest_selec != NULL)
{
- int len;
-
- /*
- * Check for characters that indicate multiple possible matches here.
- * Also, drop out at ')' or '$' so the termination test works right.
- */
- if (patt[pos] == '.' ||
- patt[pos] == '(' ||
- patt[pos] == ')' ||
- patt[pos] == '[' ||
- patt[pos] == '^' ||
- patt[pos] == '$')
- break;
-
- /*
- * XXX In multibyte character sets, we can't trust isalpha, so assume
- * any multibyte char is potentially case-varying.
- */
- if (case_insensitive)
+ if (exact)
{
- if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
- break;
- if (isalpha((unsigned char) patt[pos]))
- break;
- }
-
- /*
- * Check for quantifiers. Except for +, this means the preceding
- * character is optional, so we must remove it from the prefix too!
- * Note: in BREs, \{ is a quantifier.
- */
- if (patt[pos] == '*' ||
- patt[pos] == '?' ||
- patt[pos] == '{' ||
- (patt[pos] == '\\' && patt[pos + 1] == '{'))
- {
- match_pos = prev_match_pos;
- pos = prev_pos;
- break;
+ /* Exact match, so there's no additional selectivity */
+ *rest_selec = 1.0;
}
- if (patt[pos] == '+')
+ else
{
- pos = prev_pos;
- break;
- }
+ char *patt = TextDatumGetCString(patt_const->constvalue);
- /*
- * Normally, backslash quotes the next character. But in AREs,
- * backslash followed by alphanumeric is an escape, not a quoted
- * character. Must treat it as having multiple possible matches. In
- * BREs, \( is a parenthesis, so don't trust that either. Note: since
- * only ASCII alphanumerics are escapes, we don't have to be paranoid
- * about multibyte here.
- */
- if (patt[pos] == '\\')
- {
- if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
- break;
- pos++;
- if (patt[pos] == '\0')
- break;
+ *rest_selec = regex_selectivity(patt, strlen(patt),
+ case_insensitive,
+ strlen(prefix));
+ pfree(patt);
}
- /* save position in case we need to back up on next loop cycle */
- prev_match_pos = match_pos;
- prev_pos = pos;
- /* must use encoding-aware processing here */
- len = pg_mblen(&patt[pos]);
- memcpy(&match[match_pos], &patt[pos], len);
- match_pos += len;
- pos += len;
}
- match[match_pos] = '\0';
- rest = &patt[pos];
-
- if (have_leading_paren && patt[pos] == ')')
- pos++;
-
- if (patt[pos] == '$' && patt[pos + 1] == '\0')
- {
- *prefix_const = string_to_const(match, typeid);
-
- if (rest_selec != NULL)
- *rest_selec = 1.0;
-
- pfree(patt);
- pfree(match);
+ pfree(prefix);
+ if (exact)
return Pattern_Prefix_Exact; /* pattern specifies exact match */
- }
-
- *prefix_const = string_to_const(match, typeid);
-
- if (rest_selec != NULL)
- *rest_selec = regex_selectivity(rest, strlen(rest),
- case_insensitive);
-
- pfree(patt);
- pfree(match);
-
- if (match_pos > 0)
+ else
return Pattern_Prefix_Partial;
-
- return Pattern_Prefix_None;
}
Pattern_Prefix_Status
@@ -5122,7 +4988,8 @@ regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
}
static Selectivity
-regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
+regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
+ int fixed_prefix_len)
{
Selectivity sel;
@@ -5138,9 +5005,14 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
/* no trailing $ */
sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
sel *= FULL_WILDCARD_SEL;
- if (sel > 1.0)
- sel = 1.0;
}
+
+ /* If there's a fixed prefix, discount its selectivity */
+ if (fixed_prefix_len > 0)
+ sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
+
+ /* Make sure result stays in range */
+ CLAMP_PROBABILITY(sel);
return sel;
}