From b9edaa784e407d1b6e890b776c607a26f3aa7e49 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 10 Jul 2012 18:00:51 -0400 Subject: Back-patch fix for extraction of fixed prefixes from regular expressions. Back-patch of commits 628cbb50ba80c83917b07a7609ddec12cda172d0 and c6aae3042be5249e672b731ebeb21875b5343010. This has been broken since 7.3, so back-patch to all supported branches. --- src/backend/utils/adt/selfuncs.c | 204 ++++++++------------------------------- 1 file changed, 38 insertions(+), 166 deletions(-) (limited to 'src/backend/utils/adt/selfuncs.c') diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index ce7f7c487dd..39c43422a78 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -167,7 +167,8 @@ static Selectivity prefix_selectivity(VariableStatData *vardata, static Selectivity like_selectivity(const char *patt, int pattlen, bool case_insensitive); static Selectivity regex_selectivity(const char *patt, int pattlen, - bool case_insensitive); + bool case_insensitive, + int fixed_prefix_len); static Datum string_to_datum(const char *str, Oid datatype); static Const *string_to_const(const char *str, Oid datatype); static Const *string_to_bytea_const(const char *str, size_t str_len); @@ -4645,17 +4646,9 @@ static Pattern_Prefix_Status regex_fixed_prefix(Const *patt_const, bool case_insensitive, Const **prefix_const, Selectivity *rest_selec) { - char *match; - int pos, - match_pos, - prev_pos, - prev_match_pos; - bool have_leading_paren; - char *patt; - char *rest; Oid typeid = patt_const->consttype; - bool is_basic = regex_flavor_is_basic(); - bool is_multibyte = (pg_database_encoding_max_length() > 1); + char *prefix; + bool exact; /* * Should be unnecessary, there are no bytea regex operators defined. As @@ -4667,181 +4660,54 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("regular-expression matching not supported on type bytea"))); - /* the right-hand const is type text for all of these */ - patt = TextDatumGetCString(patt_const->constvalue); + /* Use the regexp machinery to extract the prefix, if any */ + prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue), + case_insensitive, + &exact); - /* - * Check for ARE director prefix. It's worth our trouble to recognize - * this because similar_escape() uses it. - */ - pos = 0; - if (strncmp(patt, "***:", 4) == 0) - { - pos = 4; - is_basic = false; - } - - /* Pattern must be anchored left */ - if (patt[pos] != '^') + if (prefix == NULL) { *prefix_const = NULL; if (rest_selec != NULL) - *rest_selec = regex_selectivity(patt, strlen(patt), - case_insensitive); - - return Pattern_Prefix_None; - } - pos++; - - /* - * If '|' is present in pattern, then there may be multiple alternatives - * for the start of the string. (There are cases where this isn't so, for - * instance if the '|' is inside parens, but detecting that reliably is - * too hard.) - */ - if (strchr(patt + pos, '|') != NULL) - { - *prefix_const = NULL; + { + char *patt = TextDatumGetCString(patt_const->constvalue); - if (rest_selec != NULL) *rest_selec = regex_selectivity(patt, strlen(patt), - case_insensitive); + case_insensitive, + 0); + pfree(patt); + } return Pattern_Prefix_None; } - /* OK, allocate space for pattern */ - match = palloc(strlen(patt) + 1); - prev_match_pos = match_pos = 0; + *prefix_const = string_to_const(prefix, typeid); - /* - * We special-case the syntax '^(...)$' because psql uses it. But beware: - * in BRE mode these parentheses are just ordinary characters. Also, - * sequences beginning "(?" are not what they seem, unless they're "(?:". - * (We should recognize that, too, because of similar_escape().) - * - * Note: it's a bit bogus to be depending on the current regex_flavor - * setting here, because the setting could change before the pattern is - * used. We minimize the risk by trusting the flavor as little as we can, - * but perhaps it would be a good idea to get rid of the "basic" setting. - */ - have_leading_paren = false; - if (patt[pos] == '(' && !is_basic && - (patt[pos + 1] != '?' || patt[pos + 2] == ':')) - { - have_leading_paren = true; - pos += (patt[pos + 1] != '?' ? 1 : 3); - } - - /* Scan remainder of pattern */ - prev_pos = pos; - while (patt[pos]) + if (rest_selec != NULL) { - int len; - - /* - * Check for characters that indicate multiple possible matches here. - * Also, drop out at ')' or '$' so the termination test works right. - */ - if (patt[pos] == '.' || - patt[pos] == '(' || - patt[pos] == ')' || - patt[pos] == '[' || - patt[pos] == '^' || - patt[pos] == '$') - break; - - /* - * XXX In multibyte character sets, we can't trust isalpha, so assume - * any multibyte char is potentially case-varying. - */ - if (case_insensitive) + if (exact) { - if (is_multibyte && (unsigned char) patt[pos] >= 0x80) - break; - if (isalpha((unsigned char) patt[pos])) - break; - } - - /* - * Check for quantifiers. Except for +, this means the preceding - * character is optional, so we must remove it from the prefix too! - * Note: in BREs, \{ is a quantifier. - */ - if (patt[pos] == '*' || - patt[pos] == '?' || - patt[pos] == '{' || - (patt[pos] == '\\' && patt[pos + 1] == '{')) - { - match_pos = prev_match_pos; - pos = prev_pos; - break; + /* Exact match, so there's no additional selectivity */ + *rest_selec = 1.0; } - if (patt[pos] == '+') + else { - pos = prev_pos; - break; - } + char *patt = TextDatumGetCString(patt_const->constvalue); - /* - * Normally, backslash quotes the next character. But in AREs, - * backslash followed by alphanumeric is an escape, not a quoted - * character. Must treat it as having multiple possible matches. In - * BREs, \( is a parenthesis, so don't trust that either. Note: since - * only ASCII alphanumerics are escapes, we don't have to be paranoid - * about multibyte here. - */ - if (patt[pos] == '\\') - { - if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(') - break; - pos++; - if (patt[pos] == '\0') - break; + *rest_selec = regex_selectivity(patt, strlen(patt), + case_insensitive, + strlen(prefix)); + pfree(patt); } - /* save position in case we need to back up on next loop cycle */ - prev_match_pos = match_pos; - prev_pos = pos; - /* must use encoding-aware processing here */ - len = pg_mblen(&patt[pos]); - memcpy(&match[match_pos], &patt[pos], len); - match_pos += len; - pos += len; } - match[match_pos] = '\0'; - rest = &patt[pos]; - - if (have_leading_paren && patt[pos] == ')') - pos++; - - if (patt[pos] == '$' && patt[pos + 1] == '\0') - { - *prefix_const = string_to_const(match, typeid); - - if (rest_selec != NULL) - *rest_selec = 1.0; - - pfree(patt); - pfree(match); + pfree(prefix); + if (exact) return Pattern_Prefix_Exact; /* pattern specifies exact match */ - } - - *prefix_const = string_to_const(match, typeid); - - if (rest_selec != NULL) - *rest_selec = regex_selectivity(rest, strlen(rest), - case_insensitive); - - pfree(patt); - pfree(match); - - if (match_pos > 0) + else return Pattern_Prefix_Partial; - - return Pattern_Prefix_None; } Pattern_Prefix_Status @@ -5122,7 +4988,8 @@ regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive) } static Selectivity -regex_selectivity(const char *patt, int pattlen, bool case_insensitive) +regex_selectivity(const char *patt, int pattlen, bool case_insensitive, + int fixed_prefix_len) { Selectivity sel; @@ -5138,9 +5005,14 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive) /* no trailing $ */ sel = regex_selectivity_sub(patt, pattlen, case_insensitive); sel *= FULL_WILDCARD_SEL; - if (sel > 1.0) - sel = 1.0; } + + /* If there's a fixed prefix, discount its selectivity */ + if (fixed_prefix_len > 0) + sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len); + + /* Make sure result stays in range */ + CLAMP_PROBABILITY(sel); return sel; } -- cgit v1.2.3