diff options
-rw-r--r-- | src/backend/utils/adt/regexp.c | 97 | ||||
-rw-r--r-- | src/test/regress/expected/strings.out | 9 | ||||
-rw-r--r-- | src/test/regress/sql/strings.sql | 3 |
3 files changed, 82 insertions, 27 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 6e2864cbbda..b0cdef9b19f 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -774,10 +774,8 @@ similar_escape_internal(text *pat_text, text *esc_text) elen; bool afterescape = false; int nquotes = 0; - int charclass_depth = 0; /* Nesting level of character classes, - * encompassed by square brackets */ - int charclass_start = 0; /* State of the character class start, - * for carets */ + int bracket_depth = 0; /* square bracket nesting level */ + int charclass_pos = 0; /* position inside a character class */ p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); @@ -836,6 +834,17 @@ similar_escape_internal(text *pat_text, text *esc_text) * the relevant part separators in the above expansion. If the result * of this function is used in a plain regexp match (SIMILAR TO), the * escape-double-quotes have no effect on the match behavior. + * + * While we don't fully validate character classes (bracket expressions), + * we do need to parse them well enough to know where they end. + * "charclass_pos" tracks where we are in a character class. + * Its value is uninteresting when bracket_depth is 0. + * But when bracket_depth > 0, it will be + * 1: right after the opening '[' (a following '^' will negate + * the class, while ']' is a literal character) + * 2: right after a '^' after the opening '[' (']' is still a literal + * character) + * 3 or more: further inside the character class (']' ends the class) *---------- */ @@ -907,7 +916,7 @@ similar_escape_internal(text *pat_text, text *esc_text) /* fast path */ if (afterescape) { - if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */ + if (pchar == '"' && bracket_depth < 1) /* escape-double-quote? */ { /* emit appropriate part separator, per notes above */ if (nquotes == 0) @@ -948,6 +957,12 @@ similar_escape_internal(text *pat_text, text *esc_text) */ *r++ = '\\'; *r++ = pchar; + + /* + * If we encounter an escaped character in a character class, + * we are no longer at the beginning. + */ + charclass_pos = 3; } afterescape = false; } @@ -956,41 +971,69 @@ similar_escape_internal(text *pat_text, text *esc_text) /* SQL escape character; do not send to output */ afterescape = true; } - else if (charclass_depth > 0) + else if (bracket_depth > 0) { + /* inside a character class */ if (pchar == '\\') + { + /* + * If we're here, backslash is not the SQL escape character, + * so treat it as a literal class element, which requires + * doubling it. (This matches our behavior for backslashes + * outside character classes.) + */ *r++ = '\\'; + } *r++ = pchar; - /* - * Ignore a closing bracket at the start of a character class. - * Such a bracket is taken literally rather than closing the - * class. "charclass_start" is 1 right at the beginning of a - * class and 2 after an initial caret. - */ - if (pchar == ']' && charclass_start > 2) - charclass_depth--; + /* parse the character class well enough to identify ending ']' */ + if (pchar == ']' && charclass_pos > 2) + { + /* found the real end of a bracket pair */ + bracket_depth--; + /* don't reset charclass_pos, this may be an inner bracket */ + } else if (pchar == '[') - charclass_depth++; + { + /* start of a nested bracket pair */ + bracket_depth++; - /* - * If there is a caret right after the opening bracket, it negates - * the character class, but a following closing bracket should - * still be treated as a normal character. That holds only for - * the first caret, so only the values 1 and 2 mean that closing - * brackets should be taken literally. - */ - if (pchar == '^') - charclass_start++; + /* + * We are no longer at the beginning of a character class. + * (The nested bracket pair is a collating element, not a + * character class in its own right.) + */ + charclass_pos = 3; + } + else if (pchar == '^') + { + /* + * A caret right after the opening bracket negates the + * character class. In that case, the following will + * increment charclass_pos from 1 to 2, so that a following + * ']' is still a literal character and does not end the + * character class. If we are further inside a character + * class, charclass_pos might get incremented past 3, which is + * fine. + */ + charclass_pos++; + } else - charclass_start = 3; /* definitely past the start */ + { + /* + * Anything else (including a backslash or leading ']') is an + * element of the character class, so we are no longer at the + * beginning of the class. + */ + charclass_pos = 3; + } } else if (pchar == '[') { /* start of a character class */ *r++ = pchar; - charclass_depth++; - charclass_start = 1; + bracket_depth = 1; + charclass_pos = 1; } else if (pchar == '%') { diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index ba302da51e7..2d6cb02ad60 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -693,6 +693,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; Filter: (f1 ~ '^(?:[^^]\^)$'::text) (2 rows) +-- Closing square bracket after an escape sequence at the beginning of +-- a character closes the character class +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[|a]%' ESCAPE '|'; + QUERY PLAN +--------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:[\a].*)$'::text) +(2 rows) + -- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); regexp_replace diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index b94004cc08c..5ed421d6205 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -218,6 +218,9 @@ EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%'; -- Closing square bracket effective after two carets at the beginning -- of character class. EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; +-- Closing square bracket after an escape sequence at the beginning of +-- a character closes the character class +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[|a]%' ESCAPE '|'; -- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); |