From 2a0af7fe460eb46f9af996075972bf7c2e3f211d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 25 Feb 2021 13:00:40 -0500 Subject: Allow complemented character class escapes within regex brackets. The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us --- src/backend/regex/regc_lex.c | 166 +++++-------------------------------------- 1 file changed, 16 insertions(+), 150 deletions(-) (limited to 'src/backend/regex/regc_lex.c') diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index 16664531641..7673dab76f4 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -193,83 +193,6 @@ prefixes(struct vars *v) } } -/* - * lexnest - "call a subroutine", interpolating string at the lexical level - * - * Note, this is not a very general facility. There are a number of - * implicit assumptions about what sorts of strings can be subroutines. - */ -static void -lexnest(struct vars *v, - const chr *beginp, /* start of interpolation */ - const chr *endp) /* one past end of interpolation */ -{ - assert(v->savenow == NULL); /* only one level of nesting */ - v->savenow = v->now; - v->savestop = v->stop; - v->now = beginp; - v->stop = endp; -} - -/* - * string constants to interpolate as expansions of things like \d - */ -static const chr backd[] = { /* \d */ - CHR('['), CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr backD[] = { /* \D */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr brbackd[] = { /* \d within brackets */ - CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']') -}; -static const chr backs[] = { /* \s */ - CHR('['), CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr backS[] = { /* \S */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr brbacks[] = { /* \s within brackets */ - CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']') -}; -static const chr backw[] = { /* \w */ - CHR('['), CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') -}; -static const chr backW[] = { /* \W */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') -}; -static const chr brbackw[] = { /* \w within brackets */ - CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_') -}; - -/* - * lexword - interpolate a bracket expression for word characters - * Possibly ought to inquire whether there is a "word" character class. - */ -static void -lexword(struct vars *v) -{ - lexnest(v, backw, ENDOF(backw)); -} - /* * next - get next token */ @@ -292,14 +215,6 @@ next(struct vars *v) RETV(SBEGIN, 0); /* same as \A */ } - /* if we're nested and we've hit end, return to outer level */ - if (v->savenow != NULL && ATEOS()) - { - v->now = v->savenow; - v->stop = v->savestop; - v->savenow = v->savestop = NULL; - } - /* skip white space etc. if appropriate (not in literal or []) */ if (v->cflags & REG_EXPANDED) switch (v->lexcon) @@ -420,32 +335,15 @@ next(struct vars *v) NOTE(REG_UNONPOSIX); if (ATEOS()) FAILW(REG_EESCAPE); - (DISCARD) lexescape(v); + if (!lexescape(v)) + return 0; switch (v->nexttype) { /* not all escapes okay here */ case PLAIN: + case CCLASSS: + case CCLASSC: return 1; break; - case CCLASS: - switch (v->nextvalue) - { - case 'd': - lexnest(v, brbackd, ENDOF(brbackd)); - break; - case 's': - lexnest(v, brbacks, ENDOF(brbacks)); - break; - case 'w': - lexnest(v, brbackw, ENDOF(brbackw)); - break; - default: - FAILW(REG_EESCAPE); - break; - } - /* lexnest done, back up and try again */ - v->nexttype = v->lasttype; - return next(v); - break; } /* not one of the acceptable escapes */ FAILW(REG_EESCAPE); @@ -691,49 +589,17 @@ next(struct vars *v) } RETV(PLAIN, *v->now++); } - (DISCARD) lexescape(v); - if (ISERR()) - FAILW(REG_EESCAPE); - if (v->nexttype == CCLASS) - { /* fudge at lexical level */ - switch (v->nextvalue) - { - case 'd': - lexnest(v, backd, ENDOF(backd)); - break; - case 'D': - lexnest(v, backD, ENDOF(backD)); - break; - case 's': - lexnest(v, backs, ENDOF(backs)); - break; - case 'S': - lexnest(v, backS, ENDOF(backS)); - break; - case 'w': - lexnest(v, backw, ENDOF(backw)); - break; - case 'W': - lexnest(v, backW, ENDOF(backW)); - break; - default: - assert(NOTREACHED); - FAILW(REG_ASSERT); - break; - } - /* lexnest done, back up and try again */ - v->nexttype = v->lasttype; - return next(v); - } - /* otherwise, lexescape has already done the work */ - return !ISERR(); + return lexescape(v); } /* * lexescape - parse an ARE backslash escape (backslash already eaten) - * Note slightly nonstandard use of the CCLASS type code. + * + * This is used for ARE backslashes both normally and inside bracket + * expressions. In the latter case, not all escape types are allowed, + * but the caller must reject unwanted ones after we return. */ -static int /* not actually used, but convenient for RETV */ +static int lexescape(struct vars *v) { chr c; @@ -775,11 +641,11 @@ lexescape(struct vars *v) break; case CHR('d'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'd'); + RETV(CCLASSS, CC_DIGIT); break; case CHR('D'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'D'); + RETV(CCLASSC, CC_DIGIT); break; case CHR('e'): NOTE(REG_UUNPORT); @@ -802,11 +668,11 @@ lexescape(struct vars *v) break; case CHR('s'): NOTE(REG_ULOCALE); - RETV(CCLASS, 's'); + RETV(CCLASSS, CC_SPACE); break; case CHR('S'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'S'); + RETV(CCLASSC, CC_SPACE); break; case CHR('t'): RETV(PLAIN, CHR('\t')); @@ -828,11 +694,11 @@ lexescape(struct vars *v) break; case CHR('w'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'w'); + RETV(CCLASSS, CC_WORD); break; case CHR('W'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'W'); + RETV(CCLASSC, CC_WORD); break; case CHR('x'): NOTE(REG_UUNPORT); -- cgit v1.2.3