diff options
Diffstat (limited to 'src/backend/regex')
-rw-r--r-- | src/backend/regex/COPYRIGHT | 56 | ||||
-rw-r--r-- | src/backend/regex/Makefile | 43 | ||||
-rw-r--r-- | src/backend/regex/WHATSNEW | 94 | ||||
-rw-r--r-- | src/backend/regex/engine.c | 1110 | ||||
-rw-r--r-- | src/backend/regex/re_format.7 | 269 | ||||
-rw-r--r-- | src/backend/regex/regcomp.c | 1863 | ||||
-rw-r--r-- | src/backend/regex/regerror.c | 185 | ||||
-rw-r--r-- | src/backend/regex/regex.3 | 538 | ||||
-rw-r--r-- | src/backend/regex/regexec.c | 194 | ||||
-rw-r--r-- | src/backend/regex/regfree.c | 77 | ||||
-rw-r--r-- | src/backend/regex/retest.c | 44 |
11 files changed, 0 insertions, 4473 deletions
diff --git a/src/backend/regex/COPYRIGHT b/src/backend/regex/COPYRIGHT deleted file mode 100644 index 574f6bcec6c..00000000000 --- a/src/backend/regex/COPYRIGHT +++ /dev/null @@ -1,56 +0,0 @@ -Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved. -This software is not subject to any license of the American Telephone -and Telegraph Company or of the Regents of the University of California. - -Permission is granted to anyone to use this software for any purpose on -any computer system, and to alter it and redistribute it, subject -to the following restrictions: - -1. The author is not responsible for the consequences of use of this - software, no matter how awful, even if they arise from flaws in it. - -2. The origin of this software must not be misrepresented, either by - explicit claim or by omission. Since few users ever read sources, - credits must appear in the documentation. - -3. Altered versions must be plainly marked as such, and must not be - misrepresented as being the original software. Since few users - ever read sources, credits must appear in the documentation. - -4. This notice may not be removed or altered. - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -/*- - * Copyright (c) 1994 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)COPYRIGHT 8.1 (Berkeley) 3/16/94 - */ diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile deleted file mode 100644 index 6dc606b16d2..00000000000 --- a/src/backend/regex/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -#------------------------------------------------------------------------- -# -# Makefile-- -# Makefile for regex -# -# IDENTIFICATION -# $Header: /cvsroot/pgsql/src/backend/regex/Makefile,v 1.17 2001/10/04 04:16:16 ishii Exp $ -# -#------------------------------------------------------------------------- - -subdir = src/backend/regex -top_builddir = ../../.. -include $(top_builddir)/src/Makefile.global - -override CPPFLAGS += -DPOSIX_MISTAKE - -DEBUGOBJ = - -OBJS = regcomp.o regerror.o regexec.o regfree.o - -ifdef MULTIBYTE -DEBUGOBJ += ../utils/mb/SUBSYS.o -endif - -all: SUBSYS.o - -SUBSYS.o: $(OBJS) - $(LD) $(LDREL) $(LDOUT) SUBSYS.o $(OBJS) - -regexec.o: regexec.c engine.c - -retest: retest.o SUBSYS.o $(DEBUGOBJ) - $(CC) $(CFLAGS) $(LDFLAGS) $^ $(LIBS) -o $@ - -depend dep: - $(CC) -MM $(CFLAGS) *.c >depend - -clean: - rm -f SUBSYS.o $(OBJS) retest retest.o - -ifeq (depend,$(wildcard depend)) -include depend -endif diff --git a/src/backend/regex/WHATSNEW b/src/backend/regex/WHATSNEW deleted file mode 100644 index f4301d300dd..00000000000 --- a/src/backend/regex/WHATSNEW +++ /dev/null @@ -1,94 +0,0 @@ -# @(#)WHATSNEW 8.3 (Berkeley) 3/18/94 - -New in alpha3.4: The complex bug alluded to below has been fixed (in a -slightly kludgey temporary way that may hurt efficiency a bit; this is -another "get it out the door for 4.4" release). The tests at the end of -the tests file have accordingly been uncommented. The primary sign of -the bug was that something like a?b matching ab matched b rather than ab. -(The bug was essentially specific to this exact situation, else it would -have shown up earlier.) - -New in alpha3.3: The definition of word boundaries has been altered -slightly, to more closely match the usual programming notion that "_" -is an alphabetic. Stuff used for pre-ANSI systems is now in a subdir, -and the makefile no longer alludes to it in mysterious ways. The -makefile has generally been cleaned up some. Fixes have been made -(again!) so that the regression test will run without -DREDEBUG, at -the cost of weaker checking. A workaround for a bug in some folks' -<assert.h> has been added. And some more things have been added to -tests, including a couple right at the end which are commented out -because the code currently flunks them (complex bug; fix coming). -Plus the usual minor cleanup. - -New in alpha3.2: Assorted bits of cleanup and portability improvement -(the development base is now a BSDI system using GCC instead of an ancient -Sun system, and the newer compiler exposed some glitches). Fix for a -serious bug that affected REs using many [] (including REG_ICASE REs -because of the way they are implemented), *sometimes*, depending on -memory-allocation patterns. The header-file prototypes no longer name -the parameters, avoiding possible name conflicts. The possibility that -some clot has defined CHAR_MIN as (say) `-128' instead of `(-128)' is -now handled gracefully. "uchar" is no longer used as an internal type -name (too many people have the same idea). Still the same old lousy -performance, alas. - -New in alpha3.1: Basically nothing, this release is just a bookkeeping -convenience. Stay tuned. - -New in alpha3.0: Performance is no better, alas, but some fixes have been -made and some functionality has been added. (This is basically the "get -it out the door in time for 4.4" release.) One bug fix: regfree() didn't -free the main internal structure (how embarrassing). It is now possible -to put NULs in either the RE or the target string, using (resp.) a new -REG_PEND flag and the old REG_STARTEND flag. The REG_NOSPEC flag to -regcomp() makes all characters ordinary, so you can match a literal -string easily (this will become more useful when performance improves!). -There are now primitives to match beginnings and ends of words, although -the syntax is disgusting and so is the implementation. The REG_ATOI -debugging interface has changed a bit. And there has been considerable -internal cleanup of various kinds. - -New in alpha2.3: Split change list out of README, and moved flags notes -into Makefile. Macro-ized the name of regex(7) in regex(3), since it has -to change for 4.4BSD. Cleanup work in engine.c, and some new regression -tests to catch tricky cases thereof. - -New in alpha2.2: Out-of-date manpages updated. Regerror() acquires two -small extensions -- REG_ITOA and REG_ATOI -- which avoid debugging kludges -in my own test program and might be useful to others for similar purposes. -The regression test will now compile (and run) without REDEBUG. The -BRE \$ bug is fixed. Most uses of "uchar" are gone; it's all chars now. -Char/uchar parameters are now written int/unsigned, to avoid possible -portability problems with unpromoted parameters. Some unsigned casts have -been introduced to minimize portability problems with shifting into sign -bits. - -New in alpha2.1: Lots of little stuff, cleanup and fixes. The one big -thing is that regex.h is now generated, using mkh, rather than being -supplied in the distribution; due to circularities in dependencies, -you have to build regex.h explicitly by "make h". The two known bugs -have been fixed (and the regression test now checks for them), as has a -problem with assertions not being suppressed in the absence of REDEBUG. -No performance work yet. - -New in alpha2: Backslash-anything is an ordinary character, not an -error (except, of course, for the handful of backslashed metacharacters -in BREs), which should reduce script breakage. The regression test -checks *where* null strings are supposed to match, and has generally -been tightened up somewhat. Small bug fixes in parameter passing (not -harmful, but technically errors) and some other areas. Debugging -invoked by defining REDEBUG rather than not defining NDEBUG. - -New in alpha+3: full prototyping for internal routines, using a little -helper program, mkh, which extracts prototypes given in stylized comments. -More minor cleanup. Buglet fix: it's CHAR_BIT, not CHAR_BITS. Simple -pre-screening of input when a literal string is known to be part of the -RE; this does wonders for performance. - -New in alpha+2: minor bits of cleanup. Notably, the number "32" for the -word width isn't hardwired into regexec.c any more, the public header -file prototypes the functions if __STDC__ is defined, and some small typos -in the manpages have been fixed. - -New in alpha+1: improvements to the manual pages, and an important -extension, the REG_STARTEND option to regexec(). diff --git a/src/backend/regex/engine.c b/src/backend/regex/engine.c deleted file mode 100644 index 7e77054eecf..00000000000 --- a/src/backend/regex/engine.c +++ /dev/null @@ -1,1110 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)engine.c 8.5 (Berkeley) 3/20/94 - */ - -#include "postgres.h" - -/* - * The matching engine and friends. This file is #included by regexec.c - * after suitable #defines of a variety of macros used herein, so that - * different state representations can be used without duplicating masses - * of code. - */ - -#ifdef SNAMES -#define matcher smatcher -#define fast sfast -#define slow sslow -#define dissect sdissect -#define backref sbackref -#define step sstep -#define print sprint -#define at sat -#define match smat -#endif -#ifdef LNAMES -#define matcher lmatcher -#define fast lfast -#define slow lslow -#define dissect ldissect -#define backref lbackref -#define step lstep -#define print lprint -#define at lat -#define match lmat -#endif - -/* another structure passed up and down to avoid zillions of parameters */ -struct match -{ - struct re_guts *g; - int eflags; - regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ - pg_wchar *offp; /* offsets work from here */ - pg_wchar *beginp; /* start of string -- virtual NUL precedes */ - pg_wchar *endp; /* end of string -- virtual NUL here */ - pg_wchar *coldp; /* can be no match starting before here */ - pg_wchar **lastpos; /* [nplus+1] */ - STATEVARS; - states st; /* current states */ - states fresh; /* states for a fresh start */ - states tmp; /* temporary */ - states empty; /* empty set of states */ -}; - -static int matcher(struct re_guts * g, pg_wchar *string, size_t nmatch, - regmatch_t *pmatch, int eflags); -static pg_wchar *dissect(struct match * m, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst); -static pg_wchar *backref(struct match * m, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst, sopno lev); -static pg_wchar *fast(struct match * m, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst); -static pg_wchar *slow(struct match * m, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst); -static states step(struct re_guts * g, sopno start, - sopno stop, states bef, int ch, states aft); - -#define BOL (OUT+1) -#define EOL (BOL+1) -#define BOLEOL (BOL+2) -#define NOTHING (BOL+3) -#define BOW (BOL+4) -#define EOW (BOL+5) -#define CODEMAX (BOL+5) /* highest code used */ - -#ifdef MULTIBYTE -#define NONCHAR(c) ((c) > 16777216) /* 16777216 == 2^24 == 3 bytes */ -#define NNONCHAR (CODEMAX-16777216) -#else -#define NONCHAR(c) ((c) > CHAR_MAX) -#define NNONCHAR (CODEMAX-CHAR_MAX) -#endif - -#ifdef REDEBUG -static void print(struct match * m, pg_wchar *caption, states st, int ch, - FILE *d); -static void at(struct match * m, pg_wchar *title, pg_wchar *start, - pg_wchar *stop, sopno startst, sopno stopst); -static pg_wchar *pchar(int ch); -static int pg_isprint(int c); -#endif - -#ifdef REDEBUG -#define SP(t, s, c) print(m, t, s, c, stdout) -#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) -#define NOTE(str) \ -do { \ - if (m->eflags®_TRACE) \ - printf("=%s\n", (str)); \ -} while (0) - -#else -#define SP(t, s, c) /* nothing */ -#define AT(t, p1, p2, s1, s2) /* nothing */ -#define NOTE(s) /* nothing */ -#endif - -/* - * matcher - the actual matching engine - */ -static int /* 0 success, REG_NOMATCH failure */ -matcher(struct re_guts * g, pg_wchar *string, size_t nmatch, - regmatch_t *pmatch, int eflags) -{ - pg_wchar *endp; - int i; - struct match mv; - struct match *m = &mv; - pg_wchar *dp; - const sopno gf = g->firststate + 1; /* +1 for OEND */ - const sopno gl = g->laststate; - pg_wchar *start; - pg_wchar *stop; - - /* simplify the situation where possible */ - if (g->cflags & REG_NOSUB) - nmatch = 0; - if (eflags & REG_STARTEND) - { - start = string + pmatch[0].rm_so; - stop = string + pmatch[0].rm_eo; - } - else - { - start = string; -#ifdef MULTIBYTE - stop = start + pg_wchar_strlen(start); -#else - stop = start + strlen(start); -#endif - } - if (stop < start) - return REG_INVARG; - - /* prescreening; this does wonders for this rather slow code */ - if (g->must != NULL) - { - for (dp = start; dp < stop; dp++) - if (*dp == g->must[0] && stop - dp >= g->mlen && -#ifdef MULTIBYTE - memcmp(dp, g->must, (size_t) (g->mlen * sizeof(pg_wchar))) == 0 -#else - memcmp(dp, g->must, (size_t) g->mlen) == 0 -#endif - ) - break; - if (dp == stop) /* we didn't find g->must */ - return REG_NOMATCH; - } - - /* match struct setup */ - m->g = g; - m->eflags = eflags; - m->pmatch = NULL; - m->lastpos = NULL; - m->offp = string; - m->beginp = start; - m->endp = stop; - STATESETUP(m, 4); - SETUP(m->st); - SETUP(m->fresh); - SETUP(m->tmp); - SETUP(m->empty); - CLEAR(m->empty); - - /* this loop does only one repetition except for backrefs */ - for (;;) - { - endp = fast(m, start, stop, gf, gl); - if (endp == NULL) - { /* a miss */ - STATETEARDOWN(m); - return REG_NOMATCH; - } - if (nmatch == 0 && !g->backrefs) - break; /* no further info needed */ - - /* where? */ - assert(m->coldp != NULL); - for (;;) - { - NOTE("finding start"); - endp = slow(m, m->coldp, stop, gf, gl); - if (endp != NULL) - break; - assert(m->coldp < m->endp); - m->coldp++; - } - if (nmatch == 1 && !g->backrefs) - break; /* no further info needed */ - - /* oh my, he wants the subexpressions... */ - if (m->pmatch == NULL) - m->pmatch = (regmatch_t *) malloc((m->g->nsub + 1) * - sizeof(regmatch_t)); - if (m->pmatch == NULL) - { - STATETEARDOWN(m); - return REG_ESPACE; - } - for (i = 1; i <= m->g->nsub; i++) - m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; - if (!g->backrefs && !(m->eflags & REG_BACKR)) - { - NOTE("dissecting"); - dp = dissect(m, m->coldp, endp, gf, gl); - } - else - { - if (g->nplus > 0 && m->lastpos == NULL) - m->lastpos = (pg_wchar **) malloc((g->nplus + 1) * - sizeof(pg_wchar *)); - if (g->nplus > 0 && m->lastpos == NULL) - { - free(m->pmatch); - STATETEARDOWN(m); - return REG_ESPACE; - } - NOTE("backref dissect"); - dp = backref(m, m->coldp, endp, gf, gl, (sopno) 0); - } - if (dp != NULL) - break; - - /* uh-oh... we couldn't find a subexpression-level match */ - assert(g->backrefs); /* must be back references doing it */ - assert(g->nplus == 0 || m->lastpos != NULL); - for (;;) - { - if (dp != NULL || endp <= m->coldp) - break; /* defeat */ - NOTE("backoff"); - endp = slow(m, m->coldp, endp - 1, gf, gl); - if (endp == NULL) - break; /* defeat */ - /* try it on a shorter possibility */ -#ifndef NDEBUG - for (i = 1; i <= m->g->nsub; i++) - { - assert(m->pmatch[i].rm_so == -1); - assert(m->pmatch[i].rm_eo == -1); - } -#endif - NOTE("backoff dissect"); - dp = backref(m, m->coldp, endp, gf, gl, (sopno) 0); - } - assert(dp == NULL || dp == endp); - if (dp != NULL) /* found a shorter one */ - break; - - /* despite initial appearances, there is no match here */ - NOTE("false alarm"); - start = m->coldp + 1; /* recycle starting later */ - assert(start <= stop); - } - - /* fill in the details if requested */ - if (nmatch > 0) - { - pmatch[0].rm_so = m->coldp - m->offp; - pmatch[0].rm_eo = endp - m->offp; - } - if (nmatch > 1) - { - assert(m->pmatch != NULL); - for (i = 1; i < nmatch; i++) - if (i <= m->g->nsub) - pmatch[i] = m->pmatch[i]; - else - { - pmatch[i].rm_so = -1; - pmatch[i].rm_eo = -1; - } - } - - if (m->pmatch != NULL) - free((pg_wchar *) m->pmatch); - if (m->lastpos != NULL) - free((pg_wchar *) m->lastpos); - STATETEARDOWN(m); - return 0; -} - -/* - * dissect - figure out what matched what, no back references - */ -static pg_wchar * /* == stop (success) always */ -dissect(struct match * m, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst) -{ - int i; - sopno ss; /* start sop of current subRE */ - sopno es; /* end sop of current subRE */ - pg_wchar *sp; /* start of string matched by it */ - pg_wchar *stp; /* string matched by it cannot pass here */ - pg_wchar *rest; /* start of rest of string */ - pg_wchar *tail; /* string unmatched by rest of RE */ - sopno ssub; /* start sop of subsubRE */ - sopno esub; /* end sop of subsubRE */ - pg_wchar *ssp; /* start of string matched by subsubRE */ - pg_wchar *sep; /* end of string matched by subsubRE */ - pg_wchar *oldssp; /* previous ssp */ - pg_wchar *dp; - - AT("diss", start, stop, startst, stopst); - sp = start; - for (ss = startst; ss < stopst; ss = es) - { - /* identify end of subRE */ - es = ss; - switch (OP(m->g->strip[es])) - { - case OPLUS_: - case OQUEST_: - es += OPND(m->g->strip[es]); - break; - case OCH_: - while (OP(m->g->strip[es]) != O_CH) - es += OPND(m->g->strip[es]); - break; - } - es++; - - /* figure out what it matched */ - switch (OP(m->g->strip[ss])) - { - case OEND: - assert(nope); - break; - case OCHAR: - sp++; - break; - case OBOL: - case OEOL: - case OBOW: - case OEOW: - break; - case OANY: - case OANYOF: - sp++; - break; - case OBACK_: - case O_BACK: - assert(nope); - break; - /* cases where length of match is hard to find */ - case OQUEST_: - stp = stop; - for (;;) - { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - assert(rest != NULL); /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - assert(stp >= sp); /* it did work */ - } - ssub = ss + 1; - esub = es - 1; - /* did innards match? */ - if (slow(m, sp, rest, ssub, esub) != NULL) - { - dp = dissect(m, sp, rest, ssub, esub); - assert(dp == rest); - } - else -/* no */ - assert(sp == rest); - sp = rest; - break; - case OPLUS_: - stp = stop; - for (;;) - { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - assert(rest != NULL); /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - assert(stp >= sp); /* it did work */ - } - ssub = ss + 1; - esub = es - 1; - ssp = sp; - oldssp = ssp; - for (;;) - { /* find last match of innards */ - sep = slow(m, ssp, rest, ssub, esub); - if (sep == NULL || sep == ssp) - break; /* failed or matched null */ - oldssp = ssp; /* on to next try */ - ssp = sep; - } - if (sep == NULL) - { - /* last successful match */ - sep = ssp; - ssp = oldssp; - } - assert(sep == rest); /* must exhaust substring */ - assert(slow(m, ssp, sep, ssub, esub) == rest); - dp = dissect(m, ssp, sep, ssub, esub); - assert(dp == sep); - sp = rest; - break; - case OCH_: - stp = stop; - for (;;) - { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - assert(rest != NULL); /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - assert(stp >= sp); /* it did work */ - } - ssub = ss + 1; - esub = ss + OPND(m->g->strip[ss]) - 1; - assert(OP(m->g->strip[esub]) == OOR1); - for (;;) - { /* find first matching branch */ - if (slow(m, sp, rest, ssub, esub) == rest) - break; /* it matched all of it */ - /* that one missed, try next one */ - assert(OP(m->g->strip[esub]) == OOR1); - esub++; - assert(OP(m->g->strip[esub]) == OOR2); - ssub = esub + 1; - esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) - esub--; - else - assert(OP(m->g->strip[esub]) == O_CH); - } - dp = dissect(m, sp, rest, ssub, esub); - assert(dp == rest); - sp = rest; - break; - case O_PLUS: - case O_QUEST: - case OOR1: - case OOR2: - case O_CH: - assert(nope); - break; - case OLPAREN: - i = OPND(m->g->strip[ss]); - assert(0 < i && i <= m->g->nsub); - m->pmatch[i].rm_so = sp - m->offp; - break; - case ORPAREN: - i = OPND(m->g->strip[ss]); - assert(0 < i && i <= m->g->nsub); - m->pmatch[i].rm_eo = sp - m->offp; - break; - default: /* uh oh */ - assert(nope); - break; - } - } - - assert(sp == stop); - return sp; -} - -/* - * backref - figure out what matched what, figuring in back references - * - * lev is PLUS nesting level - */ -static pg_wchar * /* == stop (success) or NULL (failure) */ -backref(struct match * m, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst, sopno lev) -{ - int i; - sopno ss; /* start sop of current subRE */ - pg_wchar *sp; /* start of string matched by it */ - sopno ssub; /* start sop of subsubRE */ - sopno esub; /* end sop of subsubRE */ - pg_wchar *ssp; /* start of string matched by subsubRE */ - pg_wchar *dp; - size_t len; - int hard; - sop s; - regoff_t offsave; - cset *cs; - - AT("back", start, stop, startst, stopst); - sp = start; - - /* get as far as we can with easy stuff */ - hard = 0; - for (ss = startst; !hard && ss < stopst; ss++) - switch (OP(s = m->g->strip[ss])) - { - case OCHAR: - if (sp == stop || *sp++ != (pg_wchar) OPND(s)) - return NULL; - break; - case OANY: - if (sp == stop) - return NULL; - sp++; - break; - case OANYOF: - cs = &m->g->sets[OPND(s)]; - if (sp == stop || !CHIN(cs, *sp++)) - return NULL; - break; - case OBOL: - if ((sp == m->beginp && !(m->eflags & REG_NOTBOL)) || - (sp < m->endp && *(sp - 1) == '\n' && - (m->g->cflags & REG_NEWLINE))) - { /* yes */ - } - else - return NULL; - break; - case OEOL: - if ((sp == m->endp && !(m->eflags & REG_NOTEOL)) || - (sp < m->endp && *sp == '\n' && - (m->g->cflags & REG_NEWLINE))) - { /* yes */ - } - else - return NULL; - break; - case OBOW: - if (((sp == m->beginp && !(m->eflags & REG_NOTBOL)) || - (sp < m->endp && *(sp - 1) == '\n' && - (m->g->cflags & REG_NEWLINE)) || - (sp > m->beginp && - !ISWORD(*(sp - 1)))) && - (sp < m->endp && ISWORD(*sp))) - { /* yes */ - } - else - return NULL; - break; - case OEOW: - if (((sp == m->endp && !(m->eflags & REG_NOTEOL)) || - (sp < m->endp && *sp == '\n' && - (m->g->cflags & REG_NEWLINE)) || - (sp < m->endp && !ISWORD(*sp))) && - (sp > m->beginp && ISWORD(*(sp - 1)))) - { /* yes */ - } - else - return NULL; - break; - case O_QUEST: - break; - case OOR1: /* matches null but needs to skip */ - ss++; - s = m->g->strip[ss]; - do - { - assert(OP(s) == OOR2); - ss += OPND(s); - } while (OP(s = m->g->strip[ss]) != O_CH); - /* note that the ss++ gets us past the O_CH */ - break; - default: /* have to make a choice */ - hard = 1; - break; - } - if (!hard) - { /* that was it! */ - if (sp != stop) - return NULL; - return sp; - } - ss--; /* adjust for the for's final increment */ - - /* the hard stuff */ - AT("hard", sp, stop, ss, stopst); - s = m->g->strip[ss]; - switch (OP(s)) - { - case OBACK_: /* the vilest depths */ - i = OPND(s); - assert(0 < i && i <= m->g->nsub); - if (m->pmatch[i].rm_eo == -1) - return NULL; - assert(m->pmatch[i].rm_so != -1); - len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; - assert(stop - m->beginp >= len); - if (sp > stop - len) - return NULL; /* not enough left to match */ - ssp = m->offp + m->pmatch[i].rm_so; - if (memcmp(sp, ssp, len) != 0) - return NULL; - while (m->g->strip[ss] != SOP(O_BACK, i)) - ss++; - return backref(m, sp + len, stop, ss + 1, stopst, lev); - break; - case OQUEST_: /* to null or not */ - dp = backref(m, sp, stop, ss + 1, stopst, lev); - if (dp != NULL) - return dp; /* not */ - return backref(m, sp, stop, ss + OPND(s) + 1, stopst, lev); - break; - case OPLUS_: - assert(m->lastpos != NULL); - assert(lev + 1 <= m->g->nplus); - m->lastpos[lev + 1] = sp; - return backref(m, sp, stop, ss + 1, stopst, lev + 1); - break; - case O_PLUS: - if (sp == m->lastpos[lev]) /* last pass matched null */ - return backref(m, sp, stop, ss + 1, stopst, lev - 1); - /* try another pass */ - m->lastpos[lev] = sp; - dp = backref(m, sp, stop, ss - OPND(s) + 1, stopst, lev); - if (dp == NULL) - return backref(m, sp, stop, ss + 1, stopst, lev - 1); - else - return dp; - break; - case OCH_: /* find the right one, if any */ - ssub = ss + 1; - esub = ss + OPND(s) - 1; - assert(OP(m->g->strip[esub]) == OOR1); - for (;;) - { /* find first matching branch */ - dp = backref(m, sp, stop, ssub, esub, lev); - if (dp != NULL) - return dp; - /* that one missed, try next one */ - if (OP(m->g->strip[esub]) == O_CH) - return NULL; /* there is none */ - esub++; - assert(OP(m->g->strip[esub]) == OOR2); - ssub = esub + 1; - esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) - esub--; - else - assert(OP(m->g->strip[esub]) == O_CH); - } - break; - case OLPAREN: /* must undo assignment if rest fails */ - i = OPND(s); - assert(0 < i && i <= m->g->nsub); - offsave = m->pmatch[i].rm_so; - m->pmatch[i].rm_so = sp - m->offp; - dp = backref(m, sp, stop, ss + 1, stopst, lev); - if (dp != NULL) - return dp; - m->pmatch[i].rm_so = offsave; - return NULL; - break; - case ORPAREN: /* must undo assignment if rest fails */ - i = OPND(s); - assert(0 < i && i <= m->g->nsub); - offsave = m->pmatch[i].rm_eo; - m->pmatch[i].rm_eo = sp - m->offp; - dp = backref(m, sp, stop, ss + 1, stopst, lev); - if (dp != NULL) - return dp; - m->pmatch[i].rm_eo = offsave; - return NULL; - break; - default: /* uh oh */ - assert(nope); - break; - } - - /* "can't happen" */ - assert(nope); - /* NOTREACHED */ - return 0; -} - -/* - * fast - step through the string at top speed - */ -static pg_wchar * /* where tentative match ended, or NULL */ -fast(struct match * m, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst) -{ - states st = m->st; - states fresh = m->fresh; - states tmp = m->tmp; - pg_wchar *p = start; - int c = (start == m->beginp) ? OUT : *(start - 1); - int lastc; /* previous c */ - int flagch; - int i; - pg_wchar *coldp; /* last p after which no match was - * underway */ - - CLEAR(st); - SET1(st, startst); - st = step(m->g, startst, stopst, st, NOTHING, st); - ASSIGN(fresh, st); - SP("start", st, *p); - coldp = NULL; - for (;;) - { - /* next character */ - lastc = c; - c = (p == m->endp) ? OUT : *p; - if (EQ(st, fresh)) - coldp = p; - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ((lastc == '\n' && m->g->cflags & REG_NEWLINE) || - (lastc == OUT && !(m->eflags & REG_NOTBOL))) - { - flagch = BOL; - i = m->g->nbol; - } - if ((c == '\n' && m->g->cflags & REG_NEWLINE) || - (c == OUT && !(m->eflags & REG_NOTEOL))) - { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) - { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); - SP("boleol", st, c); - } - - /* how about a word boundary? */ - if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c))) - flagch = BOW; - if ((lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c)))) - flagch = EOW; - if (flagch == BOW || flagch == EOW) - { - st = step(m->g, startst, stopst, st, flagch, st); - SP("boweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst) || p == stop) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, fresh); - assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); - SP("aft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p++; - } - - assert(coldp != NULL); - m->coldp = coldp; - if (ISSET(st, stopst)) - return p + 1; - else - return NULL; -} - -/* - * slow - step through the string more deliberately - */ -static pg_wchar * /* where it ended */ -slow(struct match * m, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst) -{ - states st = m->st; - states empty = m->empty; - states tmp = m->tmp; - pg_wchar *p = start; - int c = (start == m->beginp) ? OUT : *(start - 1); - int lastc; /* previous c */ - int flagch; - int i; - pg_wchar *matchp; /* last p at which a match ended */ - - AT("slow", start, stop, startst, stopst); - CLEAR(st); - SET1(st, startst); - SP("sstart", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); - matchp = NULL; - for (;;) - { - /* next character */ - lastc = c; - c = (p == m->endp) ? OUT : *p; - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ((lastc == '\n' && m->g->cflags & REG_NEWLINE) || - (lastc == OUT && !(m->eflags & REG_NOTBOL))) - { - flagch = BOL; - i = m->g->nbol; - } - if ((c == '\n' && m->g->cflags & REG_NEWLINE) || - (c == OUT && !(m->eflags & REG_NOTEOL))) - { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) - { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboleol", st, c); - } - - /* how about a word boundary? */ - if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c))) - flagch = BOW; - if ((lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c)))) - flagch = EOW; - if (flagch == BOW || flagch == EOW) - { - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst)) - matchp = p; - if (EQ(st, empty) || p == stop) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, empty); - assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); - SP("saft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p++; - } - - return matchp; -} - - -/* - * step - map set of states reachable before char to set reachable after - */ -static states -step(struct re_guts * g, - sopno start, /* start state within strip */ - sopno stop, /* state after stop state within strip */ - states bef, /* states reachable before */ - int ch, /* character or NONCHAR code */ - states aft) /* states already known reachable after */ -{ - cset *cs; - sop s; - sopno pc; - onestate here; /* note, macros know this name */ - sopno look; - int i; - - for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) - { - s = g->strip[pc]; - switch (OP(s)) - { - case OEND: - assert(pc == stop - 1); - break; - case OCHAR: - /* only characters can match */ - assert(!NONCHAR(ch) || ch != (pg_wchar) OPND(s)); - if (ch == (pg_wchar) OPND(s)) - FWD(aft, bef, 1); - break; - case OBOL: - if (ch == BOL || ch == BOLEOL) - FWD(aft, bef, 1); - break; - case OEOL: - if (ch == EOL || ch == BOLEOL) - FWD(aft, bef, 1); - break; - case OBOW: - if (ch == BOW) - FWD(aft, bef, 1); - break; - case OEOW: - if (ch == EOW) - FWD(aft, bef, 1); - break; - case OANY: - if (!NONCHAR(ch)) - FWD(aft, bef, 1); - break; - case OANYOF: - cs = &g->sets[OPND(s)]; - if (!NONCHAR(ch) && CHIN(cs, ch)) - FWD(aft, bef, 1); - break; - case OBACK_: /* ignored here */ - case O_BACK: - FWD(aft, aft, 1); - break; - case OPLUS_: /* forward, this is just an empty */ - FWD(aft, aft, 1); - break; - case O_PLUS: /* both forward and back */ - FWD(aft, aft, 1); - i = ISSETBACK(aft, OPND(s)); - BACK(aft, aft, OPND(s)); - if (!i && ISSETBACK(aft, OPND(s))) - { - /* oho, must reconsider loop body */ - pc -= OPND(s) + 1; - INIT(here, pc); - } - break; - case OQUEST_: /* two branches, both forward */ - FWD(aft, aft, 1); - FWD(aft, aft, OPND(s)); - break; - case O_QUEST: /* just an empty */ - FWD(aft, aft, 1); - break; - case OLPAREN: /* not significant here */ - case ORPAREN: - FWD(aft, aft, 1); - break; - case OCH_: /* mark the first two branches */ - FWD(aft, aft, 1); - assert(OP(g->strip[pc + OPND(s)]) == OOR2); - FWD(aft, aft, OPND(s)); - break; - case OOR1: /* done a branch, find the O_CH */ - if (ISSTATEIN(aft, here)) - { - for (look = 1; - OP(s = g->strip[pc + look]) != O_CH; - look += OPND(s)) - assert(OP(s) == OOR2); - FWD(aft, aft, look); - } - break; - case OOR2: /* propagate OCH_'s marking */ - FWD(aft, aft, 1); - if (OP(g->strip[pc + OPND(s)]) != O_CH) - { - assert(OP(g->strip[pc + OPND(s)]) == OOR2); - FWD(aft, aft, OPND(s)); - } - break; - case O_CH: /* just empty */ - FWD(aft, aft, 1); - break; - default: /* ooooops... */ - assert(nope); - break; - } - } - - return aft; -} - -#ifdef REDEBUG -/* - * print - print a set of states - */ -static void -print(struct match * m, pg_wchar *caption, states st, - int ch, FILE *d) -{ - struct re_guts *g = m->g; - int i; - int first = 1; - - if (!(m->eflags & REG_TRACE)) - return; - - fprintf(d, "%s", caption); - if (ch != '\0') - fprintf(d, " %s", pchar(ch)); - for (i = 0; i < g->nstates; i++) - if (ISSET(st, i)) - { - fprintf(d, "%s%d", (first) ? "\t" : ", ", i); - first = 0; - } - fprintf(d, "\n"); -} - -/* - * at - print current situation - */ -static void -at(struct match * m, pg_wchar *title, pg_wchar *start, pg_wchar *stop, - sopno startst, sopno stopst) -{ - if (!(m->eflags & REG_TRACE)) - return; - - printf("%s %s-", title, pchar(*start)); - printf("%s ", pchar(*stop)); - printf("%ld-%ld\n", (long) startst, (long) stopst); -} - -#ifndef PCHARDONE -#define PCHARDONE /* only do this once */ -/* - * pchar - make a character printable - * - * Is this identical to regchar() over in debug.c? Well, yes. But a - * duplicate here avoids having a debugging-capable regexec.o tied to - * a matching debug.o, and this is convenient. It all disappears in - * the non-debug compilation anyway, so it doesn't matter much. - */ -static pg_wchar * /* -> representation */ -pchar(int ch) -{ - static pg_wchar pbuf[10]; - - if (pg_isprint(ch) || ch == ' ') - sprintf(pbuf, "%c", ch); - else - sprintf(pbuf, "\\%o", ch); - return pbuf; -} - -static int -pg_isprint(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c)); -#else - return (isprint((unsigned char) c)); -#endif -} -#endif -#endif - -#undef matcher -#undef fast -#undef slow -#undef dissect -#undef backref -#undef step -#undef print -#undef at -#undef match diff --git a/src/backend/regex/re_format.7 b/src/backend/regex/re_format.7 deleted file mode 100644 index db2f6349c45..00000000000 --- a/src/backend/regex/re_format.7 +++ /dev/null @@ -1,269 +0,0 @@ -.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. -.\" Copyright (c) 1992, 1993, 1994 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" This code is derived from software contributed to Berkeley by -.\" Henry Spencer. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)re_format.7 8.3 (Berkeley) 3/20/94 -.\" -.TH RE_FORMAT 7 "March 20, 1994" -.SH NAME -re_format \- POSIX 1003.2 regular expressions -.SH DESCRIPTION -Regular expressions (``RE''s), -as defined in POSIX 1003.2, come in two forms: -modern REs (roughly those of -.IR egrep ; -1003.2 calls these ``extended'' REs) -and obsolete REs (roughly those of -.IR ed ; -1003.2 ``basic'' REs). -Obsolete REs mostly exist for backward compatibility in some old programs; -they will be discussed at the end. -1003.2 leaves some aspects of RE syntax and semantics open; -`\(dg' marks decisions on these aspects that -may not be fully portable to other 1003.2 implementations. -.PP -A (modern) RE is one\(dg or more non-empty\(dg \fIbranches\fR, -separated by `|'. -It matches anything that matches one of the branches. -.PP -A branch is one\(dg or more \fIpieces\fR, concatenated. -It matches a match for the first, followed by a match for the second, etc. -.PP -A piece is an \fIatom\fR possibly followed -by a single\(dg `*', `+', `?', or \fIbound\fR. -An atom followed by `*' matches a sequence of 0 or more matches of the atom. -An atom followed by `+' matches a sequence of 1 or more matches of the atom. -An atom followed by `?' matches a sequence of 0 or 1 matches of the atom. -.PP -A \fIbound\fR is `{' followed by an unsigned decimal integer, -possibly followed by `,' -possibly followed by another unsigned decimal integer, -always followed by `}'. -The integers must lie between 0 and RE_DUP_MAX (255\(dg) inclusive, -and if there are two of them, the first may not exceed the second. -An atom followed by a bound containing one integer \fIi\fR -and no comma matches -a sequence of exactly \fIi\fR matches of the atom. -An atom followed by a bound -containing one integer \fIi\fR and a comma matches -a sequence of \fIi\fR or more matches of the atom. -An atom followed by a bound -containing two integers \fIi\fR and \fIj\fR matches -a sequence of \fIi\fR through \fIj\fR (inclusive) matches of the atom. -.PP -An atom is a regular expression enclosed in `()' (matching a match for the -regular expression), -an empty set of `()' (matching the null string)\(dg, -a \fIbracket expression\fR (see below), `.' -(matching any single character), `^' (matching the null string at the -beginning of a line), `$' (matching the null string at the -end of a line), a `\e' followed by one of the characters -`^.[$()|*+?{\e' -(matching that character taken as an ordinary character), -a `\e' followed by any other character\(dg -(matching that character taken as an ordinary character, -as if the `\e' had not been present\(dg), -or a single character with no other significance (matching that character). -A `{' followed by a character other than a digit is an ordinary -character, not the beginning of a bound\(dg. -It is illegal to end an RE with `\e'. -.PP -A \fIbracket expression\fR is a list of characters enclosed in `[]'. -It normally matches any single character from the list (but see below). -If the list begins with `^', -it matches any single character -(but see below) \fInot\fR from the rest of the list. -If two characters in the list are separated by `\-', this is shorthand -for the full \fIrange\fR of characters between those two (inclusive) in the -collating sequence, -e.g. `[0-9]' in ASCII matches any decimal digit. -It is illegal\(dg for two ranges to share an -endpoint, e.g. `a-c-e'. -Ranges are very collating-sequence-dependent, -and portable programs should avoid relying on them. -.PP -To include a literal `]' in the list, make it the first character -(following a possible `^'). -To include a literal `\-', make it the first or last character, -or the second endpoint of a range. -To use a literal `\-' as the first endpoint of a range, -enclose it in `[.' and `.]' to make it a collating element (see below). -With the exception of these and some combinations using `[' (see next -paragraphs), all other special characters, including `\e', lose their -special significance within a bracket expression. -.PP -Within a bracket expression, a collating element (a character, -a multi-character sequence that collates as if it were a single character, -or a collating-sequence name for either) -enclosed in `[.' and `.]' stands for the -sequence of characters of that collating element. -The sequence is a single element of the bracket expression's list. -A bracket expression containing a multi-character collating element -can thus match more than one character, -e.g. if the collating sequence includes a `ch' collating element, -then the RE `[[.ch.]]*c' matches the first five characters -of `chchcc'. -.PP -Within a bracket expression, a collating element enclosed in `[=' and -`=]' is an equivalence class, standing for the sequences of characters -of all collating elements equivalent to that one, including itself. -(If there are no other equivalent collating elements, -the treatment is as if the enclosing delimiters were `[.' and `.]'.) -For example, if o and \o'o^' are the members of an equivalence class, -then `[[=o=]]', `[[=\o'o^'=]]', and `[o\o'o^']' are all synonymous. -An equivalence class may not\(dg be an endpoint -of a range. -.PP -Within a bracket expression, the name of a \fIcharacter class\fR enclosed -in `[:' and `:]' stands for the list of all characters belonging to that -class. -Standard character class names are: -.PP -.RS -.nf -.ta 3c 6c 9c -alnum digit punct -alpha graph space -blank lower upper -cntrl print xdigit -.fi -.RE -.PP -These stand for the character classes defined in -.IR ctype (3). -A locale may provide others. -A character class may not be used as an endpoint of a range. -.PP -There are two special cases\(dg of bracket expressions: -the bracket expressions `[[:<:]]' and `[[:>:]]' match the null string at -the beginning and end of a word respectively. -A word is defined as a sequence of -word characters -which is neither preceded nor followed by -word characters. -A word character is an -.I alnum -character (as defined by -.IR ctype (3)) -or an underscore. -This is an extension, -compatible with but not specified by POSIX 1003.2, -and should be used with -caution in software intended to be portable to other systems. -.PP -In the event that an RE could match more than one substring of a given -string, -the RE matches the one starting earliest in the string. -If the RE could match more than one substring starting at that point, -it matches the longest. -Subexpressions also match the longest possible substrings, subject to -the constraint that the whole match be as long as possible, -with subexpressions starting earlier in the RE taking priority over -ones starting later. -Note that higher-level subexpressions thus take priority over -their lower-level component subexpressions. -.PP -Match lengths are measured in characters, not collating elements. -A null string is considered longer than no match at all. -For example, -`bb*' matches the three middle characters of `abbbc', -`(wee|week)(knights|nights)' matches all ten characters of `weeknights', -when `(.*).*' is matched against `abc' the parenthesized subexpression -matches all three characters, and -when `(a*)*' is matched against `bc' both the whole RE and the parenthesized -subexpression match the null string. -.PP -If case-independent matching is specified, -the effect is much as if all case distinctions had vanished from the -alphabet. -When an alphabetic that exists in multiple cases appears as an -ordinary character outside a bracket expression, it is effectively -transformed into a bracket expression containing both cases, -e.g. `x' becomes `[xX]'. -When it appears inside a bracket expression, all case counterparts -of it are added to the bracket expression, so that (e.g.) `[x]' -becomes `[xX]' and `[^x]' becomes `[^xX]'. -.PP -No particular limit is imposed on the length of REs\(dg. -Programs intended to be portable should not employ REs longer -than 256 bytes, -as an implementation can refuse to accept such REs and remain -POSIX-compliant. -.PP -Obsolete (``basic'') regular expressions differ in several respects. -`|', `+', and `?' are ordinary characters and there is no equivalent -for their functionality. -The delimiters for bounds are `\e{' and `\e}', -with `{' and `}' by themselves ordinary characters. -The parentheses for nested subexpressions are `\e(' and `\e)', -with `(' and `)' by themselves ordinary characters. -`^' is an ordinary character except at the beginning of the -RE or\(dg the beginning of a parenthesized subexpression, -`$' is an ordinary character except at the end of the -RE or\(dg the end of a parenthesized subexpression, -and `*' is an ordinary character if it appears at the beginning of the -RE or the beginning of a parenthesized subexpression -(after a possible leading `^'). -Finally, there is one new type of atom, a \fIback reference\fR: -`\e' followed by a non-zero decimal digit \fId\fR -matches the same sequence of characters -matched by the \fId\fRth parenthesized subexpression -(numbering subexpressions by the positions of their opening parentheses, -left to right), -so that (e.g.) `\e([bc]\e)\e1' matches `bb' or `cc' but not `bc'. -.SH SEE ALSO -regex(3) -.PP -POSIX 1003.2, section 2.8 (Regular Expression Notation). -.SH BUGS -Having two kinds of REs is a botch. -.PP -The current 1003.2 spec says that `)' is an ordinary character in -the absence of an unmatched `('; -this was an unintentional result of a wording error, -and change is likely. -Avoid relying on it. -.PP -Back references are a dreadful botch, -posing major problems for efficient implementations. -They are also somewhat vaguely defined -(does -`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?). -Avoid using them. -.PP -1003.2's specification of case-independent matching is vague. -The ``one case implies all cases'' definition given above -is current consensus among implementors as to the right interpretation. -.PP -The syntax for word boundaries is incredibly ugly. diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c deleted file mode 100644 index d6f7b26fa1a..00000000000 --- a/src/backend/regex/regcomp.c +++ /dev/null @@ -1,1863 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 - */ - -#include "postgres.h" - -#include <sys/types.h> -#include <ctype.h> -#include <limits.h> -#include <assert.h> - -#include "regex/regex.h" -#include "regex/utils.h" -#include "regex/regex2.h" -#include "regex/cname.h" -#include <locale.h> - -struct cclass -{ - char *name; - char *chars; - char *multis; -}; -static struct cclass* cclasses = NULL; -static struct cclass* cclass_init(void); - -/* - * parse structure, passed up and down to avoid global variables and - * other clumsinesses - */ -struct parse -{ - pg_wchar *next; /* next character in RE */ - pg_wchar *end; /* end of string (-> NUL normally) */ - int error; /* has an error been seen? */ - sop *strip; /* malloced strip */ - sopno ssize; /* malloced strip size (allocated) */ - sopno slen; /* malloced strip length (used) */ - int ncsalloc; /* number of csets allocated */ - struct re_guts *g; -#define NPAREN 10 /* we need to remember () 1-9 for back - * refs */ - sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ - sopno pend[NPAREN]; /* -> ) ([0] unused) */ -}; - -static void p_ere(struct parse * p, int stop); -static void p_ere_exp(struct parse * p); -static void p_str(struct parse * p); -static void p_bre(struct parse * p, int end1, int end2); -static int p_simp_re(struct parse * p, int starordinary); -static int p_count(struct parse * p); -static void p_bracket(struct parse * p); -static void p_b_term(struct parse * p, cset *cs); -static void p_b_cclass(struct parse * p, cset *cs); -static void p_b_eclass(struct parse * p, cset *cs); -static pg_wchar p_b_symbol(struct parse * p); -static char p_b_coll_elem(struct parse * p, int endc); - -#ifdef MULTIBYTE -static unsigned char othercase(int ch); - -#else -static char othercase(int ch); -#endif -static void bothcases(struct parse * p, int ch); -static void ordinary(struct parse * p, int ch); -static void nonnewline(struct parse * p); -static void repeat(struct parse * p, sopno start, int from, int to); -static int seterr(struct parse * p, int e); -static cset *allocset(struct parse * p); -static void freeset(struct parse * p, cset *cs); -static int freezeset(struct parse * p, cset *cs); -static int firstch(struct parse * p, cset *cs); -static int nch(struct parse * p, cset *cs); -static void mcadd(struct parse * p, cset *cs, char *cp); -static void mcinvert(struct parse * p, cset *cs); -static void mccase(struct parse * p, cset *cs); -static int isinsets(struct re_guts * g, int c); -static int samesets(struct re_guts * g, int c1, int c2); -static void categorize(struct parse * p, struct re_guts * g); -static sopno dupl(struct parse * p, sopno start, sopno finish); -static void doemit(struct parse * p, sop op, size_t opnd); -static void doinsert(struct parse * p, sop op, size_t opnd, sopno pos); -static void dofwd(struct parse * p, sopno pos, sop value); -static void enlarge(struct parse * p, sopno size); -static void stripsnug(struct parse * p, struct re_guts * g); -static void findmust(struct parse * p, struct re_guts * g); -static sopno pluscount(struct parse * p, struct re_guts * g); -static int pg_isdigit(int c); -static int pg_isalpha(int c); -static int pg_isalnum(int c); -static int pg_isupper(int c); -static int pg_islower(int c); -static int pg_iscntrl(int c); -static int pg_isgraph(int c); -static int pg_isprint(int c); -static int pg_ispunct(int c); - -static pg_wchar nuls[10]; /* place to point scanner in event of - * error */ - -/* - * macros for use with parse structure - * BEWARE: these know that the parse structure is named `p' !!! - */ -#define PEEK() (*p->next) -#define PEEK2() (*(p->next+1)) -#define MORE() (p->next < p->end) -#define MORE2() (p->next+1 < p->end) -#define SEE(c) (MORE() && PEEK() == (c)) -#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) -#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) -#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) -#define NEXT() (p->next++) -#define NEXT2() (p->next += 2) -#define NEXTn(n) (p->next += (n)) -#define GETNEXT() (*p->next++) -#define SETERROR(e) seterr(p, (e)) -#define REQUIRE(co, e) if (!(co)) SETERROR(e) -#define MUSTSEE(c, e) REQUIRE(MORE() && PEEK() == (c), e) -#define MUSTEAT(c, e) REQUIRE(MORE() && GETNEXT() == (c), e) -#define MUSTNOTSEE(c, e) REQUIRE(!MORE() || PEEK() != (c), e) -#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) -#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) -#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) -#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) -#define HERE() (p->slen) -#define THERE() (p->slen - 1) -#define THERETHERE() (p->slen - 2) -#define DROP(n) (p->slen -= (n)) - -#ifndef NDEBUG -static int never = 0; /* for use in asserts; shuts lint up */ - -#else -#define never 0 /* some <assert.h>s have bugs too */ -#endif - -/* - * regcomp - interface for parser and compilation - * returns 0 success, otherwise REG_something - */ -int -pg_regcomp(regex_t *preg, const char *pattern, int cflags) -{ - struct parse pa; - struct re_guts *g; - struct parse *p = &pa; - int i; - size_t len; - -#ifdef MULTIBYTE - pg_wchar *wcp; -#endif - - if ( cclasses == NULL ) - cclasses = cclass_init(); - -#ifdef REDEBUG -#define GOODFLAGS(f) (f) -#else -#define GOODFLAGS(f) ((f)&~REG_DUMP) -#endif - - cflags = GOODFLAGS(cflags); - if ((cflags & REG_EXTENDED) && (cflags & REG_NOSPEC)) - return REG_INVARG; - - if (cflags & REG_PEND) - { -#ifdef MULTIBYTE - wcp = preg->patsave; - if (preg->re_endp < wcp) - return REG_INVARG; - len = preg->re_endp - wcp; -#else - if (preg->re_endp < pattern) - return REG_INVARG; - len = preg->re_endp - pattern; -#endif - } - else - { -#ifdef MULTIBYTE - wcp = (pg_wchar *) malloc((strlen(pattern) + 1) * sizeof(pg_wchar)); - if (wcp == NULL) - return REG_ESPACE; - preg->patsave = wcp; - (void) pg_mb2wchar((unsigned char *) pattern, wcp); - len = pg_wchar_strlen(wcp); -#else - len = strlen((char *) pattern); -#endif - } - - /* do the mallocs early so failure handling is easy */ - g = (struct re_guts *) malloc(sizeof(struct re_guts) + - (NC - 1) * sizeof(cat_t)); - if (g == NULL) - return REG_ESPACE; - p->ssize = len / (size_t) 2 *(size_t) 3 + (size_t) 1; /* ugh */ - - p->strip = (sop *) malloc(p->ssize * sizeof(sop)); - p->slen = 0; - if (p->strip == NULL) - { - free((char *) g); - return REG_ESPACE; - } - - /* set things up */ - p->g = g; -#ifdef MULTIBYTE - p->next = wcp; -#else - p->next = (pg_wchar *) pattern; /* convenience; we do not modify - * it */ -#endif - p->end = p->next + len; - p->error = 0; - p->ncsalloc = 0; - for (i = 0; i < NPAREN; i++) - { - p->pbegin[i] = 0; - p->pend[i] = 0; - } - g->csetsize = NC; - g->sets = NULL; - g->setbits = NULL; - g->ncsets = 0; - g->cflags = cflags; - g->iflags = 0; - g->nbol = 0; - g->neol = 0; - g->must = NULL; - g->mlen = 0; - g->nsub = 0; - g->ncategories = 1; /* category 0 is "everything else" */ - g->categories = &g->catspace[-(CHAR_MIN)]; - memset((char *) g->catspace, 0, NC * sizeof(cat_t)); - g->backrefs = 0; - - /* do it */ - EMIT(OEND, 0); - g->firststate = THERE(); - if (cflags & REG_EXTENDED) - p_ere(p, OUT); - else if (cflags & REG_NOSPEC) - p_str(p); - else - p_bre(p, OUT, OUT); - EMIT(OEND, 0); - g->laststate = THERE(); - - /* tidy up loose ends and fill things in */ - categorize(p, g); - stripsnug(p, g); - findmust(p, g); - g->nplus = pluscount(p, g); - g->magic = MAGIC2; - preg->re_nsub = g->nsub; - preg->re_g = g; - preg->re_magic = MAGIC1; -#ifndef REDEBUG - /* not debugging, so can't rely on the assert() in regexec() */ - if (g->iflags & BAD) - SETERROR(REG_ASSERT); -#endif - - /* win or lose, we're done */ - if (p->error != 0) /* lose */ - pg_regfree(preg); - return p->error; -} - -/* - * p_ere - ERE parser top level, concatenation and alternation - */ -static void -p_ere(struct parse * p, - int stop) /* character this ERE should end at */ -{ - char c; - sopno prevback = 0; - sopno prevfwd = 0; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) - { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) - { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) - { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); -} - -/* - * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op - */ -static void -p_ere_exp(struct parse * p) -{ - pg_wchar c; - sopno pos; - int count; - int count2; - sopno subno; - int wascaret = 0; - - assert(MORE()); /* caller should have ensured this */ - c = GETNEXT(); - - pos = HERE(); - switch (c) - { - case '(': - REQUIRE(MORE(), REG_EPAREN); - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - if (!SEE(')')) - p_ere(p, ')'); - if (subno < NPAREN) - { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); - } - EMIT(ORPAREN, subno); - MUSTEAT(')', REG_EPAREN); - break; -#ifndef POSIX_MISTAKE - case ')': /* happens only if no current unmatched ( */ - - /* - * You may ask, why the ifndef? Because I didn't notice this - * until slightly too late for 1003.2, and none of the other - * 1003.2 regular-expression reviewers noticed it at all. So - * an unmatched ) is legal POSIX, at least until we can get it - * fixed. - */ - SETERROR(REG_EPAREN); - break; -#endif - case '^': - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - wascaret = 1; - break; - case '$': - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; - break; - case '|': - SETERROR(REG_EMPTY); - break; - case '*': - case '+': - case '?': - SETERROR(REG_BADRPT); - break; - case '.': - if (p->g->cflags & REG_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; - case '\\': - REQUIRE(MORE(), REG_EESCAPE); - c = GETNEXT(); - ordinary(p, c); - break; - case '{': /* okay as ordinary except if digit - * follows */ - REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT); - /* FALLTHROUGH */ - default: - ordinary(p, c); - break; - } - - if (!MORE()) - return; - c = PEEK(); - /* we call { a repetition if followed by a digit */ - if (!(c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && pg_isdigit(PEEK2())))) - return; /* no repetition, we're done */ - NEXT(); - - REQUIRE(!wascaret, REG_BADRPT); - switch (c) - { - case '*': /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - break; - case '+': - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - break; - case '?': - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, pos); /* offset slightly wrong */ - ASTERN(OOR1, pos); /* this one's right */ - AHEAD(pos); /* fix the OCH_ */ - EMIT(OOR2, 0); /* offset very wrong... */ - AHEAD(THERE()); /* ...so fix it */ - ASTERN(O_CH, THERETHERE()); - break; - case '{': - count = p_count(p); - if (EAT(',')) - { - if (pg_isdigit(PEEK())) - { - count2 = p_count(p); - REQUIRE(count <= count2, REG_BADBR); - } - else -/* single number with comma */ - count2 = INFINITY; - } - else -/* just a single number */ - count2 = count; - repeat(p, pos, count, count2); - if (!EAT('}')) - { /* error heuristics */ - while (MORE() && PEEK() != '}') - NEXT(); - REQUIRE(MORE(), REG_EBRACE); - SETERROR(REG_BADBR); - } - break; - } - - if (!MORE()) - return; - c = PEEK(); - if (!(c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && pg_isdigit(PEEK2())))) - return; - SETERROR(REG_BADRPT); -} - -/* - * p_str - string (no metacharacters) "parser" - */ -static void -p_str(struct parse * p) -{ - REQUIRE(MORE(), REG_EMPTY); - while (MORE()) - ordinary(p, GETNEXT()); -} - -/* - * p_bre - BRE parser top level, anchoring and concatenation - * - * Giving end1 as OUT essentially eliminates the end1/end2 check. - * - * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. The - * only undesirable side effect is that '$' gets included as a character - * category in such cases. This is fairly harmless; not worth fixing. - * The amount of lookahead needed to avoid this kludge is excessive. - */ -static void -p_bre(struct parse * p, - int end1, /* first terminating character */ - int end2) /* second terminating character */ -{ - sopno start = HERE(); - int first = 1; /* first subexpression? */ - int wasdollar = 0; - - if (EAT('^')) - { - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - } - while (MORE() && !SEETWO(end1, end2)) - { - wasdollar = p_simp_re(p, first); - first = 0; - } - if (wasdollar) - { /* oops, that was a trailing anchor */ - DROP(1); - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; - } - - REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ -} - -/* - * p_simp_re - parse a simple RE, an atom possibly followed by a repetition - */ -static int /* was the simple RE an unbackslashed $? */ -p_simp_re(struct parse * p, - int starordinary) /* is a leading * an ordinary character? */ -{ - int c; - int count; - int count2; - sopno pos; - int i; - sopno subno; - -#define BACKSL (1<<24) - - pos = HERE(); /* repetion op, if any, covers from here */ - - assert(MORE()); /* caller should have ensured this */ - c = GETNEXT(); - if (c == '\\') - { - REQUIRE(MORE(), REG_EESCAPE); -#ifdef MULTIBYTE - c = BACKSL | (pg_wchar) GETNEXT(); -#else - c = BACKSL | (unsigned char) GETNEXT(); -#endif - } - switch (c) - { - case '.': - if (p->g->cflags & REG_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; - case BACKSL | '{': - SETERROR(REG_BADRPT); - break; - case BACKSL | '(': - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - /* the MORE here is an error heuristic */ - if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); - if (subno < NPAREN) - { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); - } - EMIT(ORPAREN, subno); - REQUIRE(EATTWO('\\', ')'), REG_EPAREN); - break; - case BACKSL | ')': /* should not get here -- must be user */ - case BACKSL | '}': - SETERROR(REG_EPAREN); - break; - case BACKSL | '1': - case BACKSL | '2': - case BACKSL | '3': - case BACKSL | '4': - case BACKSL | '5': - case BACKSL | '6': - case BACKSL | '7': - case BACKSL | '8': - case BACKSL | '9': - i = (c & ~BACKSL) - '0'; - assert(i < NPAREN); - if (p->pend[i] != 0) - { - assert(i <= p->g->nsub); - EMIT(OBACK_, i); - assert(p->pbegin[i] != 0); - assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); - assert(OP(p->strip[p->pend[i]]) == ORPAREN); - dupl(p, p->pbegin[i] + 1, p->pend[i]); - EMIT(O_BACK, i); - } - else - SETERROR(REG_ESUBREG); - p->g->backrefs = 1; - break; - case '*': - REQUIRE(starordinary, REG_BADRPT); - /* FALLTHROUGH */ - default: - ordinary(p, c & ~BACKSL); - break; - } - - if (EAT('*')) - { /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - } - else if (EATTWO('\\', '{')) - { - count = p_count(p); - if (EAT(',')) - { - if (MORE() && pg_isdigit(PEEK())) - { - count2 = p_count(p); - REQUIRE(count <= count2, REG_BADBR); - } - else -/* single number with comma */ - count2 = INFINITY; - } - else -/* just a single number */ - count2 = count; - repeat(p, pos, count, count2); - if (!EATTWO('\\', '}')) - { /* error heuristics */ - while (MORE() && !SEETWO('\\', '}')) - NEXT(); - REQUIRE(MORE(), REG_EBRACE); - SETERROR(REG_BADBR); - } - } - else if (c == (unsigned char) '$') /* $ (but not \$) ends it */ - return 1; - - return 0; -} - -/* - * p_count - parse a repetition count - */ -static int /* the value */ -p_count(struct parse * p) -{ - int count = 0; - int ndigits = 0; - - while (MORE() && pg_isdigit(PEEK()) && count <= DUPMAX) - { - count = count * 10 + (GETNEXT() - '0'); - ndigits++; - } - - REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); - return count; -} - -/* - * p_bracket - parse a bracketed character list - * - * Note a significant property of this code: if the allocset() did SETERROR, - * no set operations are done. - */ -static void -p_bracket(struct parse * p) -{ - cset *cs = allocset(p); - int invert = 0; - -#ifdef MULTIBYTE - pg_wchar sp1[] = {'[', ':', '<', ':', ']', ']'}; - pg_wchar sp2[] = {'[', ':', '>', ':', ']', ']'}; -#endif - - /* Dept of Truly Sickening Special-Case Kludges */ -#ifdef MULTIBYTE - if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0) -#else - if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) -#endif - { - EMIT(OBOW, 0); - NEXTn(6); - return; - } -#ifdef MULTIBYTE - if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0) -#else - if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) -#endif - { - EMIT(OEOW, 0); - NEXTn(6); - return; - } - - if (EAT('^')) - invert++; /* make note to invert set at end */ - if (EAT(']')) - CHadd(cs, ']'); - else if (EAT('-')) - CHadd(cs, '-'); - while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) - p_b_term(p, cs); - if (EAT('-')) - CHadd(cs, '-'); - MUSTEAT(']', REG_EBRACK); - - if (p->error != 0) /* don't mess things up further */ - return; - - if (p->g->cflags & REG_ICASE) - { - int i; - int ci; - - for (i = p->g->csetsize - 1; i >= 0; i--) - if (CHIN(cs, i) && pg_isalpha(i)) - { - ci = othercase(i); - if (ci != i) - CHadd(cs, ci); - } - if (cs->multis != NULL) - mccase(p, cs); - } - if (invert) - { - int i; - - for (i = p->g->csetsize - 1; i >= 0; i--) - if (CHIN(cs, i)) - CHsub(cs, i); - else - CHadd(cs, i); - if (p->g->cflags & REG_NEWLINE) - CHsub(cs, '\n'); - if (cs->multis != NULL) - mcinvert(p, cs); - } - - assert(cs->multis == NULL); /* xxx */ - - if (nch(p, cs) == 1) - { /* optimize singleton sets */ - ordinary(p, firstch(p, cs)); - freeset(p, cs); - } - else - EMIT(OANYOF, freezeset(p, cs)); -} - -/* - * p_b_term - parse one term of a bracketed character list - */ -static void -p_b_term(struct parse * p, cset *cs) -{ - pg_wchar c; - pg_wchar start, - finish; - int i; - - /* classify what we've got */ - switch ((MORE()) ? PEEK() : '\0') - { - case '[': - c = (MORE2()) ? PEEK2() : '\0'; - break; - case '-': - SETERROR(REG_ERANGE); - return; /* NOTE RETURN */ - break; - default: - c = '\0'; - break; - } - - switch (c) - { - case ':': /* character class */ - NEXT2(); - REQUIRE(MORE(), REG_EBRACK); - c = PEEK(); - REQUIRE(c != '-' && c != ']', REG_ECTYPE); - p_b_cclass(p, cs); - REQUIRE(MORE(), REG_EBRACK); - REQUIRE(EATTWO(':', ']'), REG_ECTYPE); - break; - case '=': /* equivalence class */ - NEXT2(); - REQUIRE(MORE(), REG_EBRACK); - c = PEEK(); - REQUIRE(c != '-' && c != ']', REG_ECOLLATE); - p_b_eclass(p, cs); - REQUIRE(MORE(), REG_EBRACK); - REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); - break; - default: /* symbol, ordinary character, or range */ -/* xxx revision needed for multichar stuff */ - start = p_b_symbol(p); - if (SEE('-') && MORE2() && PEEK2() != ']') - { - /* range */ - NEXT(); - if (EAT('-')) - finish = '-'; - else - finish = p_b_symbol(p); - } - else - finish = start; -/* xxx what about signed chars here... */ - REQUIRE(start <= finish, REG_ERANGE); -#ifdef MULTIBYTE - if (CHlc(start) != CHlc(finish)) - SETERROR(REG_ERANGE); -#endif - for (i = start; i <= finish; i++) - CHadd(cs, i); - break; - } -} - -/* - * p_b_cclass - parse a character-class name and deal with it - */ -static void -p_b_cclass(struct parse * p, cset *cs) -{ - pg_wchar *sp = p->next; - struct cclass *cp; - size_t len; - char *u; - unsigned char c; - - while (MORE() && pg_isalpha(PEEK())) - NEXT(); - len = p->next - sp; - for (cp = cclasses; cp->name != NULL; cp++) -#ifdef MULTIBYTE - if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') -#else - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') -#endif - break; - if (cp->name == NULL) - { - /* oops, didn't find it */ - SETERROR(REG_ECTYPE); - return; - } - - u = cp->chars; - while ((c = *u++) != '\0') - CHadd(cs, c); - for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) - MCadd(p, cs, u); -} - -/* - * p_b_eclass - parse an equivalence-class name and deal with it - * - * This implementation is incomplete. xxx - */ -static void -p_b_eclass(struct parse * p, cset *cs) -{ - char c; - - c = p_b_coll_elem(p, '='); - CHadd(cs, c); -} - -/* - * p_b_symbol - parse a character or [..]ed multicharacter collating symbol - */ -static pg_wchar /* value of symbol */ -p_b_symbol(struct parse * p) -{ - pg_wchar value; - - REQUIRE(MORE(), REG_EBRACK); - if (!EATTWO('[', '.')) - return GETNEXT(); - - /* collating symbol */ - value = p_b_coll_elem(p, '.'); - REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); - return value; -} - -/* - * p_b_coll_elem - parse a collating-element name and look it up - */ -static char /* value of collating element */ -p_b_coll_elem(struct parse * p, int endc) -{ - pg_wchar *sp = p->next; - struct cname *cp; - int len; - - while (MORE() && !SEETWO(endc, ']')) - NEXT(); - if (!MORE()) - { - SETERROR(REG_EBRACK); - return 0; - } - len = p->next - sp; - for (cp = cnames; cp->name != NULL; cp++) -#ifdef MULTIBYTE - if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') -#else - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') -#endif - return cp->code; /* known name */ - if (len == 1) - return *sp; /* single character */ - SETERROR(REG_ECOLLATE); /* neither */ - return 0; -} - -/* - * othercase - return the case counterpart of an alphabetic - */ -#ifdef MULTIBYTE -static unsigned char /* if no counterpart, return ch */ -#else -static char /* if no counterpart, return ch */ -#endif -othercase(int ch) -{ - assert(pg_isalpha(ch)); - if (pg_isupper(ch)) -#ifdef MULTIBYTE - return (unsigned char) tolower((unsigned char) ch); -#else - return tolower((unsigned char) ch); -#endif - else if (pg_islower(ch)) -#ifdef MULTIBYTE - return (unsigned char) toupper((unsigned char) ch); -#else - return toupper((unsigned char) ch); -#endif - else -/* peculiar, but could happen */ -#ifdef MULTIBYTE - return (unsigned char) ch; -#else - return ch; -#endif -} - -/* - * bothcases - emit a dualcase version of a two-case character - * - * Boy, is this implementation ever a kludge... - */ -static void -bothcases(struct parse * p, int ch) -{ - pg_wchar *oldnext = p->next; - pg_wchar *oldend = p->end; - pg_wchar bracket[3]; - - assert(othercase(ch) != ch); /* p_bracket() would recurse */ - p->next = bracket; - p->end = bracket + 2; - bracket[0] = ch; - bracket[1] = ']'; - bracket[2] = '\0'; - p_bracket(p); - assert(p->next == bracket + 2); - p->next = oldnext; - p->end = oldend; -} - -/* - * ordinary - emit an ordinary character - */ -static void -ordinary(struct parse * p, int ch) -{ - cat_t *cap = p->g->categories; - - if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch) - bothcases(p, ch); - else - { -#ifdef MULTIBYTE - EMIT(OCHAR, (pg_wchar) ch); -#else - EMIT(OCHAR, (unsigned char) ch); -#endif - if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0) - cap[ch] = p->g->ncategories++; - } -} - -/* - * nonnewline - emit REG_NEWLINE version of OANY - * - * Boy, is this implementation ever a kludge... - */ -static void -nonnewline(struct parse * p) -{ - pg_wchar *oldnext = p->next; - pg_wchar *oldend = p->end; - pg_wchar bracket[4]; - - p->next = bracket; - p->end = bracket + 3; - bracket[0] = '^'; - bracket[1] = '\n'; - bracket[2] = ']'; - bracket[3] = '\0'; - p_bracket(p); - assert(p->next == bracket + 3); - p->next = oldnext; - p->end = oldend; -} - -/* - * repeat - generate code for a bounded repetition, recursively if needed - */ -static void -repeat(struct parse * p, - sopno start, /* operand from here to end of strip */ - int from, /* repeated from this number */ - int to) /* to this number of times (maybe - * INFINITY) */ -{ - sopno finish = HERE(); - -#define N 2 -#define INF 3 -#define REP(f, t) ((f)*8 + (t)) -#define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) - sopno copy; - - if (p->error != 0) /* head off possible runaway recursion */ - return; - - assert(from <= to); - - switch (REP(MAP(from), MAP(to))) - { - case REP(0, 0): /* must be user doing this */ - DROP(finish - start); /* drop the operand */ - break; - case REP(0, 1): /* as x{1,1}? */ - case REP(0, N): /* as x{1,n}? */ - case REP(0, INF): /* as x{1,}? */ - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, start); /* offset is wrong... */ - repeat(p, start + 1, 1, to); - ASTERN(OOR1, start); - AHEAD(start); /* ... fix it */ - EMIT(OOR2, 0); - AHEAD(THERE()); - ASTERN(O_CH, THERETHERE()); - break; - case REP(1, 1): /* trivial case */ - /* done */ - break; - case REP(1, N): /* as x?x{1,n-1} */ - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, start); - ASTERN(OOR1, start); - AHEAD(start); - EMIT(OOR2, 0); /* offset very wrong... */ - AHEAD(THERE()); /* ...so fix it */ - ASTERN(O_CH, THERETHERE()); - copy = dupl(p, start + 1, finish + 1); - assert(copy == finish + 4); - repeat(p, copy, 1, to - 1); - break; - case REP(1, INF): /* as x+ */ - INSERT(OPLUS_, start); - ASTERN(O_PLUS, start); - break; - case REP(N, N): /* as xx{m-1,n-1} */ - copy = dupl(p, start, finish); - repeat(p, copy, from - 1, to - 1); - break; - case REP(N, INF): /* as xx{n-1,INF} */ - copy = dupl(p, start, finish); - repeat(p, copy, from - 1, to); - break; - default: /* "can't happen" */ - SETERROR(REG_ASSERT); /* just in case */ - break; - } -} - -/* - * seterr - set an error condition - */ -static int /* useless but makes type checking happy */ -seterr(struct parse * p, int e) -{ - if (p->error == 0) /* keep earliest error condition */ - p->error = e; - p->next = nuls; /* try to bring things to a halt */ - p->end = nuls; - return 0; /* make the return value well-defined */ -} - -/* - * allocset - allocate a set of characters for [] - */ -static cset * -allocset(struct parse * p) -{ - int no = p->g->ncsets++; - size_t nc; - size_t nbytes; - cset *cs; - size_t css = (size_t) p->g->csetsize; - int i; - - if (no >= p->ncsalloc) - { /* need another column of space */ - p->ncsalloc += CHAR_BIT; - nc = p->ncsalloc; - assert(nc % CHAR_BIT == 0); - nbytes = nc / CHAR_BIT * css; - if (p->g->sets == NULL) - p->g->sets = (cset *) malloc(nc * sizeof(cset)); - else - p->g->sets = (cset *) realloc((char *) p->g->sets, - nc * sizeof(cset)); - if (p->g->setbits == NULL) - p->g->setbits = (uch *) malloc(nbytes); - else - { - p->g->setbits = (uch *) realloc((char *) p->g->setbits, - nbytes); - /* xxx this isn't right if setbits is now NULL */ - for (i = 0; i < no; i++) - p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT); - } - if (p->g->sets != NULL && p->g->setbits != NULL) - memset((char *) p->g->setbits + (nbytes - css), - 0, css); - else - { - no = 0; - SETERROR(REG_ESPACE); - /* caller's responsibility not to do set ops */ - } - } - - assert(p->g->sets != NULL); /* xxx */ - cs = &p->g->sets[no]; - cs->ptr = p->g->setbits + css * ((no) / CHAR_BIT); - cs->mask = 1 << ((no) % CHAR_BIT); - cs->hash = 0; - cs->smultis = 0; - cs->multis = NULL; - - return cs; -} - -/* - * freeset - free a now-unused set - */ -static void -freeset(struct parse * p, cset *cs) -{ - int i; - cset *top = &p->g->sets[p->g->ncsets]; - size_t css = (size_t) p->g->csetsize; - - for (i = 0; i < css; i++) - CHsub(cs, i); - if (cs == top - 1) /* recover only the easy case */ - p->g->ncsets--; -} - -/* - * freezeset - final processing on a set of characters - * - * The main task here is merging identical sets. This is usually a waste - * of time (although the hash code minimizes the overhead), but can win - * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash - * is done using addition rather than xor -- all ASCII [aA] sets xor to - * the same value! - */ -static int /* set number */ -freezeset(struct parse * p, cset *cs) -{ - uch h = cs->hash; - int i; - cset *top = &p->g->sets[p->g->ncsets]; - cset *cs2; - size_t css = (size_t) p->g->csetsize; - - /* look for an earlier one which is the same */ - for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) - if (cs2->hash == h && cs2 != cs) - { - /* maybe */ - for (i = 0; i < css; i++) - if (!!CHIN(cs2, i) != !!CHIN(cs, i)) - break; /* no */ - if (i == css) - break; /* yes */ - } - - if (cs2 < top) - { /* found one */ - freeset(p, cs); - cs = cs2; - } - - return (int) (cs - p->g->sets); -} - -/* - * firstch - return first character in a set (which must have at least one) - */ -static int /* character; there is no "none" value */ -firstch(struct parse * p, cset *cs) -{ - int i; - size_t css = (size_t) p->g->csetsize; - - for (i = 0; i < css; i++) - if (CHIN(cs, i)) - return i; - assert(never); - return 0; /* arbitrary */ -} - -/* - * nch - number of characters in a set - */ -static int -nch(struct parse * p, cset *cs) -{ - int i; - size_t css = (size_t) p->g->csetsize; - int n = 0; - - for (i = 0; i < css; i++) - if (CHIN(cs, i)) - n++; - return n; -} - -/* - * mcadd - add a collating element to a cset - */ -static void -mcadd(struct parse * p, cset *cs, char *cp) -{ - size_t oldend = cs->smultis; - - cs->smultis += strlen(cp) + 1; - if (cs->multis == NULL) - cs->multis = malloc(cs->smultis); - else - cs->multis = realloc(cs->multis, cs->smultis); - if (cs->multis == NULL) - { - SETERROR(REG_ESPACE); - return; - } - - strcpy(cs->multis + oldend - 1, cp); - cs->multis[cs->smultis - 1] = '\0'; -} - -/* - * mcinvert - invert the list of collating elements in a cset - * - * This would have to know the set of possibilities. Implementation - * is deferred. - */ -static void -mcinvert(struct parse * p, cset *cs) -{ - assert(cs->multis == NULL); /* xxx */ -} - -/* - * mccase - add case counterparts of the list of collating elements in a cset - * - * This would have to know the set of possibilities. Implementation - * is deferred. - */ -static void -mccase(struct parse * p, cset *cs) -{ - assert(cs->multis == NULL); /* xxx */ -} - -/* - * isinsets - is this character in any sets? - */ -static int /* predicate */ -isinsets(struct re_guts * g, int c) -{ - uch *col; - int i; - int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT; - unsigned uc = (unsigned char) c; - - for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) - if (col[uc] != 0) - return 1; - return 0; -} - -/* - * samesets - are these two characters in exactly the same sets? - */ -static int /* predicate */ -samesets(struct re_guts * g, int c1, int c2) -{ - uch *col; - int i; - int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT; - unsigned uc1 = (unsigned char) c1; - unsigned uc2 = (unsigned char) c2; - - for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) - if (col[uc1] != col[uc2]) - return 0; - return 1; -} - -/* - * categorize - sort out character categories - */ -static void -categorize(struct parse * p, struct re_guts * g) -{ - cat_t *cats = g->categories; - int c; - int c2; - cat_t cat; - - /* avoid making error situations worse */ - if (p->error != 0) - return; - - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (cats[c] == 0 && isinsets(g, c)) - { - cat = g->ncategories++; - cats[c] = cat; - for (c2 = c + 1; c2 <= CHAR_MAX; c2++) - if (cats[c2] == 0 && samesets(g, c, c2)) - cats[c2] = cat; - } -} - -/* - * dupl - emit a duplicate of a bunch of sops - */ -static sopno /* start of duplicate */ -dupl(struct parse * p, - sopno start, /* from here */ - sopno finish) /* to this less one */ -{ - sopno ret = HERE(); - sopno len = finish - start; - - assert(finish >= start); - if (len == 0) - return ret; - enlarge(p, p->ssize + len); /* this many unexpected additions */ - assert(p->ssize >= p->slen + len); - memcpy((char *) (p->strip + p->slen), - (char *) (p->strip + start), (size_t) len * sizeof(sop)); - p->slen += len; - return ret; -} - -/* - * doemit - emit a strip operator - * - * It might seem better to implement this as a macro with a function as - * hard-case backup, but it's just too big and messy unless there are - * some changes to the data structures. Maybe later. - */ -static void -doemit(struct parse * p, sop op, size_t opnd) -{ - /* avoid making error situations worse */ - if (p->error != 0) - return; - - /* deal with oversize operands ("can't happen", more or less) */ - assert(opnd < 1 << OPSHIFT); - - /* deal with undersized strip */ - if (p->slen >= p->ssize) - enlarge(p, (p->ssize + 1) / 2 * 3); /* +50% */ - assert(p->slen < p->ssize); - - /* finally, it's all reduced to the easy case */ - p->strip[p->slen++] = SOP(op, opnd); -} - -/* - * doinsert - insert a sop into the strip - */ -static void -doinsert(struct parse * p, sop op, size_t opnd, sopno pos) -{ - sopno sn; - sop s; - int i; - - /* avoid making error situations worse */ - if (p->error != 0) - return; - - sn = HERE(); - EMIT(op, opnd); /* do checks, ensure space */ - assert(HERE() == sn + 1); - s = p->strip[sn]; - - /* adjust paren pointers */ - assert(pos > 0); - for (i = 1; i < NPAREN; i++) - { - if (p->pbegin[i] >= pos) - p->pbegin[i]++; - if (p->pend[i] >= pos) - p->pend[i]++; - } - - memmove((char *) &p->strip[pos + 1], (char *) &p->strip[pos], - (HERE() - pos - 1) * sizeof(sop)); - p->strip[pos] = s; -} - -/* - * dofwd - complete a forward reference - */ -static void -dofwd(struct parse * p, sopno pos, sop value) -{ - /* avoid making error situations worse */ - if (p->error != 0) - return; - - assert(value < 1 << OPSHIFT); - p->strip[pos] = OP(p->strip[pos]) | value; -} - -/* - * enlarge - enlarge the strip - */ -static void -enlarge(struct parse * p, sopno size) -{ - sop *sp; - - if (p->ssize >= size) - return; - - sp = (sop *) realloc(p->strip, size * sizeof(sop)); - if (sp == NULL) - { - SETERROR(REG_ESPACE); - return; - } - p->strip = sp; - p->ssize = size; -} - -/* - * stripsnug - compact the strip - */ -static void -stripsnug(struct parse * p, struct re_guts * g) -{ - g->nstates = p->slen; - g->strip = (sop *) realloc((char *) p->strip, p->slen * sizeof(sop)); - if (g->strip == NULL) - { - SETERROR(REG_ESPACE); - g->strip = p->strip; - } -} - -/* - * findmust - fill in must and mlen with longest mandatory literal string - * - * This algorithm could do fancy things like analyzing the operands of | - * for common subsequences. Someday. This code is simple and finds most - * of the interesting cases. - * - * Note that must and mlen got initialized during setup. - */ -static void -findmust(struct parse * p, struct re_guts * g) -{ - sop *scan; - sop *start = 0; - sop *newstart = 0; - sopno newlen; - sop s; - pg_wchar *cp; - sopno i; - - /* avoid making error situations worse */ - if (p->error != 0) - return; - - /* find the longest OCHAR sequence in strip */ - newlen = 0; - scan = g->strip + 1; - do - { - s = *scan++; - switch (OP(s)) - { - case OCHAR: /* sequence member */ - if (newlen == 0) /* new sequence */ - newstart = scan - 1; - newlen++; - break; - case OPLUS_: /* things that don't break one */ - case OLPAREN: - case ORPAREN: - break; - case OQUEST_: /* things that must be skipped */ - case OCH_: - scan--; - do - { - scan += OPND(s); - s = *scan; - /* assert() interferes w debug printouts */ - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) - { - g->iflags |= BAD; - return; - } - } while (OP(s) != O_QUEST && OP(s) != O_CH); - /* fallthrough */ - default: /* things that break a sequence */ - if (newlen > g->mlen) - { /* ends one */ - start = newstart; - g->mlen = newlen; - } - newlen = 0; - break; - } - } while (OP(s) != OEND); - - if (g->mlen == 0) /* there isn't one */ - return; - - /* turn it into a character string */ -#ifdef MULTIBYTE - g->must = (pg_wchar *) malloc((size_t) (g->mlen + 1) * sizeof(pg_wchar)); -#else - g->must = malloc((size_t) g->mlen + 1); -#endif - if (g->must == NULL) - { /* argh; just forget it */ - g->mlen = 0; - return; - } - cp = g->must; - scan = start; - for (i = g->mlen; i > 0; i--) - { - while (OP(s = *scan++) != OCHAR) - continue; - assert(cp < g->must + g->mlen); - *cp++ = (pg_wchar) OPND(s); - } - assert(cp == g->must + g->mlen); - *cp++ = '\0'; /* just on general principles */ -} - -/* - * pluscount - count + nesting - */ -static sopno /* nesting depth */ -pluscount(struct parse * p, struct re_guts * g) -{ - sop *scan; - sop s; - sopno plusnest = 0; - sopno maxnest = 0; - - if (p->error != 0) - return 0; /* there may not be an OEND */ - - scan = g->strip + 1; - do - { - s = *scan++; - switch (OP(s)) - { - case OPLUS_: - plusnest++; - break; - case O_PLUS: - if (plusnest > maxnest) - maxnest = plusnest; - plusnest--; - break; - } - } while (OP(s) != OEND); - if (plusnest != 0) - g->iflags |= BAD; - return maxnest; -} - -/* - * some ctype functions with non-ascii-char guard - */ -static int -pg_isdigit(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c)); -#else - return (isdigit((unsigned char) c)); -#endif -} - -static int -pg_isalpha(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c)); -#else - return (isalpha((unsigned char) c)); -#endif -} - -static int -pg_isalnum(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c)); -#else - return (isalnum((unsigned char) c)); -#endif -} - -static int -pg_isupper(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c)); -#else - return (isupper((unsigned char) c)); -#endif -} - -static int -pg_islower(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c)); -#else - return (islower((unsigned char) c)); -#endif -} - -static int -pg_iscntrl(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && iscntrl((unsigned char) c)); -#else - return (iscntrl((unsigned char) c)); -#endif -} - -static int -pg_isgraph(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c)); -#else - return (isgraph((unsigned char) c)); -#endif -} - -static int -pg_isprint(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c)); -#else - return (isprint((unsigned char) c)); -#endif -} - -static int -pg_ispunct(int c) -{ -#ifdef MULTIBYTE - return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c)); -#else - return (ispunct((unsigned char) c)); -#endif -} - -static struct cclass * -cclass_init(void) -{ - static struct cclass cclasses_C[] = { - { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", "" }, - { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", "" }, - { "blank", " \t", "" }, - { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177", "" }, - { "digit", "0123456789", "" }, - { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", "" }, - { "lower", "abcdefghijklmnopqrstuvwxyz", "" }, - { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", "" }, - { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", "" }, - { "space", "\t\n\v\f\r ", "" }, - { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "" }, - { "xdigit", "0123456789ABCDEFabcdef", "" }, - { NULL, NULL, "" } - }; - struct cclass *cp = NULL; - struct cclass *classes = NULL; - struct cclass_factory - { - char *name; - int (*func)(int); - char *chars; - } cclass_factories [] = - { - { "alnum", pg_isalnum, NULL }, - { "alpha", pg_isalpha, NULL }, - { "blank", NULL, " \t" }, - { "cntrl", pg_iscntrl, NULL }, - { "digit", NULL, "0123456789" }, - { "graph", pg_isgraph, NULL }, - { "lower", pg_islower, NULL }, - { "print", pg_isprint, NULL }, - { "punct", pg_ispunct, NULL }, - { "space", NULL, "\t\n\v\f\r " }, - { "upper", pg_isupper, NULL }, - { "xdigit", NULL, "0123456789ABCDEFabcdef" }, - { NULL, NULL, NULL } - }; - struct cclass_factory *cf = NULL; - - if ( strcmp( setlocale( LC_CTYPE, NULL ), "C" ) == 0 ) - return cclasses_C; - - classes = malloc(sizeof(struct cclass) * (sizeof(cclass_factories) / sizeof(struct cclass_factory))); - if (classes == NULL) - elog(ERROR,"cclass_init: out of memory"); - - cp = classes; - for(cf = cclass_factories; cf->name != NULL; cf++) - { - cp->name = strdup(cf->name); - if ( cf->chars ) - cp->chars = strdup(cf->chars); - else - { - int x = 0, y = 0; - cp->chars = malloc(sizeof(char) * 256); - if (cp->chars == NULL) - elog(ERROR,"cclass_init: out of memory"); - for (x = 0; x < 256; x++) - { - if((cf->func)(x)) - *(cp->chars + y++) = x; - } - *(cp->chars + y) = '\0'; - } - cp->multis = ""; - cp++; - } - cp->name = cp->chars = NULL; - cp->multis = ""; - - return classes; -} diff --git a/src/backend/regex/regerror.c b/src/backend/regex/regerror.c deleted file mode 100644 index fb12cba3048..00000000000 --- a/src/backend/regex/regerror.c +++ /dev/null @@ -1,185 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regerror.c 8.4 (Berkeley) 3/20/94 - */ - -#include "postgres.h" - -#include <sys/types.h> -#include <ctype.h> -#include <limits.h> -#include <assert.h> - -#include "regex/regex.h" -#include "regex/utils.h" -#include "regex/regex2.h" - -static char *regatoi(const regex_t *preg, char *localbuf); - -static struct rerr -{ - int code; - char *name; - char *explain; -} rerrs[] = - -{ - { - /* NOMATCH is not really an error condition, it just says no match */ - REG_NOMATCH, "REG_NOMATCH", "no pattern match found" - }, - { - REG_BADPAT, "REG_BADPAT", "invalid regex struct" - }, - { - REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" - }, - { - REG_ECTYPE, "REG_ECTYPE", "invalid character class" - }, - { - REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)" - }, - { - REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" - }, - { - REG_EBRACK, "REG_EBRACK", "brackets [ ] not balanced" - }, - { - REG_EPAREN, "REG_EPAREN", "parentheses ( ) not balanced" - }, - { - REG_EBRACE, "REG_EBRACE", "braces { } not balanced" - }, - { - REG_BADBR, "REG_BADBR", "invalid repetition count(s) in { }" - }, - { - REG_ERANGE, "REG_ERANGE", "invalid character range in [ ]" - }, - { - REG_ESPACE, "REG_ESPACE", "ran out of memory" - }, - { - REG_BADRPT, "REG_BADRPT", "?, *, or + operand invalid" - }, - { - REG_EMPTY, "REG_EMPTY", "empty expression or subexpression" - }, - { - REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" - }, - { - REG_INVARG, "REG_INVARG", "invalid argument to regex routine" - }, - { - 0, "", "*** unknown regexp error code ***" - } -}; - -/* - * regerror - the interface to error numbers - */ -/* ARGSUSED */ -size_t -pg_regerror(int errcode, const regex_t *preg, - char *errbuf, size_t errbuf_size) -{ - struct rerr *r; - size_t len; - int target = errcode & ~REG_ITOA; - char *s; - char convbuf[50]; - - if (errcode == REG_ATOI) - s = regatoi(preg, convbuf); - else - { - for (r = rerrs; r->code != 0; r++) - if (r->code == target) - break; - - if (errcode & REG_ITOA) - { - if (r->code != 0) - strcpy(convbuf, r->name); - else - sprintf(convbuf, "REG_0x%x", target); - assert(strlen(convbuf) < sizeof(convbuf)); - s = convbuf; - } - else - s = r->explain; - } - - len = strlen(s) + 1; - if (errbuf_size > 0) - { - if (errbuf_size > len) - strcpy(errbuf, s); - else - { - strncpy(errbuf, s, errbuf_size - 1); - errbuf[errbuf_size - 1] = '\0'; - } - } - - return len; -} - -/* - * regatoi - internal routine to implement REG_ATOI - */ -static char * -regatoi(const regex_t *preg, char *localbuf) -{ - struct rerr *r; - - for (r = rerrs; r->code != 0; r++) -#ifdef MULTIBYTE - if (pg_char_and_wchar_strcmp(r->name, preg->re_endp) == 0) -#else - if (strcmp(r->name, preg->re_endp) == 0) -#endif - break; - if (r->code == 0) - return "0"; - - sprintf(localbuf, "%d", r->code); - return localbuf; -} diff --git a/src/backend/regex/regex.3 b/src/backend/regex/regex.3 deleted file mode 100644 index 01a6109278a..00000000000 --- a/src/backend/regex/regex.3 +++ /dev/null @@ -1,538 +0,0 @@ -.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. -.\" Copyright (c) 1992, 1993, 1994 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" This code is derived from software contributed to Berkeley by -.\" Henry Spencer. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)regex.3 8.4 (Berkeley) 3/20/94 -.\" -.TH REGEX 3 "March 20, 1994" -.de ZR -.\" one other place knows this name: the SEE ALSO section -.IR re_format (7) \\$1 -.. -.SH NAME -regcomp, regexec, regerror, regfree \- regular-expression library -.SH SYNOPSIS -.ft B -.\".na -#include <sys/types.h> -.br -#include <regex.h> -.HP 10 -int regcomp(regex_t\ *preg, const\ char\ *pattern, int\ cflags); -.HP -int\ regexec(const\ regex_t\ *preg, const\ char\ *string, -size_t\ nmatch, regmatch_t\ pmatch[], int\ eflags); -.HP -size_t\ regerror(int\ errcode, const\ regex_t\ *preg, -char\ *errbuf, size_t\ errbuf_size); -.HP -void\ regfree(regex_t\ *preg); -.\".ad -.ft -.SH DESCRIPTION -These routines implement POSIX 1003.2 regular expressions (``RE''s); -see -.ZR . -.I Regcomp -compiles an RE written as a string into an internal form, -.I regexec -matches that internal form against a string and reports results, -.I regerror -transforms error codes from either into human-readable messages, -and -.I regfree -frees any dynamically-allocated storage used by the internal form -of an RE. -.PP -The header -.I <regex.h> -declares two structure types, -.I regex_t -and -.IR regmatch_t , -the former for compiled internal forms and the latter for match reporting. -It also declares the four functions, -a type -.IR regoff_t , -and a number of constants with names starting with ``REG_''. -.PP -.I Regcomp -compiles the regular expression contained in the -.I pattern -string, -subject to the flags in -.IR cflags , -and places the results in the -.I regex_t -structure pointed to by -.IR preg . -.I Cflags -is the bitwise OR of zero or more of the following flags: -.IP REG_EXTENDED \w'REG_EXTENDED'u+2n -Compile modern (``extended'') REs, -rather than the obsolete (``basic'') REs that -are the default. -.IP REG_BASIC -This is a synonym for 0, -provided as a counterpart to REG_EXTENDED to improve readability. -.IP REG_NOSPEC -Compile with recognition of all special characters turned off. -All characters are thus considered ordinary, -so the ``RE'' is a literal string. -This is an extension, -compatible with but not specified by POSIX 1003.2, -and should be used with -caution in software intended to be portable to other systems. -REG_EXTENDED and REG_NOSPEC may not be used -in the same call to -.IR regcomp . -.IP REG_ICASE -Compile for matching that ignores upper/lower case distinctions. -See -.ZR . -.IP REG_NOSUB -Compile for matching that need only report success or failure, -not what was matched. -.IP REG_NEWLINE -Compile for newline-sensitive matching. -By default, newline is a completely ordinary character with no special -meaning in either REs or strings. -With this flag, -`[^' bracket expressions and `.' never match newline, -a `^' anchor matches the null string after any newline in the string -in addition to its normal function, -and the `$' anchor matches the null string before any newline in the -string in addition to its normal function. -.IP REG_PEND -The regular expression ends, -not at the first NUL, -but just before the character pointed to by the -.I re_endp -member of the structure pointed to by -.IR preg . -The -.I re_endp -member is of type -.IR const\ char\ * . -This flag permits inclusion of NULs in the RE; -they are considered ordinary characters. -This is an extension, -compatible with but not specified by POSIX 1003.2, -and should be used with -caution in software intended to be portable to other systems. -.PP -When successful, -.I regcomp -returns 0 and fills in the structure pointed to by -.IR preg . -One member of that structure -(other than -.IR re_endp ) -is publicized: -.IR re_nsub , -of type -.IR size_t , -contains the number of parenthesized subexpressions within the RE -(except that the value of this member is undefined if the -REG_NOSUB flag was used). -If -.I regcomp -fails, it returns a non-zero error code; -see DIAGNOSTICS. -.PP -.I Regexec -matches the compiled RE pointed to by -.I preg -against the -.IR string , -subject to the flags in -.IR eflags , -and reports results using -.IR nmatch , -.IR pmatch , -and the returned value. -The RE must have been compiled by a previous invocation of -.IR regcomp . -The compiled form is not altered during execution of -.IR regexec , -so a single compiled RE can be used simultaneously by multiple threads. -.PP -By default, -the NUL-terminated string pointed to by -.I string -is considered to be the text of an entire line, minus any terminating -newline. -The -.I eflags -argument is the bitwise OR of zero or more of the following flags: -.IP REG_NOTBOL \w'REG_STARTEND'u+2n -The first character of -the string -is not the beginning of a line, so the `^' anchor should not match before it. -This does not affect the behavior of newlines under REG_NEWLINE. -.IP REG_NOTEOL -The NUL terminating -the string -does not end a line, so the `$' anchor should not match before it. -This does not affect the behavior of newlines under REG_NEWLINE. -.IP REG_STARTEND -The string is considered to start at -\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR -and to have a terminating NUL located at -\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR -(there need not actually be a NUL at that location), -regardless of the value of -.IR nmatch . -See below for the definition of -.IR pmatch -and -.IR nmatch . -This is an extension, -compatible with but not specified by POSIX 1003.2, -and should be used with -caution in software intended to be portable to other systems. -Note that a non-zero \fIrm_so\fR does not imply REG_NOTBOL; -REG_STARTEND affects only the location of the string, -not how it is matched. -.PP -See -.ZR -for a discussion of what is matched in situations where an RE or a -portion thereof could match any of several substrings of -.IR string . -.PP -Normally, -.I regexec -returns 0 for success and the non-zero code REG_NOMATCH for failure. -Other non-zero error codes may be returned in exceptional situations; -see DIAGNOSTICS. -.PP -If REG_NOSUB was specified in the compilation of the RE, -or if -.I nmatch -is 0, -.I regexec -ignores the -.I pmatch -argument (but see below for the case where REG_STARTEND is specified). -Otherwise, -.I pmatch -points to an array of -.I nmatch -structures of type -.IR regmatch_t . -Such a structure has at least the members -.I rm_so -and -.IR rm_eo , -both of type -.I regoff_t -(a signed arithmetic type at least as large as an -.I off_t -and a -.IR ssize_t ), -containing respectively the offset of the first character of a substring -and the offset of the first character after the end of the substring. -Offsets are measured from the beginning of the -.I string -argument given to -.IR regexec . -An empty substring is denoted by equal offsets, -both indicating the character following the empty substring. -.PP -The 0th member of the -.I pmatch -array is filled in to indicate what substring of -.I string -was matched by the entire RE. -Remaining members report what substring was matched by parenthesized -subexpressions within the RE; -member -.I i -reports subexpression -.IR i , -with subexpressions counted (starting at 1) by the order of their opening -parentheses in the RE, left to right. -Unused entries in the array\(emcorresponding either to subexpressions that -did not participate in the match at all, or to subexpressions that do not -exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both -.I rm_so -and -.I rm_eo -set to \-1. -If a subexpression participated in the match several times, -the reported substring is the last one it matched. -(Note, as an example in particular, that when the RE `(b*)+' matches `bbb', -the parenthesized subexpression matches each of the three `b's and then -an infinite number of empty strings following the last `b', -so the reported substring is one of the empties.) -.PP -If REG_STARTEND is specified, -.I pmatch -must point to at least one -.I regmatch_t -(even if -.I nmatch -is 0 or REG_NOSUB was specified), -to hold the input offsets for REG_STARTEND. -Use for output is still entirely controlled by -.IR nmatch ; -if -.I nmatch -is 0 or REG_NOSUB was specified, -the value of -.IR pmatch [0] -will not be changed by a successful -.IR regexec . -.PP -.I Regerror -maps a non-zero -.I errcode -from either -.I regcomp -or -.I regexec -to a human-readable, printable message. -If -.I preg -is non-NULL, -the error code should have arisen from use of -the -.I regex_t -pointed to by -.IR preg , -and if the error code came from -.IR regcomp , -it should have been the result from the most recent -.I regcomp -using that -.IR regex_t . -.RI ( Regerror -may be able to supply a more detailed message using information -from the -.IR regex_t .) -.I Regerror -places the NUL-terminated message into the buffer pointed to by -.IR errbuf , -limiting the length (including the NUL) to at most -.I errbuf_size -bytes. -If the whole message won't fit, -as much of it as will fit before the terminating NUL is supplied. -In any case, -the returned value is the size of buffer needed to hold the whole -message (including terminating NUL). -If -.I errbuf_size -is 0, -.I errbuf -is ignored but the return value is still correct. -.PP -If the -.I errcode -given to -.I regerror -is first ORed with REG_ITOA, -the ``message'' that results is the printable name of the error code, -e.g. ``REG_NOMATCH'', -rather than an explanation thereof. -If -.I errcode -is REG_ATOI, -then -.I preg -shall be non-NULL and the -.I re_endp -member of the structure it points to -must point to the printable name of an error code; -in this case, the result in -.I errbuf -is the decimal digits of -the numeric value of the error code -(0 if the name is not recognized). -REG_ITOA and REG_ATOI are intended primarily as debugging facilities; -they are extensions, -compatible with but not specified by POSIX 1003.2, -and should be used with -caution in software intended to be portable to other systems. -Be warned also that they are considered experimental and changes are possible. -.PP -.I Regfree -frees any dynamically-allocated storage associated with the compiled RE -pointed to by -.IR preg . -The remaining -.I regex_t -is no longer a valid compiled RE -and the effect of supplying it to -.I regexec -or -.I regerror -is undefined. -.PP -None of these functions references global variables except for tables -of constants; -all are safe for use from multiple threads if the arguments are safe. -.SH IMPLEMENTATION CHOICES -There are a number of decisions that 1003.2 leaves up to the implementor, -either by explicitly saying ``undefined'' or by virtue of them being -forbidden by the RE grammar. -This implementation treats them as follows. -.PP -See -.ZR -for a discussion of the definition of case-independent matching. -.PP -There is no particular limit on the length of REs, -except insofar as memory is limited. -Memory usage is approximately linear in RE size, and largely insensitive -to RE complexity, except for bounded repetitions. -See BUGS for one short RE using them -that will run almost any system out of memory. -.PP -A backslashed character other than one specifically given a magic meaning -by 1003.2 (such magic meanings occur only in obsolete [``basic''] REs) -is taken as an ordinary character. -.PP -Any unmatched [ is a REG_EBRACK error. -.PP -Equivalence classes cannot begin or end bracket-expression ranges. -The endpoint of one range cannot begin another. -.PP -RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255. -.PP -A repetition operator (?, *, +, or bounds) cannot follow another -repetition operator. -A repetition operator cannot begin an expression or subexpression -or follow `^' or `|'. -.PP -`|' cannot appear first or last in a (sub)expression or after another `|', -i.e. an operand of `|' cannot be an empty subexpression. -An empty parenthesized subexpression, `()', is legal and matches an -empty (sub)string. -An empty string is not a legal RE. -.PP -A `{' followed by a digit is considered the beginning of bounds for a -bounded repetition, which must then follow the syntax for bounds. -A `{' \fInot\fR followed by a digit is considered an ordinary character. -.PP -`^' and `$' beginning and ending subexpressions in obsolete (``basic'') -REs are anchors, not ordinary characters. -.SH SEE ALSO -grep(1), re_format(7) -.PP -POSIX 1003.2, sections 2.8 (Regular Expression Notation) -and -B.5 (C Binding for Regular Expression Matching). -.SH DIAGNOSTICS -Non-zero error codes from -.I regcomp -and -.I regexec -include the following: -.PP -.nf -.ta \w'REG_ECOLLATE'u+3n -REG_NOMATCH no pattern match found -REG_BADPAT invalid regex struct -REG_ECOLLATE invalid collating element -REG_ECTYPE invalid character class -REG_EESCAPE trailing backslash (\e) -REG_ESUBREG invalid backreference number -REG_EBRACK brackets [ ] not balanced -REG_EPAREN parentheses ( ) not balanced -REG_EBRACE braces { } not balanced -REG_BADBR invalid repetition count(s) in { } -REG_ERANGE invalid character range in [ ] -REG_ESPACE ran out of memory -REG_BADRPT ?, *, or + operand invalid -REG_EMPTY empty expression or subexpression -REG_ASSERT ``can't happen''\(emyou found a bug -REG_INVARG invalid argument, e.g. negative-length string -.fi -.SH HISTORY -Originally written by Henry Spencer. -Altered for inclusion in the 4.4BSD distribution. -.SH BUGS -This is an alpha release with known defects. -Please report problems. -.PP -There is one known functionality bug. -The implementation of internationalization is incomplete: -the locale is always assumed to be the default one of 1003.2, -and only the collating elements etc. of that locale are available. -.PP -The back-reference code is subtle and doubts linger about its correctness -in complex cases. -.PP -.I Regexec -performance is poor. -This will improve with later releases. -.I Nmatch -exceeding 0 is expensive; -.I nmatch -exceeding 1 is worse. -.I Regexec -is largely insensitive to RE complexity \fIexcept\fR that back -references are massively expensive. -RE length does matter; in particular, there is a strong speed bonus -for keeping RE length under about 30 characters, -with most special characters counting roughly double. -.PP -.I Regcomp -implements bounded repetitions by macro expansion, -which is costly in time and space if counts are large -or bounded repetitions are nested. -An RE like, say, -`((((a{1,100}){1,100}){1,100}){1,100}){1,100}' -will (eventually) run almost any existing machine out of swap space. -.PP -There are suspected problems with response to obscure error conditions. -Notably, -certain kinds of internal overflow, -produced only by truly enormous REs or by multiply nested bounded repetitions, -are probably not handled well. -.PP -Due to a mistake in 1003.2, things like `a)b' are legal REs because `)' is -a special character only in the presence of a previous unmatched `('. -This can't be fixed until the spec is fixed. -.PP -The standard's definition of back references is vague. -For example, does -`a\e(\e(b\e)*\e2\e)*d' match `abbbd'? -Until the standard is clarified, -behavior in such cases should not be relied on. -.PP -The implementation of word-boundary matching is a bit of a kludge, -and bugs may lurk in combinations of word-boundary matching and anchoring. diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c deleted file mode 100644 index 06459ef1dbc..00000000000 --- a/src/backend/regex/regexec.c +++ /dev/null @@ -1,194 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regexec.c 8.3 (Berkeley) 3/20/94 - */ - -#include "postgres.h" - -/* - * the outer shell of regexec() - * - * This file includes engine.c *twice*, after muchos fiddling with the - * macros that code uses. This lets the same code operate on two different - * representations for state sets. - */ -#include <sys/types.h> -#include <limits.h> -#include <ctype.h> -#include <assert.h> - -#include "regex/regex.h" -#include "regex/utils.h" -#include "regex/regex2.h" - -static int nope = 0; /* for use in asserts; shuts lint up */ - -/* macros for manipulating states, small version */ -#define states long -#define states1 states /* for later use in regexec() decision */ -#define CLEAR(v) ((v) = 0) -#define SET0(v, n) ((v) &= ~(1L << (n))) -#define SET1(v, n) ((v) |= (1L << (n))) -#define ISSET(v, n) ((v) & (1L << (n))) -#define ASSIGN(d, s) ((d) = (s)) -#define EQ(a, b) ((a) == (b)) -#define STATEVARS int dummy /* dummy version */ -#define STATESETUP(m, n) /* nothing */ -#define STATETEARDOWN(m) /* nothing */ -#define SETUP(v) ((v) = 0) -#define onestate long -#define INIT(o, n) ((o) = (1L << (n))) -#define INC(o) ((o) <<= 1) -#define ISSTATEIN(v, o) ((v) & (o)) -/* some abbreviations; note that some of these know variable names! */ -/* do "if I'm here, I can also be there" etc without branches */ -#define FWD(dst, src, n) ((dst) |= ((src) & (here)) << (n)) -#define BACK(dst, src, n) ((dst) |= ((src) & (here)) >> (n)) -#define ISSETBACK(v, n) ((v) & (here >> (n))) -/* function names */ -#define SNAMES /* engine.c looks after details */ - -#include "engine.c" - -/* now undo things */ -#undef states -#undef CLEAR -#undef SET0 -#undef SET1 -#undef ISSET -#undef ASSIGN -#undef EQ -#undef STATEVARS -#undef STATESETUP -#undef STATETEARDOWN -#undef SETUP -#undef onestate -#undef INIT -#undef INC -#undef ISSTATEIN -#undef FWD -#undef BACK -#undef ISSETBACK -#undef SNAMES - -/* macros for manipulating states, large version */ -#define states char * -#define CLEAR(v) memset(v, 0, m->g->nstates) -#define SET0(v, n) ((v)[n] = 0) -#define SET1(v, n) ((v)[n] = 1) -#define ISSET(v, n) ((v)[n]) -#define ASSIGN(d, s) memcpy(d, s, m->g->nstates) -#define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0) -#define STATEVARS int vn; char *space -#define STATESETUP(m, nv) \ -do { \ - (m)->space = malloc((nv)*(m)->g->nstates); \ - if ((m)->space == NULL) \ - return(REG_ESPACE); \ - (m)->vn = 0; \ -} while (0) - -#define STATETEARDOWN(m) \ -do { \ - free((m)->space); \ -} while (0) - -#define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates]) -#define onestate int -#define INIT(o, n) ((o) = (n)) -#define INC(o) ((o)++) -#define ISSTATEIN(v, o) ((v)[o]) -/* some abbreviations; note that some of these know variable names! */ -/* do "if I'm here, I can also be there" etc without branches */ -#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here]) -#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here]) -#define ISSETBACK(v, n) ((v)[here - (n)]) -/* function names */ -#define LNAMES /* flag */ - -#include "engine.c" - -/* - * regexec - interface for matching - * - * We put this here so we can exploit knowledge of the state representation - * when choosing which matcher to call. - */ -int /* 0 success, REG_NOMATCH failure */ -pg_regexec(const regex_t *preg, const char *string, size_t nmatch, - regmatch_t *pmatch, int eflags) -{ - struct re_guts *g = preg->re_g; - -#ifdef MULTIBYTE - pg_wchar *str; - int sts; -#endif - -#ifdef REDEBUG -#define GOODFLAGS(f) (f) -#else -#define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND)) -#endif - - if (preg->re_magic != MAGIC1 || g->magic != MAGIC2) - return REG_BADPAT; - assert(!(g->iflags & BAD)); - if (g->iflags & BAD) /* backstop for no-debug case */ - return REG_BADPAT; - eflags = GOODFLAGS(eflags); - -#ifdef MULTIBYTE - str = (pg_wchar *) malloc((strlen(string) + 1) * sizeof(pg_wchar)); - if (!str) - return (REG_ESPACE); - (void) pg_mb2wchar((unsigned char *) string, str); - if (g->nstates <= CHAR_BIT * sizeof(states1) && !(eflags & REG_LARGE)) - sts = smatcher(g, str, nmatch, pmatch, eflags); - else - sts = lmatcher(g, str, nmatch, pmatch, eflags); - free((char *) str); - return (sts); - -#else - - if (g->nstates <= CHAR_BIT * sizeof(states1) && !(eflags & REG_LARGE)) - return smatcher(g, (pg_wchar *) string, nmatch, pmatch, eflags); - else - return lmatcher(g, (pg_wchar *) string, nmatch, pmatch, eflags); -#endif -} diff --git a/src/backend/regex/regfree.c b/src/backend/regex/regfree.c deleted file mode 100644 index 5672fcf240f..00000000000 --- a/src/backend/regex/regfree.c +++ /dev/null @@ -1,77 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regfree.c 8.3 (Berkeley) 3/20/94 - */ - -#include "postgres.h" - -#include <sys/types.h> - -#include "regex/regex.h" -#include "regex/utils.h" -#include "regex/regex2.h" - -/* - * regfree - free everything - */ -void -pg_regfree(regex_t *preg) -{ - struct re_guts *g; - - if (preg->re_magic != MAGIC1) /* oops */ - return; /* nice to complain, but hard */ - - g = preg->re_g; - if (g == NULL || g->magic != MAGIC2) /* oops again */ - return; - preg->re_magic = 0; /* mark it invalid */ - g->magic = 0; /* mark it invalid */ -#ifdef MULTIBYTE - if (preg->patsave != NULL) - free((char *) preg->patsave); -#endif - if (g->strip != NULL) - free((char *) g->strip); - if (g->sets != NULL) - free((char *) g->sets); - if (g->setbits != NULL) - free((char *) g->setbits); - if (g->must != NULL) - free(g->must); - free((char *) g); -} diff --git a/src/backend/regex/retest.c b/src/backend/regex/retest.c deleted file mode 100644 index ca5d6c5394a..00000000000 --- a/src/backend/regex/retest.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * a simple regexp debug program - * - * $Header: /cvsroot/pgsql/src/backend/regex/Attic/retest.c,v 1.5 2002/06/11 15:41:37 thomas Exp $ - */ - -#include "postgres.h" -#include "regex/regex.h" - -int -main() -{ - int sts; - regex_t re; - char buf[1024]; - char *p; - - printf("type in regexp string: "); - if (!fgets(buf, sizeof(buf), stdin)) - exit(0); - p = strchr(buf, '\n'); - if (p) - *p = '\0'; - - sts = pg_regcomp(&re, buf, 1); - printf("regcomp: parses \"%s\" and returns %d\n", buf, sts); - for (;;) - { - printf("type in target string: "); - if (!fgets(buf, sizeof(buf), stdin)) - exit(0); - p = strchr(buf, '\n'); - if (p) - *p = '\0'; - - sts = pg_regexec(&re, buf, 0, 0, 0); - printf("regexec: returns %d\n", sts); - } -} - -void -elog(int lev, const char *fmt,...) -{ -} |