summaryrefslogtreecommitdiff
path: root/src/backend/regex/regc_nfa.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-20 18:11:56 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-20 18:11:56 -0500
commit08c0d6ad65f7c161add82ae906efb90dbd7f653d (patch)
treecc6376d6fd084c6584b18d818c0816e3b7e190c9 /src/backend/regex/regc_nfa.c
parent17661188336c8cbb1783808912096932c57893a3 (diff)
Invent "rainbow" arcs within the regex engine.
Some regular expression constructs, most notably the "." match-anything metacharacter, produce a sheaf of parallel NFA arcs covering all possible colors (that is, character equivalence classes). We can make a noticeable improvement in the space and time needed to process large regexes by replacing such cases with a single arc bearing the special color code "RAINBOW". This requires only minor additional complication in places such as pull() and push(). Callers of pg_reg_getoutarcs() must now be prepared for the possibility of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm, that's a net benefit since it cuts the number of arcs to be dealt with, and the handling isn't any different than for other colors that contain too many characters to be dealt with individually. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regc_nfa.c')
-rw-r--r--src/backend/regex/regc_nfa.c82
1 files changed, 74 insertions, 8 deletions
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index 7ed675f88a4..ff98bfd694e 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -271,6 +271,11 @@ destroystate(struct nfa *nfa,
*
* This function checks to make sure that no duplicate arcs are created.
* In general we never want duplicates.
+ *
+ * However: in principle, a RAINBOW arc is redundant with any plain arc
+ * (unless that arc is for a pseudocolor). But we don't try to recognize
+ * that redundancy, either here or in allied operations such as moveins().
+ * The pseudocolor consideration makes that more costly than it seems worth.
*/
static void
newarc(struct nfa *nfa,
@@ -1170,6 +1175,9 @@ copyouts(struct nfa *nfa,
/*
* cloneouts - copy out arcs of a state to another state pair, modifying type
+ *
+ * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
+ * the same interpretation of "co". It wouldn't be sensible with LACONs.
*/
static void
cloneouts(struct nfa *nfa,
@@ -1181,9 +1189,13 @@ cloneouts(struct nfa *nfa,
struct arc *a;
assert(old != from);
+ assert(type == AHEAD || type == BEHIND);
for (a = old->outs; a != NULL; a = a->outchain)
+ {
+ assert(a->type == PLAIN);
newarc(nfa, type, a->co, from, to);
+ }
}
/*
@@ -1597,7 +1609,7 @@ pull(struct nfa *nfa,
for (a = from->ins; a != NULL && !NISERR(); a = nexta)
{
nexta = a->inchain;
- switch (combine(con, a))
+ switch (combine(nfa, con, a))
{
case INCOMPATIBLE: /* destroy the arc */
freearc(nfa, a);
@@ -1624,6 +1636,10 @@ pull(struct nfa *nfa,
cparc(nfa, a, s, to);
freearc(nfa, a);
break;
+ case REPLACEARC: /* replace arc's color */
+ newarc(nfa, a->type, con->co, a->from, to);
+ freearc(nfa, a);
+ break;
default:
assert(NOTREACHED);
break;
@@ -1764,7 +1780,7 @@ push(struct nfa *nfa,
for (a = to->outs; a != NULL && !NISERR(); a = nexta)
{
nexta = a->outchain;
- switch (combine(con, a))
+ switch (combine(nfa, con, a))
{
case INCOMPATIBLE: /* destroy the arc */
freearc(nfa, a);
@@ -1791,6 +1807,10 @@ push(struct nfa *nfa,
cparc(nfa, a, from, s);
freearc(nfa, a);
break;
+ case REPLACEARC: /* replace arc's color */
+ newarc(nfa, a->type, con->co, from, a->to);
+ freearc(nfa, a);
+ break;
default:
assert(NOTREACHED);
break;
@@ -1810,9 +1830,11 @@ push(struct nfa *nfa,
* #def INCOMPATIBLE 1 // destroys arc
* #def SATISFIED 2 // constraint satisfied
* #def COMPATIBLE 3 // compatible but not satisfied yet
+ * #def REPLACEARC 4 // replace arc's color with constraint color
*/
static int
-combine(struct arc *con,
+combine(struct nfa *nfa,
+ struct arc *con,
struct arc *a)
{
#define CA(ct,at) (((ct)<<CHAR_BIT) | (at))
@@ -1827,14 +1849,46 @@ combine(struct arc *con,
case CA(BEHIND, PLAIN):
if (con->co == a->co)
return SATISFIED;
+ if (con->co == RAINBOW)
+ {
+ /* con is satisfied unless arc's color is a pseudocolor */
+ if (!(nfa->cm->cd[a->co].flags & PSEUDO))
+ return SATISFIED;
+ }
+ else if (a->co == RAINBOW)
+ {
+ /* con is incompatible if it's for a pseudocolor */
+ if (nfa->cm->cd[con->co].flags & PSEUDO)
+ return INCOMPATIBLE;
+ /* otherwise, constraint constrains arc to be only its color */
+ return REPLACEARC;
+ }
return INCOMPATIBLE;
break;
case CA('^', '^'): /* collision, similar constraints */
case CA('$', '$'):
- case CA(AHEAD, AHEAD):
+ if (con->co == a->co) /* true duplication */
+ return SATISFIED;
+ return INCOMPATIBLE;
+ break;
+ case CA(AHEAD, AHEAD): /* collision, similar constraints */
case CA(BEHIND, BEHIND):
if (con->co == a->co) /* true duplication */
return SATISFIED;
+ if (con->co == RAINBOW)
+ {
+ /* con is satisfied unless arc's color is a pseudocolor */
+ if (!(nfa->cm->cd[a->co].flags & PSEUDO))
+ return SATISFIED;
+ }
+ else if (a->co == RAINBOW)
+ {
+ /* con is incompatible if it's for a pseudocolor */
+ if (nfa->cm->cd[con->co].flags & PSEUDO)
+ return INCOMPATIBLE;
+ /* otherwise, constraint constrains arc to be only its color */
+ return REPLACEARC;
+ }
return INCOMPATIBLE;
break;
case CA('^', BEHIND): /* collision, dissimilar constraints */
@@ -2895,6 +2949,7 @@ compact(struct nfa *nfa,
break;
case LACON:
assert(s->no != cnfa->pre);
+ assert(a->co >= 0);
ca->co = (color) (cnfa->ncolors + a->co);
ca->to = a->to->no;
ca++;
@@ -3068,13 +3123,22 @@ dumparc(struct arc *a,
switch (a->type)
{
case PLAIN:
- fprintf(f, "[%ld]", (long) a->co);
+ if (a->co == RAINBOW)
+ fprintf(f, "[*]");
+ else
+ fprintf(f, "[%ld]", (long) a->co);
break;
case AHEAD:
- fprintf(f, ">%ld>", (long) a->co);
+ if (a->co == RAINBOW)
+ fprintf(f, ">*>");
+ else
+ fprintf(f, ">%ld>", (long) a->co);
break;
case BEHIND:
- fprintf(f, "<%ld<", (long) a->co);
+ if (a->co == RAINBOW)
+ fprintf(f, "<*<");
+ else
+ fprintf(f, "<%ld<", (long) a->co);
break;
case LACON:
fprintf(f, ":%ld:", (long) a->co);
@@ -3161,7 +3225,9 @@ dumpcstate(int st,
pos = 1;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
- if (ca->co < cnfa->ncolors)
+ if (ca->co == RAINBOW)
+ fprintf(f, "\t[*]->%d", ca->to);
+ else if (ca->co < cnfa->ncolors)
fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
else
fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);