/*------------------------------------------------------------------------- * * tsquery_cleanup.c * Cleanup query from NOT values and/or stopword * Utility functions to correct work. * * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * * * IDENTIFICATION * src/backend/utils/adt/tsquery_cleanup.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "tsearch/ts_utils.h" #include "miscadmin.h" typedef struct NODE { struct NODE *left; struct NODE *right; QueryItem *valnode; } NODE; /* * To simplify walking on query tree and pushing down of phrase operator * we define some fake priority here: phrase operator has highest priority * of any other operators (and we believe here that OP_PHRASE is a highest * code of operations) and value node has ever highest priority. * Priority values of other operations don't matter until they are less than * phrase operator and value node. */ #define VALUE_PRIORITY (OP_COUNT + 1) #define NODE_PRIORITY(x) \ ( ((x)->valnode->qoperator.type == QI_OPR) ? \ (x)->valnode->qoperator.oper : VALUE_PRIORITY ) /* * make query tree from plain view of query */ static NODE * maketree(QueryItem *in) { NODE *node = (NODE *) palloc(sizeof(NODE)); /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); node->valnode = in; node->right = node->left = NULL; if (in->type == QI_OPR) { node->right = maketree(in + 1); if (in->qoperator.oper != OP_NOT) node->left = maketree(in + in->qoperator.left); } return node; } /* * Internal state for plaintree and plainnode */ typedef struct { QueryItem *ptr; int len; /* allocated size of ptr */ int cur; /* number of elements in ptr */ } PLAINTREE; static void plainnode(PLAINTREE *state, NODE *node) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); if (state->cur == state->len) { state->len *= 2; state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem)); } memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem)); if (node->valnode->type == QI_VAL) state->cur++; else if (node->valnode->qoperator.oper == OP_NOT) { state->ptr[state->cur].qoperator.left = 1; state->cur++; plainnode(state, node->right); } else { int cur = state->cur; state->cur++; plainnode(state, node->right); state->ptr[cur].qoperator.left = state->cur - cur; plainnode(state, node->left); } pfree(node); } /* * make plain view of tree from a NODE-tree representation */ static QueryItem * plaintree(NODE *root, int *len) { PLAINTREE pl; pl.cur = 0; pl.len = 16; if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR)) { pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem)); plainnode(&pl, root); } else pl.ptr = NULL; *len = pl.cur; return pl.ptr; } static void freetree(NODE *node) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); if (!node) return; if (node->left) freetree(node->left); if (node->right) freetree(node->right); pfree(node); } /* * clean tree for ! operator. * It's useful for debug, but in * other case, such view is used with search in index. * Operator ! always return TRUE */ static NODE * clean_NOT_intree(NODE *node) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); if (node->valnode->type == QI_VAL) return node; if (node->valnode->qoperator.oper == OP_NOT) { freetree(node); return NULL; } /* operator & or | */ if (node->valnode->qoperator.oper == OP_OR) { if ((node->left = clean_NOT_intree(node->left)) == NULL || (node->right = clean_NOT_intree(node->right)) == NULL) { freetree(node); return NULL; } } else { NODE *res = node; Assert(node->valnode->qoperator.oper == OP_AND || node->valnode->qoperator.oper == OP_PHRASE); node->left = clean_NOT_intree(node->left); node->right = clean_NOT_intree(node->right); if (node->left == NULL && node->right == NULL) { pfree(node); res = NULL; } else if (node->left == NULL) { res = node->right; pfree(node); } else if (node->right == NULL) { res = node->left; pfree(node); } return res; } return node; } QueryItem * clean_NOT(QueryItem *ptr, int *len) { NODE *root = maketree(ptr); return plaintree(clean_NOT_intree(root), len); } /* * Remove QI_VALSTOP (stopword) nodes from query tree. * * Returns NULL if the query degenerates to nothing. Input must not be NULL. * * When we remove a phrase operator due to removing one or both of its * arguments, we might need to adjust the distance of a parent phrase * operator. For example, 'a' is a stopword, so: * (b <-> a) <-> c should become b <2> c * b <-> (a <-> c) should become b <2> c * (b <-> (a <-> a)) <-> c should become b <3> c * b <-> ((a <-> a) <-> c) should become b <3> c * To handle that, we define two output parameters: * ladd: amount to add to a phrase distance to the left of this node * radd: amount to add to a phrase distance to the right of this node * We need two outputs because we could need to bubble up adjustments to two * different parent phrase operators. Consider * w <-> (((a <-> x) <2> (y <3> a)) <-> z) * After we've removed the two a's and are considering the <2> node (which is * now just x <2> y), we have an ladd distance of 1 that needs to propagate * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to * propagate to the rightmost <->, so that we'll end up with * w <2> ((x <2> y) <4> z) * Near the bottom of the tree, we may have subtrees consisting only of * stopwords. The distances of any phrase operators within such a subtree are * summed and propagated to both ladd and radd, since we don't know which side * of the lowest surviving phrase operator we are in. The rule is that any * subtree that degenerates to NULL must return equal values of ladd and radd, * and the parent node dealing with it should incorporate only one of those. * * Currently, we only implement this adjustment for adjacent phrase operators. * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which * isn't ideal, but there is no way to represent the really desired semantics * without some redesign of the tsquery structure. Certainly it would not be * any better to convert that to 'x <2> (y | z)'. Since this is such a weird * corner case, let it go for now. But we can fix it in cases where the * intervening non-phrase operator also gets removed, for example * '((x <-> a) | a) <-> y' will become 'x <2> y'. */ static NODE * clean_stopword_intree(NODE *node, int *ladd, int *radd) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); /* default output parameters indicate no change in parent distance */ *ladd = *radd = 0; if (node->valnode->type == QI_VAL) return node; else if (node->valnode->type == QI_VALSTOP) { pfree(node); return NULL; } Assert(node->valnode->type == QI_OPR); if (node->valnode->qoperator.oper == OP_NOT) { /* NOT doesn't change pattern width, so just report child distances */ node->right = clean_stopword_intree(node->right, ladd, radd); if (!node->right) { freetree(node); return NULL; } } else { NODE *res = node; bool isphrase; int ndistance, lladd, lradd, rladd, rradd; /* First, recurse */ node->left = clean_stopword_intree(node->left, &lladd, &lradd); node->right = clean_stopword_intree(node->right, &rladd, &rradd); /* Check if current node is OP_PHRASE, get its distance */ isphrase = (node->valnode->qoperator.oper == OP_PHRASE); ndistance = isphrase ? node->valnode->qoperator.distance : 0; if (node->left == NULL && node->right == NULL) { /* * When we collapse out a phrase node entirely, propagate its own * distance into both *ladd and *radd; it is the responsibility of * the parent node to count it only once. Also, for a phrase * node, distances coming from children are summed and propagated * up to parent (we assume lladd == lradd and rladd == rradd, else * rule was broken at a lower level). But if this isn't a phrase * node, take the larger of the two child distances; that * corresponds to what TS_execute will do in non-stopword cases. */ if (isphrase) *ladd = *radd = lladd + ndistance + rladd; else *ladd = *radd = Max(lladd, rladd); freetree(node); return NULL; } else if (node->left == NULL) { /* Removing this operator and left subnode */ /* lladd and lradd are equal/redundant, don't count both */ if (isphrase) { /* operator's own distance must propagate to left */ *ladd = lladd + ndistance + rladd; *radd = rradd; } else { /* at non-phrase op, just forget the left subnode entirely */ *ladd = rladd; *radd = rradd; } res = node->right; pfree(node); } else if (node->right == NULL) { /* Removing this operator and right subnode */ /* rladd and rradd are equal/redundant, don't count both */ if (isphrase) { /* operator's own distance must propagate to right */ *ladd = lladd; *radd = lradd + ndistance + rradd; } else { /* at non-phrase op, just forget the right subnode entirely */ *ladd = lladd; *radd = lradd; } res = node->left; pfree(node); } else if (isphrase) { /* Absorb appropriate corrections at this level */ node->valnode->qoperator.distance += lradd + rladd; /* Propagate up any unaccounted-for corrections */ *ladd = lladd; *radd = rradd; } else { /* We're keeping a non-phrase operator, so ladd/radd remain 0 */ } return res; } return node; } static NODE * copyNODE(NODE *node) { NODE *cnode = palloc(sizeof(NODE)); /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); cnode->valnode = palloc(sizeof(QueryItem)); *(cnode->valnode) = *(node->valnode); if (node->valnode->type == QI_OPR) { cnode->right = copyNODE(node->right); if (node->valnode->qoperator.oper != OP_NOT) cnode->left = copyNODE(node->left); } return cnode; } static NODE * makeNODE(int8 op, NODE *left, NODE *right) { NODE *node = palloc(sizeof(NODE)); /* zeroing allocation to prevent difference in unused bytes */ node->valnode = palloc0(sizeof(QueryItem)); node->valnode->qoperator.type = QI_OPR; node->valnode->qoperator.oper = op; node->left = left; node->right = right; return node; } /* * Move operation with high priority to the leaves. This guarantees * that the phrase operator will be near the bottom of the tree. * An idea behind is do not store position of lexemes during execution * of ordinary operations (AND, OR, NOT) because it could be expensive. * Actual transformation will be performed only on subtrees under the * <-> () operation since it's needed solely for the phrase operator. * * Rules: * a <-> (b | c) => (a <-> b) | (a <-> c) * (a | b) <-> c => (a <-> c) | (b <-> c) * a <-> !b => a & !(a <-> b) * !a <-> b => b & !(a <-> b) * * Warnings for readers: * a <-> b != b <-> a * * a (b c) != (a b) c since the phrase lengths are: * n 2n-1 */ static NODE * normalize_phrase_tree(NODE *node) { /* there should be no stop words at this point */ Assert(node->valnode->type != QI_VALSTOP); if (node->valnode->type == QI_VAL) return node; /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); Assert(node->valnode->type == QI_OPR); if (node->valnode->qoperator.oper == OP_NOT) { NODE *orignode = node; /* eliminate NOT sequence */ while (node->valnode->type == QI_OPR && node->valnode->qoperator.oper == node->right->valnode->qoperator.oper) { node = node->right->right; } if (orignode != node) /* current node isn't checked yet */ node = normalize_phrase_tree(node); else node->right = normalize_phrase_tree(node->right); } else if (node->valnode->qoperator.oper == OP_PHRASE) { int16 distance; NODE *X; node->left = normalize_phrase_tree(node->left); node->right = normalize_phrase_tree(node->right); /* * if subtree contains only nodes with higher "priority" then we are * done. See comment near NODE_PRIORITY() */ if (NODE_PRIORITY(node) <= NODE_PRIORITY(node->right) && NODE_PRIORITY(node) <= NODE_PRIORITY(node->left)) return node; /* * We can't swap left-right and works only with left child because of * a <-> b != b <-> a */ distance = node->valnode->qoperator.distance; if (node->right->valnode->type == QI_OPR) { switch (node->right->valnode->qoperator.oper) { case OP_AND: /* a <-> (b & c) => (a <-> b) & (a <-> c) */ node = makeNODE(OP_AND, makeNODE(OP_PHRASE, node->left, node->right->left), makeNODE(OP_PHRASE, copyNODE(node->left), node->right->right)); node->left->valnode->qoperator.distance = node->right->valnode->qoperator.distance = distance; break; case OP_OR: /* a <-> (b | c) => (a <-> b) | (a <-> c) */ node = makeNODE(OP_OR, makeNODE(OP_PHRASE, node->left, node->right->left), makeNODE(OP_PHRASE, copyNODE(node->left), node->right->right)); node->left->valnode->qoperator.distance = node->right->valnode->qoperator.distance = distance; break; case OP_NOT: /* a <-> !b => a & !(a <-> b) */ X = node->right; node->right = node->right->right; X->right = node; node = makeNODE(OP_AND, copyNODE(node->left), X); break; case OP_PHRASE: /* no-op */ break; default: elog(ERROR, "Wrong type of tsquery node: %d", node->right->valnode->qoperator.oper); } } if (node->left->valnode->type == QI_OPR && node->valnode->qoperator.oper == OP_PHRASE) { /* * if the node is still OP_PHRASE, check the left subtree, * otherwise the whole node will be transformed later. */ switch (node->left->valnode->qoperator.oper) { case OP_AND: /* (a & b) <-> c => (a <-> c) & (b <-> c) */ node = makeNODE(OP_AND, makeNODE(OP_PHRASE, node->left->left, node->right), makeNODE(OP_PHRASE, node->left->right, copyNODE(node->right))); node->left->valnode->qoperator.distance = node->right->valnode->qoperator.distance = distance; break; case OP_OR: /* (a | b) <-> c => (a <-> c) | (b <-> c) */ node = makeNODE(OP_OR, makeNODE(OP_PHRASE, node->left->left, node->right), makeNODE(OP_PHRASE, node->left->right, copyNODE(node->right))); node->left->valnode->qoperator.distance = node->right->valnode->qoperator.distance = distance; break; case OP_NOT: /* !a <-> b => b & !(a <-> b) */ X = node->left; node->left = node->left->right; X->right = node; node = makeNODE(OP_AND, X, copyNODE(node->right)); break; case OP_PHRASE: /* no-op */ break; default: elog(ERROR, "Wrong type of tsquery node: %d", node->left->valnode->qoperator.oper); } } /* continue transformation */ node = normalize_phrase_tree(node); } else /* AND or OR */ { node->left = normalize_phrase_tree(node->left); node->right = normalize_phrase_tree(node->right); } return node; } /* * Number of elements in query tree */ static int32 calcstrlen(NODE *node) { int32 size = 0; if (node->valnode->type == QI_VAL) { size = node->valnode->qoperand.length + 1; } else { Assert(node->valnode->type == QI_OPR); size = calcstrlen(node->right); if (node->valnode->qoperator.oper != OP_NOT) size += calcstrlen(node->left); } return size; } TSQuery cleanup_fakeval_and_phrase(TSQuery in) { int32 len, lenstr, commonlen, i; NODE *root; int ladd, radd; TSQuery out; QueryItem *items; char *operands; if (in->size == 0) return in; /* eliminate stop words */ root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd); if (root == NULL) { ereport(NOTICE, (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored"))); out = palloc(HDRSIZETQ); out->size = 0; SET_VARSIZE(out, HDRSIZETQ); return out; } /* push OP_PHRASE nodes down */ root = normalize_phrase_tree(root); /* * Build TSQuery from plain view */ lenstr = calcstrlen(root); items = plaintree(root, &len); commonlen = COMPUTESIZE(len, lenstr); out = palloc(commonlen); SET_VARSIZE(out, commonlen); out->size = len; memcpy(GETQUERY(out), items, len * sizeof(QueryItem)); items = GETQUERY(out); operands = GETOPERAND(out); for (i = 0; i < out->size; i++) { QueryOperand *op = (QueryOperand *) &items[i]; if (op->type != QI_VAL) continue; memcpy(operands, GETOPERAND(in) + op->distance, op->length); operands[op->length] = '\0'; op->distance = operands - GETOPERAND(out); operands += op->length + 1; } return out; }