summaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/tsquery_cleanup.c
diff options
context:
space:
mode:
authorTeodor Sigaev <teodor@sigaev.ru>2016-04-07 18:44:18 +0300
committerTeodor Sigaev <teodor@sigaev.ru>2016-04-07 18:44:18 +0300
commitbb140506df605fab58f48926ee1db1f80bdafb59 (patch)
tree581f9aeb71e3596000af3b4904e0c62a372d77b3 /src/backend/utils/adt/tsquery_cleanup.c
parent015e88942aa50f0d419ddac00e63bb06d6e62e86 (diff)
Phrase full text search.
Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery. On-disk and binary in/out format of tsquery are backward compatible. It has two side effect: - change order for tsquery, so, users, who has a btree index over tsquery, should reindex it - less number of parenthesis in tsquery output, and tsquery becomes more readable Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov Reviewers: Alexander Korotkov, Artur Zakirov
Diffstat (limited to 'src/backend/utils/adt/tsquery_cleanup.c')
-rw-r--r--src/backend/utils/adt/tsquery_cleanup.c362
1 files changed, 344 insertions, 18 deletions
diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c
index 333789be3c5..126795504ad 100644
--- a/src/backend/utils/adt/tsquery_cleanup.c
+++ b/src/backend/utils/adt/tsquery_cleanup.c
@@ -25,6 +25,12 @@ typedef struct NODE
QueryItem *valnode;
} NODE;
+/* Non-operator nodes have fake (but highest) priority */
+#define NODE_PRIORITY(x) \
+ ( ((x)->valnode->qoperator.type == QI_OPR) ? \
+ QO_PRIORITY((x)->valnode) : \
+ TOP_PRIORITY )
+
/*
* make query tree from plain view of query
*/
@@ -160,7 +166,8 @@ clean_NOT_intree(NODE *node)
{
NODE *res = node;
- Assert(node->valnode->qoperator.oper == OP_AND);
+ Assert(node->valnode->qoperator.oper == OP_AND ||
+ node->valnode->qoperator.oper == OP_PHRASE);
node->left = clean_NOT_intree(node->left);
node->right = clean_NOT_intree(node->right);
@@ -212,18 +219,20 @@ clean_NOT(QueryItem *ptr, int *len)
#define V_STOP 3 /* the expression is a stop word */
/*
- * Clean query tree from values which is always in
- * text (stopword)
+ * Remove QI_VALSTOP (stopword nodes) from query tree.
*/
static NODE *
-clean_fakeval_intree(NODE *node, char *result)
+clean_fakeval_intree(NODE *node, char *result, int *adddistance)
{
- char lresult = V_UNKNOWN,
- rresult = V_UNKNOWN;
+ char lresult = V_UNKNOWN,
+ rresult = V_UNKNOWN;
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
+ if (adddistance)
+ *adddistance = 0;
+
if (node->valnode->type == QI_VAL)
return node;
else if (node->valnode->type == QI_VALSTOP)
@@ -237,7 +246,7 @@ clean_fakeval_intree(NODE *node, char *result)
if (node->valnode->qoperator.oper == OP_NOT)
{
- node->right = clean_fakeval_intree(node->right, &rresult);
+ node->right = clean_fakeval_intree(node->right, &rresult, NULL);
if (!node->right)
{
*result = V_STOP;
@@ -247,13 +256,30 @@ clean_fakeval_intree(NODE *node, char *result)
}
else
{
- NODE *res = node;
+ NODE *res = node;
+ int ndistance, ldistance = 0, rdistance = 0;
+
+ ndistance = (node->valnode->qoperator.oper == OP_PHRASE) ?
+ node->valnode->qoperator.distance :
+ 0;
- node->left = clean_fakeval_intree(node->left, &lresult);
- node->right = clean_fakeval_intree(node->right, &rresult);
+ node->left = clean_fakeval_intree(node->left,
+ &lresult,
+ ndistance ? &ldistance : NULL);
+
+ node->right = clean_fakeval_intree(node->right,
+ &rresult,
+ ndistance ? &rdistance : NULL);
+
+ /*
+ * ndistance, ldistance and rdistance are greater than zero
+ * if their corresponding nodes are OP_PHRASE
+ */
if (lresult == V_STOP && rresult == V_STOP)
{
+ if (adddistance && ndistance)
+ *adddistance = ldistance + ndistance + rdistance;
freetree(node);
*result = V_STOP;
return NULL;
@@ -261,33 +287,333 @@ clean_fakeval_intree(NODE *node, char *result)
else if (lresult == V_STOP)
{
res = node->right;
+ /*
+ * propagate distance from current node to the
+ * right upper subtree.
+ */
+ if (adddistance && ndistance)
+ *adddistance = rdistance;
pfree(node);
}
else if (rresult == V_STOP)
{
res = node->left;
+ /*
+ * propagate distance from current node to the upper tree.
+ */
+ if (adddistance && ndistance)
+ *adddistance = ndistance + ldistance;
pfree(node);
}
+ else if (ndistance)
+ {
+ node->valnode->qoperator.distance += ldistance;
+ if (adddistance)
+ *adddistance = 0;
+ }
+ else if (adddistance)
+ {
+ *adddistance = 0;
+ }
+
return res;
}
return node;
}
-QueryItem *
-clean_fakeval(QueryItem *ptr, int *len)
+static NODE *
+copyNODE(NODE *node)
{
- NODE *root = maketree(ptr);
+ NODE *cnode = palloc(sizeof(NODE));
+
+ /* since this function recurses, it could be driven to stack overflow. */
+ check_stack_depth();
+
+ cnode->valnode = palloc(sizeof(QueryItem));
+ *(cnode->valnode) = *(node->valnode);
+
+ if (node->valnode->type == QI_OPR)
+ {
+ cnode->right = copyNODE(node->right);
+ if (node->valnode->qoperator.oper != OP_NOT)
+ cnode->left = copyNODE(node->left);
+ }
+
+ return cnode;
+}
+
+static NODE *
+makeNODE(int8 op, NODE *left, NODE *right)
+{
+ NODE *node = palloc(sizeof(NODE));
+
+ node->valnode = palloc(sizeof(QueryItem));
+
+ node->valnode->qoperator.type = QI_OPR;
+ node->valnode->qoperator.oper = op;
+
+ node->left = left;
+ node->right = right;
+
+ return node;
+}
+
+/*
+ * Move operation with high priority to the leaves. This guarantees
+ * that the phrase operator will be near the bottom of the tree.
+ * An idea behind is do not store position of lexemes during execution
+ * of ordinary operations (AND, OR, NOT) because it could be expensive.
+ * Actual transformation will be performed only on subtrees under the
+ * <-> (<n>) operation since it's needed solely for the phrase operator.
+ *
+ * Rules:
+ * a <-> (b | c) => (a <-> b) | (a <-> c)
+ * (a | b) <-> c => (a <-> c) | (b <-> c)
+ * a <-> !b => a & !(a <-> b)
+ * !a <-> b => b & !(a <-> b)
+ *
+ * Warnings for readers:
+ * a <-> b != b <-> a
+ *
+ * a <n> (b <n> c) != (a <n> b) <n> c since the phrase lengths are:
+ * n 2n-1
+ */
+static NODE *
+normalize_phrase_tree(NODE *node)
+{
+ /* there should be no stop words at this point */
+ Assert(node->valnode->type != QI_VALSTOP);
+
+ if (node->valnode->type == QI_VAL)
+ return node;
+
+ /* since this function recurses, it could be driven to stack overflow. */
+ check_stack_depth();
+
+ Assert(node->valnode->type == QI_OPR);
+
+ if (node->valnode->qoperator.oper == OP_NOT)
+ {
+ /* eliminate NOT sequence */
+ while (node->valnode->type == QI_OPR &&
+ node->valnode->qoperator.oper == node->right->valnode->qoperator.oper)
+ {
+ node = node->right->right;
+ }
+
+ node->right = normalize_phrase_tree(node->right);
+ }
+ else if (node->valnode->qoperator.oper == OP_PHRASE)
+ {
+ int16 distance;
+ NODE *X;
+
+ node->left = normalize_phrase_tree(node->left);
+ node->right = normalize_phrase_tree(node->right);
+
+ if (NODE_PRIORITY(node) <= NODE_PRIORITY(node->right) &&
+ NODE_PRIORITY(node) <= NODE_PRIORITY(node->left))
+ return node;
+
+ /*
+ * We can't swap left-right and works only with left child
+ * because of a <-> b != b <-> a
+ */
+
+ distance = node->valnode->qoperator.distance;
+
+ if (node->right->valnode->type == QI_OPR)
+ {
+ switch (node->right->valnode->qoperator.oper)
+ {
+ case OP_AND:
+ /* a <-> (b & c) => (a <-> b) & (a <-> c) */
+ node = makeNODE(OP_AND,
+ makeNODE(OP_PHRASE,
+ node->left,
+ node->right->left),
+ makeNODE(OP_PHRASE,
+ copyNODE(node->left),
+ node->right->right));
+ node->left->valnode->qoperator.distance =
+ node->right->valnode->qoperator.distance = distance;
+ break;
+ case OP_OR:
+ /* a <-> (b | c) => (a <-> b) | (a <-> c) */
+ node = makeNODE(OP_OR,
+ makeNODE(OP_PHRASE,
+ node->left,
+ node->right->left),
+ makeNODE(OP_PHRASE,
+ copyNODE(node->left),
+ node->right->right));
+ node->left->valnode->qoperator.distance =
+ node->right->valnode->qoperator.distance = distance;
+ break;
+ case OP_NOT:
+ /* a <-> !b => a & !(a <-> b) */
+ X = node->right;
+ node->right = node->right->right;
+ X->right = node;
+ node = makeNODE(OP_AND,
+ copyNODE(node->left),
+ X);
+ break;
+ case OP_PHRASE:
+ /* no-op */
+ break;
+ default:
+ elog(ERROR,"Wrong type of tsquery node: %d",
+ node->right->valnode->qoperator.oper);
+ }
+ }
+
+ if (node->left->valnode->type == QI_OPR &&
+ node->valnode->qoperator.oper == OP_PHRASE)
+ {
+ /*
+ * if the node is still OP_PHRASE, check the left subtree,
+ * otherwise the whole node will be transformed later.
+ */
+ switch(node->left->valnode->qoperator.oper)
+ {
+ case OP_AND:
+ /* (a & b) <-> c => (a <-> c) & (b <-> c) */
+ node = makeNODE(OP_AND,
+ makeNODE(OP_PHRASE,
+ node->left->left,
+ node->right),
+ makeNODE(OP_PHRASE,
+ node->left->right,
+ copyNODE(node->right)));
+ node->left->valnode->qoperator.distance =
+ node->right->valnode->qoperator.distance = distance;
+ break;
+ case OP_OR:
+ /* (a | b) <-> c => (a <-> c) | (b <-> c) */
+ node = makeNODE(OP_OR,
+ makeNODE(OP_PHRASE,
+ node->left->left,
+ node->right),
+ makeNODE(OP_PHRASE,
+ node->left->right,
+ copyNODE(node->right)));
+ node->left->valnode->qoperator.distance =
+ node->right->valnode->qoperator.distance = distance;
+ break;
+ case OP_NOT:
+ /* !a <-> b => b & !(a <-> b) */
+ X = node->left;
+ node->left = node->left->right;
+ X->right = node;
+ node = makeNODE(OP_AND,
+ X,
+ copyNODE(node->right));
+ break;
+ case OP_PHRASE:
+ /* no-op */
+ break;
+ default:
+ elog(ERROR,"Wrong type of tsquery node: %d",
+ node->left->valnode->qoperator.oper);
+ }
+ }
+
+ /* continue transformation */
+ node = normalize_phrase_tree(node);
+ }
+ else /* AND or OR */
+ {
+ node->left = normalize_phrase_tree(node->left);
+ node->right = normalize_phrase_tree(node->right);
+ }
+
+ return node;
+}
+
+/*
+ * Number of elements in query tree
+ */
+static int32
+calcstrlen(NODE *node)
+{
+ int32 size = 0;
+
+ if (node->valnode->type == QI_VAL)
+ {
+ size = node->valnode->qoperand.length + 1;
+ }
+ else
+ {
+ Assert(node->valnode->type == QI_OPR);
+
+ size = calcstrlen(node->right);
+ if (node->valnode->qoperator.oper != OP_NOT)
+ size += calcstrlen(node->left);
+ }
+
+ return size;
+}
+
+TSQuery
+cleanup_fakeval_and_phrase(TSQuery in)
+{
+ int32 len,
+ lenstr,
+ commonlen,
+ i;
+ NODE *root;
char result = V_UNKNOWN;
- NODE *resroot;
+ TSQuery out;
+ QueryItem *items;
+ char *operands;
- resroot = clean_fakeval_intree(root, &result);
+ if (in->size == 0)
+ return in;
+
+ /* eliminate stop words */
+ root = clean_fakeval_intree(maketree(GETQUERY(in)), &result, NULL);
if (result != V_UNKNOWN)
{
ereport(NOTICE,
(errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
- *len = 0;
- return NULL;
+ out = palloc(HDRSIZETQ);
+ out->size = 0;
+ SET_VARSIZE(out, HDRSIZETQ);
+ return out;
+ }
+
+ /* push OP_PHRASE nodes down */
+ root = normalize_phrase_tree(root);
+
+ /*
+ * Build TSQuery from plain view
+ */
+
+ lenstr = calcstrlen(root);
+ items = plaintree(root, &len);
+ commonlen = COMPUTESIZE(len, lenstr);
+
+ out = palloc(commonlen);
+ SET_VARSIZE(out, commonlen);
+ out->size = len;
+
+ memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
+
+ items = GETQUERY(out);
+ operands = GETOPERAND(out);
+ for (i = 0; i < out->size; i++)
+ {
+ QueryOperand *op = (QueryOperand *) &items[i];
+
+ if (op->type != QI_VAL)
+ continue;
+
+ memcpy(operands, GETOPERAND(in) + op->distance, op->length);
+ operands[op->length] = '\0';
+ op->distance = operands - GETOPERAND(out);
+ operands += op->length + 1;
}
- return plaintree(resroot, len);
+ return out;
}