/*-------------------------------------------------------------------------
 *
 * tsquery_cleanup.c
 *	 Cleanup query from NOT values and/or stopword
 *	 Utility functions to correct work.
 *
 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  src/backend/utils/adt/tsquery_cleanup.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "tsearch/ts_utils.h"
#include "miscadmin.h"

typedef struct NODE
{
	struct NODE *left;
	struct NODE *right;
	QueryItem  *valnode;
} NODE;

/*
 * To simplify walking on query tree and pushing down of phrase operator
 * we define some fake priority here: phrase operator has highest priority
 * of any other operators (and we believe here that OP_PHRASE is a highest
 * code of operations) and value node has ever highest priority.
 * Priority values of other operations don't matter until they are less than
 * phrase operator and value node.
 */
#define VALUE_PRIORITY			(OP_COUNT + 1)
#define NODE_PRIORITY(x) \
	( ((x)->valnode->qoperator.type == QI_OPR) ? \
		(x)->valnode->qoperator.oper : VALUE_PRIORITY )

/*
 * make query tree from plain view of query
 */
static NODE *
maketree(QueryItem *in)
{
	NODE	   *node = (NODE *) palloc(sizeof(NODE));

	/* since this function recurses, it could be driven to stack overflow. */
	check_stack_depth();

	node->valnode = in;
	node->right = node->left = NULL;
	if (in->type == QI_OPR)
	{
		node->right = maketree(in + 1);
		if (in->qoperator.oper != OP_NOT)
			node->left = maketree(in + in->qoperator.left);
	}
	return node;
}

/*
 * Internal state for plaintree and plainnode
 */
typedef struct
{
	QueryItem  *ptr;
	int			len;			/* allocated size of ptr */
	int			cur;			/* number of elements in ptr */
} PLAINTREE;

static void
plainnode(PLAINTREE *state, NODE *node)
{
	/* since this function recurses, it could be driven to stack overflow. */
	check_stack_depth();

	if (state->cur == state->len)
	{
		state->len *= 2;
		state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem));
	}
	memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem));
	if (node->valnode->type == QI_VAL)
		state->cur++;
	else if (node->valnode->qoperator.oper == OP_NOT)
	{
		state->ptr[state->cur].qoperator.left = 1;
		state->cur++;
		plainnode(state, node->right);
	}
	else
	{
		int			cur = state->cur;

		state->cur++;
		plainnode(state, node->right);
		state->ptr[cur].qoperator.left = state->cur - cur;
		plainnode(state, node->left);
	}
	pfree(node);
}

/*
 * make plain view of tree from a NODE-tree representation
 */
static QueryItem *
plaintree(NODE *root, int *len)
{
	PLAINTREE	pl;

	pl.cur = 0;
	pl.len = 16;
	if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
	{
		pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
		plainnode(&pl, root);
	}
	else
		pl.ptr = NULL;
	*len = pl.cur;
	return pl.ptr;
}

static void
freetree(NODE *node)
{
	/* since this function recurses, it could be driven to stack overflow. */
	check_stack_depth();

	if (!node)
		return;
	if (node->left)
		freetree(node->left);
	if (node->right)
		freetree(node->right);
	pfree(node);
}

/*
 * clean tree for ! operator.
 * It's useful for debug, but in
 * other case, such view is used with search in index.
 * Operator ! always return TRUE
 */
static NODE *
clean_NOT_intree(NODE *node)
{
	/* since this function recurses, it could be driven to stack overflow. */
	check_stack_depth();

	if (node->valnode->type == QI_VAL)
		return node;

	if (node->valnode->qoperator.oper == OP_NOT)
	{
		freetree(node);
		return NULL;
	}

	/* operator & or | */
	if (node->valnode->qoperator.oper == OP_OR)
	{
		if ((node->left = clean_NOT_intree(node->left)) == NULL ||
			(node->right = clean_NOT_intree(node->right)) == NULL)
		{
			freetree(node);
			return NULL;
		}
	}
	else
	{
		NODE	   *res = node;

		Assert(node->valnode->qoperator.oper == OP_AND ||
			   node->valnode->qoperator.oper == OP_PHRASE);

		node->left = clean_NOT_intree(node->left);
		node->right = clean_NOT_intree(node->right);
		if (node->left == NULL && node->right == NULL)
		{
			pfree(node);
			res = NULL;
		}
		else if (node->left == NULL)
		{
			res = node->right;
			pfree(node);
		}
		else if (node->right == NULL)
		{
			res = node->left;
			pfree(node);
		}
		return res;
	}
	return node;
}

QueryItem *
clean_NOT(QueryItem *ptr, int *len)
{
	NODE	   *root = maketree(ptr);

	return plaintree(clean_NOT_intree(root), len);
}


/*
 * Remove QI_VALSTOP (stopword) nodes from query tree.
 *
 * Returns NULL if the query degenerates to nothing.  Input must not be NULL.
 *
 * When we remove a phrase operator due to removing one or both of its
 * arguments, we might need to adjust the distance of a parent phrase
 * operator.  For example, 'a' is a stopword, so:
 *		(b <-> a) <-> c  should become	b <2> c
 *		b <-> (a <-> c)  should become	b <2> c
 *		(b <-> (a <-> a)) <-> c  should become	b <3> c
 *		b <-> ((a <-> a) <-> c)  should become	b <3> c
 * To handle that, we define two output parameters:
 *		ladd: amount to add to a phrase distance to the left of this node
 *		radd: amount to add to a phrase distance to the right of this node
 * We need two outputs because we could need to bubble up adjustments to two
 * different parent phrase operators.  Consider
 *		w <-> (((a <-> x) <2> (y <3> a)) <-> z)
 * After we've removed the two a's and are considering the <2> node (which is
 * now just x <2> y), we have an ladd distance of 1 that needs to propagate
 * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
 * propagate to the rightmost <->, so that we'll end up with
 *		w <2> ((x <2> y) <4> z)
 * Near the bottom of the tree, we may have subtrees consisting only of
 * stopwords.  The distances of any phrase operators within such a subtree are
 * summed and propagated to both ladd and radd, since we don't know which side
 * of the lowest surviving phrase operator we are in.  The rule is that any
 * subtree that degenerates to NULL must return equal values of ladd and radd,
 * and the parent node dealing with it should incorporate only one of those.
 *
 * Currently, we only implement this adjustment for adjacent phrase operators.
 * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
 * isn't ideal, but there is no way to represent the really desired semantics
 * without some redesign of the tsquery structure.  Certainly it would not be
 * any better to convert that to 'x <2> (y | z)'.  Since this is such a weird
 * corner case, let it go for now.  But we can fix it in cases where the
 * intervening non-phrase operator also gets removed, for example
 * '((x <-> a) | a) <-> y' will become 'x <2> y'.
 */
static NODE *
clean_stopword_intree(NODE *node, int *ladd, int *radd)
{
	/* since this function recurses, it could be driven to stack overflow. */
	check_stack_depth();

	/* default output parameters indicate no change in parent distance */
	*ladd = *radd = 0;

	if (node->valnode->type == QI_VAL)
		return node;
	else if (node->valnode->type == QI_VALSTOP)
	{
		pfree(node);
		return NULL;
	}

	Assert(node->valnode->type == QI_OPR);

	if (node->valnode->qoperator.oper == OP_NOT)
	{
		/* NOT doesn't change pattern width, so just report child distances */
		node->right = clean_stopword_intree(node->right, ladd, radd);
		if (!node->right)
		{
			freetree(node);
			return NULL;
		}
	}
	else
	{
		NODE	   *res = node;
		bool		isphrase;
		int			ndistance,
					lladd,
					lradd,
					rladd,
					rradd;

		/* First, recurse */
		node->left = clean_stopword_intree(node->left, &lladd, &lradd);
		node->right = clean_stopword_intree(node->right, &rladd, &rradd);

		/* Check if current node is OP_PHRASE, get its distance */
		isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
		ndistance = isphrase ? node->valnode->qoperator.distance : 0;

		if (node->left == NULL && node->right == NULL)
		{
			/*
			 * When we collapse out a phrase node entirely, propagate its own
			 * distance into both *ladd and *radd; it is the responsibility of
			 * the parent node to count it only once.  Also, for a phrase
			 * node, distances coming from children are summed and propagated
			 * up to parent (we assume lladd == lradd and rladd == rradd, else
			 * rule was broken at a lower level).  But if this isn't a phrase
			 * node, take the larger of the two child distances; that
			 * corresponds to what TS_execute will do in non-stopword cases.
			 */
			if (isphrase)
				*ladd = *radd = lladd + ndistance + rladd;
			else
				*ladd = *radd = Max(lladd, rladd);
			freetree(node);
			return NULL;
		}
		else if (node->left == NULL)
		{
			/* Removing this operator and left subnode */
			/* lladd and lradd are equal/redundant, don't count both */
			if (isphrase)
			{
				/* operator's own distance must propagate to left */
				*ladd = lladd + ndistance + rladd;
				*radd = rradd;
			}
			else
			{
				/* at non-phrase op, just forget the left subnode entirely */
				*ladd = rladd;
				*radd = rradd;
			}
			res = node->right;
			pfree(node);
		}
		else if (node->right == NULL)
		{
			/* Removing this operator and right subnode */
			/* rladd and rradd are equal/redundant, don't count both */
			if (isphrase)
			{
				/* operator's own distance must propagate to right */
				*ladd = lladd;
				*radd = lradd + ndistance + rradd;
			}
			else
			{
				/* at non-phrase op, just forget the right subnode entirely */
				*ladd = lladd;
				*radd = lradd;
			}
			res = node->left;
			pfree(node);
		}
		else if (isphrase)
		{
			/* Absorb appropriate corrections at this level */
			node->valnode->qoperator.distance += lradd + rladd;
			/* Propagate up any unaccounted-for corrections */
			*ladd = lladd;
			*radd = rradd;
		}
		else
		{
			/* We're keeping a non-phrase operator, so ladd/radd remain 0 */
		}

		return res;
	}
	return node;
}

static NODE *
copyNODE(NODE *node)
{
	NODE	   *cnode = palloc(sizeof(NODE));

	/* since this function recurses, it could be driven to stack overflow. */
	check_stack_depth();

	cnode->valnode = palloc(sizeof(QueryItem));
	*(cnode->valnode) = *(node->valnode);

	if (node->valnode->type == QI_OPR)
	{
		cnode->right = copyNODE(node->right);
		if (node->valnode->qoperator.oper != OP_NOT)
			cnode->left = copyNODE(node->left);
	}

	return cnode;
}

static NODE *
makeNODE(int8 op, NODE *left, NODE *right)
{
	NODE	   *node = palloc(sizeof(NODE));

	/* zeroing allocation to prevent difference in unused bytes */
	node->valnode = palloc0(sizeof(QueryItem));

	node->valnode->qoperator.type = QI_OPR;
	node->valnode->qoperator.oper = op;

	node->left = left;
	node->right = right;

	return node;
}

/*
 * Move operation with high priority to the leaves. This guarantees
 * that the phrase operator will be near the bottom of the tree.
 * An idea behind is do not store position of lexemes during execution
 * of ordinary operations (AND, OR, NOT) because it could be expensive.
 * Actual transformation will be performed only on subtrees under the
 * <-> (<n>) operation since it's needed solely for the phrase operator.
 *
 * Rules:
 *	  a  <->  (b | c)	=>	(a <-> b)  |   (a <-> c)
 *	 (a | b)  <->	 c	   =>	(a <-> c)  |   (b <-> c)
 *	  a  <->	!b	   =>		a	  &  !(a <-> b)
 *	 !a  <->	 b	   =>		b	  &  !(a <-> b)
 *
 * Warnings for readers:
 *		  a <-> b	   !=	   b <-> a
 *
 *	  a <n> (b <n> c)	!=	 (a <n> b) <n> c since the phrase lengths are:
 *			 n					2n-1
 */
static NODE *
normalize_phrase_tree(NODE *node)
{
	/* there should be no stop words at this point */
	Assert(node->valnode->type != QI_VALSTOP);

	if (node->valnode->type == QI_VAL)
		return node;

	/* since this function recurses, it could be driven to stack overflow. */
	check_stack_depth();

	Assert(node->valnode->type == QI_OPR);

	if (node->valnode->qoperator.oper == OP_NOT)
	{
		NODE	   *orignode = node;

		/* eliminate NOT sequence */
		while (node->valnode->type == QI_OPR &&
		node->valnode->qoperator.oper == node->right->valnode->qoperator.oper)
		{
			node = node->right->right;
		}

		if (orignode != node)
			/* current node isn't checked yet */
			node = normalize_phrase_tree(node);
		else
			node->right = normalize_phrase_tree(node->right);
	}
	else if (node->valnode->qoperator.oper == OP_PHRASE)
	{
		int16		distance;
		NODE	   *X;

		node->left = normalize_phrase_tree(node->left);
		node->right = normalize_phrase_tree(node->right);

		/*
		 * if subtree contains only nodes with higher "priority" then we are
		 * done. See comment near NODE_PRIORITY()
		 */
		if (NODE_PRIORITY(node) <= NODE_PRIORITY(node->right) &&
			NODE_PRIORITY(node) <= NODE_PRIORITY(node->left))
			return node;

		/*
		 * We can't swap left-right and works only with left child because of
		 * a <-> b	!=	b <-> a
		 */

		distance = node->valnode->qoperator.distance;

		if (node->right->valnode->type == QI_OPR)
		{
			switch (node->right->valnode->qoperator.oper)
			{
				case OP_AND:
					/* a <-> (b & c)  =>  (a <-> b) & (a <-> c) */
					node = makeNODE(OP_AND,
									makeNODE(OP_PHRASE,
											 node->left,
											 node->right->left),
									makeNODE(OP_PHRASE,
											 copyNODE(node->left),
											 node->right->right));
					node->left->valnode->qoperator.distance =
						node->right->valnode->qoperator.distance = distance;
					break;
				case OP_OR:
					/* a <-> (b | c)  =>  (a <-> b) | (a <-> c) */
					node = makeNODE(OP_OR,
									makeNODE(OP_PHRASE,
											 node->left,
											 node->right->left),
									makeNODE(OP_PHRASE,
											 copyNODE(node->left),
											 node->right->right));
					node->left->valnode->qoperator.distance =
						node->right->valnode->qoperator.distance = distance;
					break;
				case OP_NOT:
					/* a <-> !b  =>  a & !(a <-> b) */
					X = node->right;
					node->right = node->right->right;
					X->right = node;
					node = makeNODE(OP_AND,
									copyNODE(node->left),
									X);
					break;
				case OP_PHRASE:
					/* no-op */
					break;
				default:
					elog(ERROR, "Wrong type of tsquery node: %d",
						 node->right->valnode->qoperator.oper);
			}
		}

		if (node->left->valnode->type == QI_OPR &&
			node->valnode->qoperator.oper == OP_PHRASE)
		{
			/*
			 * if the node is still OP_PHRASE, check the left subtree,
			 * otherwise the whole node will be transformed later.
			 */
			switch (node->left->valnode->qoperator.oper)
			{
				case OP_AND:
					/* (a & b) <-> c  =>  (a <-> c) & (b <-> c) */
					node = makeNODE(OP_AND,
									makeNODE(OP_PHRASE,
											 node->left->left,
											 node->right),
									makeNODE(OP_PHRASE,
											 node->left->right,
											 copyNODE(node->right)));
					node->left->valnode->qoperator.distance =
						node->right->valnode->qoperator.distance = distance;
					break;
				case OP_OR:
					/* (a | b) <-> c  =>  (a <-> c) | (b <-> c) */
					node = makeNODE(OP_OR,
									makeNODE(OP_PHRASE,
											 node->left->left,
											 node->right),
									makeNODE(OP_PHRASE,
											 node->left->right,
											 copyNODE(node->right)));
					node->left->valnode->qoperator.distance =
						node->right->valnode->qoperator.distance = distance;
					break;
				case OP_NOT:
					/* !a <-> b  =>  b & !(a <-> b) */
					X = node->left;
					node->left = node->left->right;
					X->right = node;
					node = makeNODE(OP_AND,
									X,
									copyNODE(node->right));
					break;
				case OP_PHRASE:
					/* no-op */
					break;
				default:
					elog(ERROR, "Wrong type of tsquery node: %d",
						 node->left->valnode->qoperator.oper);
			}
		}

		/* continue transformation */
		node = normalize_phrase_tree(node);
	}
	else	/* AND or OR */
	{
		node->left = normalize_phrase_tree(node->left);
		node->right = normalize_phrase_tree(node->right);
	}

	return node;
}

/*
 * Number of elements in query tree
 */
static int32
calcstrlen(NODE *node)
{
	int32		size = 0;

	if (node->valnode->type == QI_VAL)
	{
		size = node->valnode->qoperand.length + 1;
	}
	else
	{
		Assert(node->valnode->type == QI_OPR);

		size = calcstrlen(node->right);
		if (node->valnode->qoperator.oper != OP_NOT)
			size += calcstrlen(node->left);
	}

	return size;
}

TSQuery
cleanup_fakeval_and_phrase(TSQuery in)
{
	int32		len,
				lenstr,
				commonlen,
				i;
	NODE	   *root;
	int			ladd,
				radd;
	TSQuery		out;
	QueryItem  *items;
	char	   *operands;

	if (in->size == 0)
		return in;

	/* eliminate stop words */
	root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
	if (root == NULL)
	{
		ereport(NOTICE,
				(errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
		out = palloc(HDRSIZETQ);
		out->size = 0;
		SET_VARSIZE(out, HDRSIZETQ);
		return out;
	}

	/* push OP_PHRASE nodes down */
	root = normalize_phrase_tree(root);

	/*
	 * Build TSQuery from plain view
	 */

	lenstr = calcstrlen(root);
	items = plaintree(root, &len);
	commonlen = COMPUTESIZE(len, lenstr);

	out = palloc(commonlen);
	SET_VARSIZE(out, commonlen);
	out->size = len;

	memcpy(GETQUERY(out), items, len * sizeof(QueryItem));

	items = GETQUERY(out);
	operands = GETOPERAND(out);
	for (i = 0; i < out->size; i++)
	{
		QueryOperand *op = (QueryOperand *) &items[i];

		if (op->type != QI_VAL)
			continue;

		memcpy(operands, GETOPERAND(in) + op->distance, op->length);
		operands[op->length] = '\0';
		op->distance = operands - GETOPERAND(out);
		operands += op->length + 1;
	}

	return out;
}