Pathify RHS unique-ification for semijoin planning

There are two implementation techniques for semijoins: one uses the JOIN_SEMI jointype, where the executor emits at most one matching row per left-hand side (LHS) row; the other unique-ifies the right-hand side (RHS) and then performs a plain inner join. The latter technique currently has some drawbacks related to the unique-ification step. * Only the cheapest-total path of the RHS is considered during unique-ification. This may cause us to miss some optimization opportunities; for example, a path with a better sort order might be overlooked simply because it is not the cheapest in total cost. Such a path could help avoid a sort at a higher level, potentially resulting in a cheaper overall plan. * We currently rely on heuristics to choose between hash-based and sort-based unique-ification. A better approach would be to generate paths for both methods and allow add_path() to decide which one is preferable, consistent with how path selection is handled elsewhere in the planner. * In the sort-based implementation, we currently pay no attention to the pathkeys of the input subpath or the resulting output. This can result in redundant sort nodes being added to the final plan. This patch improves semijoin planning by creating a new RelOptInfo for the RHS rel to represent its unique-ified version. It then generates multiple paths that represent elimination of distinct rows from the RHS, considering both a hash-based implementation using the cheapest total path of the original RHS rel, and sort-based implementations that either exploit presorted input paths or explicitly sort the cheapest total path. All resulting paths compete in add_path(), and those deemed worthy of consideration are added to the new RelOptInfo. Finally, the unique-ified rel is joined with the other side of the semijoin using a plain inner join. As a side effect, most of the code related to the JOIN_UNIQUE_OUTER and JOIN_UNIQUE_INNER jointypes -- used to indicate that the LHS or RHS path should be made unique -- has been removed. Besides, the T_Unique path now has the same meaning for both semijoins and upper DISTINCT clauses: it represents adjacent-duplicate removal on presorted input. This patch unifies their handling by sharing the same data structures and functions. This patch also removes the UNIQUE_PATH_NOOP related code along the way, as it is dead code -- if the RHS rel is provably unique, the semijoin should have already been simplified to a plain inner join by analyzejoins.c. Author: Richard Guo <guofenglinux@gmail.com> Reviewed-by: Alexandra Wang <alexandra.wang.oss@gmail.com> Reviewed-by: wenhui qiu <qiuwenhuifx@gmail.com> Discussion: https://postgr.es/m/CAMbWs4-EBnaRvEs7frTLbsXiweSTUXifsteF-d3rvv01FKO86w@mail.gmail.com
author: Richard Guo <rguo@postgresql.org> 2025-08-19 09:35:40 +0900
committer: Richard Guo <rguo@postgresql.org> 2025-08-19 09:35:40 +0900
commit: 24225ad9aafc576295e210026d8ffa9f50d61145 (patch)
tree: 3634e26824f97b86244458a5fdae827034ac6bdf /src/backend/optimizer/plan/createplan.c
parent: 3c07944d048c82ae67884c1f09e81f53f769ac2a (diff)
1 files changed, 24 insertions, 277 deletions
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 9fd5c31edf2..6791cbeb416 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -95,8 +95,6 @@ static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path
 									  int flags);
 static Memoize *create_memoize_plan(PlannerInfo *root, MemoizePath *best_path,
 									int flags);
-static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path,
-								int flags);
 static Gather *create_gather_plan(PlannerInfo *root, GatherPath *best_path);
 static Plan *create_projection_plan(PlannerInfo *root,
 									ProjectionPath *best_path,
@@ -106,8 +104,7 @@ static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags)
 static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root,
 													IncrementalSortPath *best_path, int flags);
 static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path);
-static Unique *create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path,
-										int flags);
+static Unique *create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags);
 static Agg *create_agg_plan(PlannerInfo *root, AggPath *best_path);
 static Plan *create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path);
 static Result *create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path);
@@ -296,9 +293,9 @@ static WindowAgg *make_windowagg(List *tlist, WindowClause *wc,
 static Group *make_group(List *tlist, List *qual, int numGroupCols,
 						 AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations,
 						 Plan *lefttree);
-static Unique *make_unique_from_sortclauses(Plan *lefttree, List *distinctList);
 static Unique *make_unique_from_pathkeys(Plan *lefttree,
-										 List *pathkeys, int numCols);
+										 List *pathkeys, int numCols,
+										 Relids relids);
 static Gather *make_gather(List *qptlist, List *qpqual,
 						   int nworkers, int rescan_param, bool single_copy, Plan *subplan);
 static SetOp *make_setop(SetOpCmd cmd, SetOpStrategy strategy,
@@ -470,19 +467,9 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags)
 												flags);
 			break;
 		case T_Unique:
-			if (IsA(best_path, UpperUniquePath))
-			{
-				plan = (Plan *) create_upper_unique_plan(root,
-														 (UpperUniquePath *) best_path,
-														 flags);
-			}
-			else
-			{
-				Assert(IsA(best_path, UniquePath));
-				plan = create_unique_plan(root,
-										  (UniquePath *) best_path,
-										  flags);
-			}
+			plan = (Plan *) create_unique_plan(root,
+											   (UniquePath *) best_path,
+											   flags);
 			break;
 		case T_Gather:
 			plan = (Plan *) create_gather_plan(root,
@@ -1765,207 +1752,6 @@ create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags)
 }
 
 /*
- * create_unique_plan
- *	  Create a Unique plan for 'best_path' and (recursively) plans
- *	  for its subpaths.
- *
- *	  Returns a Plan node.
- */
-static Plan *
-create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags)
-{
-	Plan	   *plan;
-	Plan	   *subplan;
-	List	   *in_operators;
-	List	   *uniq_exprs;
-	List	   *newtlist;
-	int			nextresno;
-	bool		newitems;
-	int			numGroupCols;
-	AttrNumber *groupColIdx;
-	Oid		   *groupCollations;
-	int			groupColPos;
-	ListCell   *l;
-
-	/* Unique doesn't project, so tlist requirements pass through */
-	subplan = create_plan_recurse(root, best_path->subpath, flags);
-
-	/* Done if we don't need to do any actual unique-ifying */
-	if (best_path->umethod == UNIQUE_PATH_NOOP)
-		return subplan;
-
-	/*
-	 * As constructed, the subplan has a "flat" tlist containing just the Vars
-	 * needed here and at upper levels.  The values we are supposed to
-	 * unique-ify may be expressions in these variables.  We have to add any
-	 * such expressions to the subplan's tlist.
-	 *
-	 * The subplan may have a "physical" tlist if it is a simple scan plan. If
-	 * we're going to sort, this should be reduced to the regular tlist, so
-	 * that we don't sort more data than we need to.  For hashing, the tlist
-	 * should be left as-is if we don't need to add any expressions; but if we
-	 * do have to add expressions, then a projection step will be needed at
-	 * runtime anyway, so we may as well remove unneeded items. Therefore
-	 * newtlist starts from build_path_tlist() not just a copy of the
-	 * subplan's tlist; and we don't install it into the subplan unless we are
-	 * sorting or stuff has to be added.
-	 */
-	in_operators = best_path->in_operators;
-	uniq_exprs = best_path->uniq_exprs;
-
-	/* initialize modified subplan tlist as just the "required" vars */
-	newtlist = build_path_tlist(root, &best_path->path);
-	nextresno = list_length(newtlist) + 1;
-	newitems = false;
-
-	foreach(l, uniq_exprs)
-	{
-		Expr	   *uniqexpr = lfirst(l);
-		TargetEntry *tle;
-
-		tle = tlist_member(uniqexpr, newtlist);
-		if (!tle)
-		{
-			tle = makeTargetEntry((Expr *) uniqexpr,
-								  nextresno,
-								  NULL,
-								  false);
-			newtlist = lappend(newtlist, tle);
-			nextresno++;
-			newitems = true;
-		}
-	}
-
-	/* Use change_plan_targetlist in case we need to insert a Result node */
-	if (newitems || best_path->umethod == UNIQUE_PATH_SORT)
-		subplan = change_plan_targetlist(subplan, newtlist,
-										 best_path->path.parallel_safe);
-
-	/*
-	 * Build control information showing which subplan output columns are to
-	 * be examined by the grouping step.  Unfortunately we can't merge this
-	 * with the previous loop, since we didn't then know which version of the
-	 * subplan tlist we'd end up using.
-	 */
-	newtlist = subplan->targetlist;
-	numGroupCols = list_length(uniq_exprs);
-	groupColIdx = (AttrNumber *) palloc(numGroupCols * sizeof(AttrNumber));
-	groupCollations = (Oid *) palloc(numGroupCols * sizeof(Oid));
-
-	groupColPos = 0;
-	foreach(l, uniq_exprs)
-	{
-		Expr	   *uniqexpr = lfirst(l);
-		TargetEntry *tle;
-
-		tle = tlist_member(uniqexpr, newtlist);
-		if (!tle)				/* shouldn't happen */
-			elog(ERROR, "failed to find unique expression in subplan tlist");
-		groupColIdx[groupColPos] = tle->resno;
-		groupCollations[groupColPos] = exprCollation((Node *) tle->expr);
-		groupColPos++;
-	}
-
-	if (best_path->umethod == UNIQUE_PATH_HASH)
-	{
-		Oid		   *groupOperators;
-
-		/*
-		 * Get the hashable equality operators for the Agg node to use.
-		 * Normally these are the same as the IN clause operators, but if
-		 * those are cross-type operators then the equality operators are the
-		 * ones for the IN clause operators' RHS datatype.
-		 */
-		groupOperators = (Oid *) palloc(numGroupCols * sizeof(Oid));
-		groupColPos = 0;
-		foreach(l, in_operators)
-		{
-			Oid			in_oper = lfirst_oid(l);
-			Oid			eq_oper;
-
-			if (!get_compatible_hash_operators(in_oper, NULL, &eq_oper))
-				elog(ERROR, "could not find compatible hash operator for operator %u",
-					 in_oper);
-			groupOperators[groupColPos++] = eq_oper;
-		}
-
-		/*
-		 * Since the Agg node is going to project anyway, we can give it the
-		 * minimum output tlist, without any stuff we might have added to the
-		 * subplan tlist.
-		 */
-		plan = (Plan *) make_agg(build_path_tlist(root, &best_path->path),
-								 NIL,
-								 AGG_HASHED,
-								 AGGSPLIT_SIMPLE,
-								 numGroupCols,
-								 groupColIdx,
-								 groupOperators,
-								 groupCollations,
-								 NIL,
-								 NIL,
-								 best_path->path.rows,
-								 0,
-								 subplan);
-	}
-	else
-	{
-		List	   *sortList = NIL;
-		Sort	   *sort;
-
-		/* Create an ORDER BY list to sort the input compatibly */
-		groupColPos = 0;
-		foreach(l, in_operators)
-		{
-			Oid			in_oper = lfirst_oid(l);
-			Oid			sortop;
-			Oid			eqop;
-			TargetEntry *tle;
-			SortGroupClause *sortcl;
-
-			sortop = get_ordering_op_for_equality_op(in_oper, false);
-			if (!OidIsValid(sortop))	/* shouldn't happen */
-				elog(ERROR, "could not find ordering operator for equality operator %u",
-					 in_oper);
-
-			/*
-			 * The Unique node will need equality operators.  Normally these
-			 * are the same as the IN clause operators, but if those are
-			 * cross-type operators then the equality operators are the ones
-			 * for the IN clause operators' RHS datatype.
-			 */
-			eqop = get_equality_op_for_ordering_op(sortop, NULL);
-			if (!OidIsValid(eqop))	/* shouldn't happen */
-				elog(ERROR, "could not find equality operator for ordering operator %u",
-					 sortop);
-
-			tle = get_tle_by_resno(subplan->targetlist,
-								   groupColIdx[groupColPos]);
-			Assert(tle != NULL);
-
-			sortcl = makeNode(SortGroupClause);
-			sortcl->tleSortGroupRef = assignSortGroupRef(tle,
-														 subplan->targetlist);
-			sortcl->eqop = eqop;
-			sortcl->sortop = sortop;
-			sortcl->reverse_sort = false;
-			sortcl->nulls_first = false;
-			sortcl->hashable = false;	/* no need to make this accurate */
-			sortList = lappend(sortList, sortcl);
-			groupColPos++;
-		}
-		sort = make_sort_from_sortclauses(sortList, subplan);
-		label_sort_with_costsize(root, sort, -1.0);
-		plan = (Plan *) make_unique_from_sortclauses((Plan *) sort, sortList);
-	}
-
-	/* Copy cost data from Path to Plan */
-	copy_generic_path_info(plan, &best_path->path);
-
-	return plan;
-}
-
-/*
  * create_gather_plan
  *
  *	  Create a Gather plan for 'best_path' and (recursively) plans
@@ -2322,13 +2108,13 @@ create_group_plan(PlannerInfo *root, GroupPath *best_path)
 }
 
 /*
- * create_upper_unique_plan
+ * create_unique_plan
  *
  *	  Create a Unique plan for 'best_path' and (recursively) plans
  *	  for its subpaths.
  */
 static Unique *
-create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flags)
+create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags)
 {
 	Unique	   *plan;
 	Plan	   *subplan;
@@ -2340,9 +2126,17 @@ create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flag
 	subplan = create_plan_recurse(root, best_path->subpath,
 								  flags | CP_LABEL_TLIST);
 
+	/*
+	 * make_unique_from_pathkeys calls find_ec_member_matching_expr, which
+	 * will ignore any child EC members that don't belong to the given relids.
+	 * Thus, if this unique path is based on a child relation, we must pass
+	 * its relids.
+	 */
 	plan = make_unique_from_pathkeys(subplan,
 									 best_path->path.pathkeys,
-									 best_path->numkeys);
+									 best_path->numkeys,
+									 IS_OTHER_REL(best_path->path.parent) ?
+									 best_path->path.parent->relids : NULL);
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
@@ -6880,61 +6674,14 @@ make_group(List *tlist,
 }
 
 /*
- * distinctList is a list of SortGroupClauses, identifying the targetlist items
- * that should be considered by the Unique filter.  The input path must
- * already be sorted accordingly.
- */
-static Unique *
-make_unique_from_sortclauses(Plan *lefttree, List *distinctList)
-{
-	Unique	   *node = makeNode(Unique);
-	Plan	   *plan = &node->plan;
-	int			numCols = list_length(distinctList);
-	int			keyno = 0;
-	AttrNumber *uniqColIdx;
-	Oid		   *uniqOperators;
-	Oid		   *uniqCollations;
-	ListCell   *slitem;
-
-	plan->targetlist = lefttree->targetlist;
-	plan->qual = NIL;
-	plan->lefttree = lefttree;
-	plan->righttree = NULL;
-
-	/*
-	 * convert SortGroupClause list into arrays of attr indexes and equality
-	 * operators, as wanted by executor
-	 */
-	Assert(numCols > 0);
-	uniqColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols);
-	uniqOperators = (Oid *) palloc(sizeof(Oid) * numCols);
-	uniqCollations = (Oid *) palloc(sizeof(Oid) * numCols);
-
-	foreach(slitem, distinctList)
-	{
-		SortGroupClause *sortcl = (SortGroupClause *) lfirst(slitem);
-		TargetEntry *tle = get_sortgroupclause_tle(sortcl, plan->targetlist);
-
-		uniqColIdx[keyno] = tle->resno;
-		uniqOperators[keyno] = sortcl->eqop;
-		uniqCollations[keyno] = exprCollation((Node *) tle->expr);
-		Assert(OidIsValid(uniqOperators[keyno]));
-		keyno++;
-	}
-
-	node->numCols = numCols;
-	node->uniqColIdx = uniqColIdx;
-	node->uniqOperators = uniqOperators;
-	node->uniqCollations = uniqCollations;
-
-	return node;
-}
-
-/*
- * as above, but use pathkeys to identify the sort columns and semantics
+ * pathkeys is a list of PathKeys, identifying the sort columns and semantics.
+ * The input plan must already be sorted accordingly.
+ *
+ * relids identifies the child relation being unique-ified, if any.
  */
 static Unique *
-make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols)
+make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols,
+						  Relids relids)
 {
 	Unique	   *node = makeNode(Unique);
 	Plan	   *plan = &node->plan;
@@ -6997,7 +6744,7 @@ make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols)
 			foreach(j, plan->targetlist)
 			{
 				tle = (TargetEntry *) lfirst(j);
-				em = find_ec_member_matching_expr(ec, tle->expr, NULL);
+				em = find_ec_member_matching_expr(ec, tle->expr, relids);
 				if (em)
 				{
 					/* found expr already in tlist */
author	Richard Guo <rguo@postgresql.org>	2025-08-19 09:35:40 +0900
committer	Richard Guo <rguo@postgresql.org>	2025-08-19 09:35:40 +0900
commit	24225ad9aafc576295e210026d8ffa9f50d61145 (patch)
tree	3634e26824f97b86244458a5fdae827034ac6bdf /src/backend/optimizer/plan/createplan.c
parent	3c07944d048c82ae67884c1f09e81f53f769ac2a (diff)