diff options
author | Richard Guo <rguo@postgresql.org> | 2025-08-19 09:35:40 +0900 |
---|---|---|
committer | Richard Guo <rguo@postgresql.org> | 2025-08-19 09:35:40 +0900 |
commit | 24225ad9aafc576295e210026d8ffa9f50d61145 (patch) | |
tree | 3634e26824f97b86244458a5fdae827034ac6bdf /src/backend/optimizer/plan/createplan.c | |
parent | 3c07944d048c82ae67884c1f09e81f53f769ac2a (diff) |
Pathify RHS unique-ification for semijoin planning
There are two implementation techniques for semijoins: one uses the
JOIN_SEMI jointype, where the executor emits at most one matching row
per left-hand side (LHS) row; the other unique-ifies the right-hand
side (RHS) and then performs a plain inner join.
The latter technique currently has some drawbacks related to the
unique-ification step.
* Only the cheapest-total path of the RHS is considered during
unique-ification. This may cause us to miss some optimization
opportunities; for example, a path with a better sort order might be
overlooked simply because it is not the cheapest in total cost. Such
a path could help avoid a sort at a higher level, potentially
resulting in a cheaper overall plan.
* We currently rely on heuristics to choose between hash-based and
sort-based unique-ification. A better approach would be to generate
paths for both methods and allow add_path() to decide which one is
preferable, consistent with how path selection is handled elsewhere in
the planner.
* In the sort-based implementation, we currently pay no attention to
the pathkeys of the input subpath or the resulting output. This can
result in redundant sort nodes being added to the final plan.
This patch improves semijoin planning by creating a new RelOptInfo for
the RHS rel to represent its unique-ified version. It then generates
multiple paths that represent elimination of distinct rows from the
RHS, considering both a hash-based implementation using the cheapest
total path of the original RHS rel, and sort-based implementations
that either exploit presorted input paths or explicitly sort the
cheapest total path. All resulting paths compete in add_path(), and
those deemed worthy of consideration are added to the new RelOptInfo.
Finally, the unique-ified rel is joined with the other side of the
semijoin using a plain inner join.
As a side effect, most of the code related to the JOIN_UNIQUE_OUTER
and JOIN_UNIQUE_INNER jointypes -- used to indicate that the LHS or
RHS path should be made unique -- has been removed. Besides, the
T_Unique path now has the same meaning for both semijoins and upper
DISTINCT clauses: it represents adjacent-duplicate removal on
presorted input. This patch unifies their handling by sharing the
same data structures and functions.
This patch also removes the UNIQUE_PATH_NOOP related code along the
way, as it is dead code -- if the RHS rel is provably unique, the
semijoin should have already been simplified to a plain inner join by
analyzejoins.c.
Author: Richard Guo <guofenglinux@gmail.com>
Reviewed-by: Alexandra Wang <alexandra.wang.oss@gmail.com>
Reviewed-by: wenhui qiu <qiuwenhuifx@gmail.com>
Discussion: https://postgr.es/m/CAMbWs4-EBnaRvEs7frTLbsXiweSTUXifsteF-d3rvv01FKO86w@mail.gmail.com
Diffstat (limited to 'src/backend/optimizer/plan/createplan.c')
-rw-r--r-- | src/backend/optimizer/plan/createplan.c | 301 |
1 files changed, 24 insertions, 277 deletions
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 9fd5c31edf2..6791cbeb416 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -95,8 +95,6 @@ static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path int flags); static Memoize *create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags); -static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path, - int flags); static Gather *create_gather_plan(PlannerInfo *root, GatherPath *best_path); static Plan *create_projection_plan(PlannerInfo *root, ProjectionPath *best_path, @@ -106,8 +104,7 @@ static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags) static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path, int flags); static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path); -static Unique *create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, - int flags); +static Unique *create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags); static Agg *create_agg_plan(PlannerInfo *root, AggPath *best_path); static Plan *create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path); static Result *create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path); @@ -296,9 +293,9 @@ static WindowAgg *make_windowagg(List *tlist, WindowClause *wc, static Group *make_group(List *tlist, List *qual, int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations, Plan *lefttree); -static Unique *make_unique_from_sortclauses(Plan *lefttree, List *distinctList); static Unique *make_unique_from_pathkeys(Plan *lefttree, - List *pathkeys, int numCols); + List *pathkeys, int numCols, + Relids relids); static Gather *make_gather(List *qptlist, List *qpqual, int nworkers, int rescan_param, bool single_copy, Plan *subplan); static SetOp *make_setop(SetOpCmd cmd, SetOpStrategy strategy, @@ -470,19 +467,9 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags) flags); break; case T_Unique: - if (IsA(best_path, UpperUniquePath)) - { - plan = (Plan *) create_upper_unique_plan(root, - (UpperUniquePath *) best_path, - flags); - } - else - { - Assert(IsA(best_path, UniquePath)); - plan = create_unique_plan(root, - (UniquePath *) best_path, - flags); - } + plan = (Plan *) create_unique_plan(root, + (UniquePath *) best_path, + flags); break; case T_Gather: plan = (Plan *) create_gather_plan(root, @@ -1765,207 +1752,6 @@ create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags) } /* - * create_unique_plan - * Create a Unique plan for 'best_path' and (recursively) plans - * for its subpaths. - * - * Returns a Plan node. - */ -static Plan * -create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags) -{ - Plan *plan; - Plan *subplan; - List *in_operators; - List *uniq_exprs; - List *newtlist; - int nextresno; - bool newitems; - int numGroupCols; - AttrNumber *groupColIdx; - Oid *groupCollations; - int groupColPos; - ListCell *l; - - /* Unique doesn't project, so tlist requirements pass through */ - subplan = create_plan_recurse(root, best_path->subpath, flags); - - /* Done if we don't need to do any actual unique-ifying */ - if (best_path->umethod == UNIQUE_PATH_NOOP) - return subplan; - - /* - * As constructed, the subplan has a "flat" tlist containing just the Vars - * needed here and at upper levels. The values we are supposed to - * unique-ify may be expressions in these variables. We have to add any - * such expressions to the subplan's tlist. - * - * The subplan may have a "physical" tlist if it is a simple scan plan. If - * we're going to sort, this should be reduced to the regular tlist, so - * that we don't sort more data than we need to. For hashing, the tlist - * should be left as-is if we don't need to add any expressions; but if we - * do have to add expressions, then a projection step will be needed at - * runtime anyway, so we may as well remove unneeded items. Therefore - * newtlist starts from build_path_tlist() not just a copy of the - * subplan's tlist; and we don't install it into the subplan unless we are - * sorting or stuff has to be added. - */ - in_operators = best_path->in_operators; - uniq_exprs = best_path->uniq_exprs; - - /* initialize modified subplan tlist as just the "required" vars */ - newtlist = build_path_tlist(root, &best_path->path); - nextresno = list_length(newtlist) + 1; - newitems = false; - - foreach(l, uniq_exprs) - { - Expr *uniqexpr = lfirst(l); - TargetEntry *tle; - - tle = tlist_member(uniqexpr, newtlist); - if (!tle) - { - tle = makeTargetEntry((Expr *) uniqexpr, - nextresno, - NULL, - false); - newtlist = lappend(newtlist, tle); - nextresno++; - newitems = true; - } - } - - /* Use change_plan_targetlist in case we need to insert a Result node */ - if (newitems || best_path->umethod == UNIQUE_PATH_SORT) - subplan = change_plan_targetlist(subplan, newtlist, - best_path->path.parallel_safe); - - /* - * Build control information showing which subplan output columns are to - * be examined by the grouping step. Unfortunately we can't merge this - * with the previous loop, since we didn't then know which version of the - * subplan tlist we'd end up using. - */ - newtlist = subplan->targetlist; - numGroupCols = list_length(uniq_exprs); - groupColIdx = (AttrNumber *) palloc(numGroupCols * sizeof(AttrNumber)); - groupCollations = (Oid *) palloc(numGroupCols * sizeof(Oid)); - - groupColPos = 0; - foreach(l, uniq_exprs) - { - Expr *uniqexpr = lfirst(l); - TargetEntry *tle; - - tle = tlist_member(uniqexpr, newtlist); - if (!tle) /* shouldn't happen */ - elog(ERROR, "failed to find unique expression in subplan tlist"); - groupColIdx[groupColPos] = tle->resno; - groupCollations[groupColPos] = exprCollation((Node *) tle->expr); - groupColPos++; - } - - if (best_path->umethod == UNIQUE_PATH_HASH) - { - Oid *groupOperators; - - /* - * Get the hashable equality operators for the Agg node to use. - * Normally these are the same as the IN clause operators, but if - * those are cross-type operators then the equality operators are the - * ones for the IN clause operators' RHS datatype. - */ - groupOperators = (Oid *) palloc(numGroupCols * sizeof(Oid)); - groupColPos = 0; - foreach(l, in_operators) - { - Oid in_oper = lfirst_oid(l); - Oid eq_oper; - - if (!get_compatible_hash_operators(in_oper, NULL, &eq_oper)) - elog(ERROR, "could not find compatible hash operator for operator %u", - in_oper); - groupOperators[groupColPos++] = eq_oper; - } - - /* - * Since the Agg node is going to project anyway, we can give it the - * minimum output tlist, without any stuff we might have added to the - * subplan tlist. - */ - plan = (Plan *) make_agg(build_path_tlist(root, &best_path->path), - NIL, - AGG_HASHED, - AGGSPLIT_SIMPLE, - numGroupCols, - groupColIdx, - groupOperators, - groupCollations, - NIL, - NIL, - best_path->path.rows, - 0, - subplan); - } - else - { - List *sortList = NIL; - Sort *sort; - - /* Create an ORDER BY list to sort the input compatibly */ - groupColPos = 0; - foreach(l, in_operators) - { - Oid in_oper = lfirst_oid(l); - Oid sortop; - Oid eqop; - TargetEntry *tle; - SortGroupClause *sortcl; - - sortop = get_ordering_op_for_equality_op(in_oper, false); - if (!OidIsValid(sortop)) /* shouldn't happen */ - elog(ERROR, "could not find ordering operator for equality operator %u", - in_oper); - - /* - * The Unique node will need equality operators. Normally these - * are the same as the IN clause operators, but if those are - * cross-type operators then the equality operators are the ones - * for the IN clause operators' RHS datatype. - */ - eqop = get_equality_op_for_ordering_op(sortop, NULL); - if (!OidIsValid(eqop)) /* shouldn't happen */ - elog(ERROR, "could not find equality operator for ordering operator %u", - sortop); - - tle = get_tle_by_resno(subplan->targetlist, - groupColIdx[groupColPos]); - Assert(tle != NULL); - - sortcl = makeNode(SortGroupClause); - sortcl->tleSortGroupRef = assignSortGroupRef(tle, - subplan->targetlist); - sortcl->eqop = eqop; - sortcl->sortop = sortop; - sortcl->reverse_sort = false; - sortcl->nulls_first = false; - sortcl->hashable = false; /* no need to make this accurate */ - sortList = lappend(sortList, sortcl); - groupColPos++; - } - sort = make_sort_from_sortclauses(sortList, subplan); - label_sort_with_costsize(root, sort, -1.0); - plan = (Plan *) make_unique_from_sortclauses((Plan *) sort, sortList); - } - - /* Copy cost data from Path to Plan */ - copy_generic_path_info(plan, &best_path->path); - - return plan; -} - -/* * create_gather_plan * * Create a Gather plan for 'best_path' and (recursively) plans @@ -2322,13 +2108,13 @@ create_group_plan(PlannerInfo *root, GroupPath *best_path) } /* - * create_upper_unique_plan + * create_unique_plan * * Create a Unique plan for 'best_path' and (recursively) plans * for its subpaths. */ static Unique * -create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flags) +create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags) { Unique *plan; Plan *subplan; @@ -2340,9 +2126,17 @@ create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flag subplan = create_plan_recurse(root, best_path->subpath, flags | CP_LABEL_TLIST); + /* + * make_unique_from_pathkeys calls find_ec_member_matching_expr, which + * will ignore any child EC members that don't belong to the given relids. + * Thus, if this unique path is based on a child relation, we must pass + * its relids. + */ plan = make_unique_from_pathkeys(subplan, best_path->path.pathkeys, - best_path->numkeys); + best_path->numkeys, + IS_OTHER_REL(best_path->path.parent) ? + best_path->path.parent->relids : NULL); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -6880,61 +6674,14 @@ make_group(List *tlist, } /* - * distinctList is a list of SortGroupClauses, identifying the targetlist items - * that should be considered by the Unique filter. The input path must - * already be sorted accordingly. - */ -static Unique * -make_unique_from_sortclauses(Plan *lefttree, List *distinctList) -{ - Unique *node = makeNode(Unique); - Plan *plan = &node->plan; - int numCols = list_length(distinctList); - int keyno = 0; - AttrNumber *uniqColIdx; - Oid *uniqOperators; - Oid *uniqCollations; - ListCell *slitem; - - plan->targetlist = lefttree->targetlist; - plan->qual = NIL; - plan->lefttree = lefttree; - plan->righttree = NULL; - - /* - * convert SortGroupClause list into arrays of attr indexes and equality - * operators, as wanted by executor - */ - Assert(numCols > 0); - uniqColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); - uniqOperators = (Oid *) palloc(sizeof(Oid) * numCols); - uniqCollations = (Oid *) palloc(sizeof(Oid) * numCols); - - foreach(slitem, distinctList) - { - SortGroupClause *sortcl = (SortGroupClause *) lfirst(slitem); - TargetEntry *tle = get_sortgroupclause_tle(sortcl, plan->targetlist); - - uniqColIdx[keyno] = tle->resno; - uniqOperators[keyno] = sortcl->eqop; - uniqCollations[keyno] = exprCollation((Node *) tle->expr); - Assert(OidIsValid(uniqOperators[keyno])); - keyno++; - } - - node->numCols = numCols; - node->uniqColIdx = uniqColIdx; - node->uniqOperators = uniqOperators; - node->uniqCollations = uniqCollations; - - return node; -} - -/* - * as above, but use pathkeys to identify the sort columns and semantics + * pathkeys is a list of PathKeys, identifying the sort columns and semantics. + * The input plan must already be sorted accordingly. + * + * relids identifies the child relation being unique-ified, if any. */ static Unique * -make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols) +make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols, + Relids relids) { Unique *node = makeNode(Unique); Plan *plan = &node->plan; @@ -6997,7 +6744,7 @@ make_unique_from_pathkeys(Plan *lefttree, List *pathkeys, int numCols) foreach(j, plan->targetlist) { tle = (TargetEntry *) lfirst(j); - em = find_ec_member_matching_expr(ec, tle->expr, NULL); + em = find_ec_member_matching_expr(ec, tle->expr, relids); if (em) { /* found expr already in tlist */ |