diff options
author | Alexander Korotkov <akorotkov@postgresql.org> | 2024-01-21 22:21:36 +0200 |
---|---|---|
committer | Alexander Korotkov <akorotkov@postgresql.org> | 2024-01-21 22:21:36 +0200 |
commit | 0452b461bc405e6d35d8a14c02813c15e28ae516 (patch) | |
tree | 87587d2a6e0bd44c705af98cf2f918c000940797 /src/backend/optimizer/path/pathkeys.c | |
parent | 7ab80ac1caf9f48064190802e1068ef89e2883c4 (diff) |
Explore alternative orderings of group-by pathkeys during optimization.
When evaluating a query with a multi-column GROUP BY clause, we can minimize
sort operations or avoid them if we synchronize the order of GROUP BY clauses
with the ORDER BY sort clause or sort order, which comes from the underlying
query tree. Grouping does not imply any ordering, so we can compare
the keys in arbitrary order, and a Hash Agg leverages this. But for Group Agg,
we simply compared keys in the order specified in the query. This commit
explores alternative ordering of the keys, trying to find a cheaper one.
The ordering of group keys may interact with other parts of the query, some of
which may not be known while planning the grouping. For example, there may be
an explicit ORDER BY clause or some other ordering-dependent operation higher up
in the query, and using the same ordering may allow using either incremental
sort or even eliminating the sort entirely.
The patch always keeps the ordering specified in the query, assuming the user
might have additional insights.
This introduces a new GUC enable_group_by_reordering so that the optimization
may be disabled if needed.
Discussion: https://postgr.es/m/7c79e6a5-8597-74e8-0671-1c39d124c9d6%40sigaev.ru
Author: Andrei Lepikhov, Teodor Sigaev
Reviewed-by: Tomas Vondra, Claudio Freire, Gavin Flower, Dmitry Dolgov
Reviewed-by: Robert Haas, Pavel Borisov, David Rowley, Zhihong Yu
Reviewed-by: Tom Lane, Alexander Korotkov, Richard Guo, Alena Rybakina
Diffstat (limited to 'src/backend/optimizer/path/pathkeys.c')
-rw-r--r-- | src/backend/optimizer/path/pathkeys.c | 252 |
1 files changed, 252 insertions, 0 deletions
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index ca94a31f71e..82ff31273b9 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -22,12 +22,15 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "nodes/plannodes.h" +#include "optimizer/cost.h" #include "optimizer/optimizer.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "partitioning/partbounds.h" #include "utils/lsyscache.h" +/* Consider reordering of GROUP BY keys? */ +bool enable_group_by_reordering = true; static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys); static bool matches_boolean_partition_clause(RestrictInfo *rinfo, @@ -351,6 +354,202 @@ pathkeys_contained_in(List *keys1, List *keys2) } /* + * group_keys_reorder_by_pathkeys + * Reorder GROUP BY keys to match the input pathkeys. + * + * Function returns new lists (pathkeys and clauses), original GROUP BY lists + * stay untouched. + * + * Returns the number of GROUP BY keys with a matching pathkey. + */ +static int +group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, + List **group_clauses, + int num_groupby_pathkeys) +{ + List *new_group_pathkeys = NIL, + *new_group_clauses = NIL; + ListCell *lc; + int n; + + if (pathkeys == NIL || *group_pathkeys == NIL) + return 0; + + /* + * Walk the pathkeys (determining ordering of the input path) and see if + * there's a matching GROUP BY key. If we find one, we append it to the + * list, and do the same for the clauses. + * + * Once we find the first pathkey without a matching GROUP BY key, the + * rest of the pathkeys are useless and can't be used to evaluate the + * grouping, so we abort the loop and ignore the remaining pathkeys. + */ + foreach(lc, pathkeys) + { + PathKey *pathkey = (PathKey *) lfirst(lc); + SortGroupClause *sgc; + + /* + * Pathkeys are built in a way that allows simply comparing pointers. + * Give up if we can't find the matching pointer. Also give up if + * there is no sortclause reference for some reason. + */ + if (foreach_current_index(lc) >= num_groupby_pathkeys || + !list_member_ptr(*group_pathkeys, pathkey) || + pathkey->pk_eclass->ec_sortref == 0) + break; + + /* + * Since 1349d27 pathkey coming from underlying node can be in the + * root->group_pathkeys but not in the processed_groupClause. So, we + * should be careful here. + */ + sgc = get_sortgroupref_clause_noerr(pathkey->pk_eclass->ec_sortref, + *group_clauses); + if (!sgc) + /* The grouping clause does not cover this pathkey */ + break; + + /* + * Sort group clause should have an ordering operator as long as there + * is an associated pathkey. + */ + Assert(OidIsValid(sgc->sortop)); + + new_group_pathkeys = lappend(new_group_pathkeys, pathkey); + new_group_clauses = lappend(new_group_clauses, sgc); + } + + /* remember the number of pathkeys with a matching GROUP BY key */ + n = list_length(new_group_pathkeys); + + /* append the remaining group pathkeys (will be treated as not sorted) */ + *group_pathkeys = list_concat_unique_ptr(new_group_pathkeys, + *group_pathkeys); + *group_clauses = list_concat_unique_ptr(new_group_clauses, + *group_clauses); + + return n; +} + +/* + * pathkeys_are_duplicate + * Check if give pathkeys are already contained the list of + * PathKeyInfo's. + */ +static bool +pathkeys_are_duplicate(List *infos, List *pathkeys) +{ + ListCell *lc; + + foreach(lc, infos) + { + PathKeyInfo *info = lfirst_node(PathKeyInfo, lc); + + if (compare_pathkeys(pathkeys, info->pathkeys) == PATHKEYS_EQUAL) + return true; + } + return false; +} + +/* + * get_useful_group_keys_orderings + * Determine which orderings of GROUP BY keys are potentially interesting. + * + * Returns a list of PathKeyInfo items, each representing an interesting + * ordering of GROUP BY keys. Each item stores pathkeys and clauses in the + * matching order. + * + * The function considers (and keeps) multiple GROUP BY orderings: + * + * - the original ordering, as specified by the GROUP BY clause, + * - GROUP BY keys reordered to match 'path' ordering (as much as possible), + * - GROUP BY keys to match target ORDER BY clause (as much as possible). + */ +List * +get_useful_group_keys_orderings(PlannerInfo *root, Path *path) +{ + Query *parse = root->parse; + List *infos = NIL; + PathKeyInfo *info; + + List *pathkeys = root->group_pathkeys; + List *clauses = root->processed_groupClause; + + /* always return at least the original pathkeys/clauses */ + info = makeNode(PathKeyInfo); + info->pathkeys = pathkeys; + info->clauses = clauses; + infos = lappend(infos, info); + + /* + * Should we try generating alternative orderings of the group keys? If + * not, we produce only the order specified in the query, i.e. the + * optimization is effectively disabled. + */ + if (!enable_group_by_reordering) + return infos; + + /* + * Grouping sets have own and more complex logic to decide the ordering. + */ + if (parse->groupingSets) + return infos; + + /* + * If the path is sorted in some way, try reordering the group keys to + * match the path as much of the ordering as possible. Then thanks to + * incremental sort we would get this sort as cheap as possible. + */ + if (path->pathkeys && + !pathkeys_contained_in(path->pathkeys, root->group_pathkeys)) + { + int n; + + n = group_keys_reorder_by_pathkeys(path->pathkeys, &pathkeys, &clauses, + root->num_groupby_pathkeys); + + if (n > 0 && + (enable_incremental_sort || n == root->num_groupby_pathkeys) && + !pathkeys_are_duplicate(infos, pathkeys)) + { + info = makeNode(PathKeyInfo); + info->pathkeys = pathkeys; + info->clauses = clauses; + + infos = lappend(infos, info); + } + } + + /* + * Try reordering pathkeys to minimize the sort cost (this time consider + * the ORDER BY clause). + */ + if (root->sort_pathkeys && + !pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys)) + { + int n; + + n = group_keys_reorder_by_pathkeys(root->sort_pathkeys, &pathkeys, + &clauses, + root->num_groupby_pathkeys); + + if (n > 0 && + (enable_incremental_sort || n == list_length(root->sort_pathkeys)) && + !pathkeys_are_duplicate(infos, pathkeys)) + { + info = makeNode(PathKeyInfo); + info->pathkeys = pathkeys; + info->clauses = clauses; + + infos = lappend(infos, info); + } + } + + return infos; +} + +/* * pathkeys_count_contained_in * Same as pathkeys_contained_in, but also sets length of longest * common prefix of keys1 and keys2. @@ -1940,6 +2139,54 @@ pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys) } /* + * pathkeys_useful_for_grouping + * Count the number of pathkeys that are useful for grouping (instead of + * explicit sort) + * + * Group pathkeys could be reordered to benefit from the ordering. The + * ordering may not be "complete" and may require incremental sort, but that's + * fine. So we simply count prefix pathkeys with a matching group key, and + * stop once we find the first pathkey without a match. + * + * So e.g. with pathkeys (a,b,c) and group keys (a,b,e) this determines (a,b) + * pathkeys are useful for grouping, and we might do incremental sort to get + * path ordered by (a,b,e). + * + * This logic is necessary to retain paths with ordering not matching grouping + * keys directly, without the reordering. + * + * Returns the length of pathkey prefix with matching group keys. + */ +static int +pathkeys_useful_for_grouping(PlannerInfo *root, List *pathkeys) +{ + ListCell *key; + int n = 0; + + /* no special ordering requested for grouping */ + if (root->group_pathkeys == NIL) + return 0; + + /* unordered path */ + if (pathkeys == NIL) + return 0; + + /* walk the pathkeys and search for matching group key */ + foreach(key, pathkeys) + { + PathKey *pathkey = (PathKey *) lfirst(key); + + /* no matching group key, we're done */ + if (!list_member_ptr(root->group_pathkeys, pathkey)) + break; + + n++; + } + + return n; +} + +/* * truncate_useless_pathkeys * Shorten the given pathkey list to just the useful pathkeys. */ @@ -1955,6 +2202,9 @@ truncate_useless_pathkeys(PlannerInfo *root, nuseful2 = pathkeys_useful_for_ordering(root, pathkeys); if (nuseful2 > nuseful) nuseful = nuseful2; + nuseful2 = pathkeys_useful_for_grouping(root, pathkeys); + if (nuseful2 > nuseful) + nuseful = nuseful2; /* * Note: not safe to modify input list destructively, but we can avoid @@ -1988,6 +2238,8 @@ has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel) { if (rel->joininfo != NIL || rel->has_eclass_joins) return true; /* might be able to use pathkeys for merging */ + if (root->group_pathkeys != NIL) + return true; /* might be able to use pathkeys for grouping */ if (root->query_pathkeys != NIL) return true; /* might be able to use them for ordering */ return false; /* definitely useless */ |