Explore alternative orderings of group-by pathkeys during optimization.

When evaluating a query with a multi-column GROUP BY clause, we can minimize sort operations or avoid them if we synchronize the order of GROUP BY clauses with the ORDER BY sort clause or sort order, which comes from the underlying query tree. Grouping does not imply any ordering, so we can compare the keys in arbitrary order, and a Hash Agg leverages this. But for Group Agg, we simply compared keys in the order specified in the query. This commit explores alternative ordering of the keys, trying to find a cheaper one. The ordering of group keys may interact with other parts of the query, some of which may not be known while planning the grouping. For example, there may be an explicit ORDER BY clause or some other ordering-dependent operation higher up in the query, and using the same ordering may allow using either incremental sort or even eliminating the sort entirely. The patch always keeps the ordering specified in the query, assuming the user might have additional insights. This introduces a new GUC enable_group_by_reordering so that the optimization may be disabled if needed. Discussion: https://postgr.es/m/7c79e6a5-8597-74e8-0671-1c39d124c9d6%40sigaev.ru Author: Andrei Lepikhov, Teodor Sigaev Reviewed-by: Tomas Vondra, Claudio Freire, Gavin Flower, Dmitry Dolgov Reviewed-by: Robert Haas, Pavel Borisov, David Rowley, Zhihong Yu Reviewed-by: Tom Lane, Alexander Korotkov, Richard Guo, Alena Rybakina
author: Alexander Korotkov <akorotkov@postgresql.org> 2024-01-21 22:21:36 +0200
committer: Alexander Korotkov <akorotkov@postgresql.org> 2024-01-21 22:21:36 +0200
commit: 0452b461bc405e6d35d8a14c02813c15e28ae516 (patch)
tree: 87587d2a6e0bd44c705af98cf2f918c000940797 /src/backend/optimizer/path/pathkeys.c
parent: 7ab80ac1caf9f48064190802e1068ef89e2883c4 (diff)
1 files changed, 252 insertions, 0 deletions
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index ca94a31f71e..82ff31273b9 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -22,12 +22,15 @@
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/plannodes.h"
+#include "optimizer/cost.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "partitioning/partbounds.h"
 #include "utils/lsyscache.h"
 
+/* Consider reordering of GROUP BY keys? */
+bool		enable_group_by_reordering = true;
 
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
 static bool matches_boolean_partition_clause(RestrictInfo *rinfo,
@@ -351,6 +354,202 @@ pathkeys_contained_in(List *keys1, List *keys2)
 }
 
 /*
+ * group_keys_reorder_by_pathkeys
+ *		Reorder GROUP BY keys to match the input pathkeys.
+ *
+ * Function returns new lists (pathkeys and clauses), original GROUP BY lists
+ * stay untouched.
+ *
+ * Returns the number of GROUP BY keys with a matching pathkey.
+ */
+static int
+group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
+							   List **group_clauses,
+							   int num_groupby_pathkeys)
+{
+	List	   *new_group_pathkeys = NIL,
+			   *new_group_clauses = NIL;
+	ListCell   *lc;
+	int			n;
+
+	if (pathkeys == NIL || *group_pathkeys == NIL)
+		return 0;
+
+	/*
+	 * Walk the pathkeys (determining ordering of the input path) and see if
+	 * there's a matching GROUP BY key. If we find one, we append it to the
+	 * list, and do the same for the clauses.
+	 *
+	 * Once we find the first pathkey without a matching GROUP BY key, the
+	 * rest of the pathkeys are useless and can't be used to evaluate the
+	 * grouping, so we abort the loop and ignore the remaining pathkeys.
+	 */
+	foreach(lc, pathkeys)
+	{
+		PathKey    *pathkey = (PathKey *) lfirst(lc);
+		SortGroupClause *sgc;
+
+		/*
+		 * Pathkeys are built in a way that allows simply comparing pointers.
+		 * Give up if we can't find the matching pointer.  Also give up if
+		 * there is no sortclause reference for some reason.
+		 */
+		if (foreach_current_index(lc) >= num_groupby_pathkeys ||
+			!list_member_ptr(*group_pathkeys, pathkey) ||
+			pathkey->pk_eclass->ec_sortref == 0)
+			break;
+
+		/*
+		 * Since 1349d27 pathkey coming from underlying node can be in the
+		 * root->group_pathkeys but not in the processed_groupClause. So, we
+		 * should be careful here.
+		 */
+		sgc = get_sortgroupref_clause_noerr(pathkey->pk_eclass->ec_sortref,
+											*group_clauses);
+		if (!sgc)
+			/* The grouping clause does not cover this pathkey */
+			break;
+
+		/*
+		 * Sort group clause should have an ordering operator as long as there
+		 * is an associated pathkey.
+		 */
+		Assert(OidIsValid(sgc->sortop));
+
+		new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
+		new_group_clauses = lappend(new_group_clauses, sgc);
+	}
+
+	/* remember the number of pathkeys with a matching GROUP BY key */
+	n = list_length(new_group_pathkeys);
+
+	/* append the remaining group pathkeys (will be treated as not sorted) */
+	*group_pathkeys = list_concat_unique_ptr(new_group_pathkeys,
+											 *group_pathkeys);
+	*group_clauses = list_concat_unique_ptr(new_group_clauses,
+											*group_clauses);
+
+	return n;
+}
+
+/*
+ * pathkeys_are_duplicate
+ *		Check if give pathkeys are already contained the list of
+ *		PathKeyInfo's.
+ */
+static bool
+pathkeys_are_duplicate(List *infos, List *pathkeys)
+{
+	ListCell   *lc;
+
+	foreach(lc, infos)
+	{
+		PathKeyInfo *info = lfirst_node(PathKeyInfo, lc);
+
+		if (compare_pathkeys(pathkeys, info->pathkeys) == PATHKEYS_EQUAL)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * get_useful_group_keys_orderings
+ *		Determine which orderings of GROUP BY keys are potentially interesting.
+ *
+ * Returns a list of PathKeyInfo items, each representing an interesting
+ * ordering of GROUP BY keys.  Each item stores pathkeys and clauses in the
+ * matching order.
+ *
+ * The function considers (and keeps) multiple GROUP BY orderings:
+ *
+ * - the original ordering, as specified by the GROUP BY clause,
+ * - GROUP BY keys reordered to match 'path' ordering (as much as possible),
+ * - GROUP BY keys to match target ORDER BY clause (as much as possible).
+ */
+List *
+get_useful_group_keys_orderings(PlannerInfo *root, Path *path)
+{
+	Query	   *parse = root->parse;
+	List	   *infos = NIL;
+	PathKeyInfo *info;
+
+	List	   *pathkeys = root->group_pathkeys;
+	List	   *clauses = root->processed_groupClause;
+
+	/* always return at least the original pathkeys/clauses */
+	info = makeNode(PathKeyInfo);
+	info->pathkeys = pathkeys;
+	info->clauses = clauses;
+	infos = lappend(infos, info);
+
+	/*
+	 * Should we try generating alternative orderings of the group keys? If
+	 * not, we produce only the order specified in the query, i.e. the
+	 * optimization is effectively disabled.
+	 */
+	if (!enable_group_by_reordering)
+		return infos;
+
+	/*
+	 * Grouping sets have own and more complex logic to decide the ordering.
+	 */
+	if (parse->groupingSets)
+		return infos;
+
+	/*
+	 * If the path is sorted in some way, try reordering the group keys to
+	 * match the path as much of the ordering as possible.  Then thanks to
+	 * incremental sort we would get this sort as cheap as possible.
+	 */
+	if (path->pathkeys &&
+		!pathkeys_contained_in(path->pathkeys, root->group_pathkeys))
+	{
+		int			n;
+
+		n = group_keys_reorder_by_pathkeys(path->pathkeys, &pathkeys, &clauses,
+										   root->num_groupby_pathkeys);
+
+		if (n > 0 &&
+			(enable_incremental_sort || n == root->num_groupby_pathkeys) &&
+			!pathkeys_are_duplicate(infos, pathkeys))
+		{
+			info = makeNode(PathKeyInfo);
+			info->pathkeys = pathkeys;
+			info->clauses = clauses;
+
+			infos = lappend(infos, info);
+		}
+	}
+
+	/*
+	 * Try reordering pathkeys to minimize the sort cost (this time consider
+	 * the ORDER BY clause).
+	 */
+	if (root->sort_pathkeys &&
+		!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
+	{
+		int			n;
+
+		n = group_keys_reorder_by_pathkeys(root->sort_pathkeys, &pathkeys,
+										   &clauses,
+										   root->num_groupby_pathkeys);
+
+		if (n > 0 &&
+			(enable_incremental_sort || n == list_length(root->sort_pathkeys)) &&
+			!pathkeys_are_duplicate(infos, pathkeys))
+		{
+			info = makeNode(PathKeyInfo);
+			info->pathkeys = pathkeys;
+			info->clauses = clauses;
+
+			infos = lappend(infos, info);
+		}
+	}
+
+	return infos;
+}
+
+/*
  * pathkeys_count_contained_in
  *    Same as pathkeys_contained_in, but also sets length of longest
  *    common prefix of keys1 and keys2.
@@ -1940,6 +2139,54 @@ pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys)
 }
 
 /*
+ * pathkeys_useful_for_grouping
+ *		Count the number of pathkeys that are useful for grouping (instead of
+ *		explicit sort)
+ *
+ * Group pathkeys could be reordered to benefit from the ordering. The
+ * ordering may not be "complete" and may require incremental sort, but that's
+ * fine. So we simply count prefix pathkeys with a matching group key, and
+ * stop once we find the first pathkey without a match.
+ *
+ * So e.g. with pathkeys (a,b,c) and group keys (a,b,e) this determines (a,b)
+ * pathkeys are useful for grouping, and we might do incremental sort to get
+ * path ordered by (a,b,e).
+ *
+ * This logic is necessary to retain paths with ordering not matching grouping
+ * keys directly, without the reordering.
+ *
+ * Returns the length of pathkey prefix with matching group keys.
+ */
+static int
+pathkeys_useful_for_grouping(PlannerInfo *root, List *pathkeys)
+{
+	ListCell   *key;
+	int			n = 0;
+
+	/* no special ordering requested for grouping */
+	if (root->group_pathkeys == NIL)
+		return 0;
+
+	/* unordered path */
+	if (pathkeys == NIL)
+		return 0;
+
+	/* walk the pathkeys and search for matching group key */
+	foreach(key, pathkeys)
+	{
+		PathKey    *pathkey = (PathKey *) lfirst(key);
+
+		/* no matching group key, we're done */
+		if (!list_member_ptr(root->group_pathkeys, pathkey))
+			break;
+
+		n++;
+	}
+
+	return n;
+}
+
+/*
  * truncate_useless_pathkeys
  *		Shorten the given pathkey list to just the useful pathkeys.
  */
@@ -1955,6 +2202,9 @@ truncate_useless_pathkeys(PlannerInfo *root,
 	nuseful2 = pathkeys_useful_for_ordering(root, pathkeys);
 	if (nuseful2 > nuseful)
 		nuseful = nuseful2;
+	nuseful2 = pathkeys_useful_for_grouping(root, pathkeys);
+	if (nuseful2 > nuseful)
+		nuseful = nuseful2;
 
 	/*
 	 * Note: not safe to modify input list destructively, but we can avoid
@@ -1988,6 +2238,8 @@ has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel)
 {
 	if (rel->joininfo != NIL || rel->has_eclass_joins)
 		return true;			/* might be able to use pathkeys for merging */
+	if (root->group_pathkeys != NIL)
+		return true;			/* might be able to use pathkeys for grouping */
 	if (root->query_pathkeys != NIL)
 		return true;			/* might be able to use them for ordering */
 	return false;				/* definitely useless */
author	Alexander Korotkov <akorotkov@postgresql.org>	2024-01-21 22:21:36 +0200
committer	Alexander Korotkov <akorotkov@postgresql.org>	2024-01-21 22:21:36 +0200
commit	0452b461bc405e6d35d8a14c02813c15e28ae516 (patch)
tree	87587d2a6e0bd44c705af98cf2f918c000940797 /src/backend/optimizer/path/pathkeys.c
parent	7ab80ac1caf9f48064190802e1068ef89e2883c4 (diff)