5 files changed, 179 insertions, 43 deletions
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 65f17101591..a8c8edfac75 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8284,8 +8284,8 @@ generate_setop_child_grouplist(SetOperationStmt *op, List *targetlist)
  *    the needs of the semijoin represented by sjinfo.  If it is not possible
  *    to identify how to make the data unique, NULL is returned.
  *
- * If used at all, this is likely to be called repeatedly on the same rel;
- * So we cache the result.
+ * If used at all, this is likely to be called repeatedly on the same rel,
+ * so we cache the result.
  */
 RelOptInfo *
 create_unique_paths(PlannerInfo *root, RelOptInfo *rel, SpecialJoinInfo *sjinfo)
@@ -8375,9 +8375,14 @@ create_unique_paths(PlannerInfo *root, RelOptInfo *rel, SpecialJoinInfo *sjinfo)
 		 * variables of the input rel's targetlist.  We have to add any such
 		 * expressions to the unique rel's targetlist.
 		 *
-		 * While in the loop, build the lists of SortGroupClause's that
-		 * represent the ordering for the sort-based implementation and the
-		 * grouping for the hash-based implementation.
+		 * To complicate matters, some of the values to be unique-ified may be
+		 * known redundant by the EquivalenceClass machinery (e.g., because
+		 * they have been equated to constants).  There is no need to compare
+		 * such values during unique-ification, and indeed we had better not
+		 * try because the Vars involved may not have propagated as high as
+		 * the semijoin's level.  We use make_pathkeys_for_sortclauses to
+		 * detect such cases, which is a tad inefficient but it doesn't seem
+		 * worth building specialized infrastructure for this.
 		 */
 		newtlist = make_tlist_from_pathtarget(rel->reltarget);
 		nextresno = list_length(newtlist) + 1;
@@ -8386,8 +8391,9 @@ create_unique_paths(PlannerInfo *root, RelOptInfo *rel, SpecialJoinInfo *sjinfo)
 		{
 			Expr	   *uniqexpr = lfirst(lc1);
 			Oid			in_oper = lfirst_oid(lc2);
-			Oid			sortop = InvalidOid;
+			Oid			sortop;
 			TargetEntry *tle;
+			bool		made_tle = false;
 
 			tle = tlist_member(uniqexpr, newtlist);
 			if (!tle)
@@ -8398,19 +8404,21 @@ create_unique_paths(PlannerInfo *root, RelOptInfo *rel, SpecialJoinInfo *sjinfo)
 									  false);
 				newtlist = lappend(newtlist, tle);
 				nextresno++;
+				made_tle = true;
 			}
 
-			if (sjinfo->semi_can_btree)
+			/*
+			 * Try to build an ORDER BY list to sort the input compatibly.  We
+			 * do this for each sortable clause even when the clauses are not
+			 * all sortable, so that we can detect clauses that are redundant
+			 * according to the pathkey machinery.
+			 */
+			sortop = get_ordering_op_for_equality_op(in_oper, false);
+			if (OidIsValid(sortop))
 			{
-				/* Create an ORDER BY list to sort the input compatibly */
 				Oid			eqop;
 				SortGroupClause *sortcl;
 
-				sortop = get_ordering_op_for_equality_op(in_oper, false);
-				if (!OidIsValid(sortop))	/* shouldn't happen */
-					elog(ERROR, "could not find ordering operator for equality operator %u",
-						 in_oper);
-
 				/*
 				 * The Unique node will need equality operators.  Normally
 				 * these are the same as the IN clause operators, but if those
@@ -8430,7 +8438,32 @@ create_unique_paths(PlannerInfo *root, RelOptInfo *rel, SpecialJoinInfo *sjinfo)
 				sortcl->nulls_first = false;
 				sortcl->hashable = false;	/* no need to make this accurate */
 				sortList = lappend(sortList, sortcl);
+
+				/*
+				 * At each step, convert the SortGroupClause list to pathkey
+				 * form.  If the just-added SortGroupClause is redundant, the
+				 * result will be shorter than the SortGroupClause list.
+				 */
+				sortPathkeys = make_pathkeys_for_sortclauses(root, sortList,
+															 newtlist);
+				if (list_length(sortPathkeys) != list_length(sortList))
+				{
+					/* Drop the redundant SortGroupClause */
+					sortList = list_delete_last(sortList);
+					Assert(list_length(sortPathkeys) == list_length(sortList));
+					/* Undo tlist addition, if we made one */
+					if (made_tle)
+					{
+						newtlist = list_delete_last(newtlist);
+						nextresno--;
+					}
+					/* We need not consider this clause for hashing, either */
+					continue;
+				}
 			}
+			else if (sjinfo->semi_can_btree)	/* shouldn't happen */
+				elog(ERROR, "could not find ordering operator for equality operator %u",
+					 in_oper);
 
 			if (sjinfo->semi_can_hash)
 			{
@@ -8460,8 +8493,27 @@ create_unique_paths(PlannerInfo *root, RelOptInfo *rel, SpecialJoinInfo *sjinfo)
 			}
 		}
 
+		/*
+		 * Done building the sortPathkeys and groupClause.  But the
+		 * sortPathkeys are bogus if not all the clauses were sortable.
+		 */
+		if (!sjinfo->semi_can_btree)
+			sortPathkeys = NIL;
+
+		/*
+		 * It can happen that all the RHS columns are equated to constants.
+		 * We'd have to do something special to unique-ify in that case, and
+		 * it's such an unlikely-in-the-real-world case that it's not worth
+		 * the effort.  So just punt if we found no columns to unique-ify.
+		 */
+		if (sortPathkeys == NIL && groupClause == NIL)
+		{
+			MemoryContextSwitchTo(oldcontext);
+			return NULL;
+		}
+
+		/* Convert the required targetlist back to PathTarget form */
 		unique_rel->reltarget = create_pathtarget(root, newtlist);
-		sortPathkeys = make_pathkeys_for_sortclauses(root, sortList, newtlist);
 	}
 
 	/* build unique paths based on input rel's pathlist */
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 7319945ffe3..c35288eecde 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -3398,26 +3398,6 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
  ba       |    0 |     1
 (2 rows)
 
--- Make sure that generation of HashAggregate for uniqification purposes
--- does not lead to array overflow due to unexpected duplicate hash keys
--- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
-set enable_memoize to off;
-explain (costs off)
-  select 1 from tenk1
-   where (hundred, thousand) in (select twothousand, twothousand from onek);
-                         QUERY PLAN                          
--------------------------------------------------------------
- Hash Join
-   Hash Cond: (tenk1.hundred = onek.twothousand)
-   ->  Seq Scan on tenk1
-         Filter: (hundred = thousand)
-   ->  Hash
-         ->  HashAggregate
-               Group Key: onek.twothousand, onek.twothousand
-               ->  Seq Scan on onek
-(8 rows)
-
-reset enable_memoize;
 --
 -- Hash Aggregation Spill tests
 --
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 98b05c94a11..b26b8c5bdbe 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3222,6 +3222,24 @@ where b.unique2 is null;
          ->  Index Only Scan using tenk1_unique2 on tenk1 b
 (5 rows)
 
+-- check that we avoid de-duplicating columns redundantly
+set enable_memoize to off;
+explain (costs off)
+select 1 from tenk1
+where (hundred, thousand) in (select twothousand, twothousand from onek);
+                   QUERY PLAN                    
+-------------------------------------------------
+ Hash Join
+   Hash Cond: (tenk1.hundred = onek.twothousand)
+   ->  Seq Scan on tenk1
+         Filter: (hundred = thousand)
+   ->  Hash
+         ->  HashAggregate
+               Group Key: onek.twothousand
+               ->  Seq Scan on onek
+(8 rows)
+
+reset enable_memoize;
 --
 -- regression test for bogus RTE_GROUP entries
 --
@@ -6501,6 +6519,68 @@ where t1.a = s.c;
 (0 rows)
 
 rollback;
+-- check handling of semijoins after join removal: we must suppress
+-- unique-ification of known-constant values
+begin;
+create temp table t (a int unique, b int);
+insert into t values (1, 2);
+explain (verbose, costs off)
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Nested Loop
+   Output: t1.a
+   Inner Unique: true
+   ->  Nested Loop
+         Output: t1.a, t5.a
+         ->  Index Only Scan using t_a_key on pg_temp.t t1
+               Output: t1.a
+               Index Cond: (t1.a = 1)
+         ->  HashAggregate
+               Output: t5.a
+               Group Key: t5.a
+               ->  Hash Join
+                     Output: t5.a
+                     Hash Cond: (t6.b = t4.b)
+                     ->  Seq Scan on pg_temp.t t6
+                           Output: t6.a, t6.b
+                     ->  Hash
+                           Output: t4.b, t5.b, t5.a
+                           ->  Hash Join
+                                 Output: t4.b, t5.b, t5.a
+                                 Inner Unique: true
+                                 Hash Cond: (t5.b = t4.b)
+                                 ->  Seq Scan on pg_temp.t t5
+                                       Output: t5.a, t5.b
+                                 ->  Hash
+                                       Output: t4.b, t4.a
+                                       ->  Index Scan using t_a_key on pg_temp.t t4
+                                             Output: t4.b, t4.a
+                                             Index Cond: (t4.a = 1)
+   ->  Index Only Scan using t_a_key on pg_temp.t t3
+         Output: t3.a
+         Index Cond: (t3.a = t5.a)
+(32 rows)
+
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+ a 
+---
+ 1
+(1 row)
+
+rollback;
 -- test cases where we can remove a join, but not a PHV computed at it
 begin;
 create temp table t (a int unique, b int);
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index dde85d0dfb2..62540b1ffa4 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -1510,15 +1510,6 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
   from unnest(array['a','b']) u(v)
  group by v||'a' order by 1;
 
--- Make sure that generation of HashAggregate for uniqification purposes
--- does not lead to array overflow due to unexpected duplicate hash keys
--- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
-set enable_memoize to off;
-explain (costs off)
-  select 1 from tenk1
-   where (hundred, thousand) in (select twothousand, twothousand from onek);
-reset enable_memoize;
-
 --
 -- Hash Aggregation Spill tests
 --
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 5f0a475894d..bccd171afb6 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -839,6 +839,13 @@ explain (costs off)
 select a.* from tenk1 a left join tenk1 b on a.unique1 = b.unique2
 where b.unique2 is null;
 
+-- check that we avoid de-duplicating columns redundantly
+set enable_memoize to off;
+explain (costs off)
+select 1 from tenk1
+where (hundred, thousand) in (select twothousand, twothousand from onek);
+reset enable_memoize;
+
 --
 -- regression test for bogus RTE_GROUP entries
 --
@@ -2420,6 +2427,32 @@ where t1.a = s.c;
 
 rollback;
 
+-- check handling of semijoins after join removal: we must suppress
+-- unique-ification of known-constant values
+begin;
+
+create temp table t (a int unique, b int);
+insert into t values (1, 2);
+
+explain (verbose, costs off)
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+
+rollback;
+
 -- test cases where we can remove a join, but not a PHV computed at it
 begin;