1 files changed, 117 insertions, 36 deletions
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 75087204f0c..8b64a625ca5 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -14,15 +14,18 @@
  */
 #include "postgres.h"
 
+#include <math.h>
+
+#include "access/htup_details.h"
 #include "access/parallel.h"
 #include "common/hashfn.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
 
-static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
+static int	TupleHashTableMatch(struct tuplehash_hash *tb, MinimalTuple tuple1, MinimalTuple tuple2);
 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
-												 const MinimalTuple tuple);
+												 MinimalTuple tuple);
 static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
 														   TupleTableSlot *slot,
 														   bool *isnew, uint32 hash);
@@ -143,10 +146,10 @@ execTuplesHashPrepare(int numCols,
  *	eqfuncoids: OIDs of equality comparison functions to use
  *	hashfunctions: FmgrInfos of datatype-specific hashing functions to use
  *	collations: collations to use in comparisons
- *	nbuckets: initial estimate of hashtable size
+ *	nelements: initial estimate of hashtable size
  *	additionalsize: size of data that may be stored along with the hash entry
- *	metacxt: memory context for long-lived allocation, but not per-entry data
- *	tablecxt: memory context in which to store table entries
+ *	metacxt: memory context for long-lived data and the simplehash table
+ *	tuplescxt: memory context in which to store the hashed tuples themselves
  *	tempcxt: short-lived context for evaluation hash and comparison functions
  *	use_variable_hash_iv: if true, adjust hash IV per-parallel-worker
  *
@@ -157,11 +160,25 @@ execTuplesHashPrepare(int numCols,
  * Note that the keyColIdx, hashfunctions, and collations arrays must be
  * allocated in storage that will live as long as the hashtable does.
  *
+ * The metacxt and tuplescxt are separate because it's usually desirable for
+ * tuplescxt to be a BumpContext to avoid memory wastage, while metacxt must
+ * support pfree in case the simplehash table needs to be enlarged.  (We could
+ * simplify the API of TupleHashTables by managing the tuplescxt internally.
+ * But that would be disadvantageous to nodeAgg.c and nodeSubplan.c, which use
+ * a single tuplescxt for multiple TupleHashTables that are reset together.)
+ *
  * LookupTupleHashEntry, FindTupleHashEntry, and related functions may leak
  * memory in the tempcxt.  It is caller's responsibility to reset that context
  * reasonably often, typically once per tuple.  (We do it that way, rather
  * than managing an extra context within the hashtable, because in many cases
  * the caller can specify a tempcxt that it needs to reset per-tuple anyway.)
+ *
+ * We don't currently provide DestroyTupleHashTable functionality; the hash
+ * table will be cleaned up at destruction of the metacxt.  (Some callers
+ * bother to delete the tuplescxt explicitly, though it'd be sufficient to
+ * ensure it's a child of the metacxt.)  There's not much point in working
+ * harder than this so long as the expression-evaluation infrastructure
+ * behaves similarly.
  */
 TupleHashTable
 BuildTupleHashTable(PlanState *parent,
@@ -172,28 +189,38 @@ BuildTupleHashTable(PlanState *parent,
 					const Oid *eqfuncoids,
 					FmgrInfo *hashfunctions,
 					Oid *collations,
-					long nbuckets,
+					double nelements,
 					Size additionalsize,
 					MemoryContext metacxt,
-					MemoryContext tablecxt,
+					MemoryContext tuplescxt,
 					MemoryContext tempcxt,
 					bool use_variable_hash_iv)
 {
 	TupleHashTable hashtable;
-	Size		entrysize;
-	Size		hash_mem_limit;
+	uint32		nbuckets;
 	MemoryContext oldcontext;
-	bool		allow_jit;
 	uint32		hash_iv = 0;
 
-	Assert(nbuckets > 0);
-	additionalsize = MAXALIGN(additionalsize);
-	entrysize = sizeof(TupleHashEntryData) + additionalsize;
+	/*
+	 * tuplehash_create requires a uint32 element count, so we had better
+	 * clamp the given nelements to fit in that.  As long as we have to do
+	 * that, we might as well protect against completely insane input like
+	 * zero or NaN.  But it is not our job here to enforce issues like staying
+	 * within hash_mem: the caller should have done that, and we don't have
+	 * enough info to second-guess.
+	 */
+	if (isnan(nelements) || nelements <= 0)
+		nbuckets = 1;
+	else if (nelements >= PG_UINT32_MAX)
+		nbuckets = PG_UINT32_MAX;
+	else
+		nbuckets = (uint32) nelements;
 
-	/* Limit initial table size request to not more than hash_mem */
-	hash_mem_limit = get_hash_memory_limit() / entrysize;
-	if (nbuckets > hash_mem_limit)
-		nbuckets = hash_mem_limit;
+	/* tuplescxt must be separate, else ResetTupleHashTable breaks things */
+	Assert(metacxt != tuplescxt);
+
+	/* ensure additionalsize is maxalign'ed */
+	additionalsize = MAXALIGN(additionalsize);
 
 	oldcontext = MemoryContextSwitchTo(metacxt);
 
@@ -202,7 +229,7 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->numCols = numCols;
 	hashtable->keyColIdx = keyColIdx;
 	hashtable->tab_collations = collations;
-	hashtable->tablecxt = tablecxt;
+	hashtable->tuplescxt = tuplescxt;
 	hashtable->tempcxt = tempcxt;
 	hashtable->additionalsize = additionalsize;
 	hashtable->tableslot = NULL;	/* will be made on first lookup */
@@ -230,16 +257,6 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc),
 													&TTSOpsMinimalTuple);
 
-	/*
-	 * If the caller fails to make the metacxt different from the tablecxt,
-	 * allowing JIT would lead to the generated functions to a) live longer
-	 * than the query or b) be re-generated each time the table is being
-	 * reset. Therefore prevent JIT from being used in that case, by not
-	 * providing a parent node (which prevents accessing the JitContext in the
-	 * EState).
-	 */
-	allow_jit = (metacxt != tablecxt);
-
 	/* build hash ExprState for all columns */
 	hashtable->tab_hash_expr = ExecBuildHash32FromAttrs(inputDesc,
 														inputOps,
@@ -247,7 +264,7 @@ BuildTupleHashTable(PlanState *parent,
 														collations,
 														numCols,
 														keyColIdx,
-														allow_jit ? parent : NULL,
+														parent,
 														hash_iv);
 
 	/* build comparator for all columns */
@@ -256,7 +273,7 @@ BuildTupleHashTable(PlanState *parent,
 													&TTSOpsMinimalTuple,
 													numCols,
 													keyColIdx, eqfuncoids, collations,
-													allow_jit ? parent : NULL);
+													parent);
 
 	/*
 	 * While not pretty, it's ok to not shut down this context, but instead
@@ -273,13 +290,77 @@ BuildTupleHashTable(PlanState *parent,
 
 /*
  * Reset contents of the hashtable to be empty, preserving all the non-content
- * state. Note that the tablecxt passed to BuildTupleHashTable() should
- * also be reset, otherwise there will be leaks.
+ * state.
+ *
+ * Note: in usages where several TupleHashTables share a tuplescxt, all must
+ * be reset together, as the first one's reset call will destroy all their
+ * data.  The additional reset calls for the rest will redundantly reset the
+ * tuplescxt.  But because of mcxt.c's isReset flag, that's cheap enough that
+ * we need not avoid it.
  */
 void
 ResetTupleHashTable(TupleHashTable hashtable)
 {
 	tuplehash_reset(hashtable->hashtab);
+	MemoryContextReset(hashtable->tuplescxt);
+}
+
+/*
+ * Estimate the amount of space needed for a TupleHashTable with nentries
+ * entries, if the tuples have average data width tupleWidth and the caller
+ * requires additionalsize extra space per entry.
+ *
+ * Return SIZE_MAX if it'd overflow size_t.
+ *
+ * nentries is "double" because this is meant for use by the planner,
+ * which typically works with double rowcount estimates.  So we'd need to
+ * clamp to integer somewhere and that might as well be here.  We do expect
+ * the value not to be NaN or negative, else the result will be garbage.
+ */
+Size
+EstimateTupleHashTableSpace(double nentries,
+							Size tupleWidth,
+							Size additionalsize)
+{
+	Size		sh_space;
+	double		tuples_space;
+
+	/* First estimate the space needed for the simplehash table */
+	sh_space = tuplehash_estimate_space(nentries);
+
+	/* Give up if that's already too big */
+	if (sh_space >= SIZE_MAX)
+		return sh_space;
+
+	/*
+	 * Compute space needed for hashed tuples with additional data.  nentries
+	 * must be somewhat sane, so it should be safe to compute this product.
+	 *
+	 * We assume that the hashed tuples will be kept in a BumpContext so that
+	 * there is not additional per-tuple overhead.
+	 *
+	 * (Note that this is only accurate if MEMORY_CONTEXT_CHECKING is off,
+	 * else bump.c will add a MemoryChunk header to each tuple.  However, it
+	 * seems undesirable for debug builds to make different planning choices
+	 * than production builds, so we assume the production behavior always.)
+	 */
+	tuples_space = nentries * (MAXALIGN(SizeofMinimalTupleHeader) +
+							   MAXALIGN(tupleWidth) +
+							   MAXALIGN(additionalsize));
+
+	/*
+	 * Check for size_t overflow.  This coding is trickier than it may appear,
+	 * because on 64-bit machines SIZE_MAX cannot be represented exactly as a
+	 * double.  We must cast it explicitly to suppress compiler warnings about
+	 * an inexact conversion, and we must trust that any double value that
+	 * compares strictly less than "(double) SIZE_MAX" will cast to a
+	 * representable size_t value.
+	 */
+	if (sh_space + tuples_space >= (double) SIZE_MAX)
+		return SIZE_MAX;
+
+	/* We don't bother estimating size of the miscellaneous overhead data */
+	return (Size) (sh_space + tuples_space);
 }
 
 /*
@@ -419,7 +500,7 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
  */
 static uint32
 TupleHashTableHash_internal(struct tuplehash_hash *tb,
-							const MinimalTuple tuple)
+							MinimalTuple tuple)
 {
 	TupleHashTable hashtable = (TupleHashTable) tb->private_data;
 	uint32		hashkey;
@@ -489,10 +570,10 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 			/* created new entry */
 			*isnew = true;
 
-			MemoryContextSwitchTo(hashtable->tablecxt);
+			MemoryContextSwitchTo(hashtable->tuplescxt);
 
 			/*
-			 * Copy the first tuple into the table context, and request
+			 * Copy the first tuple into the tuples context, and request
 			 * additionalsize extra bytes before the allocation.
 			 *
 			 * The caller can get a pointer to the additional data with
@@ -517,7 +598,7 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
  * See whether two tuples (presumably of the same hash value) match
  */
 static int
-TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2)
+TupleHashTableMatch(struct tuplehash_hash *tb, MinimalTuple tuple1, MinimalTuple tuple2)
 {
 	TupleTableSlot *slot1;
 	TupleTableSlot *slot2;