1 files changed, 204 insertions, 5 deletions
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 23b48c0b634..d61a952aefc 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -26,7 +26,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.73 2006/10/04 00:30:02 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.73.2.1 2007/04/26 23:24:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -63,6 +63,7 @@
 
 #include "postgres.h"
 
+#include "access/xact.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "utils/dynahash.h"
@@ -160,6 +161,9 @@ struct HTAB
 	char	   *tabname;		/* table name (for error messages) */
 	bool		isshared;		/* true if table is in shared memory */
 
+	/* freezing a shared table isn't allowed, so we can keep state here */
+	bool		frozen;			/* true = no more inserts allowed */
+
 	/* We keep local copies of these fixed values to reduce contention */
 	Size		keysize;		/* hash key length in bytes */
 	long		ssize;			/* segment size --- must be power of 2 */
@@ -195,6 +199,9 @@ static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
 static bool init_htab(HTAB *hashp, long nelem);
 static void hash_corrupted(HTAB *hashp);
+static void register_seq_scan(HTAB *hashp);
+static void deregister_seq_scan(HTAB *hashp);
+static bool has_seq_scans(HTAB *hashp);
 
 
 /*
@@ -356,6 +363,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 					 errmsg("out of memory")));
 	}
 
+	hashp->frozen = false;
+
 	hdefault(hashp);
 
 	hctl = hashp->hctl;
@@ -898,6 +907,10 @@ hash_search_with_hash_value(HTAB *hashp,
 			if (currBucket != NULL)
 				return (void *) ELEMENTKEY(currBucket);
 
+			/* disallow inserts if frozen */
+			if (hashp->frozen)
+				elog(ERROR, "cannot insert into a frozen hashtable");
+
 			currBucket = get_hash_entry(hashp);
 			if (currBucket == NULL)
 			{
@@ -925,10 +938,15 @@ hash_search_with_hash_value(HTAB *hashp,
 
 			/* caller is expected to fill the data field on return */
 
-			/* Check if it is time to split a bucket */
-			/* Can't split if running in partitioned mode */
+			/*
+			 * Check if it is time to split a bucket.  Can't split if running
+			 * in partitioned mode, nor if table is the subject of any active
+			 * hash_seq_search scans.  Strange order of these tests is to try
+			 * to check cheaper conditions first.
+			 */
 			if (!IS_PARTITIONED(hctl) &&
-			 hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
+				hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+				!has_seq_scans(hashp))
 			{
 				/*
 				 * NOTE: failure to expand table is not a fatal error, it just
@@ -1001,18 +1019,30 @@ hash_get_num_entries(HTAB *hashp)
 }
 
 /*
- * hash_seq_init/_search
+ * hash_seq_init/_search/_term
  *			Sequentially search through hash table and return
  *			all the elements one by one, return NULL when no more.
  *
+ * hash_seq_term should be called if and only if the scan is abandoned before
+ * completion; if hash_seq_search returns NULL then it has already done the
+ * end-of-scan cleanup.
+ *
  * NOTE: caller may delete the returned element before continuing the scan.
  * However, deleting any other element while the scan is in progress is
  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
  * if elements are added to the table while the scan is in progress, it is
  * unspecified whether they will be visited by the scan or not.
  *
+ * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
+ * worry about hash_seq_term cleanup, if the hashtable is first locked against
+ * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
+ * wherein it is inconvenient to track whether a scan is still open, and
+ * there's no possibility of further insertions after readout has begun.
+ *
  * NOTE: to use this with a partitioned hashtable, caller had better hold
  * at least shared lock on all partitions of the table throughout the scan!
+ * We can cope with insertions or deletions by our own backend, but *not*
+ * with concurrent insertions or deletions by another.
  */
 void
 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@@ -1020,6 +1050,8 @@ hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
 	status->hashp = hashp;
 	status->curBucket = 0;
 	status->curEntry = NULL;
+	if (!hashp->frozen)
+		register_seq_scan(hashp);
 }
 
 void *
@@ -1054,7 +1086,10 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 	max_bucket = hctl->max_bucket;
 
 	if (curBucket > max_bucket)
+	{
+		hash_seq_term(status);
 		return NULL;			/* search is done */
+	}
 
 	/*
 	 * first find the right segment in the table directory.
@@ -1076,6 +1111,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 		if (++curBucket > max_bucket)
 		{
 			status->curBucket = curBucket;
+			hash_seq_term(status);
 			return NULL;		/* search is done */
 		}
 		if (++segment_ndx >= ssize)
@@ -1094,6 +1130,36 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 	return (void *) ELEMENTKEY(curElem);
 }
 
+void
+hash_seq_term(HASH_SEQ_STATUS *status)
+{
+	if (!status->hashp->frozen)
+		deregister_seq_scan(status->hashp);
+}
+
+/*
+ * hash_freeze
+ *			Freeze a hashtable against future insertions (deletions are
+ *			still allowed)
+ *
+ * The reason for doing this is that by preventing any more bucket splits,
+ * we no longer need to worry about registering hash_seq_search scans,
+ * and thus caller need not be careful about ensuring hash_seq_term gets
+ * called at the right times.
+ *
+ * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
+ * with active scans (since hash_seq_term would then do the wrong thing).
+ */
+void
+hash_freeze(HTAB *hashp)
+{
+	if (hashp->isshared)
+		elog(ERROR, "cannot freeze shared hashtable");
+	if (!hashp->frozen && has_seq_scans(hashp))
+		elog(ERROR, "cannot freeze hashtable with active scans");
+	hashp->frozen = true;
+}
+
 
 /********************************* UTILITIES ************************/
 
@@ -1324,3 +1390,136 @@ my_log2(long num)
 		;
 	return i;
 }
+
+
+/************************* SEQ SCAN TRACKING ************************/
+
+/*
+ * We track active hash_seq_search scans here.  The need for this mechanism
+ * comes from the fact that a scan will get confused if a bucket split occurs
+ * while it's in progress: it might visit entries twice, or even miss some
+ * entirely (if it's partway through the same bucket that splits).  Hence
+ * we want to inhibit bucket splits if there are any active scans on the
+ * table being inserted into.  This is a fairly rare case in current usage,
+ * so just postponing the split until the next insertion seems sufficient.
+ *
+ * Given present usages of the function, only a few scans are likely to be
+ * open concurrently; so a finite-size stack of open scans seems sufficient,
+ * and we don't worry that linear search is too slow.  Note that we do
+ * allow multiple scans of the same hashtable to be open concurrently.
+ *
+ * This mechanism can support concurrent scan and insertion in a shared
+ * hashtable if it's the same backend doing both.  It would fail otherwise,
+ * but locking reasons seem to preclude any such scenario anyway, so we don't
+ * worry.
+ *
+ * This arrangement is reasonably robust if a transient hashtable is deleted
+ * without notifying us.  The absolute worst case is we might inhibit splits
+ * in another table created later at exactly the same address.  We will give
+ * a warning at transaction end for reference leaks, so any bugs leading to
+ * lack of notification should be easy to catch.
+ */
+
+#define MAX_SEQ_SCANS 100
+
+static HTAB *seq_scan_tables[MAX_SEQ_SCANS];	/* tables being scanned */
+static int	seq_scan_level[MAX_SEQ_SCANS];		/* subtransaction nest level */
+static int	num_seq_scans = 0;
+
+
+/* Register a table as having an active hash_seq_search scan */
+static void
+register_seq_scan(HTAB *hashp)
+{
+	if (num_seq_scans >= MAX_SEQ_SCANS)
+		elog(ERROR, "too many active hash_seq_search scans");
+	seq_scan_tables[num_seq_scans] = hashp;
+	seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
+	num_seq_scans++;
+}
+
+/* Deregister an active scan */
+static void
+deregister_seq_scan(HTAB *hashp)
+{
+	int		i;
+
+	/* Search backward since it's most likely at the stack top */
+	for (i = num_seq_scans - 1; i >= 0; i--)
+	{
+		if (seq_scan_tables[i] == hashp)
+		{
+			seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+			seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+			num_seq_scans--;
+			return;
+		}
+	}
+	elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
+		 hashp->tabname);
+}
+
+/* Check if a table has any active scan */
+static bool
+has_seq_scans(HTAB *hashp)
+{
+	int		i;
+
+	for (i = 0; i < num_seq_scans; i++)
+	{
+		if (seq_scan_tables[i] == hashp)
+			return true;
+	}
+	return false;
+}
+
+/* Clean up any open scans at end of transaction */
+void
+AtEOXact_HashTables(bool isCommit)
+{
+	/*
+	 * During abort cleanup, open scans are expected; just silently clean 'em
+	 * out.  An open scan at commit means someone forgot a hash_seq_term()
+	 * call, so complain.
+	 *
+	 * Note: it's tempting to try to print the tabname here, but refrain for
+	 * fear of touching deallocated memory.  This isn't a user-facing message
+	 * anyway, so it needn't be pretty.
+	 */
+	if (isCommit)
+	{
+		int		i;
+
+		for (i = 0; i < num_seq_scans; i++)
+		{
+			elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+				 seq_scan_tables[i]);
+		}
+	}
+	num_seq_scans = 0;
+}
+
+/* Clean up any open scans at end of subtransaction */
+void
+AtEOSubXact_HashTables(bool isCommit, int nestDepth)
+{
+	int		i;
+
+	/*
+	 * Search backward to make cleanup easy.  Note we must check all entries,
+	 * not only those at the end of the array, because deletion technique
+	 * doesn't keep them in order.
+	 */
+	for (i = num_seq_scans - 1; i >= 0; i--)
+	{
+		if (seq_scan_level[i] >= nestDepth)
+		{
+			if (isCommit)
+				elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+					 seq_scan_tables[i]);
+			seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+			seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+			num_seq_scans--;
+		}
+	}
+}