13 files changed, 161 insertions, 252 deletions
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 0dcd6ee817e..7491cc3cb93 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1143,6 +1143,17 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	if (!(snapshot && IsMVCCSnapshot(snapshot)))
 		scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
 
+	/* Check that a historic snapshot is not used for non-catalog tables */
+	if (snapshot &&
+		IsHistoricMVCCSnapshot(snapshot) &&
+		!RelationIsAccessibleInLogicalDecoding(relation))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+				 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
+						RelationGetRelationName(relation))));
+	}
+
 	/*
 	 * For seqscan and sample scans in a serializable transaction, acquire a
 	 * predicate lock on the entire relation. This is required not only to
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 1a4f36fe0a9..86d11f4ec79 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -263,6 +263,16 @@ index_beginscan(Relation heapRelation,
 
 	Assert(snapshot != InvalidSnapshot);
 
+	/* Check that a historic snapshot is not used for non-catalog tables */
+	if (IsHistoricMVCCSnapshot(snapshot) &&
+		!RelationIsAccessibleInLogicalDecoding(heapRelation))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+				 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
+						RelationGetRelationName(heapRelation))));
+	}
+
 	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false);
 
 	/*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e8909406686..7ffb2179151 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -303,6 +303,11 @@ static bool doPageWrites;
  * so it's a plain spinlock.  The other locks are held longer (potentially
  * over I/O operations), so we use LWLocks for them.  These locks are:
  *
+ * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
+ * It is only held while initializing and changing the mapping.  If the
+ * contents of the buffer being replaced haven't been written yet, the mapping
+ * lock is released while the write is done, and reacquired afterwards.
+ *
  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
  * XLogFlush).
  *
@@ -468,37 +473,21 @@ typedef struct XLogCtlData
 	pg_atomic_uint64 logFlushResult;	/* last byte + 1 flushed */
 
 	/*
-	 * First initialized page in the cache (first byte position).
-	 */
-	XLogRecPtr	InitializedFrom;
-
-	/*
-	 * Latest reserved for initialization page in the cache (last byte
-	 * position + 1).
+	 * Latest initialized page in the cache (last byte position + 1).
 	 *
-	 * To change the identity of a buffer, you need to advance
-	 * InitializeReserved first.  To change the identity of a buffer that's
+	 * To change the identity of a buffer (and InitializedUpTo), you need to
+	 * hold WALBufMappingLock.  To change the identity of a buffer that's
 	 * still dirty, the old page needs to be written out first, and for that
 	 * you need WALWriteLock, and you need to ensure that there are no
 	 * in-progress insertions to the page by calling
 	 * WaitXLogInsertionsToFinish().
 	 */
-	pg_atomic_uint64 InitializeReserved;
-
-	/*
-	 * Latest initialized page in the cache (last byte position + 1).
-	 *
-	 * InitializedUpTo is updated after the buffer initialization.  After
-	 * update, waiters got notification using InitializedUpToCondVar.
-	 */
-	pg_atomic_uint64 InitializedUpTo;
-	ConditionVariable InitializedUpToCondVar;
+	XLogRecPtr	InitializedUpTo;
 
 	/*
 	 * These values do not change after startup, although the pointed-to pages
-	 * and xlblocks values certainly do.  xlblocks values are changed
-	 * lock-free according to the check for the xlog write position and are
-	 * accompanied by changes of InitializeReserved and InitializedUpTo.
+	 * and xlblocks values certainly do.  xlblocks values are protected by
+	 * WALBufMappingLock.
 	 */
 	char	   *pages;			/* buffers for unwritten XLOG pages */
 	pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
@@ -821,9 +810,9 @@ XLogInsertRecord(XLogRecData *rdata,
 	 * fullPageWrites from changing until the insertion is finished.
 	 *
 	 * Step 2 can usually be done completely in parallel. If the required WAL
-	 * page is not initialized yet, you have to go through AdvanceXLInsertBuffer,
-	 * which will ensure it is initialized. But the WAL writer tries to do that
-	 * ahead of insertions to avoid that from happening in the critical path.
+	 * page is not initialized yet, you have to grab WALBufMappingLock to
+	 * initialize it, but the WAL writer tries to do that ahead of insertions
+	 * to avoid that from happening in the critical path.
 	 *
 	 *----------
 	 */
@@ -2005,79 +1994,32 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 	XLogRecPtr	NewPageEndPtr = InvalidXLogRecPtr;
 	XLogRecPtr	NewPageBeginPtr;
 	XLogPageHeader NewPage;
-	XLogRecPtr	ReservedPtr;
 	int			npages pg_attribute_unused() = 0;
 
-	/*
-	 * We must run the loop below inside the critical section as we expect
-	 * XLogCtl->InitializedUpTo to eventually keep up.  The most of callers
-	 * already run inside the critical section. Except for WAL writer, which
-	 * passed 'opportunistic == true', and therefore we don't perform
-	 * operations that could error out.
-	 *
-	 * Start an explicit critical section anyway though.
-	 */
-	Assert(CritSectionCount > 0 || opportunistic);
-	START_CRIT_SECTION();
+	LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
 
-	/*--
-	 * Loop till we get all the pages in WAL buffer before 'upto' reserved for
-	 * initialization.  Multiple process can initialize different buffers with
-	 * this loop in parallel as following.
-	 *
-	 * 1. Reserve page for initialization using XLogCtl->InitializeReserved.
-	 * 2. Initialize the reserved page.
-	 * 3. Attempt to advance XLogCtl->InitializedUpTo,
+	/*
+	 * Now that we have the lock, check if someone initialized the page
+	 * already.
 	 */
-	ReservedPtr = pg_atomic_read_u64(&XLogCtl->InitializeReserved);
-	while (upto >= ReservedPtr || opportunistic)
+	while (upto >= XLogCtl->InitializedUpTo || opportunistic)
 	{
-		Assert(ReservedPtr % XLOG_BLCKSZ == 0);
+		nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
 
 		/*
-		 * Get ending-offset of the buffer page we need to replace.
-		 *
-		 * We don't lookup into xlblocks, but rather calculate position we
-		 * must wait to be written. If it was written, xlblocks will have this
-		 * position (or uninitialized)
+		 * Get ending-offset of the buffer page we need to replace (this may
+		 * be zero if the buffer hasn't been used yet).  Fall through if it's
+		 * already written out.
 		 */
-		if (ReservedPtr + XLOG_BLCKSZ > XLogCtl->InitializedFrom + XLOG_BLCKSZ * XLOGbuffers)
-			OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - (XLogRecPtr) XLOG_BLCKSZ * XLOGbuffers;
-		else
-			OldPageRqstPtr = InvalidXLogRecPtr;
-
-		if (LogwrtResult.Write < OldPageRqstPtr && opportunistic)
+		OldPageRqstPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]);
+		if (LogwrtResult.Write < OldPageRqstPtr)
 		{
 			/*
-			 * If we just want to pre-initialize as much as we can without
-			 * flushing, give up now.
+			 * Nope, got work to do. If we just want to pre-initialize as much
+			 * as we can without flushing, give up now.
 			 */
-			upto = ReservedPtr - 1;
-			break;
-		}
-
-		/*
-		 * Attempt to reserve the page for initialization.  Failure means that
-		 * this page got reserved by another process.
-		 */
-		if (!pg_atomic_compare_exchange_u64(&XLogCtl->InitializeReserved,
-											&ReservedPtr,
-											ReservedPtr + XLOG_BLCKSZ))
-			continue;
-
-		/*
-		 * Wait till page gets correctly initialized up to OldPageRqstPtr.
-		 */
-		nextidx = XLogRecPtrToBufIdx(ReservedPtr);
-		while (pg_atomic_read_u64(&XLogCtl->InitializedUpTo) < OldPageRqstPtr)
-			ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT);
-		ConditionVariableCancelSleep();
-		Assert(pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) == OldPageRqstPtr);
-
-		/* Fall through if it's already written out. */
-		if (LogwrtResult.Write < OldPageRqstPtr)
-		{
-			/* Nope, got work to do. */
+			if (opportunistic)
+				break;
 
 			/* Advance shared memory write request position */
 			SpinLockAcquire(&XLogCtl->info_lck);
@@ -2092,6 +2034,14 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 			RefreshXLogWriteResult(LogwrtResult);
 			if (LogwrtResult.Write < OldPageRqstPtr)
 			{
+				/*
+				 * Must acquire write lock. Release WALBufMappingLock first,
+				 * to make sure that all insertions that we need to wait for
+				 * can finish (up to this same position). Otherwise we risk
+				 * deadlock.
+				 */
+				LWLockRelease(WALBufMappingLock);
+
 				WaitXLogInsertionsToFinish(OldPageRqstPtr);
 
 				LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
@@ -2119,6 +2069,9 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 					 */
 					pgstat_report_fixed = true;
 				}
+				/* Re-acquire WALBufMappingLock and retry */
+				LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+				continue;
 			}
 		}
 
@@ -2126,9 +2079,11 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 		 * Now the next buffer slot is free and we can set it up to be the
 		 * next output page.
 		 */
-		NewPageBeginPtr = ReservedPtr;
+		NewPageBeginPtr = XLogCtl->InitializedUpTo;
 		NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
 
+		Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
+
 		NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
 
 		/*
@@ -2192,100 +2147,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 		 */
 		pg_write_barrier();
 
-		/*-----
-		 * Update the value of XLogCtl->xlblocks[nextidx] and try to advance
-		 * XLogCtl->InitializedUpTo in a lock-less manner.
-		 *
-		 * First, let's provide a formal proof of the algorithm.  Let it be 'n'
-		 * process with the following variables in shared memory:
-		 *	f - an array of 'n' boolean flags,
-		 *	v - atomic integer variable.
-		 *
-		 * Also, let
-		 *	i - a number of a process,
-		 *	j - local integer variable,
-		 * CAS(var, oldval, newval) - compare-and-swap atomic operation
-		 *							  returning true on success,
-		 * write_barrier()/read_barrier() - memory barriers.
-		 *
-		 * The pseudocode for each process is the following.
-		 *
-		 *	j := i
-		 *	f[i] := true
-		 *	write_barrier()
-		 *	while CAS(v, j, j + 1):
-		 *		j := j + 1
-		 *		read_barrier()
-		 *		if not f[j]:
-		 *			break
-		 *
-		 * Let's prove that v eventually reaches the value of n.
-		 * 1. Prove by contradiction.  Assume v doesn't reach n and stucks
-		 *	  on k, where k < n.
-		 * 2. Process k attempts CAS(v, k, k + 1).  1). If, as we assumed, v
-		 *	  gets stuck at k, then this CAS operation must fail.  Therefore,
-		 *    v < k when process k attempts CAS(v, k, k + 1).
-		 * 3. If, as we assumed, v gets stuck at k, then the value k of v
-		 *	  must be achieved by some process m, where m < k.  The process
-		 *	  m must observe f[k] == false.  Otherwise, it will later attempt
-		 *	  CAS(v, k, k + 1) with success.
-		 * 4. Therefore, corresponding read_barrier() (while j == k) on
-		 *	  process m reached before write_barrier() of process k.  But then
-		 *	  process k attempts CAS(v, k, k + 1) after process m successfully
-		 *	  incremented v to k, and that CAS operation must succeed.
-		 *	  That leads to a contradiction.  So, there is no such k (k < n)
-		 *    where v gets stuck.  Q.E.D.
-		 *
-		 * To apply this proof to the code below, we assume
-		 * XLogCtl->InitializedUpTo will play the role of v with XLOG_BLCKSZ
-		 * granularity.  We also assume setting XLogCtl->xlblocks[nextidx] to
-		 * NewPageEndPtr to play the role of setting f[i] to true.  Also, note
-		 * that processes can't concurrently map different xlog locations to
-		 * the same nextidx because we previously requested that
-		 * XLogCtl->InitializedUpTo >= OldPageRqstPtr.  So, a xlog buffer can
-		 * be taken for initialization only once the previous initialization
-		 * takes effect on XLogCtl->InitializedUpTo.
-		 */
-
 		pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
-
-		pg_write_barrier();
-
-		while (pg_atomic_compare_exchange_u64(&XLogCtl->InitializedUpTo, &NewPageBeginPtr, NewPageEndPtr))
-		{
-			NewPageBeginPtr = NewPageEndPtr;
-			NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
-			nextidx = XLogRecPtrToBufIdx(NewPageBeginPtr);
-
-			pg_read_barrier();
-
-			if (pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) != NewPageEndPtr)
-			{
-				/*
-				 * Page at nextidx wasn't initialized yet, so we can't move
-				 * InitializedUpto further. It will be moved by backend which
-				 * will initialize nextidx.
-				 */
-				ConditionVariableBroadcast(&XLogCtl->InitializedUpToCondVar);
-				break;
-			}
-		}
+		XLogCtl->InitializedUpTo = NewPageEndPtr;
 
 		npages++;
 	}
-
-	END_CRIT_SECTION();
-
-	/*
-	 * All the pages in WAL buffer before 'upto' were reserved for
-	 * initialization.  However, some pages might be reserved by concurrent
-	 * processes.  Wait till they finish initialization.
-	 */
-	while (upto >= pg_atomic_read_u64(&XLogCtl->InitializedUpTo))
-		ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT);
-	ConditionVariableCancelSleep();
-
-	pg_read_barrier();
+	LWLockRelease(WALBufMappingLock);
 
 #ifdef WAL_DEBUG
 	if (XLOG_DEBUG && npages > 0)
@@ -5178,10 +5045,6 @@ XLOGShmemInit(void)
 	pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr);
 	pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr);
 	pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr);
-
-	pg_atomic_init_u64(&XLogCtl->InitializeReserved, InvalidXLogRecPtr);
-	pg_atomic_init_u64(&XLogCtl->InitializedUpTo, InvalidXLogRecPtr);
-	ConditionVariableInit(&XLogCtl->InitializedUpToCondVar);
 }
 
 /*
@@ -6205,8 +6068,7 @@ StartupXLOG(void)
 		memset(page + len, 0, XLOG_BLCKSZ - len);
 
 		pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
-		pg_atomic_write_u64(&XLogCtl->InitializedUpTo, endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
-		XLogCtl->InitializedFrom = endOfRecoveryInfo->lastPageBeginPtr;
+		XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
 	}
 	else
 	{
@@ -6215,10 +6077,8 @@ StartupXLOG(void)
 		 * let the first attempt to insert a log record to initialize the next
 		 * buffer.
 		 */
-		pg_atomic_write_u64(&XLogCtl->InitializedUpTo, EndOfLog);
-		XLogCtl->InitializedFrom = EndOfLog;
+		XLogCtl->InitializedUpTo = EndOfLog;
 	}
-	pg_atomic_write_u64(&XLogCtl->InitializeReserved, pg_atomic_read_u64(&XLogCtl->InitializedUpTo));
 
 	/*
 	 * Update local and shared status.  This is OK to do without any locks
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 227df90f89c..fb784acf4af 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -586,7 +586,7 @@ RelFileLocatorSkippingWAL(RelFileLocator rlocator)
 Size
 EstimatePendingSyncsSpace(void)
 {
-	long		entries;
+	int64		entries;
 
 	entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
 	return mul_size(1 + entries, sizeof(RelFileLocator));
diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index 26d985193ae..c6de04819f1 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -126,7 +126,7 @@ static Oid	findTypeSubscriptingFunction(List *procname, Oid typeOid);
 static Oid	findRangeSubOpclass(List *opcname, Oid subtype);
 static Oid	findRangeCanonicalFunction(List *procname, Oid typeOid);
 static Oid	findRangeSubtypeDiffFunction(List *procname, Oid subtype);
-static void validateDomainCheckConstraint(Oid domainoid, const char *ccbin);
+static void validateDomainCheckConstraint(Oid domainoid, const char *ccbin, LOCKMODE lockmode);
 static void validateDomainNotNullConstraint(Oid domainoid);
 static List *get_rels_with_domain(Oid domainOid, LOCKMODE lockmode);
 static void checkEnumOwner(HeapTuple tup);
@@ -2986,7 +2986,7 @@ AlterDomainAddConstraint(List *names, Node *newConstraint,
 		 * to.
 		 */
 		if (!constr->skip_validation)
-			validateDomainCheckConstraint(domainoid, ccbin);
+			validateDomainCheckConstraint(domainoid, ccbin, ShareLock);
 
 		/*
 		 * We must send out an sinval message for the domain, to ensure that
@@ -3098,7 +3098,12 @@ AlterDomainValidateConstraint(List *names, const char *constrName)
 	val = SysCacheGetAttrNotNull(CONSTROID, tuple, Anum_pg_constraint_conbin);
 	conbin = TextDatumGetCString(val);
 
-	validateDomainCheckConstraint(domainoid, conbin);
+	/*
+	 * Locking related relations with ShareUpdateExclusiveLock is ok because
+	 * not-yet-valid constraints are still enforced against concurrent inserts
+	 * or updates.
+	 */
+	validateDomainCheckConstraint(domainoid, conbin, ShareUpdateExclusiveLock);
 
 	/*
 	 * Now update the catalog, while we have the door open.
@@ -3191,9 +3196,16 @@ validateDomainNotNullConstraint(Oid domainoid)
 /*
  * Verify that all columns currently using the domain satisfy the given check
  * constraint expression.
+ *
+ * It is used to validate existing constraints and to add newly created check
+ * constraints to a domain.
+ *
+ * The lockmode is used for relations using the domain.  It should be
+ * ShareLock when adding a new constraint to domain.  It can be
+ * ShareUpdateExclusiveLock when validating an existing constraint.
  */
 static void
-validateDomainCheckConstraint(Oid domainoid, const char *ccbin)
+validateDomainCheckConstraint(Oid domainoid, const char *ccbin, LOCKMODE lockmode)
 {
 	Expr	   *expr = (Expr *) stringToNode(ccbin);
 	List	   *rels;
@@ -3210,9 +3222,7 @@ validateDomainCheckConstraint(Oid domainoid, const char *ccbin)
 	exprstate = ExecPrepareExpr(expr, estate);
 
 	/* Fetch relation list with attributes based on this domain */
-	/* ShareLock is sufficient to prevent concurrent data changes */
-
-	rels = get_rels_with_domain(domainoid, ShareLock);
+	rels = get_rels_with_domain(domainoid, lockmode);
 
 	foreach(rt, rels)
 	{
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index d12a3ca0684..a0770e86796 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -330,8 +330,8 @@ InitShmemIndex(void)
  */
 HTAB *
 ShmemInitHash(const char *name,		/* table string name for shmem index */
-			  long init_size,	/* initial table size */
-			  long max_size,	/* max size of the table */
+			  int64 init_size,	/* initial table size */
+			  int64 max_size,	/* max size of the table */
 			  HASHCTL *infoP,	/* info about key and bucket size */
 			  int hash_flags)	/* info about infoP */
 {
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index f8c88147160..233b85b623d 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -443,7 +443,7 @@ void
 LockManagerShmemInit(void)
 {
 	HASHCTL		info;
-	long		init_table_size,
+	int64		init_table_size,
 				max_table_size;
 	bool		found;
 
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index c07fb588355..c1d8511ad17 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -1145,7 +1145,7 @@ void
 PredicateLockShmemInit(void)
 {
 	HASHCTL		info;
-	long		max_table_size;
+	int64		max_table_size;
 	Size		requestSize;
 	bool		found;
 
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 0be307d2ca0..5427da5bc1b 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -156,7 +156,6 @@ REPLICATION_SLOT_DROP	"Waiting for a replication slot to become inactive so it c
 RESTORE_COMMAND	"Waiting for <xref linkend="guc-restore-command"/> to complete."
 SAFE_SNAPSHOT	"Waiting to obtain a valid snapshot for a <literal>READ ONLY DEFERRABLE</literal> transaction."
 SYNC_REP	"Waiting for confirmation from a remote server during synchronous replication."
-WAL_BUFFER_INIT	"Waiting on WAL buffer to be initialized."
 WAL_RECEIVER_EXIT	"Waiting for the WAL receiver to exit."
 WAL_RECEIVER_WAIT_START	"Waiting for startup process to send initial data for streaming replication."
 WAL_SUMMARY_READY	"Waiting for a new WAL summary to be generated."
@@ -316,6 +315,7 @@ XidGen	"Waiting to allocate a new transaction ID."
 ProcArray	"Waiting to access the shared per-process data structures (typically, to get a snapshot or report a session's transaction ID)."
 SInvalRead	"Waiting to retrieve messages from the shared catalog invalidation queue."
 SInvalWrite	"Waiting to add a message to the shared catalog invalidation queue."
+WALBufMapping	"Waiting to replace a page in WAL buffers."
 WALWrite	"Waiting for WAL buffers to be written to disk."
 ControlFile	"Waiting to read or update the <filename>pg_control</filename> file or create a new WAL file."
 MultiXactGen	"Waiting to read or update shared multixact state."
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index 25865b660ef..3a059f4ace0 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -938,6 +938,9 @@ pg_relation_filenode(PG_FUNCTION_ARGS)
  *
  * We don't fail but return NULL if we cannot find a mapping.
  *
+ * Temporary relations are not detected, returning NULL (see
+ * RelidByRelfilenumber() for the reasons).
+ *
  * InvalidOid can be passed instead of the current database's default
  * tablespace.
  */
diff --git a/src/backend/utils/cache/relfilenumbermap.c b/src/backend/utils/cache/relfilenumbermap.c
index 8a2f6f8c693..0b6f9cf3fa1 100644
--- a/src/backend/utils/cache/relfilenumbermap.c
+++ b/src/backend/utils/cache/relfilenumbermap.c
@@ -130,6 +130,11 @@ InitializeRelfilenumberMap(void)
  * Map a relation's (tablespace, relfilenumber) to a relation's oid and cache
  * the result.
  *
+ * A temporary relation may share its relfilenumber with a permanent relation
+ * or temporary relations created in other backends.  Being able to uniquely
+ * identify a temporary relation would require a backend's proc number, which
+ * we do not know about.  Hence, this function ignores this case.
+ *
  * Returns InvalidOid if no relation matching the criteria could be found.
  */
 Oid
@@ -208,6 +213,9 @@ RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber)
 		{
 			Form_pg_class classform = (Form_pg_class) GETSTRUCT(ntp);
 
+			if (classform->relpersistence == RELPERSISTENCE_TEMP)
+				continue;
+
 			if (found)
 				elog(ERROR,
 					 "unexpected duplicate for tablespace %u, relfilenumber %u",
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index a7094917c20..1aeee5be42a 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -154,7 +154,7 @@ typedef HASHBUCKET *HASHSEGMENT;
 typedef struct
 {
 	slock_t		mutex;			/* spinlock for this freelist */
-	long		nentries;		/* number of entries in associated buckets */
+	int64		nentries;		/* number of entries in associated buckets */
 	HASHELEMENT *freeList;		/* chain of free elements */
 } FreeListData;
 
@@ -182,8 +182,8 @@ struct HASHHDR
 
 	/* These fields can change, but not in a partitioned table */
 	/* Also, dsize can't change in a shared table, even if unpartitioned */
-	long		dsize;			/* directory size */
-	long		nsegs;			/* number of allocated segments (<= dsize) */
+	int64		dsize;			/* directory size */
+	int64		nsegs;			/* number of allocated segments (<= dsize) */
 	uint32		max_bucket;		/* ID of maximum bucket in use */
 	uint32		high_mask;		/* mask to modulo into entire table */
 	uint32		low_mask;		/* mask to modulo into lower half of table */
@@ -191,9 +191,9 @@ struct HASHHDR
 	/* These fields are fixed at hashtable creation */
 	Size		keysize;		/* hash key length in bytes */
 	Size		entrysize;		/* total user element size in bytes */
-	long		num_partitions; /* # partitions (must be power of 2), or 0 */
-	long		max_dsize;		/* 'dsize' limit if directory is fixed size */
-	long		ssize;			/* segment size --- must be power of 2 */
+	int64		num_partitions; /* # partitions (must be power of 2), or 0 */
+	int64		max_dsize;		/* 'dsize' limit if directory is fixed size */
+	int64		ssize;			/* segment size --- must be power of 2 */
 	int			sshift;			/* segment shift = log2(ssize) */
 	int			nelem_alloc;	/* number of entries to allocate at once */
 	bool		isfixed;		/* if true, don't enlarge */
@@ -236,7 +236,7 @@ struct HTAB
 
 	/* We keep local copies of these fixed values to reduce contention */
 	Size		keysize;		/* hash key length in bytes */
-	long		ssize;			/* segment size --- must be power of 2 */
+	int64		ssize;			/* segment size --- must be power of 2 */
 	int			sshift;			/* segment shift = log2(ssize) */
 
 	/*
@@ -277,12 +277,12 @@ static bool expand_table(HTAB *hashp);
 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
 static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
-static bool init_htab(HTAB *hashp, long nelem);
+static bool init_htab(HTAB *hashp, int64 nelem);
 pg_noreturn static void hash_corrupted(HTAB *hashp);
 static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
 								  HASHBUCKET **bucketptr);
-static long next_pow2_long(long num);
-static int	next_pow2_int(long num);
+static int64 next_pow2_int64(int64 num);
+static int	next_pow2_int(int64 num);
 static void register_seq_scan(HTAB *hashp);
 static void deregister_seq_scan(HTAB *hashp);
 static bool has_seq_scans(HTAB *hashp);
@@ -355,7 +355,7 @@ string_compare(const char *key1, const char *key2, Size keysize)
  * large nelem will penalize hash_seq_search speed without buying much.
  */
 HTAB *
-hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
+hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
 {
 	HTAB	   *hashp;
 	HASHHDR    *hctl;
@@ -697,7 +697,7 @@ choose_nelem_alloc(Size entrysize)
  * arrays
  */
 static bool
-init_htab(HTAB *hashp, long nelem)
+init_htab(HTAB *hashp, int64 nelem)
 {
 	HASHHDR    *hctl = hashp->hctl;
 	HASHSEGMENT *segp;
@@ -780,10 +780,10 @@ init_htab(HTAB *hashp, long nelem)
  * NB: assumes that all hash structure parameters have default values!
  */
 Size
-hash_estimate_size(long num_entries, Size entrysize)
+hash_estimate_size(int64 num_entries, Size entrysize)
 {
 	Size		size;
-	long		nBuckets,
+	int64		nBuckets,
 				nSegments,
 				nDirEntries,
 				nElementAllocs,
@@ -791,9 +791,9 @@ hash_estimate_size(long num_entries, Size entrysize)
 				elementAllocCnt;
 
 	/* estimate number of buckets wanted */
-	nBuckets = next_pow2_long(num_entries);
+	nBuckets = next_pow2_int64(num_entries);
 	/* # of segments needed for nBuckets */
-	nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
+	nSegments = next_pow2_int64((nBuckets - 1) / DEF_SEGSIZE + 1);
 	/* directory entries */
 	nDirEntries = DEF_DIRSIZE;
 	while (nDirEntries < nSegments)
@@ -826,17 +826,17 @@ hash_estimate_size(long num_entries, Size entrysize)
  *
  * XXX this had better agree with the behavior of init_htab()...
  */
-long
-hash_select_dirsize(long num_entries)
+int64
+hash_select_dirsize(int64 num_entries)
 {
-	long		nBuckets,
+	int64		nBuckets,
 				nSegments,
 				nDirEntries;
 
 	/* estimate number of buckets wanted */
-	nBuckets = next_pow2_long(num_entries);
+	nBuckets = next_pow2_int64(num_entries);
 	/* # of segments needed for nBuckets */
-	nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
+	nSegments = next_pow2_int64((nBuckets - 1) / DEF_SEGSIZE + 1);
 	/* directory entries */
 	nDirEntries = DEF_DIRSIZE;
 	while (nDirEntries < nSegments)
@@ -887,7 +887,7 @@ hash_stats(const char *caller, HTAB *hashp)
 	HASHHDR    *hctl = hashp->hctl;
 
 	elog(DEBUG4,
-		 "hash_stats:  Caller: %s  Table Name: \"%s\"  Accesses: " UINT64_FORMAT "  Collisions: " UINT64_FORMAT "  Expansions: " UINT64_FORMAT "  Entries: %ld  Key Size: %zu  Max Bucket: %u  Segment Count: %ld",
+		 "hash_stats:  Caller: %s  Table Name: \"%s\"  Accesses: " UINT64_FORMAT "  Collisions: " UINT64_FORMAT "  Expansions: " UINT64_FORMAT "  Entries: " INT64_FORMAT "  Key Size: %zu  Max Bucket: %u  Segment Count: " INT64_FORMAT,
 		 caller != NULL ? caller : "(unknown)", hashp->tabname, hctl->accesses,
 		 hctl->collisions, hctl->expansions, hash_get_num_entries(hashp),
 		 hctl->keysize, hctl->max_bucket, hctl->nsegs);
@@ -993,7 +993,7 @@ hash_search_with_hash_value(HTAB *hashp,
 		 * Can't split if running in partitioned mode, nor if frozen, nor if
 		 * table is the subject of any active hash_seq_search scans.
 		 */
-		if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
+		if (hctl->freeList[0].nentries > (int64) hctl->max_bucket &&
 			!IS_PARTITIONED(hctl) && !hashp->frozen &&
 			!has_seq_scans(hashp))
 			(void) expand_table(hashp);
@@ -1332,11 +1332,11 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
 /*
  * hash_get_num_entries -- get the number of entries in a hashtable
  */
-long
+int64
 hash_get_num_entries(HTAB *hashp)
 {
 	int			i;
-	long		sum = hashp->hctl->freeList[0].nentries;
+	int64		sum = hashp->hctl->freeList[0].nentries;
 
 	/*
 	 * We currently don't bother with acquiring the mutexes; it's only
@@ -1417,9 +1417,9 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 	HTAB	   *hashp;
 	HASHHDR    *hctl;
 	uint32		max_bucket;
-	long		ssize;
-	long		segment_num;
-	long		segment_ndx;
+	int64		ssize;
+	int64		segment_num;
+	int64		segment_ndx;
 	HASHSEGMENT segp;
 	uint32		curBucket;
 	HASHELEMENT *curElem;
@@ -1548,11 +1548,11 @@ expand_table(HTAB *hashp)
 	HASHHDR    *hctl = hashp->hctl;
 	HASHSEGMENT old_seg,
 				new_seg;
-	long		old_bucket,
+	int64		old_bucket,
 				new_bucket;
-	long		new_segnum,
+	int64		new_segnum,
 				new_segndx;
-	long		old_segnum,
+	int64		old_segnum,
 				old_segndx;
 	HASHBUCKET *oldlink,
 			   *newlink;
@@ -1620,7 +1620,7 @@ expand_table(HTAB *hashp)
 		 currElement = nextElement)
 	{
 		nextElement = currElement->link;
-		if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
+		if ((int64) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
 		{
 			*oldlink = currElement;
 			oldlink = &currElement->link;
@@ -1644,9 +1644,9 @@ dir_realloc(HTAB *hashp)
 {
 	HASHSEGMENT *p;
 	HASHSEGMENT *old_p;
-	long		new_dsize;
-	long		old_dirsize;
-	long		new_dirsize;
+	int64		new_dsize;
+	int64		old_dirsize;
+	int64		new_dirsize;
 
 	if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
 		return false;
@@ -1780,8 +1780,8 @@ hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
 {
 	HASHHDR    *hctl = hashp->hctl;
 	HASHSEGMENT segp;
-	long		segment_num;
-	long		segment_ndx;
+	int64		segment_num;
+	int64		segment_ndx;
 	uint32		bucket;
 
 	bucket = calc_bucket(hctl, hashvalue);
@@ -1814,25 +1814,21 @@ hash_corrupted(HTAB *hashp)
 
 /* calculate ceil(log base 2) of num */
 int
-my_log2(long num)
+my_log2(int64 num)
 {
 	/*
 	 * guard against too-large input, which would be invalid for
 	 * pg_ceil_log2_*()
 	 */
-	if (num > LONG_MAX / 2)
-		num = LONG_MAX / 2;
+	if (num > PG_INT64_MAX / 2)
+		num = PG_INT64_MAX / 2;
 
-#if SIZEOF_LONG < 8
-	return pg_ceil_log2_32(num);
-#else
 	return pg_ceil_log2_64(num);
-#endif
 }
 
-/* calculate first power of 2 >= num, bounded to what will fit in a long */
-static long
-next_pow2_long(long num)
+/* calculate first power of 2 >= num, bounded to what will fit in a int64 */
+static int64
+next_pow2_int64(int64 num)
 {
 	/* my_log2's internal range check is sufficient */
 	return 1L << my_log2(num);
@@ -1840,7 +1836,7 @@ next_pow2_long(long num)
 
 /* calculate first power of 2 >= num, bounded to what will fit in an int */
 static int
-next_pow2_int(long num)
+next_pow2_int(int64 num)
 {
 	if (num > INT_MAX / 2)
 		num = INT_MAX / 2;
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index ea35f30f494..65561cc6bc3 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -271,12 +271,23 @@ Snapshot
 GetTransactionSnapshot(void)
 {
 	/*
-	 * This should not be called while doing logical decoding.  Historic
-	 * snapshots are only usable for catalog access, not for general-purpose
-	 * queries.
+	 * Return historic snapshot if doing logical decoding.
+	 *
+	 * Historic snapshots are only usable for catalog access, not for
+	 * general-purpose queries.  The caller is responsible for ensuring that
+	 * the snapshot is used correctly! (PostgreSQL code never calls this
+	 * during logical decoding, but extensions can do it.)
 	 */
 	if (HistoricSnapshotActive())
-		elog(ERROR, "cannot take query snapshot during logical decoding");
+	{
+		/*
+		 * We'll never need a non-historic transaction snapshot in this
+		 * (sub-)transaction, so there's no need to be careful to set one up
+		 * for later calls to GetTransactionSnapshot().
+		 */
+		Assert(!FirstSnapshotSet);
+		return HistoricSnapshot;
+	}
 
 	/* First call in transaction? */
 	if (!FirstSnapshotSet)