Increase width of RelFileNumbers from 32 bits to 56 bits.

RelFileNumbers are now assigned using a separate counter, instead of being assigned from the OID counter. This counter never wraps around: if all 2^56 possible RelFileNumbers are used, an internal error occurs. As the cluster is limited to 2^64 total bytes of WAL, this limitation should not cause a problem in practice. If the counter were 64 bits wide rather than 56 bits wide, we would need to increase the width of the BufferTag, which might adversely impact buffer lookup performance. Also, this lets us use bigint for pg_class.relfilenode and other places where these values are exposed at the SQL level without worrying about overflow. This should remove the need to keep "tombstone" files around until the next checkpoint when relations are removed. We do that to keep RelFileNumbers from being recycled, but now that won't happen anyway. However, this patch doesn't actually change anything in this area; it just makes it possible for a future patch to do so. Dilip Kumar, based on an idea from Andres Freund, who also reviewed some earlier versions of the patch. Further review and some wordsmithing by me. Also reviewed at various points by Ashutosh Sharma, Vignesh C, Amul Sul, Álvaro Herrera, and Tom Lane. Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
author: Robert Haas <rhaas@postgresql.org> 2022-09-27 13:25:21 -0400
committer: Robert Haas <rhaas@postgresql.org> 2022-09-27 13:25:21 -0400
commit: 05d4cbf9b6ba708858984b01ca0fc56d59d4ec7c (patch)
tree: 645e3ac17f002ae33e086dbf871c330986452c35 /src/backend/access/transam/varsup.c
parent: 2f47715cc8649f854b1df28dfc338af9801db217 (diff)
1 files changed, 207 insertions, 2 deletions
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 849a7ce9d6d..f99c697c2f5 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -13,12 +13,16 @@
 
 #include "postgres.h"
 
+#include <unistd.h>
+
 #include "access/clog.h"
 #include "access/commit_ts.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
 #include "access/xlogutils.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_tablespace.h"
 #include "commands/dbcommands.h"
 #include "miscadmin.h"
 #include "postmaster/autovacuum.h"
@@ -30,6 +34,15 @@
 /* Number of OIDs to prefetch (preallocate) per XLOG write */
 #define VAR_OID_PREFETCH		8192
 
+/* Number of RelFileNumbers to be logged per XLOG write */
+#define VAR_RELNUMBER_PER_XLOG				512
+
+/*
+ * Need to log more if remaining logged RelFileNumbers are less than the
+ * threshold.  Valid range could be between 0 to VAR_RELNUMBER_PER_XLOG - 1.
+ */
+#define VAR_RELNUMBER_NEW_XLOG_THRESHOLD	256
+
 /* pointer to "variable cache" in shared memory (set up by shmem.c) */
 VariableCache ShmemVariableCache = NULL;
 
@@ -521,8 +534,7 @@ ForceTransactionIdLimitUpdate(void)
  * wide, counter wraparound will occur eventually, and therefore it is unwise
  * to assume they are unique unless precautions are taken to make them so.
  * Hence, this routine should generally not be used directly.  The only direct
- * callers should be GetNewOidWithIndex() and GetNewRelFileNumber() in
- * catalog/catalog.c.
+ * caller should be GetNewOidWithIndex() in catalog/catalog.c.
  */
 Oid
 GetNewObjectId(void)
@@ -613,6 +625,199 @@ SetNextObjectId(Oid nextOid)
 }
 
 /*
+ * GetNewRelFileNumber
+ *
+ * Similar to GetNewObjectId but instead of new Oid it generates new
+ * relfilenumber.
+ */
+RelFileNumber
+GetNewRelFileNumber(Oid reltablespace, char relpersistence)
+{
+	RelFileNumber result;
+	RelFileNumber nextRelFileNumber,
+				  loggedRelFileNumber,
+				  flushedRelFileNumber;
+
+	StaticAssertStmt(VAR_RELNUMBER_NEW_XLOG_THRESHOLD < VAR_RELNUMBER_PER_XLOG,
+					 "VAR_RELNUMBER_NEW_XLOG_THRESHOLD must be smaller than VAR_RELNUMBER_PER_XLOG");
+
+	/* safety check, we should never get this far in a HS standby */
+	if (RecoveryInProgress())
+		elog(ERROR, "cannot assign RelFileNumber during recovery");
+
+	if (IsBinaryUpgrade)
+		elog(ERROR, "cannot assign RelFileNumber during binary upgrade");
+
+	LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+
+	nextRelFileNumber = ShmemVariableCache->nextRelFileNumber;
+	loggedRelFileNumber = ShmemVariableCache->loggedRelFileNumber;
+	flushedRelFileNumber = ShmemVariableCache->flushedRelFileNumber;
+
+	Assert(nextRelFileNumber <= flushedRelFileNumber);
+	Assert(flushedRelFileNumber <= loggedRelFileNumber);
+
+	/* check for the wraparound for the relfilenumber counter */
+	if (unlikely(nextRelFileNumber > MAX_RELFILENUMBER))
+		elog(ERROR, "relfilenumber is too large");
+
+	/*
+	 * If the remaining logged relfilenumbers values are less than the
+	 * threshold value then log more.  Ideally, we can wait until all
+	 * relfilenumbers have been consumed before logging more.  Nevertheless, if
+	 * we do that, we must immediately flush the logged wal record because we
+	 * want to ensure that the nextRelFileNumber is always larger than any
+	 * relfilenumber already in use on disk.  And, to maintain that invariant,
+	 * we must make sure that the record we log reaches the disk before any new
+	 * files are created with the newly logged range.
+	 *
+	 * So in order to avoid flushing the wal immediately, we always log before
+	 * consuming all the relfilenumber, and now we only have to flush the newly
+	 * logged relfilenumber wal before consuming the relfilenumber from this
+	 * new range.  By the time we need to flush this wal, hopefully, those have
+	 * already been flushed with some other XLogFlush operation.
+	 */
+	if (loggedRelFileNumber - nextRelFileNumber <=
+		VAR_RELNUMBER_NEW_XLOG_THRESHOLD)
+	{
+		XLogRecPtr	recptr;
+
+		loggedRelFileNumber = loggedRelFileNumber + VAR_RELNUMBER_PER_XLOG;
+		recptr = LogNextRelFileNumber(loggedRelFileNumber);
+		ShmemVariableCache->loggedRelFileNumber = loggedRelFileNumber;
+
+		/* remember for the future flush */
+		ShmemVariableCache->loggedRelFileNumberRecPtr = recptr;
+	}
+
+	/*
+	 * If the nextRelFileNumber is already reached to the already flushed
+	 * relfilenumber then flush the WAL for previously logged relfilenumber.
+	 */
+	if (nextRelFileNumber >= flushedRelFileNumber)
+	{
+		XLogFlush(ShmemVariableCache->loggedRelFileNumberRecPtr);
+		ShmemVariableCache->flushedRelFileNumber = loggedRelFileNumber;
+	}
+
+	result = ShmemVariableCache->nextRelFileNumber;
+
+	/* we should never be using any relfilenumber outside the flushed range */
+	Assert(result <= ShmemVariableCache->flushedRelFileNumber);
+
+	(ShmemVariableCache->nextRelFileNumber)++;
+
+	LWLockRelease(RelFileNumberGenLock);
+
+	/*
+	 * Because the RelFileNumber counter only ever increases and never wraps
+	 * around, it should be impossible for the newly-allocated RelFileNumber to
+	 * already be in use.  But, if Asserts are enabled, double check that
+	 * there's no main-fork relation file with the new RelFileNumber already on
+	 * disk.
+	 */
+#ifdef USE_ASSERT_CHECKING
+	{
+		RelFileLocatorBackend rlocator;
+		char	   *rpath;
+		BackendId	backend;
+
+		switch (relpersistence)
+		{
+			case RELPERSISTENCE_TEMP:
+				backend = BackendIdForTempRelations();
+				break;
+			case RELPERSISTENCE_UNLOGGED:
+			case RELPERSISTENCE_PERMANENT:
+				backend = InvalidBackendId;
+				break;
+			default:
+				elog(ERROR, "invalid relpersistence: %c", relpersistence);
+		}
+
+		/* this logic should match RelationInitPhysicalAddr */
+		rlocator.locator.spcOid =
+			reltablespace ? reltablespace : MyDatabaseTableSpace;
+		rlocator.locator.dbOid = (reltablespace == GLOBALTABLESPACE_OID) ?
+			InvalidOid : MyDatabaseId;
+		rlocator.locator.relNumber = result;
+
+		/*
+		 * The relpath will vary based on the backend ID, so we must
+		 * initialize that properly here to make sure that any collisions
+		 * based on filename are properly detected.
+		 */
+		rlocator.backend = backend;
+
+		/* check for existing file of same name. */
+		rpath = relpath(rlocator, MAIN_FORKNUM);
+		Assert(access(rpath, F_OK) != 0);
+	}
+#endif
+
+	return result;
+}
+
+/*
+ * SetNextRelFileNumber
+ *
+ * This may only be called during pg_upgrade; it advances the RelFileNumber
+ * counter to the specified value if the current value is smaller than the
+ * input value.
+ */
+void
+SetNextRelFileNumber(RelFileNumber relnumber)
+{
+	/* safety check, we should never get this far in a HS standby */
+	if (RecoveryInProgress())
+		elog(ERROR, "cannot set RelFileNumber during recovery");
+
+	if (!IsBinaryUpgrade)
+		elog(ERROR, "RelFileNumber can be set only during binary upgrade");
+
+	LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+
+	/*
+	 * If previous assigned value of the nextRelFileNumber is already higher
+	 * than the current value then nothing to be done.  This is possible
+	 * because during upgrade the objects are not created in relfilenumber
+	 * order.
+	 */
+	if (relnumber <= ShmemVariableCache->nextRelFileNumber)
+	{
+		LWLockRelease(RelFileNumberGenLock);
+		return;
+	}
+
+	/*
+	 * If the new relfilenumber to be set is greater than or equal to already
+	 * flushed relfilenumber then log more and flush immediately.
+	 *
+	 * (This is less efficient than GetNewRelFileNumber, which arranges to
+	 * log some new relfilenumbers before the old batch is exhausted in the
+	 * hope that a flush will happen in the background before any values are
+	 * needed from the new batch. However, since thais is only used during
+	 * binary upgrade, it shouldn't really matter.)
+	 */
+	if (relnumber >= ShmemVariableCache->flushedRelFileNumber)
+	{
+		RelFileNumber newlogrelnum;
+
+		newlogrelnum = relnumber + VAR_RELNUMBER_PER_XLOG;
+		XLogFlush(LogNextRelFileNumber(newlogrelnum));
+
+		/* we have flushed whatever we have logged so no pending flush */
+		ShmemVariableCache->loggedRelFileNumber = newlogrelnum;
+		ShmemVariableCache->flushedRelFileNumber = newlogrelnum;
+		ShmemVariableCache->loggedRelFileNumberRecPtr = InvalidXLogRecPtr;
+	}
+
+	ShmemVariableCache->nextRelFileNumber = relnumber;
+
+	LWLockRelease(RelFileNumberGenLock);
+}
+
+/*
  * StopGeneratingPinnedObjectIds
  *
  * This is called once during initdb to force the OID counter up to
author	Robert Haas <rhaas@postgresql.org>	2022-09-27 13:25:21 -0400
committer	Robert Haas <rhaas@postgresql.org>	2022-09-27 13:25:21 -0400
commit	05d4cbf9b6ba708858984b01ca0fc56d59d4ec7c (patch)
tree	645e3ac17f002ae33e086dbf871c330986452c35 /src/backend/access/transam/varsup.c
parent	2f47715cc8649f854b1df28dfc338af9801db217 (diff)