Add parallelism support for TID Range Scans

In v14, bb437f995 added support for scanning for ranges of TIDs using a dedicated executor node for the purpose. Here, we allow these scans to be parallelized. The range of blocks to scan is divvied up similarly to how a Parallel Seq Scans does that, where 'chunks' of blocks are allocated to each worker and the size of those chunks is slowly reduced down to 1 block per worker by the time we're nearing the end of the scan. Doing that means workers finish at roughly the same time. Allowing TID Range Scans to be parallelized removes the dilemma from the planner as to whether a Parallel Seq Scan will cost less than a non-parallel TID Range Scan due to the CPU concurrency of the Seq Scan (disk costs are not divided by the number of workers). It was possible the planner could choose the Parallel Seq Scan which would result in reading additional blocks during execution than the TID Scan would have. Allowing Parallel TID Range Scans removes the trade-off the planner makes when choosing between reduced CPU costs due to parallelism vs additional I/O from the Parallel Seq Scan due to it scanning blocks from outside of the required TID range. There is also, of course, the traditional parallelism performance benefits to be gained as well, which likely doesn't need to be explained here. Author: Cary Huang <cary.huang@highgo.ca> Author: David Rowley <dgrowleyml@gmail.com> Reviewed-by: Junwang Zhao <zhjwpku@gmail.com> Reviewed-by: Rafia Sabih <rafia.pghackers@gmail.com> Reviewed-by: Steven Niu <niushiji@gmail.com> Discussion: https://postgr.es/m/18f2c002a24.11bc2ab825151706.3749144144619388582@highgo.ca
author: David Rowley <drowley@postgresql.org> 2025-11-27 14:05:04 +1300
committer: David Rowley <drowley@postgresql.org> 2025-11-27 14:05:04 +1300
commit: 0ca3b16973a8bb1c185f56e65edcadc0d9d2c406 (patch)
tree: ab3bcbea310c7cb3650ab5dbde4c5121736ce966 /src/backend/access/table
parent: 42473b3b31238b15cc3c030b4416b2ee79508d8c (diff)
1 files changed, 109 insertions, 39 deletions
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 5e41404937e..1e099febdc8 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -188,6 +188,37 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
 											pscan, flags);
 }
 
+TableScanDesc
+table_beginscan_parallel_tidrange(Relation relation,
+								  ParallelTableScanDesc pscan)
+{
+	Snapshot	snapshot;
+	uint32		flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE;
+	TableScanDesc sscan;
+
+	Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
+
+	/* disable syncscan in parallel tid range scan. */
+	pscan->phs_syncscan = false;
+
+	if (!pscan->phs_snapshot_any)
+	{
+		/* Snapshot was serialized -- restore it */
+		snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
+		RegisterSnapshot(snapshot);
+		flags |= SO_TEMP_SNAPSHOT;
+	}
+	else
+	{
+		/* SnapshotAny passed by caller (not serialized) */
+		snapshot = SnapshotAny;
+	}
+
+	sscan = relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
+											 pscan, flags);
+	return sscan;
+}
+
 
 /* ----------------------------------------------------------------------------
  * Index scan related functions.
@@ -398,6 +429,7 @@ table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
 		bpscan->phs_nblocks > NBuffers / 4;
 	SpinLockInit(&bpscan->phs_mutex);
 	bpscan->phs_startblock = InvalidBlockNumber;
+	bpscan->phs_numblock = InvalidBlockNumber;
 	pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
 
 	return sizeof(ParallelBlockTableScanDescData);
@@ -416,14 +448,22 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
  *
  * Determine where the parallel seq scan should start.  This function may be
  * called many times, once by each parallel worker.  We must be careful only
- * to set the startblock once.
+ * to set the phs_startblock and phs_numblock fields once.
+ *
+ * Callers may optionally specify a non-InvalidBlockNumber value for
+ * 'startblock' to force the scan to start at the given page.  Likewise,
+ * 'numblocks' can be specified as a non-InvalidBlockNumber to limit the
+ * number of blocks to scan to that many blocks.
  */
 void
 table_block_parallelscan_startblock_init(Relation rel,
 										 ParallelBlockTableScanWorker pbscanwork,
-										 ParallelBlockTableScanDesc pbscan)
+										 ParallelBlockTableScanDesc pbscan,
+										 BlockNumber startblock,
+										 BlockNumber numblocks)
 {
 	BlockNumber sync_startpage = InvalidBlockNumber;
+	BlockNumber scan_nblocks;
 
 	/* Reset the state we use for controlling allocation size. */
 	memset(pbscanwork, 0, sizeof(*pbscanwork));
@@ -431,42 +471,36 @@ table_block_parallelscan_startblock_init(Relation rel,
 	StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
 					 "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
 
-	/*
-	 * We determine the chunk size based on the size of the relation. First we
-	 * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
-	 * take the next highest power of 2 number of the chunk size.  This means
-	 * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
-	 * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
-	 */
-	pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
-													   PARALLEL_SEQSCAN_NCHUNKS, 1));
-
-	/*
-	 * Ensure we don't go over the maximum chunk size with larger tables. This
-	 * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
-	 * tables.  Too large a chunk size has been shown to be detrimental to
-	 * synchronous scan performance.
-	 */
-	pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
-									  PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
-
 retry:
 	/* Grab the spinlock. */
 	SpinLockAcquire(&pbscan->phs_mutex);
 
 	/*
-	 * If the scan's startblock has not yet been initialized, we must do so
-	 * now.  If this is not a synchronized scan, we just start at block 0, but
-	 * if it is a synchronized scan, we must get the starting position from
-	 * the synchronized scan machinery.  We can't hold the spinlock while
-	 * doing that, though, so release the spinlock, get the information we
-	 * need, and retry.  If nobody else has initialized the scan in the
-	 * meantime, we'll fill in the value we fetched on the second time
-	 * through.
+	 * When the caller specified a limit on the number of blocks to scan, set
+	 * that in the ParallelBlockTableScanDesc, if it's not been done by
+	 * another worker already.
+	 */
+	if (numblocks != InvalidBlockNumber &&
+		pbscan->phs_numblock == InvalidBlockNumber)
+	{
+		pbscan->phs_numblock = numblocks;
+	}
+
+	/*
+	 * If the scan's phs_startblock has not yet been initialized, we must do
+	 * so now.  If a startblock was specified, start there, otherwise if this
+	 * is not a synchronized scan, we just start at block 0, but if it is a
+	 * synchronized scan, we must get the starting position from the
+	 * synchronized scan machinery.  We can't hold the spinlock while doing
+	 * that, though, so release the spinlock, get the information we need, and
+	 * retry.  If nobody else has initialized the scan in the meantime, we'll
+	 * fill in the value we fetched on the second time through.
 	 */
 	if (pbscan->phs_startblock == InvalidBlockNumber)
 	{
-		if (!pbscan->base.phs_syncscan)
+		if (startblock != InvalidBlockNumber)
+			pbscan->phs_startblock = startblock;
+		else if (!pbscan->base.phs_syncscan)
 			pbscan->phs_startblock = 0;
 		else if (sync_startpage != InvalidBlockNumber)
 			pbscan->phs_startblock = sync_startpage;
@@ -478,6 +512,34 @@ retry:
 		}
 	}
 	SpinLockRelease(&pbscan->phs_mutex);
+
+	/*
+	 * Figure out how many blocks we're going to scan; either all of them, or
+	 * just phs_numblock's worth, if a limit has been imposed.
+	 */
+	if (pbscan->phs_numblock == InvalidBlockNumber)
+		scan_nblocks = pbscan->phs_nblocks;
+	else
+		scan_nblocks = pbscan->phs_numblock;
+
+	/*
+	 * We determine the chunk size based on scan_nblocks.  First we split
+	 * scan_nblocks into PARALLEL_SEQSCAN_NCHUNKS chunks then we calculate the
+	 * next highest power of 2 number of the result.  This means we split the
+	 * blocks we're scanning into somewhere between PARALLEL_SEQSCAN_NCHUNKS
+	 * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
+	 */
+	pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(scan_nblocks /
+													   PARALLEL_SEQSCAN_NCHUNKS, 1));
+
+	/*
+	 * Ensure we don't go over the maximum chunk size with larger tables. This
+	 * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
+	 * tables.  Too large a chunk size has been shown to be detrimental to
+	 * sequential scan performance.
+	 */
+	pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
+									  PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
 }
 
 /*
@@ -493,6 +555,7 @@ table_block_parallelscan_nextpage(Relation rel,
 								  ParallelBlockTableScanWorker pbscanwork,
 								  ParallelBlockTableScanDesc pbscan)
 {
+	BlockNumber scan_nblocks;
 	BlockNumber page;
 	uint64		nallocated;
 
@@ -513,7 +576,7 @@ table_block_parallelscan_nextpage(Relation rel,
 	 *
 	 * Here we name these ranges of blocks "chunks".  The initial size of
 	 * these chunks is determined in table_block_parallelscan_startblock_init
-	 * based on the size of the relation.  Towards the end of the scan, we
+	 * based on the number of blocks to scan.  Towards the end of the scan, we
 	 * start making reductions in the size of the chunks in order to attempt
 	 * to divide the remaining work over all the workers as evenly as
 	 * possible.
@@ -530,17 +593,23 @@ table_block_parallelscan_nextpage(Relation rel,
 	 * phs_nallocated counter will exceed rs_nblocks, because workers will
 	 * still increment the value, when they try to allocate the next block but
 	 * all blocks have been allocated already. The counter must be 64 bits
-	 * wide because of that, to avoid wrapping around when rs_nblocks is close
-	 * to 2^32.
+	 * wide because of that, to avoid wrapping around when scan_nblocks is
+	 * close to 2^32.
 	 *
 	 * The actual block to return is calculated by adding the counter to the
-	 * starting block number, modulo nblocks.
+	 * starting block number, modulo phs_nblocks.
 	 */
 
+	/* First, figure out how many blocks we're planning on scanning */
+	if (pbscan->phs_numblock == InvalidBlockNumber)
+		scan_nblocks = pbscan->phs_nblocks;
+	else
+		scan_nblocks = pbscan->phs_numblock;
+
 	/*
-	 * First check if we have any remaining blocks in a previous chunk for
-	 * this worker.  We must consume all of the blocks from that before we
-	 * allocate a new chunk to the worker.
+	 * Now check if we have any remaining blocks in a previous chunk for this
+	 * worker.  We must consume all of the blocks from that before we allocate
+	 * a new chunk to the worker.
 	 */
 	if (pbscanwork->phsw_chunk_remaining > 0)
 	{
@@ -562,7 +631,7 @@ table_block_parallelscan_nextpage(Relation rel,
 		 * chunk size set to 1.
 		 */
 		if (pbscanwork->phsw_chunk_size > 1 &&
-			pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
+			pbscanwork->phsw_nallocated > scan_nblocks -
 			(pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
 			pbscanwork->phsw_chunk_size >>= 1;
 
@@ -577,7 +646,8 @@ table_block_parallelscan_nextpage(Relation rel,
 		pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
 	}
 
-	if (nallocated >= pbscan->phs_nblocks)
+	/* Check if we've run out of blocks to scan */
+	if (nallocated >= scan_nblocks)
 		page = InvalidBlockNumber;	/* all blocks have been allocated */
 	else
 		page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
author	David Rowley <drowley@postgresql.org>	2025-11-27 14:05:04 +1300
committer	David Rowley <drowley@postgresql.org>	2025-11-27 14:05:04 +1300
commit	0ca3b16973a8bb1c185f56e65edcadc0d9d2c406 (patch)
tree	ab3bcbea310c7cb3650ab5dbde4c5121736ce966 /src/backend/access/table
parent	42473b3b31238b15cc3c030b4416b2ee79508d8c (diff)