Add parallelism support for TID Range Scans

In v14, bb437f995 added support for scanning for ranges of TIDs using a dedicated executor node for the purpose. Here, we allow these scans to be parallelized. The range of blocks to scan is divvied up similarly to how a Parallel Seq Scans does that, where 'chunks' of blocks are allocated to each worker and the size of those chunks is slowly reduced down to 1 block per worker by the time we're nearing the end of the scan. Doing that means workers finish at roughly the same time. Allowing TID Range Scans to be parallelized removes the dilemma from the planner as to whether a Parallel Seq Scan will cost less than a non-parallel TID Range Scan due to the CPU concurrency of the Seq Scan (disk costs are not divided by the number of workers). It was possible the planner could choose the Parallel Seq Scan which would result in reading additional blocks during execution than the TID Scan would have. Allowing Parallel TID Range Scans removes the trade-off the planner makes when choosing between reduced CPU costs due to parallelism vs additional I/O from the Parallel Seq Scan due to it scanning blocks from outside of the required TID range. There is also, of course, the traditional parallelism performance benefits to be gained as well, which likely doesn't need to be explained here. Author: Cary Huang <cary.huang@highgo.ca> Author: David Rowley <dgrowleyml@gmail.com> Reviewed-by: Junwang Zhao <zhjwpku@gmail.com> Reviewed-by: Rafia Sabih <rafia.pghackers@gmail.com> Reviewed-by: Steven Niu <niushiji@gmail.com> Discussion: https://postgr.es/m/18f2c002a24.11bc2ab825151706.3749144144619388582@highgo.ca
author: David Rowley <drowley@postgresql.org> 2025-11-27 14:05:04 +1300
committer: David Rowley <drowley@postgresql.org> 2025-11-27 14:05:04 +1300
commit: 0ca3b16973a8bb1c185f56e65edcadc0d9d2c406 (patch)
tree: ab3bcbea310c7cb3650ab5dbde4c5121736ce966 /src/backend/optimizer
parent: 42473b3b31238b15cc3c030b4416b2ee79508d8c (diff)
3 files changed, 49 insertions, 16 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 8335cf5b5c5..5a7283bd2f5 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1340,8 +1340,9 @@ cost_tidrangescan(Path *path, PlannerInfo *root,
 {
 	Selectivity selectivity;
 	double		pages;
-	Cost		startup_cost = 0;
-	Cost		run_cost = 0;
+	Cost		startup_cost;
+	Cost		cpu_run_cost;
+	Cost		disk_run_cost;
 	QualCost	qpqual_cost;
 	Cost		cpu_per_tuple;
 	QualCost	tid_qual_cost;
@@ -1373,8 +1374,8 @@ cost_tidrangescan(Path *path, PlannerInfo *root,
 	 * page is just a normal sequential page read. NOTE: it's desirable for
 	 * TID Range Scans to cost more than the equivalent Sequential Scans,
 	 * because Seq Scans have some performance advantages such as scan
-	 * synchronization and parallelizability, and we'd prefer one of them to
-	 * be picked unless a TID Range Scan really is better.
+	 * synchronization, and we'd prefer one of them to be picked unless a TID
+	 * Range Scan really is better.
 	 */
 	ntuples = selectivity * baserel->tuples;
 	nseqpages = pages - 1.0;
@@ -1391,7 +1392,7 @@ cost_tidrangescan(Path *path, PlannerInfo *root,
 							  &spc_seq_page_cost);
 
 	/* disk costs; 1 random page and the remainder as seq pages */
-	run_cost += spc_random_page_cost + spc_seq_page_cost * nseqpages;
+	disk_run_cost = spc_random_page_cost + spc_seq_page_cost * nseqpages;
 
 	/* Add scanning CPU costs */
 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
@@ -1403,20 +1404,35 @@ cost_tidrangescan(Path *path, PlannerInfo *root,
 	 * can't be removed, this is a mistake and we're going to underestimate
 	 * the CPU cost a bit.)
 	 */
-	startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
+	startup_cost = qpqual_cost.startup + tid_qual_cost.per_tuple;
 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
 		tid_qual_cost.per_tuple;
-	run_cost += cpu_per_tuple * ntuples;
+	cpu_run_cost = cpu_per_tuple * ntuples;
 
 	/* tlist eval costs are paid per output row, not per tuple scanned */
 	startup_cost += path->pathtarget->cost.startup;
-	run_cost += path->pathtarget->cost.per_tuple * path->rows;
+	cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
+
+	/* Adjust costing for parallelism, if used. */
+	if (path->parallel_workers > 0)
+	{
+		double		parallel_divisor = get_parallel_divisor(path);
+
+		/* The CPU cost is divided among all the workers. */
+		cpu_run_cost /= parallel_divisor;
+
+		/*
+		 * In the case of a parallel plan, the row count needs to represent
+		 * the number of tuples processed per worker.
+		 */
+		path->rows = clamp_row_est(path->rows / parallel_divisor);
+	}
 
 	/* we should not generate this path type when enable_tidscan=false */
 	Assert(enable_tidscan);
 	path->disabled_nodes = 0;
 	path->startup_cost = startup_cost;
-	path->total_cost = startup_cost + run_cost;
+	path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
 }
 
 /*
diff --git a/src/backend/optimizer/path/tidpath.c b/src/backend/optimizer/path/tidpath.c
index 2bfb338b81c..3ddbc10bbdf 100644
--- a/src/backend/optimizer/path/tidpath.c
+++ b/src/backend/optimizer/path/tidpath.c
@@ -490,9 +490,8 @@ ec_member_matches_ctid(PlannerInfo *root, RelOptInfo *rel,
 
 /*
  * create_tidscan_paths
- *	  Create paths corresponding to direct TID scans of the given rel.
- *
- *	  Candidate paths are added to the rel's pathlist (using add_path).
+ *	  Create paths corresponding to direct TID scans of the given rel and add
+ *	  them to the corresponding path list via add_path or add_partial_path.
  */
 bool
 create_tidscan_paths(PlannerInfo *root, RelOptInfo *rel)
@@ -553,7 +552,24 @@ create_tidscan_paths(PlannerInfo *root, RelOptInfo *rel)
 
 		add_path(rel, (Path *) create_tidrangescan_path(root, rel,
 														tidrangequals,
-														required_outer));
+														required_outer,
+														0));
+
+		/* If appropriate, consider parallel tid range scan. */
+		if (rel->consider_parallel && required_outer == NULL)
+		{
+			int			parallel_workers;
+
+			parallel_workers = compute_parallel_worker(rel, rel->pages, -1,
+													   max_parallel_workers_per_gather);
+
+			if (parallel_workers > 0)
+				add_partial_path(rel, (Path *) create_tidrangescan_path(root,
+																		rel,
+																		tidrangequals,
+																		required_outer,
+																		parallel_workers));
+		}
 	}
 
 	/*
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index e4fd6950fad..fd4bd5f93f0 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1262,7 +1262,8 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals,
  */
 TidRangePath *
 create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel,
-						 List *tidrangequals, Relids required_outer)
+						 List *tidrangequals, Relids required_outer,
+						 int parallel_workers)
 {
 	TidRangePath *pathnode = makeNode(TidRangePath);
 
@@ -1271,9 +1272,9 @@ create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel,
 	pathnode->path.pathtarget = rel->reltarget;
 	pathnode->path.param_info = get_baserel_parampathinfo(root, rel,
 														  required_outer);
-	pathnode->path.parallel_aware = false;
+	pathnode->path.parallel_aware = (parallel_workers > 0);
 	pathnode->path.parallel_safe = rel->consider_parallel;
-	pathnode->path.parallel_workers = 0;
+	pathnode->path.parallel_workers = parallel_workers;
 	pathnode->path.pathkeys = NIL;	/* always unordered */
 
 	pathnode->tidrangequals = tidrangequals;
author	David Rowley <drowley@postgresql.org>	2025-11-27 14:05:04 +1300
committer	David Rowley <drowley@postgresql.org>	2025-11-27 14:05:04 +1300
commit	0ca3b16973a8bb1c185f56e65edcadc0d9d2c406 (patch)
tree	ab3bcbea310c7cb3650ab5dbde4c5121736ce966 /src/backend/optimizer
parent	42473b3b31238b15cc3c030b4416b2ee79508d8c (diff)