Teach tuplesort.c about "top N" sorting, in which only the first N tuples

need be returned. We keep a heap of the current best N tuples and sift-up new tuples into it as we scan the input. For M input tuples this means only about M*log(N) comparisons instead of M*log(M), not to mention a lot less workspace when N is small --- avoiding spill-to-disk for large M is actually the most attractive thing about it. Patch includes planner and executor support for invoking this facility in ORDER BY ... LIMIT queries. Greg Stark, with some editorialization by moi.
author: Tom Lane <tgl@sss.pgh.pa.us> 2007-05-04 01:13:45 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2007-05-04 01:13:45 +0000
commit: d26559dbf356736923b26704ce76ca820ff3a2b0 (patch)
tree: e899e3b4eb9f0d34f598816f69a9a60379987391 /src/backend/optimizer/path
parent: 0fef38da215cdc9b01b1b623c2e37d7414b91843 (diff)
1 files changed, 60 insertions, 17 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 9cd3a51d897..82dfc77f065 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.181 2007/04/21 21:01:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -922,6 +922,10 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
  *		disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
  *		cpu = comparison_cost * t * log2(t)
  *
+ * If the sort is bounded (i.e., only the first k result tuples are needed)
+ * and k tuples can fit into work_mem, we use a heap method that keeps only
+ * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
+ *
  * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
  * accesses (XXX can't we refine that guess?)
  *
@@ -932,6 +936,7 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
  * 'input_cost' is the total cost for reading the input data
  * 'tuples' is the number of tuples in the relation
  * 'width' is the average tuple width in bytes
+ * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
  *
  * NOTE: some callers currently pass NIL for pathkeys because they
  * can't conveniently supply the sort keys.  Since this routine doesn't
@@ -942,11 +947,14 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
  */
 void
 cost_sort(Path *path, PlannerInfo *root,
-		  List *pathkeys, Cost input_cost, double tuples, int width)
+		  List *pathkeys, Cost input_cost, double tuples, int width,
+		  double limit_tuples)
 {
 	Cost		startup_cost = input_cost;
 	Cost		run_cost = 0;
-	double		nbytes = relation_byte_size(tuples, width);
+	double		input_bytes = relation_byte_size(tuples, width);
+	double		output_bytes;
+	double		output_tuples;
 	long		work_mem_bytes = work_mem * 1024L;
 
 	if (!enable_sort)
@@ -959,23 +967,39 @@ cost_sort(Path *path, PlannerInfo *root,
 	if (tuples < 2.0)
 		tuples = 2.0;
 
-	/*
-	 * CPU costs
-	 *
-	 * Assume about two operator evals per tuple comparison and N log2 N
-	 * comparisons
-	 */
-	startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+	/* Do we have a useful LIMIT? */
+	if (limit_tuples > 0 && limit_tuples < tuples)
+	{
+		output_tuples = limit_tuples;
+		output_bytes = relation_byte_size(output_tuples, width);
+	}
+	else
+	{
+		output_tuples = tuples;
+		output_bytes = input_bytes;
+	}
 
-	/* disk costs */
-	if (nbytes > work_mem_bytes)
+	if (output_bytes > work_mem_bytes)
 	{
-		double		npages = ceil(nbytes / BLCKSZ);
-		double		nruns = (nbytes / work_mem_bytes) * 0.5;
+		/*
+		 * We'll have to use a disk-based sort of all the tuples
+		 */
+		double		npages = ceil(input_bytes / BLCKSZ);
+		double		nruns = (input_bytes / work_mem_bytes) * 0.5;
 		double		mergeorder = tuplesort_merge_order(work_mem_bytes);
 		double		log_runs;
 		double		npageaccesses;
 
+		/*
+		 * CPU costs
+		 *
+		 * Assume about two operator evals per tuple comparison and N log2 N
+		 * comparisons
+		 */
+		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+
+		/* Disk costs */
+
 		/* Compute logM(r) as log(r) / log(M) */
 		if (nruns > mergeorder)
 			log_runs = ceil(log(nruns) / log(mergeorder));
@@ -986,10 +1010,27 @@ cost_sort(Path *path, PlannerInfo *root,
 		startup_cost += npageaccesses *
 			(seq_page_cost * 0.75 + random_page_cost * 0.25);
 	}
+	else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
+	{
+		/*
+		 * We'll use a bounded heap-sort keeping just K tuples in memory,
+		 * for a total number of tuple comparisons of N log2 K; but the
+		 * constant factor is a bit higher than for quicksort.  Tweak it
+		 * so that the cost curve is continuous at the crossover point.
+		 */
+		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
+	}
+	else
+	{
+		/* We'll use plain quicksort on all the input tuples */
+		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+	}
 
 	/*
 	 * Also charge a small amount (arbitrarily set equal to operator cost) per
-	 * extracted tuple.
+	 * extracted tuple.  Note it's correct to use tuples not output_tuples
+	 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
+	 * counting the LIMIT otherwise.
 	 */
 	run_cost += cpu_operator_cost * tuples;
 
@@ -1431,7 +1472,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
 				  outersortkeys,
 				  outer_path->total_cost,
 				  outer_path_rows,
-				  outer_path->parent->width);
+				  outer_path->parent->width,
+				  -1.0);
 		startup_cost += sort_path.startup_cost;
 		run_cost += (sort_path.total_cost - sort_path.startup_cost)
 			* outerscansel;
@@ -1450,7 +1492,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
 				  innersortkeys,
 				  inner_path->total_cost,
 				  inner_path_rows,
-				  inner_path->parent->width);
+				  inner_path->parent->width,
+				  -1.0);
 		startup_cost += sort_path.startup_cost;
 		run_cost += (sort_path.total_cost - sort_path.startup_cost)
 			* innerscansel * rescanratio;
author	Tom Lane <tgl@sss.pgh.pa.us>	2007-05-04 01:13:45 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2007-05-04 01:13:45 +0000
commit	d26559dbf356736923b26704ce76ca820ff3a2b0 (patch)
tree	e899e3b4eb9f0d34f598816f69a9a60379987391 /src/backend/optimizer/path
parent	0fef38da215cdc9b01b1b623c2e37d7414b91843 (diff)