summaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtree.c
diff options
context:
space:
mode:
authorPeter Geoghegan <pg@bowt.ie>2024-04-06 11:47:10 -0400
committerPeter Geoghegan <pg@bowt.ie>2024-04-06 11:47:10 -0400
commit5bf748b86bc6786a3fc57fc7ce296c37da6564b0 (patch)
treecdf5b28c807516e1b25c716beb77f7592a1c941b /src/backend/access/nbtree/nbtree.c
parentddd9e43a92417dd0c2b60822d6e75862c73b139a (diff)
Enhance nbtree ScalarArrayOp execution.
Commit 9e8da0f7 taught nbtree to handle ScalarArrayOpExpr quals natively. This works by pushing down the full context (the array keys) to the nbtree index AM, enabling it to execute multiple primitive index scans that the planner treats as one continuous index scan/index path. This earlier enhancement enabled nbtree ScalarArrayOp index-only scans. It also allowed scans with ScalarArrayOp quals to return ordered results (with some notable restrictions, described further down). Take this general approach a lot further: teach nbtree SAOP index scans to decide how to execute ScalarArrayOp scans (when and where to start the next primitive index scan) based on physical index characteristics. This can be far more efficient. All SAOP scans will now reliably avoid duplicative leaf page accesses (just like any other nbtree index scan). SAOP scans whose array keys are naturally clustered together now require far fewer index descents, since we'll reliably avoid starting a new primitive scan just to get to a later offset from the same leaf page. The scan's arrays now advance using binary searches for the array element that best matches the next tuple's attribute value. Required scan key arrays (i.e. arrays from scan keys that can terminate the scan) ratchet forward in lockstep with the index scan. Non-required arrays (i.e. arrays from scan keys that can only exclude non-matching tuples) "advance" without the process ever rolling over to a higher-order array. Naturally, only required SAOP scan keys trigger skipping over leaf pages (non-required arrays cannot safely end or start primitive index scans). Consequently, even index scans of a composite index with a high-order inequality scan key (which we'll mark required) and a low-order SAOP scan key (which we won't mark required) now avoid repeating leaf page accesses -- that benefit isn't limited to simpler equality-only cases. In general, all nbtree index scans now output tuples as if they were one continuous index scan -- even scans that mix a high-order inequality with lower-order SAOP equalities reliably output tuples in index order. This allows us to remove a couple of special cases that were applied when building index paths with SAOP clauses during planning. Bugfix commit 807a40c5 taught the planner to avoid generating unsafe path keys: path keys on a multicolumn index path, with a SAOP clause on any attribute beyond the first/most significant attribute. These cases are now all safe, so we go back to generating path keys without regard for the presence of SAOP clauses (just like with any other clause type). Affected queries can now exploit scan output order in all the usual ways (e.g., certain "ORDER BY ... LIMIT n" queries can now terminate early). Also undo changes from follow-up bugfix commit a4523c5a, which taught the planner to produce alternative index paths, with path keys, but without low-order SAOP index quals (filter quals were used instead). We'll no longer generate these alternative paths, since they can no longer offer any meaningful advantages over standard index qual paths. Affected queries thereby avoid all of the disadvantages that come from using filter quals within index scan nodes. They can avoid extra heap page accesses from using filter quals to exclude non-matching tuples (index quals will never have that problem). They can also skip over irrelevant sections of the index in more cases (though only when nbtree determines that starting another primitive scan actually makes sense). There is a theoretical risk that removing restrictions on SAOP index paths from the planner will break compatibility with amcanorder-based index AMs maintained as extensions. Such an index AM could have the same limitations around ordered SAOP scans as nbtree had up until now. Adding a pro forma incompatibility item about the issue to the Postgres 17 release notes seems like a good idea. Author: Peter Geoghegan <pg@bowt.ie> Author: Matthias van de Meent <boekewurm+postgres@gmail.com> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com> Reviewed-By: Tomas Vondra <tomas.vondra@enterprisedb.com> Discussion: https://postgr.es/m/CAH2-Wz=ksvN_sjcnD1+Bt-WtifRA5ok48aDYnq3pkKhxgMQpcw@mail.gmail.com
Diffstat (limited to 'src/backend/access/nbtree/nbtree.c')
-rw-r--r--src/backend/access/nbtree/nbtree.c202
1 files changed, 118 insertions, 84 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 41df1027d2d..686a3206f72 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -40,6 +40,9 @@
/*
* BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
*
+ * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
+ * scan to advance it via another call to _bt_first.
+ *
* BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
* a new page; others must wait.
*
@@ -47,11 +50,11 @@
* to a new page; some process can start doing that.
*
* BTPARALLEL_DONE indicates that the scan is complete (including error exit).
- * We reach this state once for every distinct combination of array keys.
*/
typedef enum
{
BTPARALLEL_NOT_INITIALIZED,
+ BTPARALLEL_NEED_PRIMSCAN,
BTPARALLEL_ADVANCING,
BTPARALLEL_IDLE,
BTPARALLEL_DONE,
@@ -67,10 +70,14 @@ typedef struct BTParallelScanDescData
BTPS_State btps_pageStatus; /* indicates whether next page is
* available for scan. see above for
* possible states of parallel scan. */
- int btps_arrayKeyCount; /* count indicating number of array scan
- * keys processed by parallel scan */
- slock_t btps_mutex; /* protects above variables */
+ slock_t btps_mutex; /* protects above variables, btps_arrElems */
ConditionVariable btps_cv; /* used to synchronize parallel scan */
+
+ /*
+ * btps_arrElems is used when scans need to schedule another primitive
+ * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
+ */
+ int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
} BTParallelScanDescData;
typedef struct BTParallelScanDescData *BTParallelScanDesc;
@@ -204,21 +211,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
/* btree indexes are never lossy */
scan->xs_recheck = false;
- /*
- * If we have any array keys, initialize them during first call for a
- * scan. We can't do this in btrescan because we don't know the scan
- * direction at that time.
- */
- if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
- {
- /* punt if we have any unsatisfiable array keys */
- if (so->numArrayKeys < 0)
- return false;
-
- _bt_start_array_keys(scan, dir);
- }
-
- /* This loop handles advancing to the next array elements, if any */
+ /* Each loop iteration performs another primitive index scan */
do
{
/*
@@ -260,8 +253,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
/* If we have a tuple, return it ... */
if (res)
break;
- /* ... otherwise see if we have more array keys to deal with */
- } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
+ /* ... otherwise see if we need another primitive index scan */
+ } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
return res;
}
@@ -276,19 +269,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
int64 ntids = 0;
ItemPointer heapTid;
- /*
- * If we have any array keys, initialize them.
- */
- if (so->numArrayKeys)
- {
- /* punt if we have any unsatisfiable array keys */
- if (so->numArrayKeys < 0)
- return ntids;
-
- _bt_start_array_keys(scan, ForwardScanDirection);
- }
-
- /* This loop handles advancing to the next array elements, if any */
+ /* Each loop iteration performs another primitive index scan */
do
{
/* Fetch the first page & tuple */
@@ -318,8 +299,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
ntids++;
}
}
- /* Now see if we have more array keys to deal with */
- } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
+ /* Now see if we need another primitive index scan */
+ } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
return ntids;
}
@@ -348,10 +329,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
else
so->keyData = NULL;
- so->arrayKeyData = NULL; /* assume no array keys for now */
- so->arraysStarted = false;
- so->numArrayKeys = 0;
+ so->needPrimScan = false;
+ so->scanBehind = false;
so->arrayKeys = NULL;
+ so->orderProcs = NULL;
so->arrayContext = NULL;
so->killedItems = NULL; /* until needed */
@@ -391,7 +372,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
}
so->markItemIndex = -1;
- so->arrayKeyCount = 0;
+ so->needPrimScan = false;
+ so->scanBehind = false;
BTScanPosUnpinIfPinned(so->markPos);
BTScanPosInvalidate(so->markPos);
@@ -425,9 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
scankey,
scan->numberOfKeys * sizeof(ScanKeyData));
so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
-
- /* If any keys are SK_SEARCHARRAY type, set up array-key info */
- _bt_preprocess_array_keys(scan);
+ so->numArrayKeys = 0; /* ditto */
}
/*
@@ -455,7 +435,7 @@ btendscan(IndexScanDesc scan)
/* Release storage */
if (so->keyData != NULL)
pfree(so->keyData);
- /* so->arrayKeyData and so->arrayKeys are in arrayContext */
+ /* so->arrayKeys and so->orderProcs are in arrayContext */
if (so->arrayContext != NULL)
MemoryContextDelete(so->arrayContext);
if (so->killedItems != NULL)
@@ -490,10 +470,6 @@ btmarkpos(IndexScanDesc scan)
BTScanPosInvalidate(so->markPos);
so->markItemIndex = -1;
}
-
- /* Also record the current positions of any array keys */
- if (so->numArrayKeys)
- _bt_mark_array_keys(scan);
}
/*
@@ -504,10 +480,6 @@ btrestrpos(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- /* Restore the marked positions of any array keys */
- if (so->numArrayKeys)
- _bt_restore_array_keys(scan);
-
if (so->markItemIndex >= 0)
{
/*
@@ -546,6 +518,12 @@ btrestrpos(IndexScanDesc scan)
if (so->currTuples)
memcpy(so->currTuples, so->markTuples,
so->markPos.nextTupleOffset);
+ /* Reset the scan's array keys (see _bt_steppage for why) */
+ if (so->numArrayKeys)
+ {
+ _bt_start_array_keys(scan, so->currPos.dir);
+ so->needPrimScan = false;
+ }
}
else
BTScanPosInvalidate(so->currPos);
@@ -556,9 +534,10 @@ btrestrpos(IndexScanDesc scan)
* btestimateparallelscan -- estimate storage for BTParallelScanDescData
*/
Size
-btestimateparallelscan(void)
+btestimateparallelscan(int nkeys, int norderbys)
{
- return sizeof(BTParallelScanDescData);
+ /* Pessimistically assume all input scankeys will be output with arrays */
+ return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
}
/*
@@ -572,7 +551,6 @@ btinitparallelscan(void *target)
SpinLockInit(&bt_target->btps_mutex);
bt_target->btps_scanPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- bt_target->btps_arrayKeyCount = 0;
ConditionVariableInit(&bt_target->btps_cv);
}
@@ -598,7 +576,6 @@ btparallelrescan(IndexScanDesc scan)
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- btscan->btps_arrayKeyCount = 0;
SpinLockRelease(&btscan->btps_mutex);
}
@@ -608,23 +585,26 @@ btparallelrescan(IndexScanDesc scan)
* or _bt_parallel_done().
*
* The return value is true if we successfully seized the scan and false
- * if we did not. The latter case occurs if no pages remain for the current
- * set of scankeys.
+ * if we did not. The latter case occurs if no pages remain.
*
* If the return value is true, *pageno returns the next or current page
* of the scan (depending on the scan direction). An invalid block number
- * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * means the scan hasn't yet started, or that caller needs to start the next
+ * primitive index scan (if it's the latter case we'll set so.needPrimScan).
* The first time a participating process reaches the last page, it will return
* true and set *pageno to P_NONE; after that, further attempts to seize the
* scan will return false.
*
* Callers should ignore the value of pageno if the return value is false.
+ *
+ * Callers that are in a position to start a new primitive index scan must
+ * pass first=true (all other callers pass first=false). We just return false
+ * for first=false callers that require another primitive index scan.
*/
bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- BTPS_State pageStatus;
bool exit_loop = false;
bool status = true;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@@ -632,28 +612,69 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
*pageno = P_NONE;
+ if (first)
+ {
+ /*
+ * Initialize array related state when called from _bt_first, assuming
+ * that this will either be the first primitive index scan for the
+ * scan, or a previous explicitly scheduled primitive scan.
+ *
+ * Note: so->needPrimScan is only set when a scheduled primitive index
+ * scan is set to be performed in caller's worker process. It should
+ * not be set here by us for the first primitive scan, nor should we
+ * ever set it for a parallel scan that has no array keys.
+ */
+ so->needPrimScan = false;
+ so->scanBehind = false;
+ }
+ else
+ {
+ /*
+ * Don't attempt to seize the scan when backend requires another
+ * primitive index scan unless we're in a position to start it now
+ */
+ if (so->needPrimScan)
+ return false;
+ }
+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
while (1)
{
SpinLockAcquire(&btscan->btps_mutex);
- pageStatus = btscan->btps_pageStatus;
- if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+ if (btscan->btps_pageStatus == BTPARALLEL_DONE)
{
- /* Parallel scan has already advanced to a new set of scankeys. */
+ /* We're done with this parallel index scan */
status = false;
}
- else if (pageStatus == BTPARALLEL_DONE)
+ else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
{
+ Assert(so->numArrayKeys);
+
/*
- * We're done with this set of scankeys. This may be the end, or
- * there could be more sets to try.
+ * If we can start another primitive scan right away, do so.
+ * Otherwise just wait.
*/
- status = false;
+ if (first)
+ {
+ btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+ for (int i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[i];
+ ScanKey skey = &so->keyData[array->scan_key];
+
+ array->cur_elem = btscan->btps_arrElems[i];
+ skey->sk_argument = array->elem_values[array->cur_elem];
+ }
+ so->needPrimScan = true;
+ so->scanBehind = false;
+ *pageno = InvalidBlockNumber;
+ exit_loop = true;
+ }
}
- else if (pageStatus != BTPARALLEL_ADVANCING)
+ else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
{
/*
* We have successfully seized control of the scan for the purpose
@@ -677,6 +698,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
* _bt_parallel_release() -- Complete the process of advancing the scan to a
* new page. We now have the new value btps_scanPage; some other backend
* can now begin advancing the scan.
+ *
+ * Callers whose scan uses array keys must save their scan_page argument so
+ * that it can be passed to _bt_parallel_primscan_schedule, should caller
+ * determine that another primitive index scan is required. If that happens,
+ * scan_page won't be scanned by any backend (unless the next primitive index
+ * scan lands on scan_page).
*/
void
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
@@ -704,7 +731,6 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
void
_bt_parallel_done(IndexScanDesc scan)
{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
bool status_changed = false;
@@ -717,13 +743,11 @@ _bt_parallel_done(IndexScanDesc scan)
parallel_scan->ps_offset);
/*
- * Mark the parallel scan as done for this combination of scan keys,
- * unless some other process already did so. See also
- * _bt_advance_array_keys.
+ * Mark the parallel scan as done, unless some other process did so
+ * already
*/
SpinLockAcquire(&btscan->btps_mutex);
- if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
- btscan->btps_pageStatus != BTPARALLEL_DONE)
+ if (btscan->btps_pageStatus != BTPARALLEL_DONE)
{
btscan->btps_pageStatus = BTPARALLEL_DONE;
status_changed = true;
@@ -736,29 +760,39 @@ _bt_parallel_done(IndexScanDesc scan)
}
/*
- * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
- * keys.
+ * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
*
- * Updates the count of array keys processed for both local and parallel
- * scans.
+ * Caller passes the block number most recently passed to _bt_parallel_release
+ * by its backend. Caller successfully schedules the next primitive index scan
+ * if the shared parallel state hasn't been seized since caller's backend last
+ * advanced the scan.
*/
void
-_bt_parallel_advance_array_keys(IndexScanDesc scan)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
+ Assert(so->numArrayKeys);
+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
- so->arrayKeyCount++;
SpinLockAcquire(&btscan->btps_mutex);
- if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+ if (btscan->btps_scanPage == prev_scan_page &&
+ btscan->btps_pageStatus == BTPARALLEL_IDLE)
{
btscan->btps_scanPage = InvalidBlockNumber;
- btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- btscan->btps_arrayKeyCount++;
+ btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
+
+ /* Serialize scan's current array keys */
+ for (int i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[i];
+
+ btscan->btps_arrElems[i] = array->cur_elem;
+ }
}
SpinLockRelease(&btscan->btps_mutex);
}