diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtpage.c')
-rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 196 |
1 files changed, 189 insertions, 7 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index fc744cf9fdb..530d924bff8 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -32,7 +32,9 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "storage/procarray.h" #include "utils/memdebug.h" +#include "utils/memutils.h" #include "utils/snapmgr.h" static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); @@ -57,6 +59,8 @@ static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib); +static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, + FullTransactionId safexid); /* * _bt_initmetapage() -- Fill a page buffer with a correct metapage image @@ -209,13 +213,9 @@ _bt_vacuum_needs_cleanup(Relation rel) * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the * total size of the index. We can reasonably expect (though are not * guaranteed) to be able to recycle this many pages if we decide to do a - * btvacuumscan call during the ongoing btvacuumcleanup. - * - * Our approach won't reliably avoid "wasted" cleanup-only btvacuumscan - * calls. That is, we can end up scanning the entire index without ever - * placing even 1 of the prev_num_delpages pages in the free space map, at - * least in certain narrow cases (see nbtree/README section on recycling - * deleted pages for details). This rarely comes up in practice. + * btvacuumscan call during the ongoing btvacuumcleanup. For further + * details see the nbtree/README section on placing deleted pages in the + * FSM. */ if (prev_num_delpages > 0 && prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20) @@ -2729,6 +2729,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target <= scanblkno) stats->pages_deleted++; + /* + * Remember information about the target page (now a newly deleted page) + * in dedicated vstate space for later. The page will be considered as a + * candidate to place in the FSM at the end of the current btvacuumscan() + * call. + */ + _bt_pendingfsm_add(vstate, target, safexid); + return true; } @@ -2873,3 +2881,177 @@ _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, subtreeparent, poffset, topparent, topparentrightsib); } + +/* + * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize + * optimization. + * + * Called at the start of a btvacuumscan(). Caller's cleanuponly argument + * indicates if ongoing VACUUM has not (and will not) call btbulkdelete(). + * + * We expect to allocate memory inside VACUUM's top-level memory context here. + * The working buffer is subject to a limit based on work_mem. Our strategy + * when the array can no longer grow within the bounds of that limit is to + * stop saving additional newly deleted pages, while proceeding as usual with + * the pages that we can fit. + */ +void +_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly) +{ + int64 maxbufsize; + + /* + * Don't bother with optimization in cleanup-only case -- we don't expect + * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan() + * can only take place because this optimization didn't work out during + * the last VACUUM. + */ + if (cleanuponly) + return; + + /* + * Cap maximum size of array so that we always respect work_mem. Avoid + * int overflow here. + */ + vstate->bufsize = 256; + maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM); + maxbufsize = Min(maxbufsize, INT_MAX); + maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM)); + /* Stay sane with small work_mem */ + maxbufsize = Max(maxbufsize, vstate->bufsize); + vstate->maxbufsize = maxbufsize; + + /* Allocate buffer, indicate that there are currently 0 pending pages */ + vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize); + vstate->npendingpages = 0; +} + +/* + * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during + * the ongoing VACUUM operation) into the free space map -- though only when + * it is actually safe to do so by now. + * + * Called at the end of a btvacuumscan(), just before free space map vacuuming + * takes place. + * + * Frees memory allocated by _bt_pendingfsm_init(), if any. + */ +void +_bt_pendingfsm_finalize(Relation rel, BTVacState *vstate) +{ + IndexBulkDeleteResult *stats = vstate->stats; + + Assert(stats->pages_newly_deleted >= vstate->npendingpages); + + if (vstate->npendingpages == 0) + { + /* Just free memory when nothing to do */ + if (vstate->pendingpages) + pfree(vstate->pendingpages); + + return; + } + +#ifdef DEBUG_BTREE_PENDING_FSM + + /* + * Debugging aid: Sleep for 5 seconds to greatly increase the chances of + * placing pending pages in the FSM. Note that the optimization will + * never be effective without some other backend concurrently consuming an + * XID. + */ + pg_usleep(5000000L); +#endif + + /* + * Recompute VACUUM XID boundaries. + * + * We don't actually care about the oldest non-removable XID. Computing + * the oldest such XID has a useful side-effect that we rely on: it + * forcibly updates the XID horizon state for this backend. This step is + * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize + * that it is now safe to recycle newly deleted pages without this step. + */ + GetOldestNonRemovableTransactionId(NULL); + + for (int i = 0; i < vstate->npendingpages; i++) + { + BlockNumber target = vstate->pendingpages[i].target; + FullTransactionId safexid = vstate->pendingpages[i].safexid; + + /* + * Do the equivalent of checking BTPageIsRecyclable(), but without + * accessing the page again a second time. + * + * Give up on finding the first non-recyclable page -- all later pages + * must be non-recyclable too, since _bt_pendingfsm_add() adds pages + * to the array in safexid order. + */ + if (!GlobalVisCheckRemovableFullXid(NULL, safexid)) + break; + + RecordFreeIndexPage(rel, target); + stats->pages_free++; + } + + pfree(vstate->pendingpages); +} + +/* + * Maintain array of pages that were deleted during current btvacuumscan() + * call, for use in _bt_pendingfsm_finalize() + */ +static void +_bt_pendingfsm_add(BTVacState *vstate, + BlockNumber target, + FullTransactionId safexid) +{ + Assert(vstate->npendingpages <= vstate->bufsize); + Assert(vstate->bufsize <= vstate->maxbufsize); + +#ifdef USE_ASSERT_CHECKING + + /* + * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the + * array will always be in safexid order (since that is the order that we + * save them in here) + */ + if (vstate->npendingpages > 0) + { + FullTransactionId lastsafexid = + vstate->pendingpages[vstate->npendingpages - 1].safexid; + + Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid)); + } +#endif + + /* + * If temp buffer reaches maxbufsize/work_mem capacity then we discard + * information about this page. + * + * Note that this also covers the case where we opted to not use the + * optimization in _bt_pendingfsm_init(). + */ + if (vstate->npendingpages == vstate->maxbufsize) + return; + + /* Consider enlarging buffer */ + if (vstate->npendingpages == vstate->bufsize) + { + int newbufsize = vstate->bufsize * 2; + + /* Respect work_mem */ + if (newbufsize > vstate->maxbufsize) + newbufsize = vstate->maxbufsize; + + vstate->bufsize = newbufsize; + vstate->pendingpages = + repalloc(vstate->pendingpages, + sizeof(BTPendingFSM) * vstate->bufsize); + } + + /* Save metadata for newly deleted page */ + vstate->pendingpages[vstate->npendingpages].target = target; + vstate->pendingpages[vstate->npendingpages].safexid = safexid; + vstate->npendingpages++; +} |