summaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtpage.c
diff options
context:
space:
mode:
authorPeter Geoghegan <pg@bowt.ie>2021-03-21 15:25:39 -0700
committerPeter Geoghegan <pg@bowt.ie>2021-03-21 15:25:39 -0700
commit9dd963ae2534e9614f0abeccaafbd39f1b93ff8a (patch)
tree0f3d448f2e5ad78b14eca30a2d90273676fbeaf0 /src/backend/access/nbtree/nbtpage.c
parent4d399a6fbeb720b34d33441330910b7d853f703d (diff)
Recycle nbtree pages deleted during same VACUUM.
Maintain a simple array of metadata about pages that were deleted during nbtree VACUUM's current btvacuumscan() call. Use this metadata at the end of btvacuumscan() to attempt to place newly deleted pages in the FSM without further delay. It might not yet be safe to place any of the pages in the FSM by then (they may not be deemed recyclable), but we have little to lose and plenty to gain by trying. In practice there is a very good chance that this will work out when vacuuming larger indexes, where scanning the index naturally takes quite a while. This commit doesn't change the page recycling invariants; it merely improves the efficiency of page recycling within the confines of the existing design. Recycle safety is a part of nbtree's implementation of what Lanin & Shasha call "the drain technique". The design happens to use transaction IDs (they're stored in deleted pages), but that in itself doesn't align the cutoff for recycle safety to any of the XID-based cutoffs used by VACUUM (e.g., OldestXmin). All that matters is whether or not _other_ backends might be able to observe various inconsistencies in the tree structure (that they cannot just detect and recover from by moving right). Recycle safety is purely a question of maintaining the consistency (or the apparent consistency) of a physical data structure. Note that running a simple serial test case involving a large range DELETE followed by a VACUUM VERBOSE will probably show that any newly deleted nbtree pages are not yet reusable/recyclable. This is expected in the absence of even one concurrent XID assignment. It is an old implementation restriction. In practice it's unlikely to be the thing that makes recycling remain unsafe, at least with larger indexes, where recycling newly deleted pages during the same VACUUM actually matters. An important high-level goal of this commit (as well as related recent commits e5d8a999 and 9f3665fb) is to make expensive deferred cleanup operations in index AMs rare in general. If index vacuuming frequently depends on the next VACUUM operation finishing off work that the current operation started, then the general behavior of index vacuuming is hard to predict. This is relevant to ongoing work that adds a vacuumlazy.c mechanism to skip index vacuuming in certain cases. Anything that makes the real world behavior of index vacuuming simpler and more linear will also make top-down modeling in vacuumlazy.c more robust. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzk76_P=67iUscb1UN44-gyZL-KgpsXbSxq_bdcMa7Q+wQ@mail.gmail.com
Diffstat (limited to 'src/backend/access/nbtree/nbtpage.c')
-rw-r--r--src/backend/access/nbtree/nbtpage.c196
1 files changed, 189 insertions, 7 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index fc744cf9fdb..530d924bff8 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -32,7 +32,9 @@
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
+#include "storage/procarray.h"
#include "utils/memdebug.h"
+#include "utils/memutils.h"
#include "utils/snapmgr.h"
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
@@ -57,6 +59,8 @@ static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
OffsetNumber *poffset,
BlockNumber *topparent,
BlockNumber *topparentrightsib);
+static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
+ FullTransactionId safexid);
/*
* _bt_initmetapage() -- Fill a page buffer with a correct metapage image
@@ -209,13 +213,9 @@ _bt_vacuum_needs_cleanup(Relation rel)
* Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
* total size of the index. We can reasonably expect (though are not
* guaranteed) to be able to recycle this many pages if we decide to do a
- * btvacuumscan call during the ongoing btvacuumcleanup.
- *
- * Our approach won't reliably avoid "wasted" cleanup-only btvacuumscan
- * calls. That is, we can end up scanning the entire index without ever
- * placing even 1 of the prev_num_delpages pages in the free space map, at
- * least in certain narrow cases (see nbtree/README section on recycling
- * deleted pages for details). This rarely comes up in practice.
+ * btvacuumscan call during the ongoing btvacuumcleanup. For further
+ * details see the nbtree/README section on placing deleted pages in the
+ * FSM.
*/
if (prev_num_delpages > 0 &&
prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
@@ -2729,6 +2729,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
if (target <= scanblkno)
stats->pages_deleted++;
+ /*
+ * Remember information about the target page (now a newly deleted page)
+ * in dedicated vstate space for later. The page will be considered as a
+ * candidate to place in the FSM at the end of the current btvacuumscan()
+ * call.
+ */
+ _bt_pendingfsm_add(vstate, target, safexid);
+
return true;
}
@@ -2873,3 +2881,177 @@ _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack,
subtreeparent, poffset,
topparent, topparentrightsib);
}
+
+/*
+ * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
+ * optimization.
+ *
+ * Called at the start of a btvacuumscan(). Caller's cleanuponly argument
+ * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
+ *
+ * We expect to allocate memory inside VACUUM's top-level memory context here.
+ * The working buffer is subject to a limit based on work_mem. Our strategy
+ * when the array can no longer grow within the bounds of that limit is to
+ * stop saving additional newly deleted pages, while proceeding as usual with
+ * the pages that we can fit.
+ */
+void
+_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
+{
+ int64 maxbufsize;
+
+ /*
+ * Don't bother with optimization in cleanup-only case -- we don't expect
+ * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
+ * can only take place because this optimization didn't work out during
+ * the last VACUUM.
+ */
+ if (cleanuponly)
+ return;
+
+ /*
+ * Cap maximum size of array so that we always respect work_mem. Avoid
+ * int overflow here.
+ */
+ vstate->bufsize = 256;
+ maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
+ maxbufsize = Min(maxbufsize, INT_MAX);
+ maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
+ /* Stay sane with small work_mem */
+ maxbufsize = Max(maxbufsize, vstate->bufsize);
+ vstate->maxbufsize = maxbufsize;
+
+ /* Allocate buffer, indicate that there are currently 0 pending pages */
+ vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
+ vstate->npendingpages = 0;
+}
+
+/*
+ * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
+ * the ongoing VACUUM operation) into the free space map -- though only when
+ * it is actually safe to do so by now.
+ *
+ * Called at the end of a btvacuumscan(), just before free space map vacuuming
+ * takes place.
+ *
+ * Frees memory allocated by _bt_pendingfsm_init(), if any.
+ */
+void
+_bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
+{
+ IndexBulkDeleteResult *stats = vstate->stats;
+
+ Assert(stats->pages_newly_deleted >= vstate->npendingpages);
+
+ if (vstate->npendingpages == 0)
+ {
+ /* Just free memory when nothing to do */
+ if (vstate->pendingpages)
+ pfree(vstate->pendingpages);
+
+ return;
+ }
+
+#ifdef DEBUG_BTREE_PENDING_FSM
+
+ /*
+ * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
+ * placing pending pages in the FSM. Note that the optimization will
+ * never be effective without some other backend concurrently consuming an
+ * XID.
+ */
+ pg_usleep(5000000L);
+#endif
+
+ /*
+ * Recompute VACUUM XID boundaries.
+ *
+ * We don't actually care about the oldest non-removable XID. Computing
+ * the oldest such XID has a useful side-effect that we rely on: it
+ * forcibly updates the XID horizon state for this backend. This step is
+ * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
+ * that it is now safe to recycle newly deleted pages without this step.
+ */
+ GetOldestNonRemovableTransactionId(NULL);
+
+ for (int i = 0; i < vstate->npendingpages; i++)
+ {
+ BlockNumber target = vstate->pendingpages[i].target;
+ FullTransactionId safexid = vstate->pendingpages[i].safexid;
+
+ /*
+ * Do the equivalent of checking BTPageIsRecyclable(), but without
+ * accessing the page again a second time.
+ *
+ * Give up on finding the first non-recyclable page -- all later pages
+ * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
+ * to the array in safexid order.
+ */
+ if (!GlobalVisCheckRemovableFullXid(NULL, safexid))
+ break;
+
+ RecordFreeIndexPage(rel, target);
+ stats->pages_free++;
+ }
+
+ pfree(vstate->pendingpages);
+}
+
+/*
+ * Maintain array of pages that were deleted during current btvacuumscan()
+ * call, for use in _bt_pendingfsm_finalize()
+ */
+static void
+_bt_pendingfsm_add(BTVacState *vstate,
+ BlockNumber target,
+ FullTransactionId safexid)
+{
+ Assert(vstate->npendingpages <= vstate->bufsize);
+ Assert(vstate->bufsize <= vstate->maxbufsize);
+
+#ifdef USE_ASSERT_CHECKING
+
+ /*
+ * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
+ * array will always be in safexid order (since that is the order that we
+ * save them in here)
+ */
+ if (vstate->npendingpages > 0)
+ {
+ FullTransactionId lastsafexid =
+ vstate->pendingpages[vstate->npendingpages - 1].safexid;
+
+ Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
+ }
+#endif
+
+ /*
+ * If temp buffer reaches maxbufsize/work_mem capacity then we discard
+ * information about this page.
+ *
+ * Note that this also covers the case where we opted to not use the
+ * optimization in _bt_pendingfsm_init().
+ */
+ if (vstate->npendingpages == vstate->maxbufsize)
+ return;
+
+ /* Consider enlarging buffer */
+ if (vstate->npendingpages == vstate->bufsize)
+ {
+ int newbufsize = vstate->bufsize * 2;
+
+ /* Respect work_mem */
+ if (newbufsize > vstate->maxbufsize)
+ newbufsize = vstate->maxbufsize;
+
+ vstate->bufsize = newbufsize;
+ vstate->pendingpages =
+ repalloc(vstate->pendingpages,
+ sizeof(BTPendingFSM) * vstate->bufsize);
+ }
+
+ /* Save metadata for newly deleted page */
+ vstate->pendingpages[vstate->npendingpages].target = target;
+ vstate->pendingpages[vstate->npendingpages].safexid = safexid;
+ vstate->npendingpages++;
+}