diff options
author | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2024-04-03 19:32:28 +0300 |
---|---|---|
committer | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2024-04-03 19:32:28 +0300 |
commit | 6dbb490261a6170a3fc3e326c6983ad63e795047 (patch) | |
tree | a3c0a26893a20ed6d191d0aebcc5722c3c6984c4 /src/backend/access/heap/pruneheap.c | |
parent | 26d138f64474394cf1e573384e8f38efa637b674 (diff) |
Combine freezing and pruning steps in VACUUM
Execute both freezing and pruning of tuples in the same
heap_page_prune() function, now called heap_page_prune_and_freeze(),
and emit a single WAL record containing all changes. That reduces the
overall amount of WAL generated.
This moves the freezing logic from vacuumlazy.c to the
heap_page_prune_and_freeze() function. The main difference in the
coding is that in vacuumlazy.c, we looked at the tuples after the
pruning had already happened, but in heap_page_prune_and_freeze() we
operate on the tuples before pruning. The heap_prepare_freeze_tuple()
function is now invoked after we have determined that a tuple is not
going to be pruned away.
VACUUM no longer needs to loop through the items on the page after
pruning. heap_page_prune_and_freeze() does all the work. It now
returns the list of dead offsets, including existing LP_DEAD items, to
the caller. Similarly it's now responsible for tracking 'all_visible',
'all_frozen', and 'hastup' on the caller's behalf.
Author: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://www.postgresql.org/message-id/20240330055710.kqg6ii2cdojsxgje@liskov
Diffstat (limited to 'src/backend/access/heap/pruneheap.c')
-rw-r--r-- | src/backend/access/heap/pruneheap.c | 761 |
1 files changed, 663 insertions, 98 deletions
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 1b5bf990d21..d2eecaf7ebc 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -17,32 +17,54 @@ #include "access/heapam.h" #include "access/heapam_xlog.h" #include "access/htup_details.h" +#include "access/multixact.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xloginsert.h" +#include "commands/vacuum.h" +#include "executor/instrument.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "utils/rel.h" #include "utils/snapmgr.h" -/* Working data for heap_page_prune and subroutines */ +/* Working data for heap_page_prune_and_freeze() and subroutines */ typedef struct { + /*------------------------------------------------------- + * Arguments passed to heap_page_and_freeze() + *------------------------------------------------------- + */ + /* tuple visibility test, initialized for the relation */ GlobalVisState *vistest; /* whether or not dead items can be set LP_UNUSED during pruning */ bool mark_unused_now; + /* whether to attempt freezing tuples */ + bool freeze; + struct VacuumCutoffs *cutoffs; - TransactionId new_prune_xid; /* new prune hint value for page */ - TransactionId snapshotConflictHorizon; /* latest xid removed */ + /*------------------------------------------------------- + * Fields describing what to do to the page + *------------------------------------------------------- + */ + TransactionId new_prune_xid; /* new prune hint value */ + TransactionId latest_xid_removed; int nredirected; /* numbers of entries in arrays below */ int ndead; int nunused; + int nfrozen; /* arrays that accumulate indexes of items to be changed */ OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; OffsetNumber nowdead[MaxHeapTuplesPerPage]; OffsetNumber nowunused[MaxHeapTuplesPerPage]; + HeapTupleFreeze frozen[MaxHeapTuplesPerPage]; + + /*------------------------------------------------------- + * Working state for HOT chain processing + *------------------------------------------------------- + */ /* * 'root_items' contains offsets of all LP_REDIRECT line pointers and @@ -63,24 +85,92 @@ typedef struct */ bool processed[MaxHeapTuplesPerPage + 1]; + /* + * Tuple visibility is only computed once for each tuple, for correctness + * and efficiency reasons; see comment in heap_page_prune_and_freeze() for + * details. This is of type int8[], instead of HTSV_Result[], so we can + * use -1 to indicate no visibility has been computed, e.g. for LP_DEAD + * items. + * + * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is + * 1. Otherwise every access would need to subtract 1. + */ + int8 htsv[MaxHeapTuplesPerPage + 1]; + + /* + * Freezing-related state. + */ + HeapPageFreeze pagefrz; + + /*------------------------------------------------------- + * Information about what was done + * + * These fields are not used by pruning itself for the most part, but are + * used to collect information about what was pruned and what state the + * page is in after pruning, for the benefit of the caller. They are + * copied to the caller's PruneFreezeResult at the end. + * ------------------------------------------------------- + */ + int ndeleted; /* Number of tuples deleted from the page */ + + /* Number of live and recently dead tuples, after pruning */ + int live_tuples; + int recently_dead_tuples; + + /* Whether or not the page makes rel truncation unsafe */ + bool hastup; + + /* + * LP_DEAD items on the page after pruning. Includes existing LP_DEAD + * items + */ + int lpdead_items; /* number of items in the array */ + OffsetNumber *deadoffsets; /* points directly to presult->deadoffsets */ + + /* + * all_visible and all_frozen indicate if the all-visible and all-frozen + * bits in the visibility map can be set for this page after pruning. + * + * visibility_cutoff_xid is the newest xmin of live tuples on the page. + * The caller can use it as the conflict horizon, when setting the VM + * bits. It is only valid if we froze some tuples, and all_frozen is + * true. + * + * NOTE: all_visible and all_frozen don't include LP_DEAD items. That's + * convenient for heap_page_prune_and_freeze(), to use them to decide + * whether to freeze the page or not. The all_visible and all_frozen + * values returned to the caller are adjusted to include LP_DEAD items at + * the end. + * + * all_frozen should only be considered valid if all_visible is also set; + * we don't bother to clear the all_frozen flag every time we clear the + * all_visible flag. + */ + bool all_visible; + bool all_frozen; + TransactionId visibility_cutoff_xid; } PruneState; /* Local functions */ static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer); +static inline HTSV_Result htsv_get_valid_status(int status); static void heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff, - OffsetNumber rootoffnum, int8 *htsv, PruneState *prstate); + OffsetNumber rootoffnum, PruneState *prstate); static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); static void heap_prune_record_redirect(PruneState *prstate, - OffsetNumber offnum, OffsetNumber rdoffnum, bool was_normal); -static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, bool was_normal); -static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal); + OffsetNumber offnum, OffsetNumber rdoffnum, + bool was_normal); +static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, + bool was_normal); +static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum, + bool was_normal); static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal); static void heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumber offnum); -static void heap_prune_record_unchanged_lp_normal(Page page, int8 *htsv, PruneState *prstate, OffsetNumber offnum); +static void heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum); static void heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber offnum); static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum); @@ -163,15 +253,15 @@ heap_page_prune_opt(Relation relation, Buffer buffer) if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { OffsetNumber dummy_off_loc; - PruneResult presult; + PruneFreezeResult presult; /* * For now, pass mark_unused_now as false regardless of whether or * not the relation has indexes, since we cannot safely determine * that during on-access pruning with the current implementation. */ - heap_page_prune(relation, buffer, vistest, 0, - &presult, PRUNE_ON_ACCESS, &dummy_off_loc); + heap_page_prune_and_freeze(relation, buffer, vistest, 0, + NULL, &presult, PRUNE_ON_ACCESS, &dummy_off_loc, NULL, NULL); /* * Report the number of tuples reclaimed to pgstats. This is @@ -205,13 +295,24 @@ heap_page_prune_opt(Relation relation, Buffer buffer) /* - * Prune and repair fragmentation in the specified page. + * Prune and repair fragmentation and potentially freeze tuples on the + * specified page. * * Caller must have pin and buffer cleanup lock on the page. Note that we * don't update the FSM information for page on caller's behalf. Caller might * also need to account for a reduction in the length of the line pointer * array following array truncation by us. * + * If the HEAP_PRUNE_FREEZE option is set, we will also freeze tuples if it's + * required in order to advance relfrozenxid / relminmxid, or if it's + * considered advantageous for overall system performance to do so now. The + * 'cutoffs', 'presult', 'new_refrozen_xid' and 'new_relmin_mxid' arguments + * are required when freezing. When HEAP_PRUNE_FREEZE option is set, we also + * set presult->all_visible and presult->all_frozen on exit, to indicate if + * the VM bits can be set. They are always set to false when the + * HEAP_PRUNE_FREEZE option is not set, because at the moment only callers + * that also freeze need that information. + * * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD * (see heap_prune_satisfies_vacuum). * @@ -219,23 +320,40 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * MARK_UNUSED_NOW indicates that dead items can be set LP_UNUSED during * pruning. * - * presult contains output parameters needed by callers such as the number of - * tuples removed and the number of line pointers newly marked LP_DEAD. - * heap_page_prune() is responsible for initializing it. + * FREEZE indicates that we will also freeze tuples, and will return + * 'all_visible', 'all_frozen' flags to the caller. + * + * cutoffs contains the freeze cutoffs, established by VACUUM at the beginning + * of vacuuming the relation. Required if HEAP_PRUNE_FREEZE option is set. + * + * presult contains output parameters needed by callers, such as the number of + * tuples removed and the offsets of dead items on the page after pruning. + * heap_page_prune_and_freeze() is responsible for initializing it. Required + * by all callers. * * reason indicates why the pruning is performed. It is included in the WAL * record for debugging and analysis purposes, but otherwise has no effect. * * off_loc is the offset location required by the caller to use in error * callback. + * + * new_relfrozen_xid and new_relmin_xid must provided by the caller if the + * HEAP_PRUNE_FREEZE option is set. On entry, they contain the oldest XID and + * multi-XID seen on the relation so far. They will be updated with oldest + * values present on the page after pruning. After processing the whole + * relation, VACUUM can use these values as the new relfrozenxid/relminmxid + * for the relation. */ void -heap_page_prune(Relation relation, Buffer buffer, - GlobalVisState *vistest, - int options, - PruneResult *presult, - PruneReason reason, - OffsetNumber *off_loc) +heap_page_prune_and_freeze(Relation relation, Buffer buffer, + GlobalVisState *vistest, + int options, + struct VacuumCutoffs *cutoffs, + PruneFreezeResult *presult, + PruneReason reason, + OffsetNumber *off_loc, + TransactionId *new_relfrozen_xid, + MultiXactId *new_relmin_mxid) { Page page = BufferGetPage(buffer); BlockNumber blockno = BufferGetBlockNumber(buffer); @@ -243,6 +361,17 @@ heap_page_prune(Relation relation, Buffer buffer, maxoff; PruneState prstate; HeapTupleData tup; + bool do_freeze; + bool do_prune; + bool do_hint; + bool hint_bit_fpi; + int64 fpi_before = pgWalUsage.wal_fpi; + + /* Copy parameters to prstate */ + prstate.vistest = vistest; + prstate.mark_unused_now = (options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; + prstate.freeze = (options & HEAP_PAGE_PRUNE_FREEZE) != 0; + prstate.cutoffs = cutoffs; /* * Our strategy is to scan the page and make lists of items to change, @@ -256,36 +385,98 @@ heap_page_prune(Relation relation, Buffer buffer, * initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; - prstate.vistest = vistest; - prstate.mark_unused_now = (options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; - prstate.snapshotConflictHorizon = InvalidTransactionId; - prstate.nredirected = prstate.ndead = prstate.nunused = 0; - prstate.ndeleted = 0; + prstate.latest_xid_removed = InvalidTransactionId; + prstate.nredirected = prstate.ndead = prstate.nunused = prstate.nfrozen = 0; prstate.nroot_items = 0; prstate.nheaponly_items = 0; + /* initialize page freezing working state */ + prstate.pagefrz.freeze_required = false; + if (prstate.freeze) + { + Assert(new_relfrozen_xid && new_relmin_mxid); + prstate.pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid; + prstate.pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid; + prstate.pagefrz.FreezePageRelminMxid = *new_relmin_mxid; + prstate.pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid; + } + else + { + Assert(new_relfrozen_xid == NULL && new_relmin_mxid == NULL); + prstate.pagefrz.FreezePageRelminMxid = InvalidMultiXactId; + prstate.pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId; + prstate.pagefrz.FreezePageRelfrozenXid = InvalidTransactionId; + prstate.pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId; + } + + prstate.ndeleted = 0; + prstate.live_tuples = 0; + prstate.recently_dead_tuples = 0; + prstate.hastup = false; + prstate.lpdead_items = 0; + prstate.deadoffsets = presult->deadoffsets; + /* - * presult->htsv is not initialized here because all ntuple spots in the - * array will be set either to a valid HTSV_Result value or -1. + * Caller may update the VM after we're done. We can keep track of + * whether the page will be all-visible and all-frozen after pruning and + * freezing to help the caller to do that. + * + * Currently, only VACUUM sets the VM bits. To save the effort, only do + * the bookkeeping if the caller needs it. Currently, that's tied to + * HEAP_PAGE_PRUNE_FREEZE, but it could be a separate flag if you wanted + * to update the VM bits without also freezing or freeze without also + * setting the VM bits. + * + * In addition to telling the caller whether it can set the VM bit, we + * also use 'all_visible' and 'all_frozen' for our own decision-making. If + * the whole page would become frozen, we consider opportunistically + * freezing tuples. We will not be able to freeze the whole page if there + * are tuples present that are not visible to everyone or if there are + * dead tuples which are not yet removable. However, dead tuples which + * will be removed by the end of vacuuming should not preclude us from + * opportunistically freezing. Because of that, we do not clear + * all_visible when we see LP_DEAD items. We fix that at the end of the + * function, when we return the value to the caller, so that the caller + * doesn't set the VM bit incorrectly. */ - presult->ndeleted = 0; - presult->nnewlpdead = 0; + if (prstate.freeze) + { + prstate.all_visible = true; + prstate.all_frozen = true; + } + else + { + /* + * Initializing to false allows skipping the work to update them in + * heap_prune_record_unchanged_lp_normal(). + */ + prstate.all_visible = false; + prstate.all_frozen = false; + } + + /* + * The visibility cutoff xid is the newest xmin of live tuples on the + * page. In the common case, this will be set as the conflict horizon the + * caller can use for updating the VM. If, at the end of freezing and + * pruning, the page is all-frozen, there is no possibility that any + * running transaction on the standby does not see tuples on the page as + * all-visible, so the conflict horizon remains InvalidTransactionId. + */ + prstate.visibility_cutoff_xid = InvalidTransactionId; maxoff = PageGetMaxOffsetNumber(page); tup.t_tableOid = RelationGetRelid(relation); /* * Determine HTSV for all tuples, and queue them up for processing as HOT - * chain roots or as a heap-only items. + * chain roots or as heap-only items. * * Determining HTSV only once for each tuple is required for correctness, * to deal with cases where running HTSV twice could result in different * results. For example, RECENTLY_DEAD can turn to DEAD if another * checked item causes GlobalVisTestIsRemovableFullXid() to update the * horizon, or INSERT_IN_PROGRESS can change to DEAD if the inserting - * transaction aborts. VACUUM assumes that there are no normal DEAD - * tuples left on the page after pruning, so it needs to have the same - * understanding of what is DEAD and what is not. + * transaction aborts. * * It's also good for performance. Most commonly tuples within a page are * stored at decreasing offsets (while the items are stored at increasing @@ -310,7 +501,7 @@ heap_page_prune(Relation relation, Buffer buffer, *off_loc = offnum; prstate.processed[offnum] = false; - presult->htsv[offnum] = -1; + prstate.htsv[offnum] = -1; /* Nothing to do if slot doesn't contain a tuple */ if (!ItemIdIsUsed(itemid)) @@ -349,8 +540,8 @@ heap_page_prune(Relation relation, Buffer buffer, tup.t_len = ItemIdGetLength(itemid); ItemPointerSet(&tup.t_self, blockno, offnum); - presult->htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, - buffer); + prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, + buffer); if (!HeapTupleHeaderIsHeapOnly(htup)) prstate.root_items[prstate.nroot_items++] = offnum; @@ -359,6 +550,12 @@ heap_page_prune(Relation relation, Buffer buffer, } /* + * If checksums are enabled, heap_prune_satisfies_vacuum() may have caused + * an FPI to be emitted. + */ + hint_bit_fpi = fpi_before != pgWalUsage.wal_fpi; + + /* * Process HOT chains. * * We added the items to the array starting from 'maxoff', so by @@ -381,8 +578,7 @@ heap_page_prune(Relation relation, Buffer buffer, *off_loc = offnum; /* Process this item or chain of items */ - heap_prune_chain(page, blockno, maxoff, - offnum, presult->htsv, &prstate); + heap_prune_chain(page, blockno, maxoff, offnum, &prstate); } /* @@ -412,7 +608,7 @@ heap_page_prune(Relation relation, Buffer buffer, * return true for an XMIN_INVALID tuple, so this code will work even * when there were sequential updates within the aborted transaction.) */ - if (presult->htsv[offnum] == HEAPTUPLE_DEAD) + if (prstate.htsv[offnum] == HEAPTUPLE_DEAD) { ItemId itemid = PageGetItemId(page, offnum); HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid); @@ -420,7 +616,7 @@ heap_page_prune(Relation relation, Buffer buffer, if (likely(!HeapTupleHeaderIsHotUpdated(htup))) { HeapTupleHeaderAdvanceConflictHorizon(htup, - &prstate.snapshotConflictHorizon); + &prstate.latest_xid_removed); heap_prune_record_unused(&prstate, offnum, true); } else @@ -438,7 +634,7 @@ heap_page_prune(Relation relation, Buffer buffer, } } else - heap_prune_record_unchanged_lp_normal(page, presult->htsv, &prstate, offnum); + heap_prune_record_unchanged_lp_normal(page, &prstate, offnum); } /* We should now have processed every tuple exactly once */ @@ -456,21 +652,107 @@ heap_page_prune(Relation relation, Buffer buffer, /* Clear the offset information once we have processed the given page. */ *off_loc = InvalidOffsetNumber; - /* Any error while applying the changes is critical */ - START_CRIT_SECTION(); + do_prune = prstate.nredirected > 0 || + prstate.ndead > 0 || + prstate.nunused > 0; - /* Have we found any prunable items? */ - if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) + /* + * Even if we don't prune anything, if we found a new value for the + * pd_prune_xid field or the page was marked full, we will update the hint + * bit. + */ + do_hint = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || + PageIsFull(page); + + /* + * Decide if we want to go ahead with freezing according to the freeze + * plans we prepared, or not. + */ + do_freeze = false; + if (prstate.freeze) + { + if (prstate.pagefrz.freeze_required) + { + /* + * heap_prepare_freeze_tuple indicated that at least one XID/MXID + * from before FreezeLimit/MultiXactCutoff is present. Must + * freeze to advance relfrozenxid/relminmxid. + */ + do_freeze = true; + } + else + { + /* + * Opportunistically freeze the page if we are generating an FPI + * anyway and if doing so means that we can set the page + * all-frozen afterwards (might not happen until VACUUM's final + * heap pass). + * + * XXX: Previously, we knew if pruning emitted an FPI by checking + * pgWalUsage.wal_fpi before and after pruning. Once the freeze + * and prune records were combined, this heuristic couldn't be + * used anymore. The opportunistic freeze heuristic must be + * improved; however, for now, try to approximate the old logic. + */ + if (prstate.all_visible && prstate.all_frozen && prstate.nfrozen > 0) + { + /* + * Freezing would make the page all-frozen. Have already + * emitted an FPI or will do so anyway? + */ + if (RelationNeedsWAL(relation)) + { + if (hint_bit_fpi) + do_freeze = true; + else if (do_prune) + { + if (XLogCheckBufferNeedsBackup(buffer)) + do_freeze = true; + } + else if (do_hint) + { + if (XLogHintBitIsNeeded() && XLogCheckBufferNeedsBackup(buffer)) + do_freeze = true; + } + } + } + } + } + + if (do_freeze) { /* - * Apply the planned item changes, then repair page fragmentation, and - * update the page's hint bit about whether it has free line pointers. + * Validate the tuples we will be freezing before entering the + * critical section. */ - heap_page_prune_execute(buffer, false, - prstate.redirected, prstate.nredirected, - prstate.nowdead, prstate.ndead, - prstate.nowunused, prstate.nunused); + heap_pre_freeze_checks(buffer, prstate.frozen, prstate.nfrozen); + } + else if (prstate.nfrozen > 0) + { + /* + * The page contained some tuples that were not already frozen, and we + * chose not to freeze them now. The page won't be all-frozen then. + */ + Assert(!prstate.pagefrz.freeze_required); + prstate.all_frozen = false; + prstate.nfrozen = 0; /* avoid miscounts in instrumentation */ + } + else + { + /* + * We have no freeze plans to execute. The page might already be + * all-frozen (perhaps only following pruning), though. Such pages + * can be marked all-frozen in the VM by our caller, even though none + * of its tuples were newly frozen here. + */ + } + + /* Any error while applying the changes is critical */ + START_CRIT_SECTION(); + + if (do_hint) + { /* * Update the page's pd_prune_xid field to either zero, or the lowest * XID of any soon-prunable tuple. @@ -484,6 +766,29 @@ heap_page_prune(Relation relation, Buffer buffer, */ PageClearFull(page); + /* + * If that's all we had to do to the page, this is a non-WAL-logged + * hint. If we are going to freeze or prune the page, we will mark + * the buffer dirty below. + */ + if (!do_freeze && !do_prune) + MarkBufferDirtyHint(buffer, true); + } + + if (do_prune || do_freeze) + { + /* Apply the planned item changes and repair page fragmentation. */ + if (do_prune) + { + heap_page_prune_execute(buffer, false, + prstate.redirected, prstate.nredirected, + prstate.nowdead, prstate.ndead, + prstate.nowunused, prstate.nunused); + } + + if (do_freeze) + heap_freeze_prepared_tuples(buffer, prstate.frozen, prstate.nfrozen); + MarkBufferDirty(buffer); /* @@ -491,40 +796,115 @@ heap_page_prune(Relation relation, Buffer buffer, */ if (RelationNeedsWAL(relation)) { + /* + * The snapshotConflictHorizon for the whole record should be the + * most conservative of all the horizons calculated for any of the + * possible modifications. If this record will prune tuples, any + * transactions on the standby older than the youngest xmax of the + * most recently removed tuple this record will prune will + * conflict. If this record will freeze tuples, any transactions + * on the standby with xids older than the youngest tuple this + * record will freeze will conflict. + */ + TransactionId frz_conflict_horizon = InvalidTransactionId; + TransactionId conflict_xid; + + /* + * We can use the visibility_cutoff_xid as our cutoff for + * conflicts when the whole page is eligible to become all-frozen + * in the VM once we're done with it. Otherwise we generate a + * conservative cutoff by stepping back from OldestXmin. + */ + if (do_freeze) + { + if (prstate.all_visible && prstate.all_frozen) + frz_conflict_horizon = prstate.visibility_cutoff_xid; + else + { + /* Avoids false conflicts when hot_standby_feedback in use */ + frz_conflict_horizon = prstate.cutoffs->OldestXmin; + TransactionIdRetreat(frz_conflict_horizon); + } + } + + if (TransactionIdFollows(frz_conflict_horizon, prstate.latest_xid_removed)) + conflict_xid = frz_conflict_horizon; + else + conflict_xid = prstate.latest_xid_removed; + log_heap_prune_and_freeze(relation, buffer, - prstate.snapshotConflictHorizon, + conflict_xid, true, reason, - NULL, 0, + prstate.frozen, prstate.nfrozen, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused); } } - else - { - /* - * If we didn't prune anything, but have found a new value for the - * pd_prune_xid field, update it and mark the buffer dirty. This is - * treated as a non-WAL-logged hint. - * - * Also clear the "page is full" flag if it is set, since there's no - * point in repeating the prune/defrag process until something else - * happens to the page. - */ - if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || - PageIsFull(page)) - { - ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; - PageClearFull(page); - MarkBufferDirtyHint(buffer, true); - } - } END_CRIT_SECTION(); /* Copy information back for caller */ - presult->nnewlpdead = prstate.ndead; presult->ndeleted = prstate.ndeleted; + presult->nnewlpdead = prstate.ndead; + presult->nfrozen = prstate.nfrozen; + presult->live_tuples = prstate.live_tuples; + presult->recently_dead_tuples = prstate.recently_dead_tuples; + + /* + * It was convenient to ignore LP_DEAD items in all_visible earlier on to + * make the choice of whether or not to freeze the page unaffected by the + * short-term presence of LP_DEAD items. These LP_DEAD items were + * effectively assumed to be LP_UNUSED items in the making. It doesn't + * matter which vacuum heap pass (initial pass or final pass) ends up + * setting the page all-frozen, as long as the ongoing VACUUM does it. + * + * Now that freezing has been finalized, unset all_visible if there are + * any LP_DEAD items on the page. It needs to reflect the present state + * of the page, as expected by our caller. + */ + if (prstate.all_visible && prstate.lpdead_items == 0) + { + presult->all_visible = prstate.all_visible; + presult->all_frozen = prstate.all_frozen; + } + else + { + presult->all_visible = false; + presult->all_frozen = false; + } + + presult->hastup = prstate.hastup; + + /* + * For callers planning to update the visibility map, the conflict horizon + * for that record must be the newest xmin on the page. However, if the + * page is completely frozen, there can be no conflict and the + * vm_conflict_horizon should remain InvalidTransactionId. This includes + * the case that we just froze all the tuples; the prune-freeze record + * included the conflict XID already so the caller doesn't need it. + */ + if (presult->all_frozen) + presult->vm_conflict_horizon = InvalidTransactionId; + else + presult->vm_conflict_horizon = prstate.visibility_cutoff_xid; + + presult->lpdead_items = prstate.lpdead_items; + /* the presult->deadoffsets array was already filled in */ + + if (prstate.freeze) + { + if (presult->nfrozen > 0) + { + *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid; + *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid; + } + else + { + *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid; + *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid; + } + } } @@ -550,9 +930,23 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) /* + * Pruning calculates tuple visibility once and saves the results in an array + * of int8. See PruneState.htsv for details. This helper function is meant + * to guard against examining visibility status array members which have not + * yet been computed. + */ +static inline HTSV_Result +htsv_get_valid_status(int status) +{ + Assert(status >= HEAPTUPLE_DEAD && + status <= HEAPTUPLE_DELETE_IN_PROGRESS); + return (HTSV_Result) status; +} + +/* * Prune specified line pointer or a HOT chain originating at line pointer. * - * Tuple visibility information is provided in htsv. + * Tuple visibility information is provided in prstate->htsv. * * If the item is an index-referenced tuple (i.e. not a heap-only tuple), * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT @@ -572,11 +966,17 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) * prstate showing the changes to be made. Items to be redirected are added * to the redirected[] array (two entries per redirection); items to be set to * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED - * state are added to nowunused[]. + * state are added to nowunused[]. We perform bookkeeping of live tuples, + * visibility etc. based on what the page will look like after the changes + * applied. All that bookkeeping is performed in the heap_prune_record_*() + * subroutines. The division of labor is that heap_prune_chain() decides the + * fate of each tuple, ie. whether it's going to be removed, redirected or + * left unchanged, and the heap_prune_record_*() subroutines update PruneState + * based on that outcome. */ static void heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff, - OffsetNumber rootoffnum, int8 *htsv, PruneState *prstate) + OffsetNumber rootoffnum, PruneState *prstate) { TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; @@ -656,15 +1056,14 @@ heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff, */ chainitems[nchain++] = offnum; - switch (htsv_get_valid_status(htsv[offnum])) + switch (htsv_get_valid_status(prstate->htsv[offnum])) { case HEAPTUPLE_DEAD: /* Remember the last DEAD tuple seen */ ndeadchain = nchain; HeapTupleHeaderAdvanceConflictHorizon(htup, - &prstate->snapshotConflictHorizon); - + &prstate->latest_xid_removed); /* Advance to next chain member */ break; @@ -720,10 +1119,11 @@ heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff, { /* * We found a redirect item that doesn't point to a valid follow-on - * item. This can happen if the loop in heap_page_prune caused us to - * visit the dead successor of a redirect item before visiting the - * redirect item. We can clean up by setting the redirect item to - * LP_DEAD state or LP_UNUSED if the caller indicated. + * item. This can happen if the loop in heap_page_prune_and_freeze() + * caused us to visit the dead successor of a redirect item before + * visiting the redirect item. We can clean up by setting the + * redirect item to LP_DEAD state or LP_UNUSED if the caller + * indicated. */ heap_prune_record_dead_or_unused(prstate, rootoffnum, false); return; @@ -745,7 +1145,7 @@ process_chain: i++; } for (; i < nchain; i++) - heap_prune_record_unchanged_lp_normal(page, htsv, prstate, chainitems[i]); + heap_prune_record_unchanged_lp_normal(page, prstate, chainitems[i]); } else if (ndeadchain == nchain) { @@ -771,7 +1171,7 @@ process_chain: /* the rest of tuples in the chain are normal, unchanged tuples */ for (int i = ndeadchain; i < nchain; i++) - heap_prune_record_unchanged_lp_normal(page, htsv, prstate, chainitems[i]); + heap_prune_record_unchanged_lp_normal(page, prstate, chainitems[i]); } } @@ -816,6 +1216,8 @@ heap_prune_record_redirect(PruneState *prstate, */ if (was_normal) prstate->ndeleted++; + + prstate->hastup = true; } /* Record line pointer to be marked dead */ @@ -831,6 +1233,14 @@ heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, prstate->ndead++; /* + * Deliberately delay unsetting all_visible until later during pruning. + * Removable dead tuples shouldn't preclude freezing the page. + */ + + /* Record the dead offset for vacuum */ + prstate->deadoffsets[prstate->lpdead_items++] = offnum; + + /* * If the root entry had been a normal tuple, we are deleting it, so count * it in the result. But changing a redirect (even to DEAD state) doesn't * count. @@ -892,22 +1302,122 @@ heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumb } /* - * Record LP_NORMAL line pointer that is left unchanged. + * Record line pointer that is left unchanged. We consider freezing it, and + * update bookkeeping of tuple counts and page visibility. */ static void -heap_prune_record_unchanged_lp_normal(Page page, int8 *htsv, PruneState *prstate, OffsetNumber offnum) +heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum) { HeapTupleHeader htup; Assert(!prstate->processed[offnum]); prstate->processed[offnum] = true; - switch (htsv[offnum]) + prstate->hastup = true; /* the page is not empty */ + + /* + * The criteria for counting a tuple as live in this block need to match + * what analyze.c's acquire_sample_rows() does, otherwise VACUUM and + * ANALYZE may produce wildly different reltuples values, e.g. when there + * are many recently-dead tuples. + * + * The logic here is a bit simpler than acquire_sample_rows(), as VACUUM + * can't run inside a transaction block, which makes some cases impossible + * (e.g. in-progress insert from the same transaction). + * + * HEAPTUPLE_DEAD are handled by the other heap_prune_record_*() + * subroutines. They don't count dead items like acquire_sample_rows() + * does, because we assume that all dead items will become LP_UNUSED + * before VACUUM finishes. This difference is only superficial. VACUUM + * effectively agrees with ANALYZE about DEAD items, in the end. VACUUM + * won't remember LP_DEAD items, but only because they're not supposed to + * be left behind when it is done. (Cases where we bypass index vacuuming + * will violate this optimistic assumption, but the overall impact of that + * should be negligible.) + */ + htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum)); + + switch (prstate->htsv[offnum]) { case HEAPTUPLE_LIVE: + + /* + * Count it as live. Not only is this natural, but it's also what + * acquire_sample_rows() does. + */ + prstate->live_tuples++; + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * PD_ALL_VISIBLE flag if the inserter committed asynchronously. + * See SetHintBits for more info. Check that the tuple is hinted + * xmin-committed because of that. + */ + if (prstate->all_visible) + { + TransactionId xmin; + + if (!HeapTupleHeaderXminCommitted(htup)) + { + prstate->all_visible = false; + break; + } + + /* + * The inserter definitely committed. But is it old enough + * that everyone sees it as committed? A FrozenTransactionId + * is seen as committed to everyone. Otherwise, we check if + * there is a snapshot that considers this xid to still be + * running, and if so, we don't consider the page all-visible. + */ + xmin = HeapTupleHeaderGetXmin(htup); + + /* + * For now always use prstate->cutoffs for this test, because + * we only update 'all_visible' when freezing is requested. We + * could use GlobalVisTestIsRemovableXid instead, if a + * non-freezing caller wanted to set the VM bit. + */ + Assert(prstate->cutoffs); + if (!TransactionIdPrecedes(xmin, prstate->cutoffs->OldestXmin)) + { + prstate->all_visible = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, prstate->visibility_cutoff_xid) && + TransactionIdIsNormal(xmin)) + prstate->visibility_cutoff_xid = xmin; + } + break; + + case HEAPTUPLE_RECENTLY_DEAD: + prstate->recently_dead_tuples++; + prstate->all_visible = false; + + /* + * This tuple will soon become DEAD. Update the hint field so + * that the page is reconsidered for pruning in future. + */ + heap_prune_record_prunable(prstate, + HeapTupleHeaderGetUpdateXid(htup)); + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: /* + * We do not count these rows as live, because we expect the + * inserting transaction to update the counters at commit, and we + * assume that will happen only after we report our results. This + * assumption is a bit shaky, but it is what acquire_sample_rows() + * does, so be consistent. + */ + prstate->all_visible = false; + + /* * If we wanted to optimize for aborts, we might consider marking * the page prunable when we see INSERT_IN_PROGRESS. But we * don't. See related decisions about when to mark the page @@ -915,10 +1425,15 @@ heap_prune_record_unchanged_lp_normal(Page page, int8 *htsv, PruneState *prstate */ break; - case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_DELETE_IN_PROGRESS: - htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum)); + /* + * This an expected case during concurrent vacuum. Count such + * rows as live. As above, we assume the deleting transaction + * will commit and update the counters after we report. + */ + prstate->live_tuples++; + prstate->all_visible = false; /* * This tuple may soon become DEAD. Update the hint field so that @@ -928,16 +1443,40 @@ heap_prune_record_unchanged_lp_normal(Page page, int8 *htsv, PruneState *prstate HeapTupleHeaderGetUpdateXid(htup)); break; - default: /* * DEAD tuples should've been passed to heap_prune_record_dead() * or heap_prune_record_unused() instead. */ - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d", htsv[offnum]); + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d", + prstate->htsv[offnum]); break; } + + /* Consider freezing any normal tuples which will not be removed */ + if (prstate->freeze) + { + bool totally_frozen; + + if ((heap_prepare_freeze_tuple(htup, + prstate->cutoffs, + &prstate->pagefrz, + &prstate->frozen[prstate->nfrozen], + &totally_frozen))) + { + /* Save prepared freeze plan for later */ + prstate->frozen[prstate->nfrozen++].offset = offnum; + } + + /* + * If any tuple isn't either totally frozen already or eligible to + * become totally frozen (according to its freeze plan), then the page + * definitely cannot be set all-frozen in the visibility map later on. + */ + if (!totally_frozen) + prstate->all_frozen = false; + } } @@ -949,6 +1488,24 @@ heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber { Assert(!prstate->processed[offnum]); prstate->processed[offnum] = true; + + /* + * Deliberately don't set hastup for LP_DEAD items. We make the soft + * assumption that any LP_DEAD items encountered here will become + * LP_UNUSED later on, before count_nondeletable_pages is reached. If we + * don't make this assumption then rel truncation will only happen every + * other VACUUM, at most. Besides, VACUUM must treat + * hastup/nonempty_pages as provisional no matter how LP_DEAD items are + * handled (handled here, or handled later on). + * + * Similarly, don't unset all_visible until later, at the end of + * heap_page_prune_and_freeze(). This will allow us to attempt to freeze + * the page after pruning. As long as we unset it before updating the + * visibility map, this will be correct. + */ + + /* Record the dead offset for vacuum */ + prstate->deadoffsets[prstate->lpdead_items++] = offnum; } /* @@ -957,12 +1514,20 @@ heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum) { + /* + * A redirect line pointer doesn't count as a live tuple. + * + * If we leave a redirect line pointer in place, there will be another + * tuple on the page that it points to. We will do the bookkeeping for + * that separately. So we have nothing to do here, except remember that + * we processed this item. + */ Assert(!prstate->processed[offnum]); prstate->processed[offnum] = true; } /* - * Perform the actual page changes needed by heap_page_prune. + * Perform the actual page changes needed by heap_page_prune_and_freeze(). * * If 'lp_truncate_only' is set, we are merely marking LP_DEAD line pointers * as unused, not redirecting or removing anything else. The @@ -1093,10 +1658,10 @@ heap_page_prune_execute(Buffer buffer, bool lp_truncate_only, else { /* - * When heap_page_prune() was called, mark_unused_now may have - * been passed as true, which allows would-be LP_DEAD items to be - * made LP_UNUSED instead. This is only possible if the relation - * has no indexes. If there are any dead items, then + * When heap_page_prune_and_freeze() was called, mark_unused_now + * may have been passed as true, which allows would-be LP_DEAD + * items to be made LP_UNUSED instead. This is only possible if + * the relation has no indexes. If there are any dead items, then * mark_unused_now was not true and every item being marked * LP_UNUSED must refer to a heap-only tuple. */ |