summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/nbtree/README22
-rw-r--r--src/backend/access/nbtree/nbtree.c15
-rw-r--r--src/backend/access/nbtree/nbtxlog.c23
-rw-r--r--src/include/access/nbtree.h6
4 files changed, 58 insertions, 8 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 4820f76e3bb..997d2721d56 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -486,6 +486,28 @@ normal running after recovery has completed. This is a key capability
because it allows running applications to continue while the standby
changes state into a normally running server.
+The interlocking required to avoid returning incorrect results from
+non-MVCC scans is not required on standby nodes. That is because
+HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(),
+HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only
+ever used during write transactions, which cannot exist on the standby.
+MVCC scans are already protected by definition, so HeapTupleSatisfiesMVCC()
+is not a problem. That leaves concern only for HeapTupleSatisfiesToast().
+HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's
+because it doesn't need to - if the main heap row is visible then the
+toast rows will also be visible. So as long as we follow a toast
+pointer from a visible (live) tuple the corresponding toast rows
+will also be visible, so we do not need to recheck MVCC on them.
+There is one minor exception, which is that the optimizer sometimes
+looks at the boundaries of value ranges using SnapshotDirty, which
+could result in returning a newer value for query statistics; this
+would affect the query plan in rare cases, but not the correctness.
+The risk window is small since the stats look at the min and max values
+in the index, so the scan retrieves a tid then immediately uses it
+to look in the heap. It is unlikely that the tid could have been
+deleted, vacuumed and re-inserted in the time taken to look in the heap
+via direct tid access. So we ignore that scan type as a problem.
+
Other Things That Are Handy to Know
-----------------------------------
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 36dc6c278ea..469c2ed83fd 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -811,6 +811,10 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
}
/*
+ * Check to see if we need to issue one final WAL record for this index,
+ * which may be needed for correctness on a hot standby node when non-MVCC
+ * index scans could take place.
+ *
* If the WAL is replayed in hot standby, the replay process needs to get
* cleanup locks on all index leaf pages, just as we've been doing here.
* However, we won't issue any WAL records about pages that have no items
@@ -1018,10 +1022,13 @@ restart:
if (ndeletable > 0)
{
/*
- * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an
- * instruction to the replay code to get cleanup lock on all pages
- * between the previous lastBlockVacuumed and this page. This
- * ensures that WAL replay locks all leaf pages at some point.
+ * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
+ * all information to the replay code to allow it to get a cleanup
+ * lock on all pages between the previous lastBlockVacuumed and
+ * this page. This ensures that WAL replay locks all leaf pages at
+ * some point, which is important should non-MVCC scans be
+ * requested. This is currently unused on standby, but we record
+ * it anyway, so that the WAL contains the required information.
*
* Since we can visit leaf pages out-of-order when recursing,
* replay might end up locking such pages an extra time, but it
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 2debb870bd0..42d65fa385f 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -479,7 +479,24 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
Page page;
BTPageOpaque opaque;
+#ifdef UNUSED
/*
+ * This section of code is thought to be no longer needed, after analysis
+ * of the calling paths. It is retained to allow the code to be reinstated
+ * if a flaw is revealed in that thinking.
+ *
+ * If we are running non-MVCC scans using this index we need to do some
+ * additional work to ensure correctness, which is known as a "pin scan"
+ * described in more detail in next paragraphs. We used to do the extra
+ * work in all cases, whereas we now avoid that work in most cases. If
+ * lastBlockVacuumed is set to InvalidBlockNumber then we skip the
+ * additional work required for the pin scan.
+ *
+ * Avoiding this extra work is important since it requires us to touch
+ * every page in the index, so is an O(N) operation. Worse, it is an
+ * operation performed in the foreground during redo, so it delays
+ * replication directly.
+ *
* If queries might be active then we need to ensure every leaf page is
* unpinned between the lastBlockVacuumed and the current block, if there
* are any. This prevents replay of the VACUUM from reaching the stage of
@@ -500,7 +517,7 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
* isn't yet consistent; so we need not fear reading still-corrupt blocks
* here during crash recovery.
*/
- if (HotStandbyActiveInReplay())
+ if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed))
{
BlockNumber blkno;
@@ -517,7 +534,8 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
* XXX we don't actually need to read the block, we just need to
* confirm it is unpinned. If we had a special call into the
* buffer manager we could optimise this so that if the block is
- * not in shared_buffers we confirm it as unpinned.
+ * not in shared_buffers we confirm it as unpinned. Optimizing
+ * this is now moot, since in most cases we avoid the scan.
*/
buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
RBM_NORMAL_NO_LOG);
@@ -528,6 +546,7 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
}
}
}
+#endif
/*
* If we have a full-page image, restore it (using a cleanup lock) and
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f2817590c41..09708c01c81 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -342,8 +342,10 @@ typedef struct xl_btree_reuse_page
* The WAL record can represent deletion of any number of index tuples on a
* single index page when executed by VACUUM.
*
- * The correctness requirement for applying these changes during recovery is
- * that we must do one of these two things for every block in the index:
+ * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber.
+ * For a non-MVCC index scans there is an additional correctness requirement
+ * for applying these changes during recovery, which is that we must do one
+ * of these two things for every block in the index:
* * lock the block for cleanup and apply any required changes
* * EnsureBlockUnpinned()
* The purpose of this is to ensure that no index scans started before we