summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNoah Misch <noah@leadboat.com>2024-10-25 06:51:03 -0700
committerNoah Misch <noah@leadboat.com>2024-10-25 06:51:08 -0700
commite30d0d8adf534453355024aae68b8b691c105b67 (patch)
treefd84b8cf379998a798e3cbc7820f641a7ae95f1d /src
parent4cf948cbeedffe597546af6c4a7a7e1d103c3923 (diff)
WAL-log inplace update before revealing it to other sessions.
A buffer lock won't stop a reader having already checked tuple visibility. If a vac_update_datfrozenid() and then a crash happened during inplace update of a relfrozenxid value, datfrozenxid could overtake relfrozenxid. That could lead to "could not access status of transaction" errors. Back-patch to v12 (all supported versions). In v14 and earlier, this also back-patches the assertion removal from commit 7fcf2faf9c7dd473208fd6d5565f88d7f733782b. Discussion: https://postgr.es/m/20240620012908.92.nmisch@google.com
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/heap/README.tuplock4
-rw-r--r--src/backend/access/heap/heapam.c58
-rw-r--r--src/backend/access/transam/xloginsert.c2
3 files changed, 46 insertions, 18 deletions
diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock
index 818cd7f9806..31c52ad28f9 100644
--- a/src/backend/access/heap/README.tuplock
+++ b/src/backend/access/heap/README.tuplock
@@ -203,6 +203,4 @@ Inplace updates create an exception to the rule that tuple data won't change
under a reader holding a pin. A reader of a heap_fetch() result tuple may
witness a torn read. Current inplace-updated fields are aligned and are no
wider than four bytes, and current readers don't need consistency across
-fields. Hence, they get by with just fetching each field once. XXX such a
-caller may also read a value that has not reached WAL; see
-systable_inplace_update_finish().
+fields. Hence, they get by with just fetching each field once.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index c35e3a80441..c0ebfb8ec0d 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6135,6 +6135,8 @@ heap_inplace_update_and_unlock(Relation relation,
HeapTupleHeader htup = oldtup->t_data;
uint32 oldlen;
uint32 newlen;
+ char *dst;
+ char *src;
Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
oldlen = oldtup->t_len - htup->t_hoff;
@@ -6142,6 +6144,9 @@ heap_inplace_update_and_unlock(Relation relation,
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
elog(ERROR, "wrong tuple length");
+ dst = (char *) htup + htup->t_hoff;
+ src = (char *) tuple->t_data + tuple->t_data->t_hoff;
+
/*
* Construct shared cache inval if necessary. Note that because we only
* pass the new version of the tuple, this mustn't be used for any
@@ -6160,15 +6165,15 @@ heap_inplace_update_and_unlock(Relation relation,
*/
PreInplace_Inval();
- /* NO EREPORT(ERROR) from here till changes are logged */
- START_CRIT_SECTION();
-
- memcpy((char *) htup + htup->t_hoff,
- (char *) tuple->t_data + tuple->t_data->t_hoff,
- newlen);
-
/*----------
- * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
+ * NO EREPORT(ERROR) from here till changes are complete
+ *
+ * Our buffer lock won't stop a reader having already pinned and checked
+ * visibility for this tuple. Hence, we write WAL first, then mutate the
+ * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
+ * checkpoint delay makes that acceptable. With the usual order of
+ * changes, a crash after memcpy() and before XLogInsert() could allow
+ * datfrozenxid to overtake relfrozenxid:
*
* ["D" is a VACUUM (ONLY_DATABASE_STATS)]
* ["R" is a VACUUM tbl]
@@ -6178,14 +6183,28 @@ heap_inplace_update_and_unlock(Relation relation,
* D: raise pg_database.datfrozenxid, XLogInsert(), finish
* [crash]
* [recovery restores datfrozenxid w/o relfrozenxid]
+ *
+ * Like in MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(), copy
+ * the buffer to the stack before logging. Here, that facilitates a FPI
+ * of the post-mutation block before we accept other sessions seeing it.
*/
-
- MarkBufferDirty(buffer);
+ Assert(!MyPgXact->delayChkpt);
+ START_CRIT_SECTION();
+ MyPgXact->delayChkpt = true;
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
xl_heap_inplace xlrec;
+ PGAlignedBlock copied_buffer;
+ char *origdata = (char *) BufferGetBlock(buffer);
+ Page page = BufferGetPage(buffer);
+ uint16 lower = ((PageHeader) page)->pd_lower;
+ uint16 upper = ((PageHeader) page)->pd_upper;
+ uintptr_t dst_offset_in_block;
+ RelFileNode rnode;
+ ForkNumber forkno;
+ BlockNumber blkno;
XLogRecPtr recptr;
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
@@ -6193,16 +6212,28 @@ heap_inplace_update_and_unlock(Relation relation,
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
- XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
- XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
+ /* register block matching what buffer will look like after changes */
+ memcpy(copied_buffer.data, origdata, lower);
+ memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
+ dst_offset_in_block = dst - origdata;
+ memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
+ BufferGetTag(buffer, &rnode, &forkno, &blkno);
+ Assert(forkno == MAIN_FORKNUM);
+ XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data,
+ REGBUF_STANDARD);
+ XLogRegisterBufData(0, src, newlen);
/* inplace updates aren't decoded atm, don't log the origin */
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
- PageSetLSN(BufferGetPage(buffer), recptr);
+ PageSetLSN(page, recptr);
}
+ memcpy(dst, src, newlen);
+
+ MarkBufferDirty(buffer);
+
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/*
@@ -6215,6 +6246,7 @@ heap_inplace_update_and_unlock(Relation relation,
*/
AtInplace_Inval();
+ MyPgXact->delayChkpt = false;
END_CRIT_SECTION();
UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 78a95344f91..1993088a672 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -268,8 +268,6 @@ XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
{
registered_buffer *regbuf;
- /* This is currently only used to WAL-log a full-page image of a page */
- Assert(flags & REGBUF_FORCE_IMAGE);
Assert(begininsert_called);
if (block_id >= max_registered_block_id)