diff options
author | Michael Paquier <michael@paquier.xyz> | 2025-09-26 08:41:06 +0900 |
---|---|---|
committer | Michael Paquier <michael@paquier.xyz> | 2025-09-26 08:41:06 +0900 |
commit | 85e0ff62b68224b3354e47fb71b78d309063d06c (patch) | |
tree | 42c39a8a6ca6cee3e50aaed49ae06ea784db6da0 /src/backend/access/nbtree/nbtinsert.c | |
parent | 3760d278dc4165d5eef7435fc62d8ebe13b8e793 (diff) |
Improve stability of btree page split on ERRORs
This improves the stability of VACUUM when processing btree indexes,
which was previously able to trigger an assertion failure in
_bt_lock_subtree_parent() when an error was previously thrown outside
the scope of _bt_split() when splitting a btree page. VACUUM would
consider the index as in a corrupted state as the right page would not
be zeroed for the error thrown (allocation failure is one pattern).
In a non-assert build, VACUUM is able to succeed, reporting what it sees
as a corruption while attempting to fix the index. This would manifest
as a LOG message, as of:
LOG: failed to re-find parent key in index "idx" for deletion target
page N
CONTEXT: while vacuuming index "idx" of relation "public.tab"
This commit improves the code to rely on two PGAlignedBlocks that are
used as a temporary space for the left and right pages. The main change
concerns the right page, whose contents are now copied into the
"temporary" PGAlignedBlock page while its original space is zeroed. Its
contents are moved from the PGAlignedBlock page back to the page once we
enter in the critical section used for the split. This simplifies the
split logic, as it is not necessary to zero the right page before
throwing an error anymore. Hence errors can now be thrown outside the
split code. For the left page, this shaves one allocation, with
PageGetTempPage() being previously used.
The previous logic originates from commit 8fa30f906b, at a point where
PGAlignedBlock did not exist yet. This could be argued as something
that should be backpatched, but the lack of complaints indicates that it
may not be necessary.
Author: Konstantin Knizhnik <knizhnik@garret.ru>
Discussion: https://postgr.es/m/566dacaf-5751-47e4-abc6-73de17a5d42a@garret.ru
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 49 |
1 files changed, 29 insertions, 20 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index be60781fc98..85d97a970ac 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1473,6 +1473,8 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Page origpage; Page leftpage, rightpage; + PGAlignedBlock leftpage_buf, + rightpage_buf; BlockNumber origpagenumber, rightpagenumber; BTPageOpaque ropaque, @@ -1543,8 +1545,8 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, newitem, &newitemonleft); - /* Allocate temp buffer for leftpage */ - leftpage = PageGetTempPage(origpage); + /* Use temporary buffer for leftpage */ + leftpage = leftpage_buf.data; _bt_pageinit(leftpage, BufferGetPageSize(buf)); lopaque = BTPageGetOpaque(leftpage); @@ -1707,19 +1709,23 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, /* * Acquire a new right page to split into, now that left page has a new - * high key. From here on, it's not okay to throw an error without - * zeroing rightpage first. This coding rule ensures that we won't - * confuse future VACUUM operations, which might otherwise try to re-find - * a downlink to a leftover junk page as the page undergoes deletion. + * high key. * - * It would be reasonable to start the critical section just after the new - * rightpage buffer is acquired instead; that would allow us to avoid - * leftover junk pages without bothering to zero rightpage. We do it this - * way because it avoids an unnecessary PANIC when either origpage or its - * existing sibling page are corrupt. + * To not confuse future VACUUM operations, we zero the right page and + * work on an in-memory copy of it before writing WAL, then copy its + * contents back to the actual page once we start the critical section + * work. This simplifies the split work, so as there is no need to zero + * the right page before throwing an error. */ rbuf = _bt_allocbuf(rel, heaprel); - rightpage = BufferGetPage(rbuf); + rightpage = rightpage_buf.data; + + /* + * Copy the contents of the right page into its temporary location, and + * zero the original space. + */ + memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ); + memset(BufferGetPage(rbuf), 0, BLCKSZ); rightpagenumber = BufferGetBlockNumber(rbuf); /* rightpage was initialized by _bt_allocbuf */ ropaque = BTPageGetOpaque(rightpage); @@ -1768,7 +1774,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add high key to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1816,7 +1821,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff, false)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1829,7 +1833,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, afterrightoff == minusinfoff)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1843,7 +1846,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, { if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add old item to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1855,7 +1857,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff, afterrightoff == minusinfoff)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add old item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1876,7 +1877,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, afterrightoff == minusinfoff)) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); @@ -1896,7 +1896,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, sopaque = BTPageGetOpaque(spage); if (sopaque->btpo_prev != origpagenumber) { - memset(rightpage, 0, BufferGetPageSize(rbuf)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("right sibling's left-link doesn't match: " @@ -1939,9 +1938,19 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, * original. We need to do this before writing the WAL record, so that * XLogInsert can WAL log an image of the page if necessary. */ - PageRestoreTempPage(leftpage, origpage); + memcpy(origpage, leftpage, BLCKSZ); /* leftpage, lopaque must not be used below here */ + /* + * Move the contents of the right page from its temporary location to the + * destination buffer, before writing the WAL record. Unlike the left + * page, the right page and its opaque area are still needed to complete + * the update of the page, so reinitialize them. + */ + rightpage = BufferGetPage(rbuf); + memcpy(rightpage, rightpage_buf.data, BLCKSZ); + ropaque = BTPageGetOpaque(rightpage); + MarkBufferDirty(buf); MarkBufferDirty(rbuf); |