From 40dae7ec537c5619fc93ad602c62f37be786d161 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 18 Mar 2014 20:12:58 +0200 Subject: Make the handling of interrupted B-tree page splits more robust. Splitting a page consists of two separate steps: splitting the child page, and inserting the downlink for the new right page to the parent. Previously, we handled the case that you crash in between those steps with a cleanup routine after the WAL recovery had finished, which finished the incomplete split. However, that doesn't help if the page split is interrupted but the database doesn't crash, so that you don't perform WAL recovery. That could happen for example if you run out of disk space. Remove the end-of-recovery cleanup step. Instead, when a page is split, the left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink is inserted to the parent, the flag is cleared again. If an insertion sees a page with the flag set, it knows that the split was interrupted for some reason, and inserts the missing downlink before proceeding. I used the same approach to fix GIN and GiST split algorithms earlier. This was the last WAL cleanup routine, so we could get rid of that whole machinery now, but I'll leave that for a separate patch. Reviewed by Peter Geoghegan. --- src/backend/access/nbtree/nbtxlog.c | 323 +++++++++++++++--------------------- 1 file changed, 136 insertions(+), 187 deletions(-) (limited to 'src/backend/access/nbtree/nbtxlog.c') diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index bc376fd4094..665e60b7bd2 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -20,61 +20,6 @@ #include "storage/procarray.h" #include "miscadmin.h" -/* - * We must keep track of expected insertions due to page splits, and apply - * them manually if they are not seen in the WAL log during replay. This - * makes it safe for page insertion to be a multiple-WAL-action process. - * - * The data structure is a simple linked list --- this should be good enough, - * since we don't expect a page split to remain incomplete for long. In any - * case we need to respect the order of operations. - */ -typedef struct bt_incomplete_split -{ - RelFileNode node; /* the index */ - bool is_root; /* we split the root */ - BlockNumber leftblk; /* left half of split */ - BlockNumber rightblk; /* right half of split */ -} bt_incomplete_split; - -static List *incomplete_splits; - - -static void -log_incomplete_split(RelFileNode node, BlockNumber leftblk, - BlockNumber rightblk, bool is_root) -{ - bt_incomplete_split *action = palloc(sizeof(bt_incomplete_split)); - - action->node = node; - action->is_root = is_root; - action->leftblk = leftblk; - action->rightblk = rightblk; - incomplete_splits = lappend(incomplete_splits, action); -} - -static void -forget_matching_split(RelFileNode node, BlockNumber downlink, bool is_root) -{ - ListCell *l; - - foreach(l, incomplete_splits) - { - bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l); - - if (RelFileNodeEquals(node, action->node) && - downlink == action->rightblk) - { - if (is_root != action->is_root) - elog(LOG, "forget_matching_split: fishy is_root data (expected %d, got %d)", - action->is_root, is_root); - incomplete_splits = list_delete_ptr(incomplete_splits, action); - pfree(action); - break; /* need not look further */ - } - } -} - /* * _bt_restore_page -- re-enter all the index tuples on a page * @@ -149,23 +94,60 @@ _bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn, UnlockReleaseBuffer(metabuf); } +/* + * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page + * + * This is a common subroutine of the redo functions of all the WAL record + * types that can insert a downlink: insert, split, and newroot. + */ +static void +_bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record, + RelFileNode rnode, BlockNumber cblock) +{ + Buffer buf; + + buf = XLogReadBuffer(rnode, cblock, false); + if (BufferIsValid(buf)) + { + Page page = (Page) BufferGetPage(buf); + + if (lsn > PageGetLSN(page)) + { + BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); + Assert((pageop->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0); + pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + UnlockReleaseBuffer(buf); + } +} + static void btree_xlog_insert(bool isleaf, bool ismeta, XLogRecPtr lsn, XLogRecord *record) { xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); Buffer buffer; + Buffer cbuffer = InvalidBuffer; Page page; char *datapos; int datalen; xl_btree_metadata md; - BlockNumber downlink = 0; + BlockNumber cblkno = 0; + int main_blk_index; datapos = (char *) xlrec + SizeOfBtreeInsert; datalen = record->xl_len - SizeOfBtreeInsert; - if (!isleaf) + /* + * if this insert finishes a split at lower level, extract the block + * number of the (left) child. + */ + if (!isleaf && (record->xl_info & XLR_BKP_BLOCK(0)) == 0) { - memcpy(&downlink, datapos, sizeof(BlockNumber)); + memcpy(&cblkno, datapos, sizeof(BlockNumber)); + Assert(cblkno != 0); datapos += sizeof(BlockNumber); datalen -= sizeof(BlockNumber); } @@ -176,8 +158,19 @@ btree_xlog_insert(bool isleaf, bool ismeta, datalen -= sizeof(xl_btree_metadata); } - if (record->xl_info & XLR_BKP_BLOCK(0)) - (void) RestoreBackupBlock(lsn, record, 0, false, false); + if (!isleaf) + { + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else + _bt_clear_incomplete_split(lsn, record, xlrec->target.node, cblkno); + main_blk_index = 1; + } + else + main_blk_index = 0; + + if (record->xl_info & XLR_BKP_BLOCK(main_blk_index)) + (void) RestoreBackupBlock(lsn, record, main_blk_index, false, false); else { buffer = XLogReadBuffer(xlrec->target.node, @@ -187,11 +180,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, { page = (Page) BufferGetPage(buffer); - if (lsn <= PageGetLSN(page)) - { - UnlockReleaseBuffer(buffer); - } - else + if (lsn > PageGetLSN(page)) { if (PageAddItem(page, (Item) datapos, datalen, ItemPointerGetOffsetNumber(&(xlrec->target.tid)), @@ -200,11 +189,14 @@ btree_xlog_insert(bool isleaf, bool ismeta, PageSetLSN(page, lsn); MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); } + UnlockReleaseBuffer(buffer); } } + if (BufferIsValid(cbuffer)) + UnlockReleaseBuffer(cbuffer); + /* * Note: in normal operation, we'd update the metapage while still holding * lock on the page we inserted into. But during replay it's not @@ -216,10 +208,6 @@ btree_xlog_insert(bool isleaf, bool ismeta, _bt_restore_meta(xlrec->target.node, lsn, md.root, md.level, md.fastroot, md.fastlevel); - - /* Forget any split this insertion completes */ - if (!isleaf) - forget_matching_split(xlrec->target.node, downlink, false); } static void @@ -227,6 +215,8 @@ btree_xlog_split(bool onleft, bool isroot, XLogRecPtr lsn, XLogRecord *record) { xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); + bool isleaf = (xlrec->level == 0); + Buffer lbuf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; @@ -237,42 +227,18 @@ btree_xlog_split(bool onleft, bool isroot, Size newitemsz = 0; Item left_hikey = NULL; Size left_hikeysz = 0; + BlockNumber cblkno = InvalidBlockNumber; datapos = (char *) xlrec + SizeOfBtreeSplit; datalen = record->xl_len - SizeOfBtreeSplit; - /* Forget any split this insertion completes */ - if (xlrec->level > 0) - { - /* we assume SizeOfBtreeSplit is at least 16-bit aligned */ - BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos); - - datapos += sizeof(BlockIdData); - datalen -= sizeof(BlockIdData); - - forget_matching_split(xlrec->node, downlink, false); - - /* Extract left hikey and its size (still assuming 16-bit alignment) */ - if (!(record->xl_info & XLR_BKP_BLOCK(0))) - { - /* We assume 16-bit alignment is enough for IndexTupleSize */ - left_hikey = (Item) datapos; - left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); - - datapos += left_hikeysz; - datalen -= left_hikeysz; - } - } - - /* Extract newitem and newitemoff, if present */ + /* Extract newitemoff and newitem, if present */ if (onleft) { - /* Extract the offset (still assuming 16-bit alignment) */ memcpy(&newitemoff, datapos, sizeof(OffsetNumber)); datapos += sizeof(OffsetNumber); datalen -= sizeof(OffsetNumber); } - if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0))) { /* @@ -286,6 +252,37 @@ btree_xlog_split(bool onleft, bool isroot, datalen -= newitemsz; } + /* Extract left hikey and its size (still assuming 16-bit alignment) */ + if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(0))) + { + left_hikey = (Item) datapos; + left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); + datapos += left_hikeysz; + datalen -= left_hikeysz; + } + /* + * If this insertion finishes an incomplete split, get the block number + * of the child. + */ + if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(1))) + { + memcpy(&cblkno, datapos, sizeof(BlockNumber)); + datapos += sizeof(BlockNumber); + datalen -= sizeof(BlockNumber); + } + + /* + * Clear the incomplete split flag on the left sibling of the child page + * this is a downlink for. + */ + if (!isleaf) + { + if (record->xl_info & XLR_BKP_BLOCK(2)) + (void) RestoreBackupBlock(lsn, record, 2, false, false); + else + _bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno); + } + /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true); Assert(BufferIsValid(rbuf)); @@ -297,7 +294,7 @@ btree_xlog_split(bool onleft, bool isroot, ropaque->btpo_prev = xlrec->leftsib; ropaque->btpo_next = xlrec->rnext; ropaque->btpo.level = xlrec->level; - ropaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; + ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; _bt_restore_page(rpage, datapos, datalen); @@ -306,7 +303,7 @@ btree_xlog_split(bool onleft, bool isroot, * On leaf level, the high key of the left page is equal to the first key * on the right page. */ - if (xlrec->level == 0) + if (isleaf) { ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); @@ -321,10 +318,10 @@ btree_xlog_split(bool onleft, bool isroot, /* Now reconstruct left (original) sibling page */ if (record->xl_info & XLR_BKP_BLOCK(0)) - (void) RestoreBackupBlock(lsn, record, 0, false, false); + lbuf = RestoreBackupBlock(lsn, record, 0, false, true); else { - Buffer lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false); + lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false); if (BufferIsValid(lbuf)) { @@ -381,19 +378,21 @@ btree_xlog_split(bool onleft, bool isroot, elog(PANIC, "failed to add high key to left page after split"); /* Fix opaque fields */ - lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; + lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; + if (isleaf) + lopaque->btpo_flags |= BTP_LEAF; lopaque->btpo_next = xlrec->rightsib; lopaque->btpo_cycleid = 0; PageSetLSN(lpage, lsn); MarkBufferDirty(lbuf); } - - UnlockReleaseBuffer(lbuf); } } - /* We no longer need the right buffer */ + /* We no longer need the buffers */ + if (BufferIsValid(lbuf)) + UnlockReleaseBuffer(lbuf); UnlockReleaseBuffer(rbuf); /* @@ -404,32 +403,39 @@ btree_xlog_split(bool onleft, bool isroot, * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ - if (record->xl_info & XLR_BKP_BLOCK(1)) - (void) RestoreBackupBlock(lsn, record, 1, false, false); - else if (xlrec->rnext != P_NONE) + if (xlrec->rnext != P_NONE) { - Buffer buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false); + /* + * the backup block containing right sibling is 2 or 3, depending + * whether this was a leaf or internal page. + */ + int rnext_index = isleaf ? 2 : 3; - if (BufferIsValid(buffer)) + if (record->xl_info & XLR_BKP_BLOCK(rnext_index)) + (void) RestoreBackupBlock(lsn, record, rnext_index, false, false); + else { - Page page = (Page) BufferGetPage(buffer); + Buffer buffer; - if (lsn > PageGetLSN(page)) + buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false); + + if (BufferIsValid(buffer)) { - BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); + Page page = (Page) BufferGetPage(buffer); - pageop->btpo_prev = xlrec->rightsib; + if (lsn > PageGetLSN(page)) + { + BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); + pageop->btpo_prev = xlrec->rightsib; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); } - UnlockReleaseBuffer(buffer); } } - - /* The job ain't done till the parent link is inserted... */ - log_incomplete_split(xlrec->node, - xlrec->leftsib, xlrec->rightsib, isroot); } static void @@ -1003,10 +1009,6 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; BTPageOpaque pageop; - BlockNumber downlink = 0; - - /* Backup blocks are not used in newroot records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true); Assert(BufferIsValid(buffer)); @@ -1025,14 +1027,21 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) if (record->xl_len > SizeOfBtreeNewroot) { IndexTuple itup; + BlockNumber cblkno; _bt_restore_page(page, (char *) xlrec + SizeOfBtreeNewroot, record->xl_len - SizeOfBtreeNewroot); - /* extract downlink to the right-hand split page */ - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY)); - downlink = ItemPointerGetBlockNumber(&(itup->t_tid)); + /* extract block number of the left-hand split page */ + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY)); + cblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); + + /* Clear the incomplete-split flag in left child */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else + _bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno); } PageSetLSN(page, lsn); @@ -1042,10 +1051,6 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) _bt_restore_meta(xlrec->node, lsn, xlrec->rootblk, xlrec->level, xlrec->rootblk, xlrec->level); - - /* Check to see if this satisfies any incomplete insertions */ - if (record->xl_len > SizeOfBtreeNewroot) - forget_matching_split(xlrec->node, downlink, true); } static void @@ -1125,59 +1130,3 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record) elog(PANIC, "btree_redo: unknown op code %u", info); } } - -void -btree_xlog_startup(void) -{ - incomplete_splits = NIL; -} - -void -btree_xlog_cleanup(void) -{ - ListCell *l; - - foreach(l, incomplete_splits) - { - /* finish an incomplete split */ - bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l); - Buffer lbuf, - rbuf; - Page lpage, - rpage; - BTPageOpaque lpageop, - rpageop; - bool is_only; - Relation reln; - - lbuf = XLogReadBuffer(action->node, action->leftblk, false); - /* failure is impossible because we wrote this page earlier */ - if (!BufferIsValid(lbuf)) - elog(PANIC, "btree_xlog_cleanup: left block unfound"); - lpage = (Page) BufferGetPage(lbuf); - lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage); - rbuf = XLogReadBuffer(action->node, action->rightblk, false); - /* failure is impossible because we wrote this page earlier */ - if (!BufferIsValid(rbuf)) - elog(PANIC, "btree_xlog_cleanup: right block unfound"); - rpage = (Page) BufferGetPage(rbuf); - rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); - - /* if the pages are all of their level, it's a only-page split */ - is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop); - - reln = CreateFakeRelcacheEntry(action->node); - _bt_insert_parent(reln, lbuf, rbuf, NULL, - action->is_root, is_only); - FreeFakeRelcacheEntry(reln); - } - incomplete_splits = NIL; -} - -bool -btree_safe_restartpoint(void) -{ - if (incomplete_splits) - return false; - return true; -} -- cgit v1.2.3