diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 286 |
1 files changed, 223 insertions, 63 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 5f7953f5b38..3cd70ad4cfe 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -57,15 +57,18 @@ static void _bt_findinsertloc(Relation rel, int keysz, ScanKey scankey, IndexTuple newtup, + BTStack stack, Relation heapRel); -static void _bt_insertonpg(Relation rel, Buffer buf, +static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page); -static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, - OffsetNumber newitemoff, Size newitemsz, +static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, + OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft); +static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, + BTStack stack, bool is_root, bool is_only); static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, @@ -129,7 +132,8 @@ top: * move right in the tree. See Lehman and Yao for an excruciatingly * precise description. */ - buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE); + buf = _bt_moveright(rel, buf, natts, itup_scankey, false, + true, stack, BT_WRITE); /* * If we're not allowing duplicates, make sure the key isn't already in @@ -182,8 +186,9 @@ top: */ CheckForSerializableConflictIn(rel, NULL, buf); /* do the insertion */ - _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel); - _bt_insertonpg(rel, buf, stack, itup, offset, false); + _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, + stack, heapRel); + _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false); } else { @@ -507,6 +512,7 @@ _bt_findinsertloc(Relation rel, int keysz, ScanKey scankey, IndexTuple newtup, + BTStack stack, Relation heapRel) { Buffer buf = *bufptr; @@ -568,6 +574,7 @@ _bt_findinsertloc(Relation rel, while (PageGetFreeSpace(page) < itemsz) { Buffer rbuf; + BlockNumber rblkno; /* * before considering moving right, see if we can obtain enough space @@ -605,18 +612,33 @@ _bt_findinsertloc(Relation rel, */ rbuf = InvalidBuffer; + rblkno = lpageop->btpo_next; for (;;) { - BlockNumber rblkno = lpageop->btpo_next; - rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); page = BufferGetPage(rbuf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If this page was incompletely split, finish the split now. + * We do this while holding a lock on the left sibling, which + * is not good because finishing the split could be a fairly + * lengthy operation. But this should happen very seldom. + */ + if (P_INCOMPLETE_SPLIT(lpageop)) + { + _bt_finish_split(rel, rbuf, stack); + rbuf = InvalidBuffer; + continue; + } + if (!P_IGNORE(lpageop)) break; if (P_RIGHTMOST(lpageop)) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); + + rblkno = lpageop->btpo_next; } _bt_relbuf(rel, buf); buf = rbuf; @@ -658,10 +680,14 @@ _bt_findinsertloc(Relation rel, * child page on the parent. * + updates the metapage if a true root or fast root is split. * - * On entry, we must have the right buffer in which to do the + * On entry, we must have the correct buffer in which to do the * insertion, and the buffer must be pinned and write-locked. On return, * we will have dropped both the pin and the lock on the buffer. * + * When inserting to a non-leaf page, 'cbuf' is the left-sibling of the + * page we're inserting the downlink for. This function will clear the + * INCOMPLETE_SPLIT flag on it, and release the buffer. + * * The locking interactions in this code are critical. You should * grok Lehman and Yao's paper before making any changes. In addition, * you need to understand how we disambiguate duplicate keys in this @@ -675,6 +701,7 @@ _bt_findinsertloc(Relation rel, static void _bt_insertonpg(Relation rel, Buffer buf, + Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, @@ -688,6 +715,14 @@ _bt_insertonpg(Relation rel, page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + /* child buffer must be given iff inserting on an internal page */ + Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf)); + + /* The caller should've finished any incomplete splits already. */ + if (P_INCOMPLETE_SPLIT(lpageop)) + elog(ERROR, "cannot insert to incompletely split page %u", + BufferGetBlockNumber(buf)); + itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ @@ -712,7 +747,7 @@ _bt_insertonpg(Relation rel, &newitemonleft); /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, buf, firstright, + rbuf = _bt_split(rel, buf, cbuf, firstright, newitemoff, itemsz, itup, newitemonleft); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), @@ -786,11 +821,22 @@ _bt_insertonpg(Relation rel, MarkBufferDirty(metabuf); } + /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */ + if (BufferIsValid(cbuf)) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + Assert(P_INCOMPLETE_SPLIT(cpageop)); + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_btree_insert xlrec; - BlockNumber xldownlink; + BlockNumber xlleftchild; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; @@ -810,12 +856,15 @@ _bt_insertonpg(Relation rel, xlinfo = XLOG_BTREE_INSERT_LEAF; else { - xldownlink = ItemPointerGetBlockNumber(&(itup->t_tid)); - Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); - - nextrdata->data = (char *) &xldownlink; + /* + * Include the block number of the left child, whose + * INCOMPLETE_SPLIT flag was cleared. + */ + xlleftchild = BufferGetBlockNumber(cbuf); + nextrdata->data = (char *) &xlleftchild; nextrdata->len = sizeof(BlockNumber); - nextrdata->buffer = InvalidBuffer; + nextrdata->buffer = cbuf; + nextrdata->buffer_std = true; nextrdata->next = nextrdata + 1; nextrdata++; @@ -870,7 +919,8 @@ _bt_insertonpg(Relation rel, /* release buffers */ if (BufferIsValid(metabuf)) _bt_relbuf(rel, metabuf); - + if (BufferIsValid(cbuf)) + _bt_relbuf(rel, cbuf); _bt_relbuf(rel, buf); } } @@ -883,11 +933,15 @@ _bt_insertonpg(Relation rel, * new right page. newitemoff etc. tell us about the new item that * must be inserted along with the data from the old page. * + * When splitting a non-leaf page, 'cbuf' is the left-sibling of the + * page we're inserting the downlink for. This function will clear the + * INCOMPLETE_SPLIT flag on it, and release the buffer. + * * Returns the new right sibling of buf, pinned and write-locked. * The pin and lock on buf are maintained. */ static Buffer -_bt_split(Relation rel, Buffer buf, OffsetNumber firstright, +_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft) { @@ -911,6 +965,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, OffsetNumber maxoff; OffsetNumber i; bool isroot; + bool isleaf; /* Acquire a new page to split into */ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); @@ -949,12 +1004,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); isroot = P_ISROOT(oopaque); + isleaf = P_ISLEAF(oopaque); /* if we're splitting this page, it won't be the root when we're done */ /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */ lopaque->btpo_flags = oopaque->btpo_flags; lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); ropaque->btpo_flags = lopaque->btpo_flags; + /* set flag in left page indicating that the right page has no downlink */ + lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; lopaque->btpo_prev = oopaque->btpo_prev; lopaque->btpo_next = rightpagenumber; ropaque->btpo_prev = origpagenumber; @@ -1178,6 +1236,19 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, MarkBufferDirty(sbuf); } + /* + * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes + * a split. + */ + if (!isleaf) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + /* XLOG stuff */ if (RelationNeedsWAL(rel)) { @@ -1186,6 +1257,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, XLogRecPtr recptr; XLogRecData rdata[7]; XLogRecData *lastrdata; + BlockNumber cblkno; xlrec.node = rel->rd_node; xlrec.leftsib = origpagenumber; @@ -1200,34 +1272,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lastrdata = &rdata[0]; - if (ropaque->btpo.level > 0) - { - /* Log downlink on non-leaf pages */ - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = (char *) &newitem->t_tid.ip_blkid; - lastrdata->len = sizeof(BlockIdData); - lastrdata->buffer = InvalidBuffer; - - /* - * We must also log the left page's high key, because the right - * page's leftmost key is suppressed on non-leaf levels. Show it - * as belonging to the left page buffer, so that it is not stored - * if XLogInsert decides it needs a full-page image of the left - * page. - */ - lastrdata->next = lastrdata + 1; - lastrdata++; - - itemid = PageGetItemId(origpage, P_HIKEY); - item = (IndexTuple) PageGetItem(origpage, itemid); - lastrdata->data = (char *) item; - lastrdata->len = MAXALIGN(IndexTupleSize(item)); - lastrdata->buffer = buf; /* backup block 1 */ - lastrdata->buffer_std = true; - } - /* * Log the new item and its offset, if it was inserted on the left * page. (If it was put on the right page, we don't need to explicitly @@ -1254,17 +1298,40 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer_std = true; } - else if (ropaque->btpo.level == 0) + + /* Log left page */ + if (!isleaf) { + lastrdata->next = lastrdata + 1; + lastrdata++; + /* - * Although we don't need to WAL-log the new item, we still need - * XLogInsert to consider storing a full-page image of the left - * page, so make an empty entry referencing that buffer. This also - * ensures that the left page is always backup block 1. + * We must also log the left page's high key, because the right + * page's leftmost key is suppressed on non-leaf levels. Show it + * as belonging to the left page buffer, so that it is not stored + * if XLogInsert decides it needs a full-page image of the left + * page. */ + itemid = PageGetItemId(origpage, P_HIKEY); + item = (IndexTuple) PageGetItem(origpage, itemid); + lastrdata->data = (char *) item; + lastrdata->len = MAXALIGN(IndexTupleSize(item)); + lastrdata->buffer = buf; /* backup block 1 */ + lastrdata->buffer_std = true; + } + + if (isleaf && !newitemonleft) + { lastrdata->next = lastrdata + 1; lastrdata++; + /* + * Although we don't need to WAL-log anything on the left page, + * we still need XLogInsert to consider storing a full-page image + * of the left page, so make an empty entry referencing that + * buffer. This also ensures that the left page is always backup + * block 1. + */ lastrdata->data = NULL; lastrdata->len = 0; lastrdata->buffer = buf; /* backup block 1 */ @@ -1272,6 +1339,22 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, } /* + * Log block number of left child, whose INCOMPLETE_SPLIT flag this + * insertion clears. + */ + if (!isleaf) + { + lastrdata->next = lastrdata + 1; + lastrdata++; + + cblkno = BufferGetBlockNumber(cbuf); + lastrdata->data = (char *) &cblkno; + lastrdata->len = sizeof(BlockNumber); + lastrdata->buffer = cbuf; /* backup block 2 */ + lastrdata->buffer_std = true; + } + + /* * Log the contents of the right page in the format understood by * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer, * because we're going to recreate the whole page anyway, so it should @@ -1300,7 +1383,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lastrdata->data = NULL; lastrdata->len = 0; - lastrdata->buffer = sbuf; /* backup block 2 */ + lastrdata->buffer = sbuf; /* bkp block 2 (leaf) or 3 (non-leaf) */ lastrdata->buffer_std = true; } @@ -1327,6 +1410,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, if (!P_RIGHTMOST(ropaque)) _bt_relbuf(rel, sbuf); + /* release the child */ + if (!isleaf) + _bt_relbuf(rel, cbuf); + /* split's done */ return rbuf; } @@ -1597,10 +1684,8 @@ _bt_checksplitloc(FindSplitData *state, * have to be efficient (concurrent ROOT split, WAL recovery) * is_root - we split the true root * is_only - we split a page alone on its level (might have been fast root) - * - * This is exported so it can be called by nbtxlog.c. */ -void +static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, @@ -1619,8 +1704,7 @@ _bt_insert_parent(Relation rel, * * If we have to search for the parent level, we do so by re-descending * from the root. This is not super-efficient, but it's rare enough not - * to matter. (This path is also taken when called from WAL recovery --- - * we have no stack in that case.) + * to matter. */ if (is_root) { @@ -1679,12 +1763,13 @@ _bt_insert_parent(Relation rel, * 05/27/97 */ ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY); - pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); - /* Now we can unlock the children */ + /* + * Now we can unlock the right child. The left child will be unlocked + * by _bt_insertonpg(). + */ _bt_relbuf(rel, rbuf); - _bt_relbuf(rel, buf); /* Check for error only after writing children */ if (pbuf == InvalidBuffer) @@ -1692,7 +1777,7 @@ _bt_insert_parent(Relation rel, RelationGetRelationName(rel), bknum, rbknum); /* Recursively update the parent */ - _bt_insertonpg(rel, pbuf, stack->bts_parent, + _bt_insertonpg(rel, pbuf, buf, stack->bts_parent, new_item, stack->bts_offset + 1, is_only); @@ -1702,6 +1787,62 @@ _bt_insert_parent(Relation rel, } /* + * _bt_finish_split() -- Finish an incomplete split + * + * A crash or other failure can leave a split incomplete. The insertion + * routines won't allow to insert on a page that is incompletely split. + * Before inserting on such a page, call _bt_finish_split(). + * + * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked + * and unpinned. + */ +void +_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) +{ + Page lpage = BufferGetPage(lbuf); + BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage); + Buffer rbuf; + Page rpage; + BTPageOpaque rpageop; + bool was_root; + bool was_only; + + Assert(P_INCOMPLETE_SPLIT(lpageop)); + + /* Lock right sibling, the one missing the downlink */ + rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* Could this be a root split? */ + if (!stack) + { + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + /* acquire lock on the metapage */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + was_root = (metad->btm_root == BufferGetBlockNumber(lbuf)); + + _bt_relbuf(rel, metabuf); + } + else + was_root = false; + + /* Was this the only page on the level before split? */ + was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop)); + + elog(DEBUG1, "finishing incomplete split of %u/%u", + BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf)); + + _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only); +} + +/* * _bt_getstackbuf() -- Walk back up the tree one step, and find the item * we last looked at in the parent. * @@ -1733,6 +1874,12 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque)) + { + _bt_finish_split(rel, buf, stack->bts_parent); + continue; + } + if (!P_IGNORE(opaque)) { OffsetNumber offnum, @@ -1837,6 +1984,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rbkno; BlockNumber rootblknum; BTPageOpaque rootopaque; + BTPageOpaque lopaque; ItemId itemid; IndexTuple item; Size itemsz; @@ -1848,6 +1996,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) lbkno = BufferGetBlockNumber(lbuf); rbkno = BufferGetBlockNumber(rbuf); lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); /* get a new root page */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); @@ -1921,6 +2070,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); pfree(new_item); + /* Clear the incomplete-split flag in the left child */ + Assert(P_INCOMPLETE_SPLIT(lopaque)); + lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(lbuf); + MarkBufferDirty(rootbuf); MarkBufferDirty(metabuf); @@ -1929,7 +2083,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) { xl_btree_newroot xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; + XLogRecData rdata[3]; xlrec.node = rel->rd_node; xlrec.rootblk = rootblknum; @@ -1948,7 +2102,13 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rdata[1].len = ((PageHeader) rootpage)->pd_special - ((PageHeader) rootpage)->pd_upper; rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + rdata[1].next = &(rdata[2]); + + /* Make a full-page image of the left child if needed */ + rdata[2].data = NULL; + rdata[2].len = 0; + rdata[2].buffer = lbuf; + rdata[2].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata); |