diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtpage.c')
| -rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 91 |
1 files changed, 87 insertions, 4 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 14e422a1d36..87ac5f4aafb 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -992,6 +992,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, Buffer pbuf; Page page; BTPageOpaque opaque; + BlockNumber leftsib; /* Locate the parent's downlink (updating the stack entry if needed) */ ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY); @@ -1020,7 +1021,8 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, * We have to check the parent itself, and then recurse to test * the conditions at the parent's parent. */ - if (P_RIGHTMOST(opaque) || P_ISROOT(opaque)) + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || + P_INCOMPLETE_SPLIT(opaque)) { _bt_relbuf(rel, pbuf); return false; @@ -1028,8 +1030,41 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, *target = parent; *rightsib = opaque->btpo_next; + leftsib = opaque->btpo_prev; _bt_relbuf(rel, pbuf); + + /* + * Like in _bt_pagedel, check that the left sibling is not marked + * with INCOMPLETE_SPLIT flag. That would mean that there is no + * downlink to the page to be deleted, and the page deletion + * algorithm isn't prepared to handle that. + */ + if (leftsib != P_NONE) + { + Buffer lbuf; + Page lpage; + BTPageOpaque lopaque; + + lbuf = _bt_getbuf(rel, leftsib, BT_READ); + lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); + /* + * If the left sibling was concurrently split, so that its + * next-pointer doesn't point to the current page anymore, + * the split that created the current page must be completed. + * (We don't allow splitting an incompletely split page again + * until the previous split has been completed) + */ + if (lopaque->btpo_next == parent && + P_INCOMPLETE_SPLIT(lopaque)) + { + _bt_relbuf(rel, lbuf); + return false; + } + _bt_relbuf(rel, lbuf); + } + return _bt_lock_branch_parent(rel, parent, stack->bts_parent, topparent, topoff, target, rightsib); } @@ -1081,6 +1116,10 @@ _bt_pagedel(Relation rel, Buffer buf) * "stack" is a search stack leading (approximately) to the target page. * It is initially NULL, but when iterating, we keep it to avoid * duplicated search effort. + * + * Also, when "stack" is not NULL, we have already checked that the + * current page is not the right half of an incomplete split, i.e. the + * left sibling does not have its INCOMPLETE_SPLIT flag set. */ BTStack stack = NULL; @@ -1117,11 +1156,25 @@ _bt_pagedel(Relation rel, Buffer buf) } /* - * We can never delete rightmost pages nor root pages. While at it, - * check that page is not already deleted and is empty. + * We can never delete rightmost pages nor root pages. While at + * it, check that page is not already deleted and is empty. + * + * To keep the algorithm simple, we also never delete an incompletely + * split page (they should be rare enough that this doesn't make any + * meaningful difference to disk usage): + * + * The INCOMPLETE_SPLIT flag on the page tells us if the page is the + * left half of an incomplete split, but ensuring that it's not the + * right half is more complicated. For that, we have to check that + * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On + * the first iteration, we temporarily release the lock on the + * current page, and check the left sibling and also construct a + * search stack to. On subsequent iterations, we know we stepped right + * from a page that passed these tests, so it's OK. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || - P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) + P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || + P_INCOMPLETE_SPLIT(opaque)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); @@ -1142,6 +1195,9 @@ _bt_pagedel(Relation rel, Buffer buf) * use the standard search mechanism to search for the page's high * key; this will give us a link to either the current parent or * someplace to its left (if there are multiple equal high keys). + * + * Also check if this is the right-half of an incomplete split + * (see comment above). */ if (!stack) { @@ -1149,16 +1205,43 @@ _bt_pagedel(Relation rel, Buffer buf) ItemId itemid; IndexTuple targetkey; Buffer lbuf; + BlockNumber leftsib; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); + leftsib = opaque->btpo_prev; + /* * To avoid deadlocks, we'd better drop the leaf page lock * before going further. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); + /* + * Fetch the left sibling, to check that it's not marked + * with INCOMPLETE_SPLIT flag. That would mean that the + * page to-be-deleted doesn't have a downlink, and the page + * deletion algorithm isn't prepared to handle that. + */ + if (!P_LEFTMOST(opaque)) + { + BTPageOpaque lopaque; + Page lpage; + + lbuf = _bt_getbuf(rel, leftsib, BT_READ); + lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); + if (lopaque->btpo_next == BufferGetBlockNumber(buf) && + P_INCOMPLETE_SPLIT(lopaque)) + { + ReleaseBuffer(buf); + _bt_relbuf(rel, lbuf); + return ndeleted; + } + _bt_relbuf(rel, lbuf); + } + /* we need an insertion scan key for the search, so build one */ itup_scankey = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page containing this key */ |
