summaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtpage.c
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-03-18 20:12:58 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-03-18 20:50:44 +0200
commit40dae7ec537c5619fc93ad602c62f37be786d161 (patch)
tree933e82bd349f08206cf2f97841ccee2a80e06c91 /src/backend/access/nbtree/nbtpage.c
parentb6ec7c92ac7ab6223b3c45dc554efffd1953758f (diff)
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page, and inserting the downlink for the new right page to the parent. Previously, we handled the case that you crash in between those steps with a cleanup routine after the WAL recovery had finished, which finished the incomplete split. However, that doesn't help if the page split is interrupted but the database doesn't crash, so that you don't perform WAL recovery. That could happen for example if you run out of disk space. Remove the end-of-recovery cleanup step. Instead, when a page is split, the left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink is inserted to the parent, the flag is cleared again. If an insertion sees a page with the flag set, it knows that the split was interrupted for some reason, and inserts the missing downlink before proceeding. I used the same approach to fix GIN and GiST split algorithms earlier. This was the last WAL cleanup routine, so we could get rid of that whole machinery now, but I'll leave that for a separate patch. Reviewed by Peter Geoghegan.
Diffstat (limited to 'src/backend/access/nbtree/nbtpage.c')
-rw-r--r--src/backend/access/nbtree/nbtpage.c91
1 files changed, 87 insertions, 4 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 14e422a1d36..87ac5f4aafb 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -992,6 +992,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
Buffer pbuf;
Page page;
BTPageOpaque opaque;
+ BlockNumber leftsib;
/* Locate the parent's downlink (updating the stack entry if needed) */
ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY);
@@ -1020,7 +1021,8 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
* We have to check the parent itself, and then recurse to test
* the conditions at the parent's parent.
*/
- if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
+ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
+ P_INCOMPLETE_SPLIT(opaque))
{
_bt_relbuf(rel, pbuf);
return false;
@@ -1028,8 +1030,41 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
*target = parent;
*rightsib = opaque->btpo_next;
+ leftsib = opaque->btpo_prev;
_bt_relbuf(rel, pbuf);
+
+ /*
+ * Like in _bt_pagedel, check that the left sibling is not marked
+ * with INCOMPLETE_SPLIT flag. That would mean that there is no
+ * downlink to the page to be deleted, and the page deletion
+ * algorithm isn't prepared to handle that.
+ */
+ if (leftsib != P_NONE)
+ {
+ Buffer lbuf;
+ Page lpage;
+ BTPageOpaque lopaque;
+
+ lbuf = _bt_getbuf(rel, leftsib, BT_READ);
+ lpage = BufferGetPage(lbuf);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+ /*
+ * If the left sibling was concurrently split, so that its
+ * next-pointer doesn't point to the current page anymore,
+ * the split that created the current page must be completed.
+ * (We don't allow splitting an incompletely split page again
+ * until the previous split has been completed)
+ */
+ if (lopaque->btpo_next == parent &&
+ P_INCOMPLETE_SPLIT(lopaque))
+ {
+ _bt_relbuf(rel, lbuf);
+ return false;
+ }
+ _bt_relbuf(rel, lbuf);
+ }
+
return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
topparent, topoff, target, rightsib);
}
@@ -1081,6 +1116,10 @@ _bt_pagedel(Relation rel, Buffer buf)
* "stack" is a search stack leading (approximately) to the target page.
* It is initially NULL, but when iterating, we keep it to avoid
* duplicated search effort.
+ *
+ * Also, when "stack" is not NULL, we have already checked that the
+ * current page is not the right half of an incomplete split, i.e. the
+ * left sibling does not have its INCOMPLETE_SPLIT flag set.
*/
BTStack stack = NULL;
@@ -1117,11 +1156,25 @@ _bt_pagedel(Relation rel, Buffer buf)
}
/*
- * We can never delete rightmost pages nor root pages. While at it,
- * check that page is not already deleted and is empty.
+ * We can never delete rightmost pages nor root pages. While at
+ * it, check that page is not already deleted and is empty.
+ *
+ * To keep the algorithm simple, we also never delete an incompletely
+ * split page (they should be rare enough that this doesn't make any
+ * meaningful difference to disk usage):
+ *
+ * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
+ * left half of an incomplete split, but ensuring that it's not the
+ * right half is more complicated. For that, we have to check that
+ * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
+ * the first iteration, we temporarily release the lock on the
+ * current page, and check the left sibling and also construct a
+ * search stack to. On subsequent iterations, we know we stepped right
+ * from a page that passed these tests, so it's OK.
*/
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
- P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
+ P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
+ P_INCOMPLETE_SPLIT(opaque))
{
/* Should never fail to delete a half-dead page */
Assert(!P_ISHALFDEAD(opaque));
@@ -1142,6 +1195,9 @@ _bt_pagedel(Relation rel, Buffer buf)
* use the standard search mechanism to search for the page's high
* key; this will give us a link to either the current parent or
* someplace to its left (if there are multiple equal high keys).
+ *
+ * Also check if this is the right-half of an incomplete split
+ * (see comment above).
*/
if (!stack)
{
@@ -1149,16 +1205,43 @@ _bt_pagedel(Relation rel, Buffer buf)
ItemId itemid;
IndexTuple targetkey;
Buffer lbuf;
+ BlockNumber leftsib;
itemid = PageGetItemId(page, P_HIKEY);
targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
+ leftsib = opaque->btpo_prev;
+
/*
* To avoid deadlocks, we'd better drop the leaf page lock
* before going further.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ /*
+ * Fetch the left sibling, to check that it's not marked
+ * with INCOMPLETE_SPLIT flag. That would mean that the
+ * page to-be-deleted doesn't have a downlink, and the page
+ * deletion algorithm isn't prepared to handle that.
+ */
+ if (!P_LEFTMOST(opaque))
+ {
+ BTPageOpaque lopaque;
+ Page lpage;
+
+ lbuf = _bt_getbuf(rel, leftsib, BT_READ);
+ lpage = BufferGetPage(lbuf);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+ if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
+ P_INCOMPLETE_SPLIT(lopaque))
+ {
+ ReleaseBuffer(buf);
+ _bt_relbuf(rel, lbuf);
+ return ndeleted;
+ }
+ _bt_relbuf(rel, lbuf);
+ }
+
/* we need an insertion scan key for the search, so build one */
itup_scankey = _bt_mkscankey(rel, targetkey);
/* find the leftmost leaf page containing this key */