summaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtpage.c
diff options
context:
space:
mode:
authorPeter Geoghegan <pg@bowt.ie>2019-03-20 10:04:01 -0700
committerPeter Geoghegan <pg@bowt.ie>2019-03-20 10:04:01 -0700
commitdd299df8189bd00fbe54b72c64f43b6af2ffeccd (patch)
tree931ef720687d61cf5e75464fa0b1c1d75fb3f9d3 /src/backend/access/nbtree/nbtpage.c
parente5adcb789d80ba565ccacb1ed4341a7c29085238 (diff)
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute. Index searches can distinguish duplicates by heap TID, since heap TID is always guaranteed to be unique. This general approach has numerous benefits for performance, and is prerequisite to teaching VACUUM to perform "retail index tuple deletion". Naively adding a new attribute to every pivot tuple has unacceptable overhead (it bloats internal pages), so suffix truncation of pivot tuples is added. This will usually truncate away the "extra" heap TID attribute from pivot tuples during a leaf page split, and may also truncate away additional user attributes. This can increase fan-out, especially in a multi-column index. Truncation can only occur at the attribute granularity, which isn't particularly effective, but works well enough for now. A future patch may add support for truncating "within" text attributes by generating truncated key values using new opclass infrastructure. Only new indexes (BTREE_VERSION 4 indexes) will have insertions that treat heap TID as a tiebreaker attribute, or will have pivot tuples undergo suffix truncation during a leaf page split (on-disk compatibility with versions 2 and 3 is preserved). Upgrades to version 4 cannot be performed on-the-fly, unlike upgrades from version 2 to version 3. contrib/amcheck continues to work with version 2 and 3 indexes, while also enforcing stricter invariants when verifying version 4 indexes. These stricter invariants are the same invariants described by "3.1.12 Sequencing" from the Lehman and Yao paper. A later patch will enhance the logic used by nbtree to pick a split point. This patch is likely to negatively impact performance without smarter choices around the precise point to split leaf pages at. Making these two mostly-distinct sets of enhancements into distinct commits seems like it might clarify their design, even though neither commit is particularly useful on its own. The maximum allowed size of new tuples is reduced by an amount equal to the space required to store an extra MAXALIGN()'d TID in a new high key during leaf page splits. The user-facing definition of the "1/3 of a page" restriction is already imprecise, and so does not need to be revised. However, there should be a compatibility note in the v12 release notes. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas, Alexander Korotkov Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
Diffstat (limited to 'src/backend/access/nbtree/nbtpage.c')
-rw-r--r--src/backend/access/nbtree/nbtpage.c206
1 files changed, 141 insertions, 65 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 56041c3d383..37829d34321 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -33,7 +33,8 @@
#include "storage/predicate.h"
#include "utils/snapmgr.h"
-static void _bt_cachemetadata(Relation rel, BTMetaPageData *metad);
+static void _bt_cachemetadata(Relation rel, BTMetaPageData *input);
+static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
bool *rightsib_empty);
@@ -77,7 +78,9 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
}
/*
- * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new.
+ * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
+ * 3, the last version that can be updated without broadly affecting
+ * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
*
* This routine does purely in-memory image upgrade. Caller is
* responsible for locking, WAL-logging etc.
@@ -93,11 +96,11 @@ _bt_upgrademetapage(Page page)
/* It must be really a meta page of upgradable version */
Assert(metaopaque->btpo_flags & BTP_META);
- Assert(metad->btm_version < BTREE_VERSION);
+ Assert(metad->btm_version < BTREE_NOVAC_VERSION);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
/* Set version number and fill extra fields added into version 3 */
- metad->btm_version = BTREE_VERSION;
+ metad->btm_version = BTREE_NOVAC_VERSION;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
@@ -107,44 +110,80 @@ _bt_upgrademetapage(Page page)
}
/*
- * Cache metadata from meta page to rel->rd_amcache.
+ * Cache metadata from input meta page to rel->rd_amcache.
*/
static void
-_bt_cachemetadata(Relation rel, BTMetaPageData *metad)
+_bt_cachemetadata(Relation rel, BTMetaPageData *input)
{
+ BTMetaPageData *cached_metad;
+
/* We assume rel->rd_amcache was already freed by caller */
Assert(rel->rd_amcache == NULL);
rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
sizeof(BTMetaPageData));
- /*
- * Meta page should be of supported version (should be already checked by
- * caller).
- */
- Assert(metad->btm_version >= BTREE_MIN_VERSION &&
- metad->btm_version <= BTREE_VERSION);
+ /* Meta page should be of supported version */
+ Assert(input->btm_version >= BTREE_MIN_VERSION &&
+ input->btm_version <= BTREE_VERSION);
- if (metad->btm_version == BTREE_VERSION)
+ cached_metad = (BTMetaPageData *) rel->rd_amcache;
+ if (input->btm_version >= BTREE_NOVAC_VERSION)
{
- /* Last version of meta-data, no need to upgrade */
- memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+ /* Version with compatible meta-data, no need to upgrade */
+ memcpy(cached_metad, input, sizeof(BTMetaPageData));
}
else
{
- BTMetaPageData *cached_metad = (BTMetaPageData *) rel->rd_amcache;
-
/*
* Upgrade meta-data: copy available information from meta-page and
* fill new fields with default values.
+ *
+ * Note that we cannot upgrade to version 4+ without a REINDEX, since
+ * extensive on-disk changes are required.
*/
- memcpy(rel->rd_amcache, metad, offsetof(BTMetaPageData, btm_oldest_btpo_xact));
- cached_metad->btm_version = BTREE_VERSION;
+ memcpy(cached_metad, input, offsetof(BTMetaPageData, btm_oldest_btpo_xact));
+ cached_metad->btm_version = BTREE_NOVAC_VERSION;
cached_metad->btm_oldest_btpo_xact = InvalidTransactionId;
cached_metad->btm_last_cleanup_num_heap_tuples = -1.0;
}
}
/*
+ * Get metadata from share-locked buffer containing metapage, while performing
+ * standard sanity checks. Sanity checks here must match _bt_getroot().
+ */
+static BTMetaPageData *
+_bt_getmeta(Relation rel, Buffer metabuf)
+{
+ Page metapg;
+ BTPageOpaque metaopaque;
+ BTMetaPageData *metad;
+
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ metad = BTPageGetMeta(metapg);
+
+ /* sanity-check the metapage */
+ if (!P_ISMETA(metaopaque) ||
+ metad->btm_magic != BTREE_MAGIC)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" is not a btree",
+ RelationGetRelationName(rel))));
+
+ if (metad->btm_version < BTREE_MIN_VERSION ||
+ metad->btm_version > BTREE_VERSION)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("version mismatch in index \"%s\": file version %d, "
+ "current version %d, minimal supported version %d",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+
+ return metad;
+}
+
+/*
* _bt_update_meta_cleanup_info() -- Update cleanup-related information in
* the metapage.
*
@@ -167,7 +206,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
metad = BTPageGetMeta(metapg);
/* outdated version of metapage always needs rewrite */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
needsRewrite = true;
else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
@@ -186,7 +225,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
START_CRIT_SECTION();
/* upgrade meta-page if needed */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
/* update cleanup-related information */
@@ -202,6 +241,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
XLogBeginInsert();
XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ md.version = metad->btm_version;
md.root = metad->btm_root;
md.level = metad->btm_level;
md.fastroot = metad->btm_fastroot;
@@ -376,7 +417,7 @@ _bt_getroot(Relation rel, int access)
START_CRIT_SECTION();
/* upgrade metapage if needed */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_root = rootblkno;
@@ -400,6 +441,8 @@ _bt_getroot(Relation rel, int access)
XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ md.version = metad->btm_version;
md.root = rootblkno;
md.level = 0;
md.fastroot = rootblkno;
@@ -595,37 +638,12 @@ _bt_getrootheight(Relation rel)
{
BTMetaPageData *metad;
- /*
- * We can get what we need from the cached metapage data. If it's not
- * cached yet, load it. Sanity checks here must match _bt_getroot().
- */
if (rel->rd_amcache == NULL)
{
Buffer metabuf;
- Page metapg;
- BTPageOpaque metaopaque;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
- metapg = BufferGetPage(metabuf);
- metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
- metad = BTPageGetMeta(metapg);
-
- /* sanity-check the metapage */
- if (!P_ISMETA(metaopaque) ||
- metad->btm_magic != BTREE_MAGIC)
- ereport(ERROR,
- (errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("index \"%s\" is not a btree",
- RelationGetRelationName(rel))));
-
- if (metad->btm_version < BTREE_MIN_VERSION ||
- metad->btm_version > BTREE_VERSION)
- ereport(ERROR,
- (errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("version mismatch in index \"%s\": file version %d, "
- "current version %d, minimal supported version %d",
- RelationGetRelationName(rel),
- metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+ metad = _bt_getmeta(rel, metabuf);
/*
* If there's no root page yet, _bt_getroot() doesn't expect a cache
@@ -642,20 +660,71 @@ _bt_getrootheight(Relation rel)
* Cache the metapage data for next time
*/
_bt_cachemetadata(rel, metad);
-
+ /* We shouldn't have cached it if any of these fail */
+ Assert(metad->btm_magic == BTREE_MAGIC);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ Assert(metad->btm_fastroot != P_NONE);
_bt_relbuf(rel, metabuf);
}
+ /* Get cached page */
metad = (BTMetaPageData *) rel->rd_amcache;
- /* We shouldn't have cached it if any of these fail */
- Assert(metad->btm_magic == BTREE_MAGIC);
- Assert(metad->btm_version == BTREE_VERSION);
- Assert(metad->btm_fastroot != P_NONE);
return metad->btm_fastlevel;
}
/*
+ * _bt_heapkeyspace() -- is heap TID being treated as a key?
+ *
+ * This is used to determine the rules that must be used to descend a
+ * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
+ * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
+ * performance when inserting a new BTScanInsert-wise duplicate tuple
+ * among many leaf pages already full of such duplicates.
+ */
+bool
+_bt_heapkeyspace(Relation rel)
+{
+ BTMetaPageData *metad;
+
+ if (rel->rd_amcache == NULL)
+ {
+ Buffer metabuf;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metad = _bt_getmeta(rel, metabuf);
+
+ /*
+ * If there's no root page yet, _bt_getroot() doesn't expect a cache
+ * to be made, so just stop here. (XXX perhaps _bt_getroot() should
+ * be changed to allow this case.)
+ */
+ if (metad->btm_root == P_NONE)
+ {
+ uint32 btm_version = metad->btm_version;
+
+ _bt_relbuf(rel, metabuf);
+ return btm_version > BTREE_NOVAC_VERSION;
+ }
+
+ /*
+ * Cache the metapage data for next time
+ */
+ _bt_cachemetadata(rel, metad);
+ /* We shouldn't have cached it if any of these fail */
+ Assert(metad->btm_magic == BTREE_MAGIC);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ Assert(metad->btm_fastroot != P_NONE);
+ _bt_relbuf(rel, metabuf);
+ }
+
+ /* Get cached page */
+ metad = (BTMetaPageData *) rel->rd_amcache;
+
+ return metad->btm_version > BTREE_NOVAC_VERSION;
+}
+
+/*
* _bt_checkpage() -- Verify that a freshly-read page looks sane.
*/
void
@@ -1123,11 +1192,12 @@ _bt_is_page_halfdead(Relation rel, BlockNumber blk)
* right sibling.
*
* "child" is the leaf page we wish to delete, and "stack" is a search stack
- * leading to it (approximately). Note that we will update the stack
- * entry(s) to reflect current downlink positions --- this is essentially the
- * same as the corresponding step of splitting, and is not expected to affect
- * caller. The caller should initialize *target and *rightsib to the leaf
- * page and its right sibling.
+ * leading to it (it actually leads to the leftmost leaf page with a high key
+ * matching that of the page to be deleted in !heapkeyspace indexes). Note
+ * that we will update the stack entry(s) to reflect current downlink
+ * positions --- this is essentially the same as the corresponding step of
+ * splitting, and is not expected to affect caller. The caller should
+ * initialize *target and *rightsib to the leaf page and its right sibling.
*
* Note: it's OK to release page locks on any internal pages between the leaf
* and *topparent, because a safe deletion can't become unsafe due to
@@ -1149,8 +1219,10 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
BlockNumber leftsib;
/*
- * Locate the downlink of "child" in the parent (updating the stack entry
- * if needed)
+ * Locate the downlink of "child" in the parent, updating the stack entry
+ * if needed. This is how !heapkeyspace indexes deal with having
+ * non-unique high keys in leaf level pages. Even heapkeyspace indexes
+ * can have a stale stack due to insertions into the parent.
*/
stack->bts_btentry = child;
pbuf = _bt_getstackbuf(rel, stack);
@@ -1362,9 +1434,10 @@ _bt_pagedel(Relation rel, Buffer buf)
{
/*
* We need an approximate pointer to the page's parent page. We
- * use the standard search mechanism to search for the page's high
- * key; this will give us a link to either the current parent or
- * someplace to its left (if there are multiple equal high keys).
+ * use a variant of the standard search mechanism to search for
+ * the page's high key; this will give us a link to either the
+ * current parent or someplace to its left (if there are multiple
+ * equal high keys, which is possible with !heapkeyspace indexes).
*
* Also check if this is the right-half of an incomplete split
* (see comment above).
@@ -1422,7 +1495,8 @@ _bt_pagedel(Relation rel, Buffer buf)
/* we need an insertion scan key for the search, so build one */
itup_key = _bt_mkscankey(rel, targetkey);
- /* get stack to leaf page by searching index */
+ /* find the leftmost leaf page with matching pivot/high key */
+ itup_key->pivotsearch = true;
stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
/* don't need a lock or second pin on the page */
_bt_relbuf(rel, lbuf);
@@ -1969,7 +2043,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
if (BufferIsValid(metabuf))
{
/* upgrade metapage if needed */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = rightsib;
metad->btm_fastlevel = targetlevel;
@@ -2017,6 +2091,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
{
XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ xlmeta.version = metad->btm_version;
xlmeta.root = metad->btm_root;
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;