summaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/include')
-rw-r--r--src/include/access/nbtree.h433
-rw-r--r--src/include/access/nbtxlog.h117
-rw-r--r--src/include/access/rmgrlist.h2
-rw-r--r--src/include/access/xlog_internal.h2
4 files changed, 472 insertions, 82 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e8d4d2b55b7..bfe49f46b08 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -108,6 +108,7 @@ typedef struct BTMetaPageData
* pages */
float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples
* during last cleanup */
+ bool btm_allequalimage; /* are all columns "equalimage"? */
} BTMetaPageData;
#define BTPageGetMeta(p) \
@@ -124,6 +125,14 @@ typedef struct BTMetaPageData
* need to be immediately re-indexed at pg_upgrade. In order to get the
* new heapkeyspace semantics, however, a REINDEX is needed.
*
+ * Deduplication is safe to use when the btm_allequalimage field is set to
+ * true. It's safe to read the btm_allequalimage field on version 3, but
+ * only version 4 indexes make use of deduplication. Even version 4
+ * indexes created on PostgreSQL v12 will need a REINDEX to make use of
+ * deduplication, though, since there is no other way to set
+ * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
+ * metapage field).
+ *
* Btree version 2 is mostly the same as version 3. There are two new
* fields in the metapage that were introduced in version 3. A version 2
* metapage will be automatically upgraded to version 3 on the first
@@ -157,6 +166,21 @@ typedef struct BTMetaPageData
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
/*
+ * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
+ * that may be stored on a btree leaf page. It is used to size the
+ * per-page temporary buffers used by index scans.)
+ *
+ * Note: we don't bother considering per-tuple overheads here to keep
+ * things simple (value is based on how many elements a single array of
+ * heap TIDs must have to fill the space between the page header and
+ * special area). The value is slightly higher (i.e. more conservative)
+ * than necessary as a result, which is considered acceptable.
+ */
+#define MaxTIDsPerBTreePage \
+ (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
+ sizeof(ItemPointerData))
+
+/*
* The leaf-page fillfactor defaults to 90% but is user-adjustable.
* For pages above the leaf level, we use a fixed 70% fillfactor.
* The fillfactor is applied during index build and when splitting
@@ -230,16 +254,15 @@ typedef struct BTMetaPageData
* tuples (non-pivot tuples). _bt_check_natts() enforces the rules
* described here.
*
- * Non-pivot tuple format:
+ * Non-pivot tuple format (plain/non-posting variant):
*
* t_tid | t_info | key values | INCLUDE columns, if any
*
* t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
*
- * All other types of index tuples ("pivot" tuples) only have key columns,
- * since pivot tuples only exist to represent how the key space is
+ * Non-pivot tuples complement pivot tuples, which only have key columns.
+ * The sole purpose of pivot tuples is to represent how the key space is
* separated. In general, any B-Tree index that has more than one level
* (i.e. any index that does not just consist of a metapage and a single
* leaf root page) must have some number of pivot tuples, since pivot
@@ -264,7 +287,8 @@ typedef struct BTMetaPageData
* INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
* TID column sometimes stored in pivot tuples -- that's represented by
* the presence of BT_PIVOT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in
- * t_info is always set on BTREE_VERSION 4 pivot tuples.
+ * t_info is always set on BTREE_VERSION 4 pivot tuples, since
+ * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
*
* In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in
* pivot tuples. In that case, the number of key columns is implicitly
@@ -279,90 +303,256 @@ typedef struct BTMetaPageData
* The 12 least significant offset bits from t_tid are used to represent
* the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status
* bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
- * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any
- * number of columns/attributes <= INDEX_MAX_KEYS.
+ * future use. BT_OFFSET_MASK should be large enough to store any number
+ * of columns/attributes <= INDEX_MAX_KEYS.
+ *
+ * Sometimes non-pivot tuples also use a representation that repurposes
+ * t_tid to store metadata rather than a TID. PostgreSQL v13 introduced a
+ * new non-pivot tuple format to support deduplication: posting list
+ * tuples. Deduplication merges together multiple equal non-pivot tuples
+ * into a logically equivalent, space efficient representation. A posting
+ * list is an array of ItemPointerData elements. Non-pivot tuples are
+ * merged together to form posting list tuples lazily, at the point where
+ * we'd otherwise have to split a leaf page.
+ *
+ * Posting tuple format (alternative non-pivot tuple representation):
+ *
+ * t_tid | t_info | key values | posting list (TID array)
+ *
+ * Posting list tuples are recognized as such by having the
+ * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
+ * bit set in t_tid. These flags redefine the content of the posting
+ * tuple's t_tid to store an offset to the posting list, as well as the
+ * total number of posting list array elements.
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items present in the tuple, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use. Like any non-pivot tuple, the number of columns stored is
+ * always implicitly the total number in the index (in practice there can
+ * never be non-key columns stored, since deduplication is not supported
+ * with INCLUDE indexes). BT_OFFSET_MASK should be large enough to store
+ * any number of posting list TIDs that might be present in a tuple (since
+ * tuple size is subject to the INDEX_SIZE_MASK limit).
*
* Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
- * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
- * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes). They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or
+ * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set
+ * must be a non-pivot tuple (or must have the same number of attributes as
+ * the index has generally in the case of !heapkeyspace indexes).
*/
#define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT
/* Item pointer offset bits */
#define BT_RESERVED_OFFSET_MASK 0xF000
-#define BT_N_KEYS_OFFSET_MASK 0x0FFF
+#define BT_OFFSET_MASK 0x0FFF
#define BT_PIVOT_HEAP_TID_ATTR 0x1000
+#define BT_IS_POSTING 0x2000
+
+/*
+ * Note: BTreeTupleIsPivot() can have false negatives (but not false
+ * positives) when used with !heapkeyspace indexes
+ */
+static inline bool
+BTreeTupleIsPivot(IndexTuple itup)
+{
+ if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+ return false;
+ /* absence of BT_IS_POSTING in offset number indicates pivot tuple */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
+ return false;
+
+ return true;
+}
+
+static inline bool
+BTreeTupleIsPosting(IndexTuple itup)
+{
+ if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+ return false;
+ /* presence of BT_IS_POSTING in offset number indicates posting tuple */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
+ return false;
+
+ return true;
+}
+
+static inline void
+BTreeTupleSetPosting(IndexTuple itup, int nhtids, int postingoffset)
+{
+ Assert(nhtids > 1 && (nhtids & BT_OFFSET_MASK) == nhtids);
+ Assert(postingoffset == MAXALIGN(postingoffset));
+ Assert(postingoffset < INDEX_SIZE_MASK);
+
+ itup->t_info |= INDEX_ALT_TID_MASK;
+ ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
+ ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
+}
+
+static inline uint16
+BTreeTupleGetNPosting(IndexTuple posting)
+{
+ OffsetNumber existing;
+
+ Assert(BTreeTupleIsPosting(posting));
+
+ existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
+ return (existing & BT_OFFSET_MASK);
+}
-/* Get/set downlink block number in pivot tuple */
-#define BTreeTupleGetDownLink(itup) \
- ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-#define BTreeTupleSetDownLink(itup, blkno) \
- ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
+static inline uint32
+BTreeTupleGetPostingOffset(IndexTuple posting)
+{
+ Assert(BTreeTupleIsPosting(posting));
+
+ return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
+}
+
+static inline ItemPointer
+BTreeTupleGetPosting(IndexTuple posting)
+{
+ return (ItemPointer) ((char *) posting +
+ BTreeTupleGetPostingOffset(posting));
+}
+
+static inline ItemPointer
+BTreeTupleGetPostingN(IndexTuple posting, int n)
+{
+ return BTreeTupleGetPosting(posting) + n;
+}
/*
- * Get/set leaf page highkey's link. During the second phase of deletion, the
- * target leaf page's high key may point to an ancestor page (at all other
- * times, the leaf level high key's link is not used). See the nbtree README
- * for full details.
+ * Get/set downlink block number in pivot tuple.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple. If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
*/
-#define BTreeTupleGetTopParent(itup) \
- ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-#define BTreeTupleSetTopParent(itup, blkno) \
- do { \
- ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \
- BTreeTupleSetNAtts((itup), 0); \
- } while(0)
+static inline BlockNumber
+BTreeTupleGetDownLink(IndexTuple pivot)
+{
+ return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
+}
+
+static inline void
+BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
+{
+ ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
+}
/*
- * Get/set number of attributes within B-tree index tuple.
+ * Get number of attributes within tuple.
*
* Note that this does not include an implicit tiebreaker heap TID
* attribute, if any. Note also that the number of key attributes must be
* explicitly represented in all heapkeyspace pivot tuples.
+ *
+ * Note: This is defined as a macro rather than an inline function to
+ * avoid including rel.h.
*/
#define BTreeTupleGetNAtts(itup, rel) \
( \
- (itup)->t_info & INDEX_ALT_TID_MASK ? \
+ (BTreeTupleIsPivot(itup)) ? \
( \
- ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
+ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
) \
: \
IndexRelationGetNumberOfAttributes(rel) \
)
-#define BTreeTupleSetNAtts(itup, n) \
- do { \
- (itup)->t_info |= INDEX_ALT_TID_MASK; \
- ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
- } while(0)
/*
- * Get tiebreaker heap TID attribute, if any. Macro works with both pivot
- * and non-pivot tuples, despite differences in how heap TID is represented.
+ * Set number of attributes in tuple, making it into a pivot tuple
*/
-#define BTreeTupleGetHeapTID(itup) \
- ( \
- (itup)->t_info & INDEX_ALT_TID_MASK && \
- (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_PIVOT_HEAP_TID_ATTR) != 0 ? \
- ( \
- (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
- sizeof(ItemPointerData)) \
- ) \
- : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
- )
+static inline void
+BTreeTupleSetNAtts(IndexTuple itup, int natts)
+{
+ Assert(natts <= INDEX_MAX_KEYS);
+
+ itup->t_info |= INDEX_ALT_TID_MASK;
+ /* BT_IS_POSTING bit may be unset -- tuple always becomes a pivot tuple */
+ ItemPointerSetOffsetNumber(&itup->t_tid, natts);
+ Assert(BTreeTupleIsPivot(itup));
+}
+
/*
- * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
- * representation (currently limited to pivot tuples)
+ * Set the bit indicating heap TID attribute present in pivot tuple
*/
-#define BTreeTupleSetAltHeapTID(itup) \
- do { \
- Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
- ItemPointerSetOffsetNumber(&(itup)->t_tid, \
- ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_PIVOT_HEAP_TID_ATTR); \
- } while(0)
+static inline void
+BTreeTupleSetAltHeapTID(IndexTuple pivot)
+{
+ OffsetNumber existing;
+
+ Assert(BTreeTupleIsPivot(pivot));
+
+ existing = ItemPointerGetOffsetNumberNoCheck(&pivot->t_tid);
+ ItemPointerSetOffsetNumber(&pivot->t_tid,
+ existing | BT_PIVOT_HEAP_TID_ATTR);
+}
+
+/*
+ * Get/set leaf page's "top parent" link from its high key. Used during page
+ * deletion.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple. If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
+ */
+static inline BlockNumber
+BTreeTupleGetTopParent(IndexTuple leafhikey)
+{
+ return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
+}
+
+static inline void
+BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
+{
+ ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
+ BTreeTupleSetNAtts(leafhikey, 0);
+}
+
+/*
+ * Get tiebreaker heap TID attribute, if any.
+ *
+ * This returns the first/lowest heap TID in the case of a posting list tuple.
+ */
+static inline ItemPointer
+BTreeTupleGetHeapTID(IndexTuple itup)
+{
+ if (BTreeTupleIsPivot(itup))
+ {
+ /* Pivot tuple heap TID representation? */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+ BT_PIVOT_HEAP_TID_ATTR) != 0)
+ return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
+ sizeof(ItemPointerData));
+
+ /* Heap TID attribute was truncated */
+ return NULL;
+ }
+ else if (BTreeTupleIsPosting(itup))
+ return BTreeTupleGetPosting(itup);
+
+ return &itup->t_tid;
+}
+
+/*
+ * Get maximum heap TID attribute, which could be the only TID in the case of
+ * a non-pivot tuple that does not have a posting list tuple.
+ *
+ * Works with non-pivot tuples only.
+ */
+static inline ItemPointer
+BTreeTupleGetMaxHeapTID(IndexTuple itup)
+{
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (BTreeTupleIsPosting(itup))
+ {
+ uint16 nposting = BTreeTupleGetNPosting(itup);
+
+ return BTreeTupleGetPostingN(itup, nposting - 1);
+ }
+
+ return &itup->t_tid;
+}
/*
* Operator strategy numbers for B-tree have been moved to access/stratnum.h,
@@ -439,6 +629,9 @@ typedef BTStackData *BTStack;
* indexes whose version is >= version 4. It's convenient to keep this close
* by, rather than accessing the metapage repeatedly.
*
+ * allequalimage is set to indicate that deduplication is safe for the index.
+ * This is also a property of the index relation rather than an indexscan.
+ *
* anynullkeys indicates if any of the keys had NULL value when scankey was
* built from index tuple (note that already-truncated tuple key attributes
* set NULL as a placeholder key value, which also affects value of
@@ -474,6 +667,7 @@ typedef BTStackData *BTStack;
typedef struct BTScanInsertData
{
bool heapkeyspace;
+ bool allequalimage;
bool anynullkeys;
bool nextkey;
bool pivotsearch;
@@ -512,11 +706,95 @@ typedef struct BTInsertStateData
bool bounds_valid;
OffsetNumber low;
OffsetNumber stricthigh;
+
+ /*
+ * if _bt_binsrch_insert found the location inside existing posting list,
+ * save the position inside the list. -1 sentinel value indicates overlap
+ * with an existing posting list tuple that has its LP_DEAD bit set.
+ */
+ int postingoff;
} BTInsertStateData;
typedef BTInsertStateData *BTInsertState;
/*
+ * State used to representing an individual pending tuple during
+ * deduplication.
+ */
+typedef struct BTDedupInterval
+{
+ OffsetNumber baseoff;
+ uint16 nitems;
+} BTDedupInterval;
+
+/*
+ * BTDedupStateData is a working area used during deduplication.
+ *
+ * The status info fields track the state of a whole-page deduplication pass.
+ * State about the current pending posting list is also tracked.
+ *
+ * A pending posting list is comprised of a contiguous group of equal items
+ * from the page, starting from page offset number 'baseoff'. This is the
+ * offset number of the "base" tuple for new posting list. 'nitems' is the
+ * current total number of existing items from the page that will be merged to
+ * make a new posting list tuple, including the base tuple item. (Existing
+ * items may themselves be posting list tuples, or regular non-pivot tuples.)
+ *
+ * The total size of the existing tuples to be freed when pending posting list
+ * is processed gets tracked by 'phystupsize'. This information allows
+ * deduplication to calculate the space saving for each new posting list
+ * tuple, and for the entire pass over the page as a whole.
+ */
+typedef struct BTDedupStateData
+{
+ /* Deduplication status info for entire pass over page */
+ bool deduplicate; /* Still deduplicating page? */
+ Size maxpostingsize; /* Limit on size of final tuple */
+
+ /* Metadata about base tuple of current pending posting list */
+ IndexTuple base; /* Use to form new posting list */
+ OffsetNumber baseoff; /* page offset of base */
+ Size basetupsize; /* base size without original posting list */
+
+ /* Other metadata about pending posting list */
+ ItemPointer htids; /* Heap TIDs in pending posting list */
+ int nhtids; /* Number of heap TIDs in htids array */
+ int nitems; /* Number of existing tuples/line pointers */
+ Size phystupsize; /* Includes line pointer overhead */
+
+ /*
+ * Array of tuples to go on new version of the page. Contains one entry
+ * for each group of consecutive items. Note that existing tuples that
+ * will not become posting list tuples do not appear in the array (they
+ * are implicitly unchanged by deduplication pass).
+ */
+ int nintervals; /* current size of intervals array */
+ BTDedupInterval intervals[MaxIndexTuplesPerPage];
+} BTDedupStateData;
+
+typedef BTDedupStateData *BTDedupState;
+
+/*
+ * BTVacuumPostingData is state that represents how to VACUUM a posting list
+ * tuple when some (though not all) of its TIDs are to be deleted.
+ *
+ * Convention is that itup field is the original posting list tuple on input,
+ * and palloc()'d final tuple used to overwrite existing tuple on output.
+ */
+typedef struct BTVacuumPostingData
+{
+ /* Tuple that will be/was updated */
+ IndexTuple itup;
+ OffsetNumber updatedoffset;
+
+ /* State needed to describe final itup in WAL */
+ uint16 ndeletedtids;
+ uint16 deletetids[FLEXIBLE_ARRAY_MEMBER];
+} BTVacuumPostingData;
+
+typedef BTVacuumPostingData *BTVacuumPosting;
+
+/*
* BTScanOpaqueData is the btree-private state needed for an indexscan.
* This consists of preprocessed scan keys (see _bt_preprocess_keys() for
* details of the preprocessing), information about the current location
@@ -539,7 +817,9 @@ typedef BTInsertStateData *BTInsertState;
* If we are doing an index-only scan, we save the entire IndexTuple for each
* matched item, otherwise only its heap TID and offset. The IndexTuples go
* into a separate workspace array; each BTScanPosItem stores its tuple's
- * offset within that array.
+ * offset within that array. Posting list tuples store a "base" tuple once,
+ * allowing the same key to be returned for each TID in the posting list
+ * tuple.
*/
typedef struct BTScanPosItem /* what we remember about each match */
@@ -583,7 +863,7 @@ typedef struct BTScanPosData
int lastItem; /* last valid index in items[] */
int itemIndex; /* current index in items[] */
- BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+ BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */
} BTScanPosData;
typedef BTScanPosData *BTScanPos;
@@ -691,6 +971,7 @@ typedef struct BTOptions
int fillfactor; /* page fill factor in percent (0..100) */
/* fraction of newly inserted tuples prior to trigger index cleanup */
float8 vacuum_cleanup_index_scale_factor;
+ bool deduplicate_items; /* Try to deduplicate items? */
} BTOptions;
#define BTGetFillFactor(relation) \
@@ -701,6 +982,11 @@ typedef struct BTOptions
BTREE_DEFAULT_FILLFACTOR)
#define BTGetTargetPageFreeSpace(relation) \
(BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
+#define BTGetDeduplicateItems(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == BTREE_AM_OID), \
+ ((relation)->rd_options ? \
+ ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
/*
* Constant definition for progress reporting. Phase numbers must match
@@ -748,6 +1034,22 @@ extern void _bt_parallel_done(IndexScanDesc scan);
extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
/*
+ * prototypes for functions in nbtdedup.c
+ */
+extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
+ IndexTuple newitem, Size newitemsz,
+ bool checkingunique);
+extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+ OffsetNumber baseoff);
+extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
+extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
+extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
+ int nhtids);
+extern void _bt_update_posting(BTVacuumPosting vacposting);
+extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
+ int postingoff);
+
+/*
* prototypes for functions in nbtinsert.c
*/
extern bool _bt_doinsert(Relation rel, IndexTuple itup,
@@ -765,14 +1067,16 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
/*
* prototypes for functions in nbtpage.c
*/
-extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+ bool allequalimage);
extern void _bt_update_meta_cleanup_info(Relation rel,
TransactionId oldestBtpoXact, float8 numHeapTuples);
extern void _bt_upgrademetapage(Page page);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern int _bt_getrootheight(Relation rel);
-extern bool _bt_heapkeyspace(Relation rel);
+extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
+ bool *allequalimage);
extern void _bt_checkpage(Relation rel, Buffer buf);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@@ -781,7 +1085,8 @@ extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
- OffsetNumber *deletable, int ndeletable);
+ OffsetNumber *deletable, int ndeletable,
+ BTVacuumPosting *updatable, int nupdatable);
extern void _bt_delitems_delete(Relation rel, Buffer buf,
OffsetNumber *deletable, int ndeletable,
Relation heapRel);
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 776a9bd7233..347976c532c 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,7 +28,8 @@
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
-/* 0x50 and 0x60 are unused */
+#define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */
+#define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */
#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
@@ -53,21 +54,34 @@ typedef struct xl_btree_metadata
uint32 fastlevel;
TransactionId oldest_btpo_xact;
float8 last_cleanup_num_heap_tuples;
+ bool allequalimage;
} xl_btree_metadata;
/*
* This is what we need to know about simple (without split) insert.
*
- * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
- * Note that INSERT_META implies it's not a leaf page.
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
+ * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a
+ * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
+ * page.
*
- * Backup Blk 0: original page (data contains the inserted tuple)
+ * Backup Blk 0: original page
* Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
* Backup Blk 2: xl_btree_metadata, if INSERT_META
+ *
+ * Note: The new tuple is actually the "original" new item in the posting
+ * list split insert case (i.e. the INSERT_POST case). A split offset for
+ * the posting list is logged before the original new item. Recovery needs
+ * both, since it must do an in-place update of the existing posting list
+ * that was split as an extra step. Also, recovery generates a "final"
+ * newitem. See _bt_swap_posting() for details on posting list splits.
*/
typedef struct xl_btree_insert
{
OffsetNumber offnum;
+
+ /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
+ /* NEW TUPLE ALWAYS FOLLOWS AT THE END */
} xl_btree_insert;
#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
@@ -92,8 +106,37 @@ typedef struct xl_btree_insert
* Backup Blk 0: original page / new left page
*
* The left page's data portion contains the new item, if it's the _L variant.
- * An IndexTuple representing the high key of the left page must follow with
- * either variant.
+ * _R variant split records generally do not have a newitem (_R variant leaf
+ * page split records that must deal with a posting list split will include an
+ * explicit newitem, though it is never used on the right page -- it is
+ * actually an orignewitem needed to update existing posting list). The new
+ * high key of the left/original page appears last of all (and must always be
+ * present).
+ *
+ * Page split records that need the REDO routine to deal with a posting list
+ * split directly will have an explicit newitem, which is actually an
+ * orignewitem (the newitem as it was before the posting list split, not
+ * after). A posting list split always has a newitem that comes immediately
+ * after the posting list being split (which would have overlapped with
+ * orignewitem prior to split). Usually REDO must deal with posting list
+ * splits with an _L variant page split record, and usually both the new
+ * posting list and the final newitem go on the left page (the existing
+ * posting list will be inserted instead of the old, and the final newitem
+ * will be inserted next to that). However, _R variant split records will
+ * include an orignewitem when the split point for the page happens to have a
+ * lastleft tuple that is also the posting list being split (leaving newitem
+ * as the page split's firstright tuple). The existence of this corner case
+ * does not change the basic fact about newitem/orignewitem for the REDO
+ * routine: it is always state used for the left page alone. (This is why the
+ * record's postingoff field isn't a reliable indicator of whether or not a
+ * posting list split occurred during the page split; a non-zero value merely
+ * indicates that the REDO routine must reconstruct a new posting list tuple
+ * that is needed for the left page.)
+ *
+ * This posting list split handling is equivalent to the xl_btree_insert REDO
+ * routine's INSERT_POST handling. While the details are more complicated
+ * here, the concept and goals are exactly the same. See _bt_swap_posting()
+ * for details on posting list splits.
*
* Backup Blk 1: new right page
*
@@ -111,15 +154,33 @@ typedef struct xl_btree_split
{
uint32 level; /* tree level of page being split */
OffsetNumber firstright; /* first item moved to right page */
- OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */
+ OffsetNumber newitemoff; /* new item's offset */
+ uint16 postingoff; /* offset inside orig posting tuple */
} xl_btree_split;
-#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
+#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16))
+
+/*
+ * When page is deduplicated, consecutive groups of tuples with equal keys are
+ * merged together into posting list tuples.
+ *
+ * The WAL record represents a deduplication pass for a leaf page. An array
+ * of BTDedupInterval structs follows.
+ */
+typedef struct xl_btree_dedup
+{
+ uint16 nintervals;
+
+ /* DEDUPLICATION INTERVALS FOLLOW */
+} xl_btree_dedup;
+
+#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
/*
* This is what we need to know about delete of individual leaf index tuples.
* The WAL record can represent deletion of any number of index tuples on a
- * single index page when *not* executed by VACUUM.
+ * single index page when *not* executed by VACUUM. Deletion of a subset of
+ * the TIDs within a posting list tuple is not supported.
*
* Backup Blk 0: index page
*/
@@ -150,21 +211,43 @@ typedef struct xl_btree_reuse_page
#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
/*
- * This is what we need to know about vacuum of individual leaf index tuples.
- * The WAL record can represent deletion of any number of index tuples on a
- * single index page when executed by VACUUM.
+ * This is what we need to know about which TIDs to remove from an individual
+ * posting list tuple during vacuuming. An array of these may appear at the
+ * end of xl_btree_vacuum records.
+ */
+typedef struct xl_btree_update
+{
+ uint16 ndeletedtids;
+
+ /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
+} xl_btree_update;
+
+#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
+
+/*
+ * This is what we need to know about a VACUUM of a leaf page. The WAL record
+ * can represent deletion of any number of index tuples on a single index page
+ * when executed by VACUUM. It can also support "updates" of index tuples,
+ * which is how deletes of a subset of TIDs contained in an existing posting
+ * list tuple are implemented. (Updates are only used when there will be some
+ * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can
+ * just be deleted).
*
- * Note that the WAL record in any vacuum of an index must have at least one
- * item to delete.
+ * Updated posting list tuples are represented using xl_btree_update metadata.
+ * The REDO routine uses each xl_btree_update (plus its corresponding original
+ * index tuple from the target leaf page) to generate the final updated tuple.
*/
typedef struct xl_btree_vacuum
{
- uint32 ndeleted;
+ uint16 ndeleted;
+ uint16 nupdated;
/* DELETED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TUPLES METADATA ARRAY FOLLOWS */
} xl_btree_vacuum;
-#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32))
+#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
/*
* This is what we need to know about marking an empty branch for deletion.
@@ -245,6 +328,8 @@ typedef struct xl_btree_newroot
extern void btree_redo(XLogReaderState *record);
extern void btree_desc(StringInfo buf, XLogReaderState *record);
extern const char *btree_identify(uint8 info);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
extern void btree_mask(char *pagedata, BlockNumber blkno);
#endif /* NBTXLOG_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index c88dccfb8dd..6c15df7e707 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL,
PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 087918d41dd..27ded593abb 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
/*
* Each page of XLOG file has a header like this:
*/
-#define XLOG_PAGE_MAGIC 0xD104 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD105 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{