4 files changed, 472 insertions, 82 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e8d4d2b55b7..bfe49f46b08 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -108,6 +108,7 @@ typedef struct BTMetaPageData
 										 * pages */
 	float8		btm_last_cleanup_num_heap_tuples;	/* number of heap tuples
 													 * during last cleanup */
+	bool		btm_allequalimage;	/* are all columns "equalimage"? */
 } BTMetaPageData;
 
 #define BTPageGetMeta(p) \
@@ -124,6 +125,14 @@ typedef struct BTMetaPageData
  * need to be immediately re-indexed at pg_upgrade.  In order to get the
  * new heapkeyspace semantics, however, a REINDEX is needed.
  *
+ * Deduplication is safe to use when the btm_allequalimage field is set to
+ * true.  It's safe to read the btm_allequalimage field on version 3, but
+ * only version 4 indexes make use of deduplication.  Even version 4
+ * indexes created on PostgreSQL v12 will need a REINDEX to make use of
+ * deduplication, though, since there is no other way to set
+ * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
+ * metapage field).
+ *
  * Btree version 2 is mostly the same as version 3.  There are two new
  * fields in the metapage that were introduced in version 3.  A version 2
  * metapage will be automatically upgraded to version 3 on the first
@@ -157,6 +166,21 @@ typedef struct BTMetaPageData
 				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
 
 /*
+ * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
+ * that may be stored on a btree leaf page.  It is used to size the
+ * per-page temporary buffers used by index scans.)
+ *
+ * Note: we don't bother considering per-tuple overheads here to keep
+ * things simple (value is based on how many elements a single array of
+ * heap TIDs must have to fill the space between the page header and
+ * special area).  The value is slightly higher (i.e. more conservative)
+ * than necessary as a result, which is considered acceptable.
+ */
+#define MaxTIDsPerBTreePage \
+	(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
+		   sizeof(ItemPointerData))
+
+/*
  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
  * For pages above the leaf level, we use a fixed 70% fillfactor.
  * The fillfactor is applied during index build and when splitting
@@ -230,16 +254,15 @@ typedef struct BTMetaPageData
  * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
  * described here.
  *
- * Non-pivot tuple format:
+ * Non-pivot tuple format (plain/non-posting variant):
  *
  *  t_tid | t_info | key values | INCLUDE columns, if any
  *
  * t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4.  Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
  *
- * All other types of index tuples ("pivot" tuples) only have key columns,
- * since pivot tuples only exist to represent how the key space is
+ * Non-pivot tuples complement pivot tuples, which only have key columns.
+ * The sole purpose of pivot tuples is to represent how the key space is
  * separated.  In general, any B-Tree index that has more than one level
  * (i.e. any index that does not just consist of a metapage and a single
  * leaf root page) must have some number of pivot tuples, since pivot
@@ -264,7 +287,8 @@ typedef struct BTMetaPageData
  * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
  * TID column sometimes stored in pivot tuples -- that's represented by
  * the presence of BT_PIVOT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in
- * t_info is always set on BTREE_VERSION 4 pivot tuples.
+ * t_info is always set on BTREE_VERSION 4 pivot tuples, since
+ * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
  *
  * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in
  * pivot tuples.  In that case, the number of key columns is implicitly
@@ -279,90 +303,256 @@ typedef struct BTMetaPageData
  * The 12 least significant offset bits from t_tid are used to represent
  * the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status
  * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
- * future use.  BT_N_KEYS_OFFSET_MASK should be large enough to store any
- * number of columns/attributes <= INDEX_MAX_KEYS.
+ * future use.  BT_OFFSET_MASK should be large enough to store any number
+ * of columns/attributes <= INDEX_MAX_KEYS.
+ *
+ * Sometimes non-pivot tuples also use a representation that repurposes
+ * t_tid to store metadata rather than a TID.  PostgreSQL v13 introduced a
+ * new non-pivot tuple format to support deduplication: posting list
+ * tuples.  Deduplication merges together multiple equal non-pivot tuples
+ * into a logically equivalent, space efficient representation.  A posting
+ * list is an array of ItemPointerData elements.  Non-pivot tuples are
+ * merged together to form posting list tuples lazily, at the point where
+ * we'd otherwise have to split a leaf page.
+ *
+ * Posting tuple format (alternative non-pivot tuple representation):
+ *
+ *  t_tid | t_info | key values | posting list (TID array)
+ *
+ * Posting list tuples are recognized as such by having the
+ * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
+ * bit set in t_tid.  These flags redefine the content of the posting
+ * tuple's t_tid to store an offset to the posting list, as well as the
+ * total number of posting list array elements.
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items present in the tuple, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.  Like any non-pivot tuple, the number of columns stored is
+ * always implicitly the total number in the index (in practice there can
+ * never be non-key columns stored, since deduplication is not supported
+ * with INCLUDE indexes).  BT_OFFSET_MASK should be large enough to store
+ * any number of posting list TIDs that might be present in a tuple (since
+ * tuple size is subject to the INDEX_SIZE_MASK limit).
  *
  * Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
- * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
- * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes).  They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or
+ * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set
+ * must be a non-pivot tuple (or must have the same number of attributes as
+ * the index has generally in the case of !heapkeyspace indexes).
  */
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT
 
 /* Item pointer offset bits */
 #define BT_RESERVED_OFFSET_MASK		0xF000
-#define BT_N_KEYS_OFFSET_MASK		0x0FFF
+#define BT_OFFSET_MASK				0x0FFF
 #define BT_PIVOT_HEAP_TID_ATTR		0x1000
+#define BT_IS_POSTING				0x2000
+
+/*
+ * Note: BTreeTupleIsPivot() can have false negatives (but not false
+ * positives) when used with !heapkeyspace indexes
+ */
+static inline bool
+BTreeTupleIsPivot(IndexTuple itup)
+{
+	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+		return false;
+	/* absence of BT_IS_POSTING in offset number indicates pivot tuple */
+	if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
+		return false;
+
+	return true;
+}
+
+static inline bool
+BTreeTupleIsPosting(IndexTuple itup)
+{
+	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+		return false;
+	/* presence of BT_IS_POSTING in offset number indicates posting tuple */
+	if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
+		return false;
+
+	return true;
+}
+
+static inline void
+BTreeTupleSetPosting(IndexTuple itup, int nhtids, int postingoffset)
+{
+	Assert(nhtids > 1 && (nhtids & BT_OFFSET_MASK) == nhtids);
+	Assert(postingoffset == MAXALIGN(postingoffset));
+	Assert(postingoffset < INDEX_SIZE_MASK);
+
+	itup->t_info |= INDEX_ALT_TID_MASK;
+	ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
+	ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
+}
+
+static inline uint16
+BTreeTupleGetNPosting(IndexTuple posting)
+{
+	OffsetNumber existing;
+
+	Assert(BTreeTupleIsPosting(posting));
+
+	existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
+	return (existing & BT_OFFSET_MASK);
+}
 
-/* Get/set downlink block number in pivot tuple */
-#define BTreeTupleGetDownLink(itup) \
-	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-#define BTreeTupleSetDownLink(itup, blkno) \
-	ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
+static inline uint32
+BTreeTupleGetPostingOffset(IndexTuple posting)
+{
+	Assert(BTreeTupleIsPosting(posting));
+
+	return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
+}
+
+static inline ItemPointer
+BTreeTupleGetPosting(IndexTuple posting)
+{
+	return (ItemPointer) ((char *) posting +
+						  BTreeTupleGetPostingOffset(posting));
+}
+
+static inline ItemPointer
+BTreeTupleGetPostingN(IndexTuple posting, int n)
+{
+	return BTreeTupleGetPosting(posting) + n;
+}
 
 /*
- * Get/set leaf page highkey's link. During the second phase of deletion, the
- * target leaf page's high key may point to an ancestor page (at all other
- * times, the leaf level high key's link is not used).  See the nbtree README
- * for full details.
+ * Get/set downlink block number in pivot tuple.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
  */
-#define BTreeTupleGetTopParent(itup) \
-	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-#define BTreeTupleSetTopParent(itup, blkno)	\
-	do { \
-		ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \
-		BTreeTupleSetNAtts((itup), 0); \
-	} while(0)
+static inline BlockNumber
+BTreeTupleGetDownLink(IndexTuple pivot)
+{
+	return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
+}
+
+static inline void
+BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
+{
+	ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
+}
 
 /*
- * Get/set number of attributes within B-tree index tuple.
+ * Get number of attributes within tuple.
  *
  * Note that this does not include an implicit tiebreaker heap TID
  * attribute, if any.  Note also that the number of key attributes must be
  * explicitly represented in all heapkeyspace pivot tuples.
+ *
+ * Note: This is defined as a macro rather than an inline function to
+ * avoid including rel.h.
  */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
-		(itup)->t_info & INDEX_ALT_TID_MASK ? \
+		(BTreeTupleIsPivot(itup)) ? \
 		( \
-			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
+			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
 		) \
 		: \
 		IndexRelationGetNumberOfAttributes(rel) \
 	)
-#define BTreeTupleSetNAtts(itup, n) \
-	do { \
-		(itup)->t_info |= INDEX_ALT_TID_MASK; \
-		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
-	} while(0)
 
 /*
- * Get tiebreaker heap TID attribute, if any.  Macro works with both pivot
- * and non-pivot tuples, despite differences in how heap TID is represented.
+ * Set number of attributes in tuple, making it into a pivot tuple
  */
-#define BTreeTupleGetHeapTID(itup) \
-	( \
-	  (itup)->t_info & INDEX_ALT_TID_MASK && \
-	  (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_PIVOT_HEAP_TID_ATTR) != 0 ? \
-	  ( \
-		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
-					   sizeof(ItemPointerData)) \
-	  ) \
-	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
-	)
+static inline void
+BTreeTupleSetNAtts(IndexTuple itup, int natts)
+{
+	Assert(natts <= INDEX_MAX_KEYS);
+
+	itup->t_info |= INDEX_ALT_TID_MASK;
+	/* BT_IS_POSTING bit may be unset -- tuple always becomes a pivot tuple */
+	ItemPointerSetOffsetNumber(&itup->t_tid, natts);
+	Assert(BTreeTupleIsPivot(itup));
+}
+
 /*
- * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
- * representation (currently limited to pivot tuples)
+ * Set the bit indicating heap TID attribute present in pivot tuple
  */
-#define BTreeTupleSetAltHeapTID(itup) \
-	do { \
-		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
-		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
-								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_PIVOT_HEAP_TID_ATTR); \
-	} while(0)
+static inline void
+BTreeTupleSetAltHeapTID(IndexTuple pivot)
+{
+	OffsetNumber existing;
+
+	Assert(BTreeTupleIsPivot(pivot));
+
+	existing = ItemPointerGetOffsetNumberNoCheck(&pivot->t_tid);
+	ItemPointerSetOffsetNumber(&pivot->t_tid,
+							   existing | BT_PIVOT_HEAP_TID_ATTR);
+}
+
+/*
+ * Get/set leaf page's "top parent" link from its high key.  Used during page
+ * deletion.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
+ */
+static inline BlockNumber
+BTreeTupleGetTopParent(IndexTuple leafhikey)
+{
+	return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
+}
+
+static inline void
+BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
+{
+	ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
+	BTreeTupleSetNAtts(leafhikey, 0);
+}
+
+/*
+ * Get tiebreaker heap TID attribute, if any.
+ *
+ * This returns the first/lowest heap TID in the case of a posting list tuple.
+ */
+static inline ItemPointer
+BTreeTupleGetHeapTID(IndexTuple itup)
+{
+	if (BTreeTupleIsPivot(itup))
+	{
+		/* Pivot tuple heap TID representation? */
+		if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+			 BT_PIVOT_HEAP_TID_ATTR) != 0)
+			return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
+								  sizeof(ItemPointerData));
+
+		/* Heap TID attribute was truncated */
+		return NULL;
+	}
+	else if (BTreeTupleIsPosting(itup))
+		return BTreeTupleGetPosting(itup);
+
+	return &itup->t_tid;
+}
+
+/*
+ * Get maximum heap TID attribute, which could be the only TID in the case of
+ * a non-pivot tuple that does not have a posting list tuple.
+ *
+ * Works with non-pivot tuples only.
+ */
+static inline ItemPointer
+BTreeTupleGetMaxHeapTID(IndexTuple itup)
+{
+	Assert(!BTreeTupleIsPivot(itup));
+
+	if (BTreeTupleIsPosting(itup))
+	{
+		uint16		nposting = BTreeTupleGetNPosting(itup);
+
+		return BTreeTupleGetPostingN(itup, nposting - 1);
+	}
+
+	return &itup->t_tid;
+}
 
 /*
  *	Operator strategy numbers for B-tree have been moved to access/stratnum.h,
@@ -439,6 +629,9 @@ typedef BTStackData *BTStack;
  * indexes whose version is >= version 4.  It's convenient to keep this close
  * by, rather than accessing the metapage repeatedly.
  *
+ * allequalimage is set to indicate that deduplication is safe for the index.
+ * This is also a property of the index relation rather than an indexscan.
+ *
  * anynullkeys indicates if any of the keys had NULL value when scankey was
  * built from index tuple (note that already-truncated tuple key attributes
  * set NULL as a placeholder key value, which also affects value of
@@ -474,6 +667,7 @@ typedef BTStackData *BTStack;
 typedef struct BTScanInsertData
 {
 	bool		heapkeyspace;
+	bool		allequalimage;
 	bool		anynullkeys;
 	bool		nextkey;
 	bool		pivotsearch;
@@ -512,11 +706,95 @@ typedef struct BTInsertStateData
 	bool		bounds_valid;
 	OffsetNumber low;
 	OffsetNumber stricthigh;
+
+	/*
+	 * if _bt_binsrch_insert found the location inside existing posting list,
+	 * save the position inside the list.  -1 sentinel value indicates overlap
+	 * with an existing posting list tuple that has its LP_DEAD bit set.
+	 */
+	int			postingoff;
 } BTInsertStateData;
 
 typedef BTInsertStateData *BTInsertState;
 
 /*
+ * State used to representing an individual pending tuple during
+ * deduplication.
+ */
+typedef struct BTDedupInterval
+{
+	OffsetNumber baseoff;
+	uint16		nitems;
+} BTDedupInterval;
+
+/*
+ * BTDedupStateData is a working area used during deduplication.
+ *
+ * The status info fields track the state of a whole-page deduplication pass.
+ * State about the current pending posting list is also tracked.
+ *
+ * A pending posting list is comprised of a contiguous group of equal items
+ * from the page, starting from page offset number 'baseoff'.  This is the
+ * offset number of the "base" tuple for new posting list.  'nitems' is the
+ * current total number of existing items from the page that will be merged to
+ * make a new posting list tuple, including the base tuple item.  (Existing
+ * items may themselves be posting list tuples, or regular non-pivot tuples.)
+ *
+ * The total size of the existing tuples to be freed when pending posting list
+ * is processed gets tracked by 'phystupsize'.  This information allows
+ * deduplication to calculate the space saving for each new posting list
+ * tuple, and for the entire pass over the page as a whole.
+ */
+typedef struct BTDedupStateData
+{
+	/* Deduplication status info for entire pass over page */
+	bool		deduplicate;	/* Still deduplicating page? */
+	Size		maxpostingsize; /* Limit on size of final tuple */
+
+	/* Metadata about base tuple of current pending posting list */
+	IndexTuple	base;			/* Use to form new posting list */
+	OffsetNumber baseoff;		/* page offset of base */
+	Size		basetupsize;	/* base size without original posting list */
+
+	/* Other metadata about pending posting list */
+	ItemPointer htids;			/* Heap TIDs in pending posting list */
+	int			nhtids;			/* Number of heap TIDs in htids array */
+	int			nitems;			/* Number of existing tuples/line pointers */
+	Size		phystupsize;	/* Includes line pointer overhead */
+
+	/*
+	 * Array of tuples to go on new version of the page.  Contains one entry
+	 * for each group of consecutive items.  Note that existing tuples that
+	 * will not become posting list tuples do not appear in the array (they
+	 * are implicitly unchanged by deduplication pass).
+	 */
+	int			nintervals;		/* current size of intervals array */
+	BTDedupInterval intervals[MaxIndexTuplesPerPage];
+} BTDedupStateData;
+
+typedef BTDedupStateData *BTDedupState;
+
+/*
+ * BTVacuumPostingData is state that represents how to VACUUM a posting list
+ * tuple when some (though not all) of its TIDs are to be deleted.
+ *
+ * Convention is that itup field is the original posting list tuple on input,
+ * and palloc()'d final tuple used to overwrite existing tuple on output.
+ */
+typedef struct BTVacuumPostingData
+{
+	/* Tuple that will be/was updated */
+	IndexTuple	itup;
+	OffsetNumber updatedoffset;
+
+	/* State needed to describe final itup in WAL */
+	uint16		ndeletedtids;
+	uint16		deletetids[FLEXIBLE_ARRAY_MEMBER];
+} BTVacuumPostingData;
+
+typedef BTVacuumPostingData *BTVacuumPosting;
+
+/*
  * BTScanOpaqueData is the btree-private state needed for an indexscan.
  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
  * details of the preprocessing), information about the current location
@@ -539,7 +817,9 @@ typedef BTInsertStateData *BTInsertState;
  * If we are doing an index-only scan, we save the entire IndexTuple for each
  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
  * into a separate workspace array; each BTScanPosItem stores its tuple's
- * offset within that array.
+ * offset within that array.  Posting list tuples store a "base" tuple once,
+ * allowing the same key to be returned for each TID in the posting list
+ * tuple.
  */
 
 typedef struct BTScanPosItem	/* what we remember about each match */
@@ -583,7 +863,7 @@ typedef struct BTScanPosData
 	int			lastItem;		/* last valid index in items[] */
 	int			itemIndex;		/* current index in items[] */
 
-	BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+	BTScanPosItem items[MaxTIDsPerBTreePage];	/* MUST BE LAST */
 } BTScanPosData;
 
 typedef BTScanPosData *BTScanPos;
@@ -691,6 +971,7 @@ typedef struct BTOptions
 	int			fillfactor;		/* page fill factor in percent (0..100) */
 	/* fraction of newly inserted tuples prior to trigger index cleanup */
 	float8		vacuum_cleanup_index_scale_factor;
+	bool		deduplicate_items;	/* Try to deduplicate items? */
 } BTOptions;
 
 #define BTGetFillFactor(relation) \
@@ -701,6 +982,11 @@ typedef struct BTOptions
 	 BTREE_DEFAULT_FILLFACTOR)
 #define BTGetTargetPageFreeSpace(relation) \
 	(BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
+#define BTGetDeduplicateItems(relation) \
+	(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+				 relation->rd_rel->relam == BTREE_AM_OID), \
+	((relation)->rd_options ? \
+	 ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
 
 /*
  * Constant definition for progress reporting.  Phase numbers must match
@@ -748,6 +1034,22 @@ extern void _bt_parallel_done(IndexScanDesc scan);
 extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
 
 /*
+ * prototypes for functions in nbtdedup.c
+ */
+extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
+							   IndexTuple newitem, Size newitemsz,
+							   bool checkingunique);
+extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+									OffsetNumber baseoff);
+extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
+extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
+extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
+								   int nhtids);
+extern void _bt_update_posting(BTVacuumPosting vacposting);
+extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
+								   int postingoff);
+
+/*
  * prototypes for functions in nbtinsert.c
  */
 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
@@ -765,14 +1067,16 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
 /*
  * prototypes for functions in nbtpage.c
  */
-extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+							 bool allequalimage);
 extern void _bt_update_meta_cleanup_info(Relation rel,
 										 TransactionId oldestBtpoXact, float8 numHeapTuples);
 extern void _bt_upgrademetapage(Page page);
 extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
 extern int	_bt_getrootheight(Relation rel);
-extern bool _bt_heapkeyspace(Relation rel);
+extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
+							bool *allequalimage);
 extern void _bt_checkpage(Relation rel, Buffer buf);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@@ -781,7 +1085,8 @@ extern void _bt_relbuf(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
 extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
-								OffsetNumber *deletable, int ndeletable);
+								OffsetNumber *deletable, int ndeletable,
+								BTVacuumPosting *updatable, int nupdatable);
 extern void _bt_delitems_delete(Relation rel, Buffer buf,
 								OffsetNumber *deletable, int ndeletable,
 								Relation heapRel);
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 776a9bd7233..347976c532c 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,7 +28,8 @@
 #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
 #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
-/* 0x50 and 0x60 are unused */
+#define XLOG_BTREE_INSERT_POST	0x50	/* add index tuple with posting split */
+#define XLOG_BTREE_DEDUP		0x60	/* deduplicate tuples for a page */
 #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
 #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
 #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
@@ -53,21 +54,34 @@ typedef struct xl_btree_metadata
 	uint32		fastlevel;
 	TransactionId oldest_btpo_xact;
 	float8		last_cleanup_num_heap_tuples;
+	bool		allequalimage;
 } xl_btree_metadata;
 
 /*
  * This is what we need to know about simple (without split) insert.
  *
- * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
- * Note that INSERT_META implies it's not a leaf page.
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
+ * INSERT_POST.  Note that INSERT_META and INSERT_UPPER implies it's not a
+ * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
+ * page.
  *
- * Backup Blk 0: original page (data contains the inserted tuple)
+ * Backup Blk 0: original page
  * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
  * Backup Blk 2: xl_btree_metadata, if INSERT_META
+ *
+ * Note: The new tuple is actually the "original" new item in the posting
+ * list split insert case (i.e. the INSERT_POST case).  A split offset for
+ * the posting list is logged before the original new item.  Recovery needs
+ * both, since it must do an in-place update of the existing posting list
+ * that was split as an extra step.  Also, recovery generates a "final"
+ * newitem.  See _bt_swap_posting() for details on posting list splits.
  */
 typedef struct xl_btree_insert
 {
 	OffsetNumber offnum;
+
+	/* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
+	/* NEW TUPLE ALWAYS FOLLOWS AT THE END */
 } xl_btree_insert;
 
 #define SizeOfBtreeInsert	(offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
@@ -92,8 +106,37 @@ typedef struct xl_btree_insert
  * Backup Blk 0: original page / new left page
  *
  * The left page's data portion contains the new item, if it's the _L variant.
- * An IndexTuple representing the high key of the left page must follow with
- * either variant.
+ * _R variant split records generally do not have a newitem (_R variant leaf
+ * page split records that must deal with a posting list split will include an
+ * explicit newitem, though it is never used on the right page -- it is
+ * actually an orignewitem needed to update existing posting list).  The new
+ * high key of the left/original page appears last of all (and must always be
+ * present).
+ *
+ * Page split records that need the REDO routine to deal with a posting list
+ * split directly will have an explicit newitem, which is actually an
+ * orignewitem (the newitem as it was before the posting list split, not
+ * after).  A posting list split always has a newitem that comes immediately
+ * after the posting list being split (which would have overlapped with
+ * orignewitem prior to split).  Usually REDO must deal with posting list
+ * splits with an _L variant page split record, and usually both the new
+ * posting list and the final newitem go on the left page (the existing
+ * posting list will be inserted instead of the old, and the final newitem
+ * will be inserted next to that).  However, _R variant split records will
+ * include an orignewitem when the split point for the page happens to have a
+ * lastleft tuple that is also the posting list being split (leaving newitem
+ * as the page split's firstright tuple).  The existence of this corner case
+ * does not change the basic fact about newitem/orignewitem for the REDO
+ * routine: it is always state used for the left page alone.  (This is why the
+ * record's postingoff field isn't a reliable indicator of whether or not a
+ * posting list split occurred during the page split; a non-zero value merely
+ * indicates that the REDO routine must reconstruct a new posting list tuple
+ * that is needed for the left page.)
+ *
+ * This posting list split handling is equivalent to the xl_btree_insert REDO
+ * routine's INSERT_POST handling.  While the details are more complicated
+ * here, the concept and goals are exactly the same.  See _bt_swap_posting()
+ * for details on posting list splits.
  *
  * Backup Blk 1: new right page
  *
@@ -111,15 +154,33 @@ typedef struct xl_btree_split
 {
 	uint32		level;			/* tree level of page being split */
 	OffsetNumber firstright;	/* first item moved to right page */
-	OffsetNumber newitemoff;	/* new item's offset (useful for _L variant) */
+	OffsetNumber newitemoff;	/* new item's offset */
+	uint16		postingoff;		/* offset inside orig posting tuple */
 } xl_btree_split;
 
-#define SizeOfBtreeSplit	(offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
+#define SizeOfBtreeSplit	(offsetof(xl_btree_split, postingoff) + sizeof(uint16))
+
+/*
+ * When page is deduplicated, consecutive groups of tuples with equal keys are
+ * merged together into posting list tuples.
+ *
+ * The WAL record represents a deduplication pass for a leaf page.  An array
+ * of BTDedupInterval structs follows.
+ */
+typedef struct xl_btree_dedup
+{
+	uint16		nintervals;
+
+	/* DEDUPLICATION INTERVALS FOLLOW */
+} xl_btree_dedup;
+
+#define SizeOfBtreeDedup 	(offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
 
 /*
  * This is what we need to know about delete of individual leaf index tuples.
  * The WAL record can represent deletion of any number of index tuples on a
- * single index page when *not* executed by VACUUM.
+ * single index page when *not* executed by VACUUM.  Deletion of a subset of
+ * the TIDs within a posting list tuple is not supported.
  *
  * Backup Blk 0: index page
  */
@@ -150,21 +211,43 @@ typedef struct xl_btree_reuse_page
 #define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page))
 
 /*
- * This is what we need to know about vacuum of individual leaf index tuples.
- * The WAL record can represent deletion of any number of index tuples on a
- * single index page when executed by VACUUM.
+ * This is what we need to know about which TIDs to remove from an individual
+ * posting list tuple during vacuuming.  An array of these may appear at the
+ * end of xl_btree_vacuum records.
+ */
+typedef struct xl_btree_update
+{
+	uint16		ndeletedtids;
+
+	/* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
+} xl_btree_update;
+
+#define SizeOfBtreeUpdate	(offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
+
+/*
+ * This is what we need to know about a VACUUM of a leaf page.  The WAL record
+ * can represent deletion of any number of index tuples on a single index page
+ * when executed by VACUUM.  It can also support "updates" of index tuples,
+ * which is how deletes of a subset of TIDs contained in an existing posting
+ * list tuple are implemented. (Updates are only used when there will be some
+ * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can
+ * just be deleted).
  *
- * Note that the WAL record in any vacuum of an index must have at least one
- * item to delete.
+ * Updated posting list tuples are represented using xl_btree_update metadata.
+ * The REDO routine uses each xl_btree_update (plus its corresponding original
+ * index tuple from the target leaf page) to generate the final updated tuple.
  */
 typedef struct xl_btree_vacuum
 {
-	uint32		ndeleted;
+	uint16		ndeleted;
+	uint16		nupdated;
 
 	/* DELETED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TUPLES METADATA ARRAY FOLLOWS */
 } xl_btree_vacuum;
 
-#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32))
+#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
 
 /*
  * This is what we need to know about marking an empty branch for deletion.
@@ -245,6 +328,8 @@ typedef struct xl_btree_newroot
 extern void btree_redo(XLogReaderState *record);
 extern void btree_desc(StringInfo buf, XLogReaderState *record);
 extern const char *btree_identify(uint8 info);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
 extern void btree_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* NBTXLOG_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index c88dccfb8dd..6c15df7e707 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL,
 PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
 PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
 PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
 PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
 PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
 PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 087918d41dd..27ded593abb 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD104	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD105	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {