diff options
Diffstat (limited to 'src/include')
| -rw-r--r-- | src/include/access/nbtree.h | 433 | ||||
| -rw-r--r-- | src/include/access/nbtxlog.h | 117 | ||||
| -rw-r--r-- | src/include/access/rmgrlist.h | 2 | ||||
| -rw-r--r-- | src/include/access/xlog_internal.h | 2 |
4 files changed, 472 insertions, 82 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e8d4d2b55b7..bfe49f46b08 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -108,6 +108,7 @@ typedef struct BTMetaPageData * pages */ float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples * during last cleanup */ + bool btm_allequalimage; /* are all columns "equalimage"? */ } BTMetaPageData; #define BTPageGetMeta(p) \ @@ -124,6 +125,14 @@ typedef struct BTMetaPageData * need to be immediately re-indexed at pg_upgrade. In order to get the * new heapkeyspace semantics, however, a REINDEX is needed. * + * Deduplication is safe to use when the btm_allequalimage field is set to + * true. It's safe to read the btm_allequalimage field on version 3, but + * only version 4 indexes make use of deduplication. Even version 4 + * indexes created on PostgreSQL v12 will need a REINDEX to make use of + * deduplication, though, since there is no other way to set + * btm_allequalimage to true (pg_upgrade hasn't been taught to set the + * metapage field). + * * Btree version 2 is mostly the same as version 3. There are two new * fields in the metapage that were introduced in version 3. A version 2 * metapage will be automatically upgraded to version 3 on the first @@ -157,6 +166,21 @@ typedef struct BTMetaPageData MAXALIGN(sizeof(BTPageOpaqueData))) / 3) /* + * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples + * that may be stored on a btree leaf page. It is used to size the + * per-page temporary buffers used by index scans.) + * + * Note: we don't bother considering per-tuple overheads here to keep + * things simple (value is based on how many elements a single array of + * heap TIDs must have to fill the space between the page header and + * special area). The value is slightly higher (i.e. more conservative) + * than necessary as a result, which is considered acceptable. + */ +#define MaxTIDsPerBTreePage \ + (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \ + sizeof(ItemPointerData)) + +/* * The leaf-page fillfactor defaults to 90% but is user-adjustable. * For pages above the leaf level, we use a fixed 70% fillfactor. * The fillfactor is applied during index build and when splitting @@ -230,16 +254,15 @@ typedef struct BTMetaPageData * tuples (non-pivot tuples). _bt_check_natts() enforces the rules * described here. * - * Non-pivot tuple format: + * Non-pivot tuple format (plain/non-posting variant): * * t_tid | t_info | key values | INCLUDE columns, if any * * t_tid points to the heap TID, which is a tiebreaker key column as of - * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never - * set for non-pivot tuples. + * BTREE_VERSION 4. * - * All other types of index tuples ("pivot" tuples) only have key columns, - * since pivot tuples only exist to represent how the key space is + * Non-pivot tuples complement pivot tuples, which only have key columns. + * The sole purpose of pivot tuples is to represent how the key space is * separated. In general, any B-Tree index that has more than one level * (i.e. any index that does not just consist of a metapage and a single * leaf root page) must have some number of pivot tuples, since pivot @@ -264,7 +287,8 @@ typedef struct BTMetaPageData * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap * TID column sometimes stored in pivot tuples -- that's represented by * the presence of BT_PIVOT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in - * t_info is always set on BTREE_VERSION 4 pivot tuples. + * t_info is always set on BTREE_VERSION 4 pivot tuples, since + * BTreeTupleIsPivot() must work reliably on heapkeyspace versions. * * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in * pivot tuples. In that case, the number of key columns is implicitly @@ -279,90 +303,256 @@ typedef struct BTMetaPageData * The 12 least significant offset bits from t_tid are used to represent * the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for - * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any - * number of columns/attributes <= INDEX_MAX_KEYS. + * future use. BT_OFFSET_MASK should be large enough to store any number + * of columns/attributes <= INDEX_MAX_KEYS. + * + * Sometimes non-pivot tuples also use a representation that repurposes + * t_tid to store metadata rather than a TID. PostgreSQL v13 introduced a + * new non-pivot tuple format to support deduplication: posting list + * tuples. Deduplication merges together multiple equal non-pivot tuples + * into a logically equivalent, space efficient representation. A posting + * list is an array of ItemPointerData elements. Non-pivot tuples are + * merged together to form posting list tuples lazily, at the point where + * we'd otherwise have to split a leaf page. + * + * Posting tuple format (alternative non-pivot tuple representation): + * + * t_tid | t_info | key values | posting list (TID array) + * + * Posting list tuples are recognized as such by having the + * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status + * bit set in t_tid. These flags redefine the content of the posting + * tuple's t_tid to store an offset to the posting list, as well as the + * total number of posting list array elements. + * + * The 12 least significant offset bits from t_tid are used to represent + * the number of posting items present in the tuple, leaving 4 status + * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for + * future use. Like any non-pivot tuple, the number of columns stored is + * always implicitly the total number in the index (in practice there can + * never be non-key columns stored, since deduplication is not supported + * with INCLUDE indexes). BT_OFFSET_MASK should be large enough to store + * any number of posting list TIDs that might be present in a tuple (since + * tuple size is subject to the INDEX_SIZE_MASK limit). * * Note well: The macros that deal with the number of attributes in tuples - * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple, - * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot - * tuple (or must have the same number of attributes as the index has - * generally in the case of !heapkeyspace indexes). They will need to be - * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK - * for something else. + * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or + * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set + * must be a non-pivot tuple (or must have the same number of attributes as + * the index has generally in the case of !heapkeyspace indexes). */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT /* Item pointer offset bits */ #define BT_RESERVED_OFFSET_MASK 0xF000 -#define BT_N_KEYS_OFFSET_MASK 0x0FFF +#define BT_OFFSET_MASK 0x0FFF #define BT_PIVOT_HEAP_TID_ATTR 0x1000 +#define BT_IS_POSTING 0x2000 + +/* + * Note: BTreeTupleIsPivot() can have false negatives (but not false + * positives) when used with !heapkeyspace indexes + */ +static inline bool +BTreeTupleIsPivot(IndexTuple itup) +{ + if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + return false; + /* absence of BT_IS_POSTING in offset number indicates pivot tuple */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0) + return false; + + return true; +} + +static inline bool +BTreeTupleIsPosting(IndexTuple itup) +{ + if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + return false; + /* presence of BT_IS_POSTING in offset number indicates posting tuple */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0) + return false; + + return true; +} + +static inline void +BTreeTupleSetPosting(IndexTuple itup, int nhtids, int postingoffset) +{ + Assert(nhtids > 1 && (nhtids & BT_OFFSET_MASK) == nhtids); + Assert(postingoffset == MAXALIGN(postingoffset)); + Assert(postingoffset < INDEX_SIZE_MASK); + + itup->t_info |= INDEX_ALT_TID_MASK; + ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING)); + ItemPointerSetBlockNumber(&itup->t_tid, postingoffset); +} + +static inline uint16 +BTreeTupleGetNPosting(IndexTuple posting) +{ + OffsetNumber existing; + + Assert(BTreeTupleIsPosting(posting)); + + existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid); + return (existing & BT_OFFSET_MASK); +} -/* Get/set downlink block number in pivot tuple */ -#define BTreeTupleGetDownLink(itup) \ - ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) -#define BTreeTupleSetDownLink(itup, blkno) \ - ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)) +static inline uint32 +BTreeTupleGetPostingOffset(IndexTuple posting) +{ + Assert(BTreeTupleIsPosting(posting)); + + return ItemPointerGetBlockNumberNoCheck(&posting->t_tid); +} + +static inline ItemPointer +BTreeTupleGetPosting(IndexTuple posting) +{ + return (ItemPointer) ((char *) posting + + BTreeTupleGetPostingOffset(posting)); +} + +static inline ItemPointer +BTreeTupleGetPostingN(IndexTuple posting, int n) +{ + return BTreeTupleGetPosting(posting) + n; +} /* - * Get/set leaf page highkey's link. During the second phase of deletion, the - * target leaf page's high key may point to an ancestor page (at all other - * times, the leaf level high key's link is not used). See the nbtree README - * for full details. + * Get/set downlink block number in pivot tuple. + * + * Note: Cannot assert that tuple is a pivot tuple. If we did so then + * !heapkeyspace indexes would exhibit false positive assertion failures. */ -#define BTreeTupleGetTopParent(itup) \ - ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) -#define BTreeTupleSetTopParent(itup, blkno) \ - do { \ - ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \ - BTreeTupleSetNAtts((itup), 0); \ - } while(0) +static inline BlockNumber +BTreeTupleGetDownLink(IndexTuple pivot) +{ + return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid); +} + +static inline void +BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno) +{ + ItemPointerSetBlockNumber(&pivot->t_tid, blkno); +} /* - * Get/set number of attributes within B-tree index tuple. + * Get number of attributes within tuple. * * Note that this does not include an implicit tiebreaker heap TID * attribute, if any. Note also that the number of key attributes must be * explicitly represented in all heapkeyspace pivot tuples. + * + * Note: This is defined as a macro rather than an inline function to + * avoid including rel.h. */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ - (itup)->t_info & INDEX_ALT_TID_MASK ? \ + (BTreeTupleIsPivot(itup)) ? \ ( \ - ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \ ) \ : \ IndexRelationGetNumberOfAttributes(rel) \ ) -#define BTreeTupleSetNAtts(itup, n) \ - do { \ - (itup)->t_info |= INDEX_ALT_TID_MASK; \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ - } while(0) /* - * Get tiebreaker heap TID attribute, if any. Macro works with both pivot - * and non-pivot tuples, despite differences in how heap TID is represented. + * Set number of attributes in tuple, making it into a pivot tuple */ -#define BTreeTupleGetHeapTID(itup) \ - ( \ - (itup)->t_info & INDEX_ALT_TID_MASK && \ - (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_PIVOT_HEAP_TID_ATTR) != 0 ? \ - ( \ - (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ - sizeof(ItemPointerData)) \ - ) \ - : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ - ) +static inline void +BTreeTupleSetNAtts(IndexTuple itup, int natts) +{ + Assert(natts <= INDEX_MAX_KEYS); + + itup->t_info |= INDEX_ALT_TID_MASK; + /* BT_IS_POSTING bit may be unset -- tuple always becomes a pivot tuple */ + ItemPointerSetOffsetNumber(&itup->t_tid, natts); + Assert(BTreeTupleIsPivot(itup)); +} + /* - * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK - * representation (currently limited to pivot tuples) + * Set the bit indicating heap TID attribute present in pivot tuple */ -#define BTreeTupleSetAltHeapTID(itup) \ - do { \ - Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, \ - ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_PIVOT_HEAP_TID_ATTR); \ - } while(0) +static inline void +BTreeTupleSetAltHeapTID(IndexTuple pivot) +{ + OffsetNumber existing; + + Assert(BTreeTupleIsPivot(pivot)); + + existing = ItemPointerGetOffsetNumberNoCheck(&pivot->t_tid); + ItemPointerSetOffsetNumber(&pivot->t_tid, + existing | BT_PIVOT_HEAP_TID_ATTR); +} + +/* + * Get/set leaf page's "top parent" link from its high key. Used during page + * deletion. + * + * Note: Cannot assert that tuple is a pivot tuple. If we did so then + * !heapkeyspace indexes would exhibit false positive assertion failures. + */ +static inline BlockNumber +BTreeTupleGetTopParent(IndexTuple leafhikey) +{ + return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid); +} + +static inline void +BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno) +{ + ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno); + BTreeTupleSetNAtts(leafhikey, 0); +} + +/* + * Get tiebreaker heap TID attribute, if any. + * + * This returns the first/lowest heap TID in the case of a posting list tuple. + */ +static inline ItemPointer +BTreeTupleGetHeapTID(IndexTuple itup) +{ + if (BTreeTupleIsPivot(itup)) + { + /* Pivot tuple heap TID representation? */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_PIVOT_HEAP_TID_ATTR) != 0) + return (ItemPointer) ((char *) itup + IndexTupleSize(itup) - + sizeof(ItemPointerData)); + + /* Heap TID attribute was truncated */ + return NULL; + } + else if (BTreeTupleIsPosting(itup)) + return BTreeTupleGetPosting(itup); + + return &itup->t_tid; +} + +/* + * Get maximum heap TID attribute, which could be the only TID in the case of + * a non-pivot tuple that does not have a posting list tuple. + * + * Works with non-pivot tuples only. + */ +static inline ItemPointer +BTreeTupleGetMaxHeapTID(IndexTuple itup) +{ + Assert(!BTreeTupleIsPivot(itup)); + + if (BTreeTupleIsPosting(itup)) + { + uint16 nposting = BTreeTupleGetNPosting(itup); + + return BTreeTupleGetPostingN(itup, nposting - 1); + } + + return &itup->t_tid; +} /* * Operator strategy numbers for B-tree have been moved to access/stratnum.h, @@ -439,6 +629,9 @@ typedef BTStackData *BTStack; * indexes whose version is >= version 4. It's convenient to keep this close * by, rather than accessing the metapage repeatedly. * + * allequalimage is set to indicate that deduplication is safe for the index. + * This is also a property of the index relation rather than an indexscan. + * * anynullkeys indicates if any of the keys had NULL value when scankey was * built from index tuple (note that already-truncated tuple key attributes * set NULL as a placeholder key value, which also affects value of @@ -474,6 +667,7 @@ typedef BTStackData *BTStack; typedef struct BTScanInsertData { bool heapkeyspace; + bool allequalimage; bool anynullkeys; bool nextkey; bool pivotsearch; @@ -512,11 +706,95 @@ typedef struct BTInsertStateData bool bounds_valid; OffsetNumber low; OffsetNumber stricthigh; + + /* + * if _bt_binsrch_insert found the location inside existing posting list, + * save the position inside the list. -1 sentinel value indicates overlap + * with an existing posting list tuple that has its LP_DEAD bit set. + */ + int postingoff; } BTInsertStateData; typedef BTInsertStateData *BTInsertState; /* + * State used to representing an individual pending tuple during + * deduplication. + */ +typedef struct BTDedupInterval +{ + OffsetNumber baseoff; + uint16 nitems; +} BTDedupInterval; + +/* + * BTDedupStateData is a working area used during deduplication. + * + * The status info fields track the state of a whole-page deduplication pass. + * State about the current pending posting list is also tracked. + * + * A pending posting list is comprised of a contiguous group of equal items + * from the page, starting from page offset number 'baseoff'. This is the + * offset number of the "base" tuple for new posting list. 'nitems' is the + * current total number of existing items from the page that will be merged to + * make a new posting list tuple, including the base tuple item. (Existing + * items may themselves be posting list tuples, or regular non-pivot tuples.) + * + * The total size of the existing tuples to be freed when pending posting list + * is processed gets tracked by 'phystupsize'. This information allows + * deduplication to calculate the space saving for each new posting list + * tuple, and for the entire pass over the page as a whole. + */ +typedef struct BTDedupStateData +{ + /* Deduplication status info for entire pass over page */ + bool deduplicate; /* Still deduplicating page? */ + Size maxpostingsize; /* Limit on size of final tuple */ + + /* Metadata about base tuple of current pending posting list */ + IndexTuple base; /* Use to form new posting list */ + OffsetNumber baseoff; /* page offset of base */ + Size basetupsize; /* base size without original posting list */ + + /* Other metadata about pending posting list */ + ItemPointer htids; /* Heap TIDs in pending posting list */ + int nhtids; /* Number of heap TIDs in htids array */ + int nitems; /* Number of existing tuples/line pointers */ + Size phystupsize; /* Includes line pointer overhead */ + + /* + * Array of tuples to go on new version of the page. Contains one entry + * for each group of consecutive items. Note that existing tuples that + * will not become posting list tuples do not appear in the array (they + * are implicitly unchanged by deduplication pass). + */ + int nintervals; /* current size of intervals array */ + BTDedupInterval intervals[MaxIndexTuplesPerPage]; +} BTDedupStateData; + +typedef BTDedupStateData *BTDedupState; + +/* + * BTVacuumPostingData is state that represents how to VACUUM a posting list + * tuple when some (though not all) of its TIDs are to be deleted. + * + * Convention is that itup field is the original posting list tuple on input, + * and palloc()'d final tuple used to overwrite existing tuple on output. + */ +typedef struct BTVacuumPostingData +{ + /* Tuple that will be/was updated */ + IndexTuple itup; + OffsetNumber updatedoffset; + + /* State needed to describe final itup in WAL */ + uint16 ndeletedtids; + uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]; +} BTVacuumPostingData; + +typedef BTVacuumPostingData *BTVacuumPosting; + +/* * BTScanOpaqueData is the btree-private state needed for an indexscan. * This consists of preprocessed scan keys (see _bt_preprocess_keys() for * details of the preprocessing), information about the current location @@ -539,7 +817,9 @@ typedef BTInsertStateData *BTInsertState; * If we are doing an index-only scan, we save the entire IndexTuple for each * matched item, otherwise only its heap TID and offset. The IndexTuples go * into a separate workspace array; each BTScanPosItem stores its tuple's - * offset within that array. + * offset within that array. Posting list tuples store a "base" tuple once, + * allowing the same key to be returned for each TID in the posting list + * tuple. */ typedef struct BTScanPosItem /* what we remember about each match */ @@ -583,7 +863,7 @@ typedef struct BTScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ + BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */ } BTScanPosData; typedef BTScanPosData *BTScanPos; @@ -691,6 +971,7 @@ typedef struct BTOptions int fillfactor; /* page fill factor in percent (0..100) */ /* fraction of newly inserted tuples prior to trigger index cleanup */ float8 vacuum_cleanup_index_scale_factor; + bool deduplicate_items; /* Try to deduplicate items? */ } BTOptions; #define BTGetFillFactor(relation) \ @@ -701,6 +982,11 @@ typedef struct BTOptions BTREE_DEFAULT_FILLFACTOR) #define BTGetTargetPageFreeSpace(relation) \ (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100) +#define BTGetDeduplicateItems(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BTREE_AM_OID), \ + ((relation)->rd_options ? \ + ((BTOptions *) (relation)->rd_options)->deduplicate_items : true)) /* * Constant definition for progress reporting. Phase numbers must match @@ -748,6 +1034,22 @@ extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); /* + * prototypes for functions in nbtdedup.c + */ +extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, + IndexTuple newitem, Size newitemsz, + bool checkingunique); +extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff); +extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup); +extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state); +extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, + int nhtids); +extern void _bt_update_posting(BTVacuumPosting vacposting); +extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, + int postingoff); + +/* * prototypes for functions in nbtinsert.c */ extern bool _bt_doinsert(Relation rel, IndexTuple itup, @@ -765,14 +1067,16 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page, /* * prototypes for functions in nbtpage.c */ -extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool allequalimage); extern void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples); extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); -extern bool _bt_heapkeyspace(Relation rel); +extern void _bt_metaversion(Relation rel, bool *heapkeyspace, + bool *allequalimage); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, @@ -781,7 +1085,8 @@ extern void _bt_relbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *deletable, int ndeletable); + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable); extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel); diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 776a9bd7233..347976c532c 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -28,7 +28,8 @@ #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ -/* 0x50 and 0x60 are unused */ +#define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */ +#define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */ #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ @@ -53,21 +54,34 @@ typedef struct xl_btree_metadata uint32 fastlevel; TransactionId oldest_btpo_xact; float8 last_cleanup_num_heap_tuples; + bool allequalimage; } xl_btree_metadata; /* * This is what we need to know about simple (without split) insert. * - * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. - * Note that INSERT_META implies it's not a leaf page. + * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and + * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a + * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf + * page. * - * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 0: original page * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META * Backup Blk 2: xl_btree_metadata, if INSERT_META + * + * Note: The new tuple is actually the "original" new item in the posting + * list split insert case (i.e. the INSERT_POST case). A split offset for + * the posting list is logged before the original new item. Recovery needs + * both, since it must do an in-place update of the existing posting list + * that was split as an extra step. Also, recovery generates a "final" + * newitem. See _bt_swap_posting() for details on posting list splits. */ typedef struct xl_btree_insert { OffsetNumber offnum; + + /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */ + /* NEW TUPLE ALWAYS FOLLOWS AT THE END */ } xl_btree_insert; #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) @@ -92,8 +106,37 @@ typedef struct xl_btree_insert * Backup Blk 0: original page / new left page * * The left page's data portion contains the new item, if it's the _L variant. - * An IndexTuple representing the high key of the left page must follow with - * either variant. + * _R variant split records generally do not have a newitem (_R variant leaf + * page split records that must deal with a posting list split will include an + * explicit newitem, though it is never used on the right page -- it is + * actually an orignewitem needed to update existing posting list). The new + * high key of the left/original page appears last of all (and must always be + * present). + * + * Page split records that need the REDO routine to deal with a posting list + * split directly will have an explicit newitem, which is actually an + * orignewitem (the newitem as it was before the posting list split, not + * after). A posting list split always has a newitem that comes immediately + * after the posting list being split (which would have overlapped with + * orignewitem prior to split). Usually REDO must deal with posting list + * splits with an _L variant page split record, and usually both the new + * posting list and the final newitem go on the left page (the existing + * posting list will be inserted instead of the old, and the final newitem + * will be inserted next to that). However, _R variant split records will + * include an orignewitem when the split point for the page happens to have a + * lastleft tuple that is also the posting list being split (leaving newitem + * as the page split's firstright tuple). The existence of this corner case + * does not change the basic fact about newitem/orignewitem for the REDO + * routine: it is always state used for the left page alone. (This is why the + * record's postingoff field isn't a reliable indicator of whether or not a + * posting list split occurred during the page split; a non-zero value merely + * indicates that the REDO routine must reconstruct a new posting list tuple + * that is needed for the left page.) + * + * This posting list split handling is equivalent to the xl_btree_insert REDO + * routine's INSERT_POST handling. While the details are more complicated + * here, the concept and goals are exactly the same. See _bt_swap_posting() + * for details on posting list splits. * * Backup Blk 1: new right page * @@ -111,15 +154,33 @@ typedef struct xl_btree_split { uint32 level; /* tree level of page being split */ OffsetNumber firstright; /* first item moved to right page */ - OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */ + OffsetNumber newitemoff; /* new item's offset */ + uint16 postingoff; /* offset inside orig posting tuple */ } xl_btree_split; -#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) +#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16)) + +/* + * When page is deduplicated, consecutive groups of tuples with equal keys are + * merged together into posting list tuples. + * + * The WAL record represents a deduplication pass for a leaf page. An array + * of BTDedupInterval structs follows. + */ +typedef struct xl_btree_dedup +{ + uint16 nintervals; + + /* DEDUPLICATION INTERVALS FOLLOW */ +} xl_btree_dedup; + +#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16)) /* * This is what we need to know about delete of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a - * single index page when *not* executed by VACUUM. + * single index page when *not* executed by VACUUM. Deletion of a subset of + * the TIDs within a posting list tuple is not supported. * * Backup Blk 0: index page */ @@ -150,21 +211,43 @@ typedef struct xl_btree_reuse_page #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) /* - * This is what we need to know about vacuum of individual leaf index tuples. - * The WAL record can represent deletion of any number of index tuples on a - * single index page when executed by VACUUM. + * This is what we need to know about which TIDs to remove from an individual + * posting list tuple during vacuuming. An array of these may appear at the + * end of xl_btree_vacuum records. + */ +typedef struct xl_btree_update +{ + uint16 ndeletedtids; + + /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */ +} xl_btree_update; + +#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16)) + +/* + * This is what we need to know about a VACUUM of a leaf page. The WAL record + * can represent deletion of any number of index tuples on a single index page + * when executed by VACUUM. It can also support "updates" of index tuples, + * which is how deletes of a subset of TIDs contained in an existing posting + * list tuple are implemented. (Updates are only used when there will be some + * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can + * just be deleted). * - * Note that the WAL record in any vacuum of an index must have at least one - * item to delete. + * Updated posting list tuples are represented using xl_btree_update metadata. + * The REDO routine uses each xl_btree_update (plus its corresponding original + * index tuple from the target leaf page) to generate the final updated tuple. */ typedef struct xl_btree_vacuum { - uint32 ndeleted; + uint16 ndeleted; + uint16 nupdated; /* DELETED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TUPLES METADATA ARRAY FOLLOWS */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) /* * This is what we need to know about marking an empty branch for deletion. @@ -245,6 +328,8 @@ typedef struct xl_btree_newroot extern void btree_redo(XLogReaderState *record); extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); +extern void btree_xlog_startup(void); +extern void btree_xlog_cleanup(void); extern void btree_mask(char *pagedata, BlockNumber blkno); #endif /* NBTXLOG_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index c88dccfb8dd..6c15df7e707 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL) PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask) PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask) PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask) PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask) PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask) diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 087918d41dd..27ded593abb 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD104 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD105 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { |
