diff options
author | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2014-11-20 17:56:26 +0200 |
---|---|---|
committer | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2014-11-20 18:46:41 +0200 |
commit | 2c03216d831160bedd72d45f712601b6f7d03f1c (patch) | |
tree | ab6a03d031ffa605d848b0b7067add15e56e2207 /src/backend/access/nbtree/nbtinsert.c | |
parent | 8dc626defec23016dd5988208d8704b858b9d21d (diff) |
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 207 |
1 files changed, 57 insertions, 150 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index bcaba7e5e84..2c4f9904e1a 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -837,37 +837,25 @@ _bt_insertonpg(Relation rel, if (RelationNeedsWAL(rel)) { xl_btree_insert xlrec; - BlockNumber xlleftchild; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; - XLogRecData rdata[4]; - XLogRecData *nextrdata; IndexTupleData trunctuple; - xlrec.target.node = rel->rd_node; - ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off); + xlrec.offnum = itup_off; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeInsert; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = nextrdata = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); if (P_ISLEAF(lpageop)) xlinfo = XLOG_BTREE_INSERT_LEAF; else { /* - * Include the block number of the left child, whose - * INCOMPLETE_SPLIT flag was cleared. + * Register the left child whose INCOMPLETE_SPLIT flag was + * cleared. */ - xlleftchild = BufferGetBlockNumber(cbuf); - nextrdata->data = (char *) &xlleftchild; - nextrdata->len = sizeof(BlockNumber); - nextrdata->buffer = cbuf; - nextrdata->buffer_std = true; - nextrdata->next = nextrdata + 1; - nextrdata++; + XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD); xlinfo = XLOG_BTREE_INSERT_UPPER; } @@ -879,33 +867,25 @@ _bt_insertonpg(Relation rel, xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; - nextrdata->data = (char *) &xlmeta; - nextrdata->len = sizeof(xl_btree_metadata); - nextrdata->buffer = InvalidBuffer; - nextrdata->next = nextrdata + 1; - nextrdata++; + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT); + XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_INSERT_META; } /* Read comments in _bt_pgaddtup */ + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop)) { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); - nextrdata->data = (char *) &trunctuple; - nextrdata->len = sizeof(IndexTupleData); + XLogRegisterBufData(0, (char *) &trunctuple, + sizeof(IndexTupleData)); } else - { - nextrdata->data = (char *) itup; - nextrdata->len = IndexTupleDSize(*itup); - } - nextrdata->buffer = buf; - nextrdata->buffer_std = true; - nextrdata->next = NULL; + XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup)); - recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); + recptr = XLogInsert(RM_BTREE_ID, xlinfo); if (BufferIsValid(metabuf)) { @@ -1260,56 +1240,37 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; - XLogRecData rdata[7]; - XLogRecData *lastrdata; - BlockNumber cblkno; - - xlrec.node = rel->rd_node; - xlrec.leftsib = origpagenumber; - xlrec.rightsib = rightpagenumber; - xlrec.rnext = ropaque->btpo_next; + xlrec.level = ropaque->btpo.level; xlrec.firstright = firstright; + xlrec.newitemoff = newitemoff; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeSplit; - rdata[0].buffer = InvalidBuffer; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); - lastrdata = &rdata[0]; + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); + /* Log the right sibling, because we've changed its prev-pointer. */ + if (!P_RIGHTMOST(ropaque)) + XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); + if (BufferIsValid(cbuf)) + XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); /* - * Log the new item and its offset, if it was inserted on the left - * page. (If it was put on the right page, we don't need to explicitly - * WAL log it because it's included with all the other items on the - * right page.) Show the new item as belonging to the left page - * buffer, so that it is not stored if XLogInsert decides it needs a - * full-page image of the left page. We store the offset anyway, - * though, to support archive compression of these records. + * Log the new item, if it was inserted on the left page. (If it was + * put on the right page, we don't need to explicitly WAL log it + * because it's included with all the other items on the right page.) + * Show the new item as belonging to the left page buffer, so that it + * is not stored if XLogInsert decides it needs a full-page image of + * the left page. We store the offset anyway, though, to support + * archive compression of these records. */ if (newitemonleft) - { - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = (char *) &newitemoff; - lastrdata->len = sizeof(OffsetNumber); - lastrdata->buffer = InvalidBuffer; - - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = (char *) newitem; - lastrdata->len = MAXALIGN(newitemsz); - lastrdata->buffer = buf; /* backup block 0 */ - lastrdata->buffer_std = true; - } + XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); /* Log left page */ if (!isleaf) { - lastrdata->next = lastrdata + 1; - lastrdata++; - /* * We must also log the left page's high key, because the right * page's leftmost key is suppressed on non-leaf levels. Show it @@ -1319,43 +1280,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, */ itemid = PageGetItemId(origpage, P_HIKEY); item = (IndexTuple) PageGetItem(origpage, itemid); - lastrdata->data = (char *) item; - lastrdata->len = MAXALIGN(IndexTupleSize(item)); - lastrdata->buffer = buf; /* backup block 0 */ - lastrdata->buffer_std = true; - } - - if (isleaf && !newitemonleft) - { - lastrdata->next = lastrdata + 1; - lastrdata++; - - /* - * Although we don't need to WAL-log anything on the left page, we - * still need XLogInsert to consider storing a full-page image of - * the left page, so make an empty entry referencing that buffer. - * This also ensures that the left page is always backup block 0. - */ - lastrdata->data = NULL; - lastrdata->len = 0; - lastrdata->buffer = buf; /* backup block 0 */ - lastrdata->buffer_std = true; - } - - /* - * Log block number of left child, whose INCOMPLETE_SPLIT flag this - * insertion clears. - */ - if (!isleaf) - { - lastrdata->next = lastrdata + 1; - lastrdata++; - - cblkno = BufferGetBlockNumber(cbuf); - lastrdata->data = (char *) &cblkno; - lastrdata->len = sizeof(BlockNumber); - lastrdata->buffer = cbuf; /* backup block 1 */ - lastrdata->buffer_std = true; + XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); } /* @@ -1370,35 +1295,16 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, * and so the item pointers can be reconstructed. See comments for * _bt_restore_page(). */ - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = (char *) rightpage + - ((PageHeader) rightpage)->pd_upper; - lastrdata->len = ((PageHeader) rightpage)->pd_special - - ((PageHeader) rightpage)->pd_upper; - lastrdata->buffer = InvalidBuffer; - - /* Log the right sibling, because we've changed its' prev-pointer. */ - if (!P_RIGHTMOST(ropaque)) - { - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = NULL; - lastrdata->len = 0; - lastrdata->buffer = sbuf; /* bkp block 1 (leaf) or 2 (non-leaf) */ - lastrdata->buffer_std = true; - } - - lastrdata->next = NULL; + XLogRegisterBufData(1, + (char *) rightpage + ((PageHeader) rightpage)->pd_upper, + ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); if (isroot) xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT; else xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; - recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); + recptr = XLogInsert(RM_BTREE_ID, xlinfo); PageSetLSN(origpage, recptr); PageSetLSN(rightpage, recptr); @@ -2090,34 +1996,35 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) { xl_btree_newroot xlrec; XLogRecPtr recptr; - XLogRecData rdata[3]; + xl_btree_metadata md; - xlrec.node = rel->rd_node; xlrec.rootblk = rootblknum; xlrec.level = metad->btm_level; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeNewroot; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); + + XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT); + + md.root = rootblknum; + md.level = metad->btm_level; + md.fastroot = rootblknum; + md.fastlevel = metad->btm_level; + + XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); /* * Direct access to page is not good but faster - we should implement * some new func in page API. */ - rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper; - rdata[1].len = ((PageHeader) rootpage)->pd_special - - ((PageHeader) rootpage)->pd_upper; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = &(rdata[2]); - - /* Make a full-page image of the left child if needed */ - rdata[2].data = NULL; - rdata[2].len = 0; - rdata[2].buffer = lbuf; - rdata[2].next = NULL; - - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata); + XLogRegisterBufData(0, + (char *) rootpage + ((PageHeader) rootpage)->pd_upper, + ((PageHeader) rootpage)->pd_special - + ((PageHeader) rootpage)->pd_upper); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); PageSetLSN(lpage, recptr); PageSetLSN(rootpage, recptr); |