diff options
Diffstat (limited to 'src/backend/access/gin/gindatapage.c')
-rw-r--r-- | src/backend/access/gin/gindatapage.c | 373 |
1 files changed, 236 insertions, 137 deletions
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index e3ab6cfd0ee..209020992dc 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -18,7 +18,6 @@ #include "access/heapam_xlog.h" #include "lib/ilist.h" #include "miscadmin.h" -#include "utils/memutils.h" #include "utils/rel.h" /* @@ -57,6 +56,13 @@ typedef struct int rsize; /* total size on right page */ bool oldformat; /* page is in pre-9.4 format on disk */ + + /* + * If we need WAL data representing the reconstructed leaf page, it's + * stored here by computeLeafRecompressWALData. + */ + char *walinfo; /* buffer start */ + int walinfolen; /* and length */ } disassembledLeaf; typedef struct @@ -98,20 +104,18 @@ static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage); + Page *newlpage, Page *newrpage, XLogRecData *rdata); static disassembledLeaf *disassembleLeaf(Page page); static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems); -static XLogRecData *constructLeafRecompressWALData(Buffer buf, - disassembledLeaf *leaf); +static void computeLeafRecompressWALData(disassembledLeaf *leaf); static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); -static void dataPlaceToPageLeafSplit(Buffer buf, - disassembledLeaf *leaf, +static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, - XLogRecData **prdata, Page lpage, Page rpage); + Page lpage, Page rpage, XLogRecData *rdata); /* * Read TIDs from leaf data page to single uncompressed array. The TIDs are @@ -424,12 +428,25 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset) } /* - * Places keys to leaf data page and fills WAL record. + * Prepare to insert data on a leaf data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. Also, + * if WAL logging is needed, fill one or more entries of rdata[] with + * whatever data must be appended to the WAL record. + * + * In neither case should the given page buffer be modified here. */ static GinPlaceToPageRC -dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, XLogRecData **prdata, - Page *newlpage, Page *newrpage) +dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, + void **ptp_workspace, + Page *newlpage, Page *newrpage, + XLogRecData *rdata) { GinBtreeDataLeafInsertData *items = insertdata; ItemPointer newItems = &items->items[items->curitem]; @@ -442,15 +459,11 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, bool append; int segsize; Size freespace; - MemoryContext tmpCxt; - MemoryContext oldCxt; disassembledLeaf *leaf; leafSegmentInfo *lastleftinfo; ItemPointerData maxOldItem; ItemPointerData remaining; - Assert(GinPageIsData(page)); - rbound = *GinDataPageGetRightBound(page); /* @@ -474,18 +487,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, maxitems = i; } - /* - * The following operations do quite a lot of small memory allocations, - * create a temporary memory context so that we don't need to keep track - * of them individually. - */ - tmpCxt = AllocSetContextCreate(CurrentMemoryContext, - "Gin split temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - oldCxt = MemoryContextSwitchTo(tmpCxt); - + /* Disassemble the data on the page */ leaf = disassembleLeaf(page); /* @@ -550,16 +552,13 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment); } - /* Add the new items to the segments */ + /* Add the new items to the segment list */ if (!addItemsToLeaf(leaf, newItems, maxitems)) { /* all items were duplicates, we have nothing to do */ items->curitem += maxitems; - MemoryContextSwitchTo(oldCxt); - MemoryContextDelete(tmpCxt); - - return UNMODIFIED; + return GPTP_NO_WORK; } /* @@ -592,21 +591,17 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (!needsplit) { /* - * Great, all the items fit on a single page. Construct a WAL record - * describing the changes we made, and write the segments back to the - * page. - * - * Once we start modifying the page, there's no turning back. The - * caller is responsible for calling END_CRIT_SECTION() after writing - * the WAL record. + * Great, all the items fit on a single page. If needed, prepare data + * for a WAL record describing the changes we'll make. */ - MemoryContextSwitchTo(oldCxt); if (RelationNeedsWAL(btree->index)) - *prdata = constructLeafRecompressWALData(buf, leaf); - else - *prdata = NULL; - START_CRIT_SECTION(); - dataPlaceToPageLeafRecompress(buf, leaf); + computeLeafRecompressWALData(leaf); + + /* + * We're ready to enter the critical section, but + * dataExecPlaceToPageLeaf will need access to the "leaf" data. + */ + *ptp_workspace = leaf; if (append) elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", @@ -620,7 +615,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, else { /* - * Had to split. + * Have to split. * * leafRepackItems already divided the segments between the left and * the right page. It filled the left page as full as possible, and @@ -632,7 +627,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, * until they're balanced. * * As a further heuristic, when appending items to the end of the - * page, try make the left page 75% full, one the assumption that + * page, try to make the left page 75% full, on the assumption that * subsequent insertions will probably also go to the end. This packs * the index somewhat tighter when appending to a table, which is very * common. @@ -681,11 +676,14 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, &lastleftinfo->nitems); lbound = lastleftinfo->items[lastleftinfo->nitems - 1]; - *newlpage = MemoryContextAlloc(oldCxt, BLCKSZ); - *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ); + /* + * Now allocate a couple of temporary page images, and fill them. + */ + *newlpage = palloc(BLCKSZ); + *newrpage = palloc(BLCKSZ); - dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound, - prdata, *newlpage, *newrpage); + dataPlaceToPageLeafSplit(leaf, lbound, rbound, + *newlpage, *newrpage, rdata); Assert(GinPageRightMost(page) || ginCompareItemPointers(GinDataPageGetRightBound(*newlpage), @@ -701,12 +699,37 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, items->nitem - items->curitem - maxitems); } - MemoryContextSwitchTo(oldCxt); - MemoryContextDelete(tmpCxt); - items->curitem += maxitems; - return needsplit ? SPLIT : INSERTED; + return needsplit ? GPTP_SPLIT : GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section. It must modify the target + * buffer and store one or more XLogRecData records describing the changes + * in rdata[]. + */ +static void +dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, void *ptp_workspace, + XLogRecData *rdata) +{ + disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace; + + /* Apply changes to page */ + dataPlaceToPageLeafRecompress(buf, leaf); + + /* If needed, register WAL data built by computeLeafRecompressWALData */ + if (RelationNeedsWAL(btree->index)) + { + rdata[0].buffer = buf; + rdata[0].buffer_std = true; + rdata[0].data = leaf->walinfo; + rdata[0].len = leaf->walinfolen; + rdata[0].next = NULL; + } } /* @@ -791,7 +814,6 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) */ if (removedsomething) { - XLogRecData *payloadrdata = NULL; bool modified; /* @@ -818,8 +840,11 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) } if (RelationNeedsWAL(indexrel)) - payloadrdata = constructLeafRecompressWALData(buffer, leaf); + computeLeafRecompressWALData(leaf); + + /* Apply changes to page */ START_CRIT_SECTION(); + dataPlaceToPageLeafRecompress(buffer, leaf); MarkBufferDirty(buffer); @@ -827,18 +852,24 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) if (RelationNeedsWAL(indexrel)) { XLogRecPtr recptr; - XLogRecData rdata; + XLogRecData rdata[2]; ginxlogVacuumDataLeafPage xlrec; xlrec.node = indexrel->rd_node; xlrec.blkno = BufferGetBlockNumber(buffer); - rdata.buffer = InvalidBuffer; - rdata.data = (char *) &xlrec; - rdata.len = offsetof(ginxlogVacuumDataLeafPage, data); - rdata.next = payloadrdata; + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &xlrec; + rdata[0].len = offsetof(ginxlogVacuumDataLeafPage, data); + rdata[0].next = &rdata[1]; + + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + rdata[1].data = leaf->walinfo; + rdata[1].len = leaf->walinfolen; + rdata[1].next = NULL; - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, rdata); PageSetLSN(page, recptr); } @@ -848,15 +879,15 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) /* * Construct a ginxlogRecompressDataLeaf record representing the changes - * in *leaf. + * in *leaf. (Because this requires a palloc, we have to do it before + * we enter the critical section that actually updates the page.) */ -static XLogRecData * -constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) +static void +computeLeafRecompressWALData(disassembledLeaf *leaf) { int nmodified = 0; char *walbufbegin; char *walbufend; - XLogRecData *rdata; dlist_iter iter; int segno; ginxlogRecompressDataLeaf *recompress_xlog; @@ -871,12 +902,11 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) nmodified++; } - walbufbegin = palloc( - sizeof(ginxlogRecompressDataLeaf) + - BLCKSZ + /* max size needed to hold the segment - * data */ - nmodified * 2 + /* (segno + action) per action */ - sizeof(XLogRecData)); + walbufbegin = + palloc(sizeof(ginxlogRecompressDataLeaf) + + BLCKSZ + /* max size needed to hold the segment data */ + nmodified * 2 /* (segno + action) per action */ + ); walbufend = walbufbegin; recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend; @@ -944,22 +974,15 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) segno++; } - rdata = (XLogRecData *) MAXALIGN(walbufend); - rdata->buffer = buf; - rdata->buffer_std = TRUE; - rdata->data = walbufbegin; - rdata->len = walbufend - walbufbegin; - rdata->next = NULL; - - return rdata; + /* Pass back the constructed info via *leaf */ + leaf->walinfo = walbufbegin; + leaf->walinfolen = walbufend - walbufbegin; } /* * Assemble a disassembled posting tree leaf page back to a buffer. * - * *prdata is filled with WAL information about this operation. The caller - * is responsible for inserting to the WAL, along with any other information - * about the operation that triggered this recompression. + * This just updates the target buffer; WAL stuff is caller's responsibility. * * NOTE: The segment pointers must not point directly to the same buffer, * except for segments that have not been modified and whose preceding @@ -1018,13 +1041,14 @@ dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) * segments to two pages instead of one. * * This is different from the non-split cases in that this does not modify - * the original page directly, but to temporary in-memory copies of the new - * left and right pages. + * the original page directly, but writes to temporary in-memory copies of + * the new left and right pages. Also, we prepare rdata[] entries for the + * data that must be appended to the WAL record. */ static void -dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, +dataPlaceToPageLeafSplit(disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, - XLogRecData **prdata, Page lpage, Page rpage) + Page lpage, Page rpage, XLogRecData *rdata) { char *ptr; int segsize; @@ -1034,9 +1058,8 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, dlist_node *firstright; leafSegmentInfo *seginfo; - /* these must be static so they can be returned to caller */ + /* this must be static so it can be returned to caller */ static ginxlogSplitDataLeaf split_xlog; - static XLogRecData rdata[3]; /* Initialize temporary pages to hold the new left and right pages */ GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); @@ -1113,43 +1136,63 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage); rdata[2].len = rsize; rdata[2].next = NULL; - - *prdata = rdata; } /* - * Place a PostingItem to page, and fill a WAL record. + * Prepare to insert data on an internal data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. * - * If the item doesn't fit, returns false without modifying the page. + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. Also, + * if WAL logging is needed, fill one or more entries of rdata[] with + * whatever data must be appended to the WAL record. * - * In addition to inserting the given item, the downlink of the existing item - * at 'off' is updated to point to 'updateblkno'. + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. */ static GinPlaceToPageRC -dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) +dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage, + XLogRecData *rdata) { Page page = BufferGetPage(buf); - OffsetNumber off = stack->off; - PostingItem *pitem; - - /* these must be static so they can be returned to caller */ - static XLogRecData rdata; - static ginxlogInsertDataInternal data; - /* split if we have to */ + /* If it doesn't fit, deal with split case */ if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) { dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, - prdata, newlpage, newrpage); - return SPLIT; + newlpage, newrpage, rdata); + return GPTP_SPLIT; } - *prdata = &rdata; - Assert(GinPageIsData(page)); + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} - START_CRIT_SECTION(); +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section. It must modify the target + * buffer and store one or more XLogRecData records describing the changes + * in rdata[]. + */ +static void +dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace, + XLogRecData *rdata) +{ + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + PostingItem *pitem; /* Update existing downlink to point to next page (on internal page) */ pitem = GinDataPageGetPostingItem(page, off); @@ -1159,50 +1202,106 @@ dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, pitem = (PostingItem *) insertdata; GinDataPageAddPostingItem(page, pitem, off); - data.offset = off; - data.newitem = *pitem; + if (RelationNeedsWAL(btree->index)) + { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertDataInternal data; - rdata.buffer = buf; - rdata.buffer_std = TRUE; - rdata.data = (char *) &data; - rdata.len = sizeof(ginxlogInsertDataInternal); - rdata.next = NULL; + data.offset = off; + data.newitem = *pitem; - return INSERTED; + rdata[0].buffer = buf; + rdata[0].buffer_std = true; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogInsertDataInternal); + rdata[0].next = NULL; + } } /* - * Places an item (or items) to a posting tree. Calls relevant function of - * internal of leaf page because they are handled very differently. + * Prepare to insert data on a posting-tree data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. Also, + * if WAL logging is needed, fill one or more entries of rdata[] with + * whatever data must be appended to the WAL record. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. */ static GinPlaceToPageRC -dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, - Page *newlpage, Page *newrpage) +dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage, + XLogRecData *rdata) { Page page = BufferGetPage(buf); Assert(GinPageIsData(page)); if (GinPageIsLeaf(page)) - return dataPlaceToPageLeaf(btree, buf, stack, insertdata, - prdata, newlpage, newrpage); + return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace, + newlpage, newrpage, rdata); + else + return dataBeginPlaceToPageInternal(btree, buf, stack, + insertdata, updateblkno, + ptp_workspace, + newlpage, newrpage, rdata); +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section. It must modify the target + * buffer and store one or more XLogRecData records describing the changes + * in rdata[]. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. + */ +static void +dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace, + XLogRecData *rdata) +{ + Page page = BufferGetPage(buf); + + if (GinPageIsLeaf(page)) + dataExecPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace, rdata); else - return dataPlaceToPageInternal(btree, buf, stack, - insertdata, updateblkno, - prdata, newlpage, newrpage); + dataExecPlaceToPageInternal(btree, buf, stack, insertdata, + updateblkno, ptp_workspace, rdata); } /* - * Split page and fill WAL record. Returns a new temp buffer filled with data - * that should go to the left page. The original buffer is left untouched. + * Split internal page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. */ static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) + Page *newlpage, Page *newrpage, XLogRecData *rdata) { Page oldpage = BufferGetPage(origbuf); OffsetNumber off = stack->off; @@ -1218,7 +1317,6 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, /* these must be static so they can be returned to caller */ static ginxlogSplitDataInternal data; - static XLogRecData rdata[4]; static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; lpage = PageGetTempPage(oldpage); @@ -1226,8 +1324,6 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize); GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize); - *prdata = rdata; - /* * First construct a new list of PostingItems, which includes all the old * items, and the new item. @@ -1277,6 +1373,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, /* set up right bound for right page */ *GinDataPageGetRightBound(rpage) = oldbound; + /* Set up WAL data */ data.separator = separator; data.nitem = nitems; data.rightbound = oldbound; @@ -1291,6 +1388,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, rdata[1].len = nitems * sizeof(PostingItem); rdata[1].next = NULL; + /* return temp pages to caller */ *newlpage = lpage; *newrpage = rpage; } @@ -1855,7 +1953,8 @@ ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) btree->isMoveRight = dataIsMoveRight; btree->findItem = NULL; btree->findChildPtr = dataFindChildPtr; - btree->placeToPage = dataPlaceToPage; + btree->beginPlaceToPage = dataBeginPlaceToPage; + btree->execPlaceToPage = dataExecPlaceToPage; btree->fillRoot = ginDataFillRoot; btree->prepareDownlink = dataPrepareDownlink; |