BRIN: Block Range Indexes

BRIN is a new index access method intended to accelerate scans of very large tables, without the maintenance overhead of btrees or other traditional indexes. They work by maintaining "summary" data about block ranges. Bitmap index scans work by reading each summary tuple and comparing them with the query quals; all pages in the range are returned in a lossy TID bitmap if the quals are consistent with the values in the summary tuple, otherwise not. Normal index scans are not supported because these indexes do not store TIDs. As new tuples are added into the index, the summary information is updated (if the block range in which the tuple is added is already summarized) or not; in the latter case, a subsequent pass of VACUUM or the brin_summarize_new_values() function will create the summary information. For data types with natural 1-D sort orders, the summary info consists of the maximum and the minimum values of each indexed column within each page range. This type of operator class we call "Minmax", and we supply a bunch of them for most data types with B-tree opclasses. Since the BRIN code is generalized, other approaches are possible for things such as arrays, geometric types, ranges, etc; even for things such as enum types we could do something different than minmax with better results. In this commit I only include minmax. Catalog version bumped due to new builtin catalog entries. There's more that could be done here, but this is a good step forwards. Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera, with contribution by Heikki Linnakangas. Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas. Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo. PS: The research leading to these results has received funding from the European Union's Seventh Framework Programme (FP7/2007-2013) under grant agreement n° 318633.
author: Alvaro Herrera <alvherre@alvh.no-ip.org> 2014-11-07 16:38:14 -0300
committer: Alvaro Herrera <alvherre@alvh.no-ip.org> 2014-11-07 16:38:14 -0300
commit: 7516f5259411c02ae89e49084452dc342aadb2ae (patch)
tree: e1c84a6dacca20c8070823b26cbaa30c24670b1f /src/backend/storage/page/bufpage.c
parent: 1961b1c131e4211f3dc86ff2be971e430ab2a23d (diff)
1 files changed, 178 insertions, 1 deletions
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 6351a9bea47..2b858c82719 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -399,7 +399,8 @@ PageRestoreTempPage(Page tempPage, Page oldPage)
 }
 
 /*
- * sorting support for PageRepairFragmentation and PageIndexMultiDelete
+ * sorting support for PageRepairFragmentation, PageIndexMultiDelete,
+ * PageIndexDeleteNoCompact
  */
 typedef struct itemIdSortData
 {
@@ -896,6 +897,182 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
 	phdr->pd_upper = upper;
 }
 
+/*
+ * PageIndexDeleteNoCompact
+ *		Delete the given items for an index page, and defragment the resulting
+ *		free space, but do not compact the item pointers array.
+ *
+ * itemnos is the array of tuples to delete; nitems is its size.  maxIdxTuples
+ * is the maximum number of tuples that can exist in a page.
+ *
+ * Unused items at the end of the array are removed.
+ *
+ * This is used for index AMs that require that existing TIDs of live tuples
+ * remain unchanged.
+ */
+void
+PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems)
+{
+	PageHeader	phdr = (PageHeader) page;
+	LocationIndex pd_lower = phdr->pd_lower;
+	LocationIndex pd_upper = phdr->pd_upper;
+	LocationIndex pd_special = phdr->pd_special;
+	int			nline;
+	bool		empty;
+	OffsetNumber offnum;
+	int			nextitm;
+
+	/*
+	 * As with PageRepairFragmentation, paranoia seems justified.
+	 */
+	if (pd_lower < SizeOfPageHeaderData ||
+		pd_lower > pd_upper ||
+		pd_upper > pd_special ||
+		pd_special > BLCKSZ ||
+		pd_special != MAXALIGN(pd_special))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+						pd_lower, pd_upper, pd_special)));
+
+	/*
+	 * Scan the existing item pointer array and mark as unused those that are
+	 * in our kill-list; make sure any non-interesting ones are marked unused
+	 * as well.
+	 */
+	nline = PageGetMaxOffsetNumber(page);
+	empty = true;
+	nextitm = 0;
+	for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		lp;
+		ItemLength	itemlen;
+		ItemOffset	offset;
+
+		lp = PageGetItemId(page, offnum);
+
+		itemlen = ItemIdGetLength(lp);
+		offset = ItemIdGetOffset(lp);
+
+		if (ItemIdIsUsed(lp))
+		{
+			if (offset < pd_upper ||
+				(offset + itemlen) > pd_special ||
+				offset != MAXALIGN(offset))
+				ereport(ERROR,
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("corrupted item pointer: offset = %u, length = %u",
+								offset, (unsigned int) itemlen)));
+
+			if (nextitm < nitems && offnum == itemnos[nextitm])
+			{
+				/* this one is on our list to delete, so mark it unused */
+				ItemIdSetUnused(lp);
+				nextitm++;
+			}
+			else if (ItemIdHasStorage(lp))
+			{
+				/* This one's live -- must do the compaction dance */
+				empty = false;
+			}
+			else
+			{
+				/* get rid of this one too */
+				ItemIdSetUnused(lp);
+			}
+		}
+	}
+
+	/* this will catch invalid or out-of-order itemnos[] */
+	if (nextitm != nitems)
+		elog(ERROR, "incorrect index offsets supplied");
+
+	if (empty)
+	{
+		/* Page is completely empty, so just reset it quickly */
+		phdr->pd_lower = SizeOfPageHeaderData;
+		phdr->pd_upper = pd_special;
+	}
+	else
+	{
+		/* There are live items: need to compact the page the hard way */
+		itemIdSortData itemidbase[MaxOffsetNumber];
+		itemIdSort	itemidptr;
+		int			i;
+		Size		totallen;
+		Offset		upper;
+
+		/*
+		 * Scan the page taking note of each item that we need to preserve.
+		 * This includes both live items (those that contain data) and
+		 * interspersed unused ones.  It's critical to preserve these unused
+		 * items, because otherwise the offset numbers for later live items
+		 * would change, which is not acceptable.  Unused items might get used
+		 * again later; that is fine.
+		 */
+		itemidptr = itemidbase;
+		totallen = 0;
+		for (i = 0; i < nline; i++, itemidptr++)
+		{
+			ItemId		lp;
+
+			itemidptr->offsetindex = i;
+
+			lp = PageGetItemId(page, i + 1);
+			if (ItemIdHasStorage(lp))
+			{
+				itemidptr->itemoff = ItemIdGetOffset(lp);
+				itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
+				totallen += itemidptr->alignedlen;
+			}
+			else
+			{
+				itemidptr->itemoff = 0;
+				itemidptr->alignedlen = 0;
+			}
+		}
+		/* By here, there are exactly nline elements in itemidbase array */
+
+		if (totallen > (Size) (pd_special - pd_lower))
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("corrupted item lengths: total %u, available space %u",
+							(unsigned int) totallen, pd_special - pd_lower)));
+
+		/* sort itemIdSortData array into decreasing itemoff order */
+		qsort((char *) itemidbase, nline, sizeof(itemIdSortData),
+			  itemoffcompare);
+
+		/*
+		 * Defragment the data areas of each tuple, being careful to preserve
+		 * each item's position in the linp array.
+		 */
+		upper = pd_special;
+		PageClearHasFreeLinePointers(page);
+		for (i = 0, itemidptr = itemidbase; i < nline; i++, itemidptr++)
+		{
+			ItemId		lp;
+
+			lp = PageGetItemId(page, itemidptr->offsetindex + 1);
+			if (itemidptr->alignedlen == 0)
+			{
+				PageSetHasFreeLinePointers(page);
+				ItemIdSetUnused(lp);
+				continue;
+			}
+			upper -= itemidptr->alignedlen;
+			memmove((char *) page + upper,
+					(char *) page + itemidptr->itemoff,
+					itemidptr->alignedlen);
+			lp->lp_off = upper;
+			/* lp_flags and lp_len remain the same as originally */
+		}
+
+		/* Set the new page limits */
+		phdr->pd_upper = upper;
+		phdr->pd_lower = SizeOfPageHeaderData + i * sizeof(ItemIdData);
+	}
+}
 
 /*
  * Set checksum for a page in shared buffers.
author	Alvaro Herrera <alvherre@alvh.no-ip.org>	2014-11-07 16:38:14 -0300
committer	Alvaro Herrera <alvherre@alvh.no-ip.org>	2014-11-07 16:38:14 -0300
commit	7516f5259411c02ae89e49084452dc342aadb2ae (patch)
tree	e1c84a6dacca20c8070823b26cbaa30c24670b1f /src/backend/storage/page/bufpage.c
parent	1961b1c131e4211f3dc86ff2be971e430ab2a23d (diff)