1 files changed, 116 insertions, 127 deletions
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index d9da5e1ee37..4cb3266a944 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashinsert.c,v 1.27 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashinsert.c,v 1.27.2.1 2003/09/07 04:36:47 momjian Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -16,136 +16,124 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "storage/lmgr.h"
+
+
+static OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
+								   Size itemsize, HashItem hitem);
 
-static InsertIndexResult _hash_insertonpg(Relation rel, Buffer buf, int keysz, ScanKey scankey, HashItem hitem, Buffer metabuf);
-static OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, HashItem hitem);
 
 /*
  *	_hash_doinsert() -- Handle insertion of a single HashItem in the table.
  *
  *		This routine is called by the public interface routines, hashbuild
- *		and hashinsert.  By here, hashitem is filled in, and has a unique
- *		(xid, seqno) pair. The datum to be used as a "key" is in the
- *		hashitem.
+ *		and hashinsert.  By here, hashitem is completely filled in.
+ *		The datum to be used as a "key" is in the hashitem.
  */
 InsertIndexResult
 _hash_doinsert(Relation rel, HashItem hitem)
 {
 	Buffer		buf;
 	Buffer		metabuf;
-	BlockNumber blkno;
 	HashMetaPage metap;
 	IndexTuple	itup;
+	BlockNumber itup_blkno;
+	OffsetNumber itup_off;
 	InsertIndexResult res;
-	ScanKey		itup_scankey;
-	int			natts;
+	BlockNumber blkno;
 	Page		page;
-
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
-	_hash_checkpage((Page) metap, LH_META_PAGE);
-
-	/* we need a scan key to do our search, so build one */
-	itup = &(hitem->hash_itup);
-	if ((natts = rel->rd_rel->relnatts) != 1)
-		elog(ERROR, "Hash indexes support only one index key");
-	itup_scankey = _hash_mkscankey(rel, itup);
+	HashPageOpaque pageopaque;
+	Size		itemsz;
+	bool		do_expand;
+	uint32		hashkey;
+	Bucket		bucket;
+	Datum		datum;
+	bool		isnull;
 
 	/*
-	 * find the first page in the bucket chain containing this key and
-	 * place it in buf.  _hash_search obtains a read lock for us.
+	 * Compute the hash key for the item.  We do this first so as not to
+	 * need to hold any locks while running the hash function.
 	 */
-	_hash_search(rel, natts, itup_scankey, &buf, metap);
-	page = BufferGetPage(buf);
-	_hash_checkpage(page, LH_BUCKET_PAGE);
+	itup = &(hitem->hash_itup);
+	if (rel->rd_rel->relnatts != 1)
+		elog(ERROR, "hash indexes support only one index key");
+	datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
+	Assert(!isnull);
+	hashkey = _hash_datum2hashkey(rel, datum);
+
+	/* compute item size too */
+	itemsz = IndexTupleDSize(hitem->hash_itup)
+		+ (sizeof(HashItemData) - sizeof(IndexTupleData));
+
+	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but
+								 * we need to be consistent */
 
 	/*
-	 * trade in our read lock for a write lock so that we can do the
-	 * insertion.
+	 * Acquire shared split lock so we can compute the target bucket
+	 * safely (see README).
 	 */
-	blkno = BufferGetBlockNumber(buf);
-	_hash_relbuf(rel, buf, HASH_READ);
-	buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+	_hash_getlock(rel, 0, HASH_SHARE);
 
+	/* Read the metapage */
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+	metap = (HashMetaPage) BufferGetPage(metabuf);
+	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
 
 	/*
-	 * XXX btree comment (haven't decided what to do in hash): don't think
-	 * the bucket can be split while we're reading the metapage.
-	 *
-	 * If the page was split between the time that we surrendered our read
-	 * lock and acquired our write lock, then this page may no longer be
-	 * the right place for the key we want to insert.
+	 * Check whether the item can fit on a hash page at all. (Eventually,
+	 * we ought to try to apply TOAST methods if not.)  Note that at this
+	 * point, itemsz doesn't include the ItemId.
 	 */
+	if (itemsz > HashMaxItemSize((Page) metap))
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("index tuple size %lu exceeds hash maximum, %lu",
+						(unsigned long) itemsz,
+						(unsigned long) HashMaxItemSize((Page) metap))));
 
-	/* do the insertion */
-	res = _hash_insertonpg(rel, buf, natts, itup_scankey,
-						   hitem, metabuf);
+	/*
+	 * Compute the target bucket number, and convert to block number.
+	 */
+	bucket = _hash_hashkey2bucket(hashkey,
+								  metap->hashm_maxbucket,
+								  metap->hashm_highmask,
+								  metap->hashm_lowmask);
 
-	/* be tidy */
-	_hash_freeskey(itup_scankey);
+	blkno = BUCKET_TO_BLKNO(metap, bucket);
 
-	return res;
-}
+	/* release lock on metapage, but keep pin since we'll need it again */
+	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 
-/*
- *	_hash_insertonpg() -- Insert a tuple on a particular page in the table.
- *
- *		This recursive procedure does the following things:
- *
- *			+  if necessary, splits the target page.
- *			+  inserts the tuple.
- *
- *		On entry, we must have the right buffer on which to do the
- *		insertion, and the buffer must be pinned and locked.  On return,
- *		we will have dropped both the pin and the write lock on the buffer.
- *
- */
-static InsertIndexResult
-_hash_insertonpg(Relation rel,
-				 Buffer buf,
-				 int keysz,
-				 ScanKey scankey,
-				 HashItem hitem,
-				 Buffer metabuf)
-{
-	InsertIndexResult res;
-	Page		page;
-	BlockNumber itup_blkno;
-	OffsetNumber itup_off;
-	Size		itemsz;
-	HashPageOpaque pageopaque;
-	bool		do_expand = false;
-	Buffer		ovflbuf;
-	HashMetaPage metap;
-	Bucket		bucket;
+	/*
+	 * Acquire share lock on target bucket; then we can release split lock.
+	 */
+	_hash_getlock(rel, blkno, HASH_SHARE);
 
-	metap = (HashMetaPage) BufferGetPage(metabuf);
-	_hash_checkpage((Page) metap, LH_META_PAGE);
+	_hash_droplock(rel, 0, HASH_SHARE);
 
+	/* Fetch the primary bucket page for the bucket */
+	buf = _hash_getbuf(rel, blkno, HASH_WRITE);
 	page = BufferGetPage(buf);
-	_hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+	_hash_checkpage(rel, page, LH_BUCKET_PAGE);
 	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
-	bucket = pageopaque->hasho_bucket;
-
-	itemsz = IndexTupleDSize(hitem->hash_itup)
-		+ (sizeof(HashItemData) - sizeof(IndexTupleData));
-	itemsz = MAXALIGN(itemsz);
+	Assert(pageopaque->hasho_bucket == bucket);
 
+	/* Do the insertion */
 	while (PageGetFreeSpace(page) < itemsz)
 	{
 		/*
 		 * no space on this page; check for an overflow page
 		 */
-		if (BlockNumberIsValid(pageopaque->hasho_nextblkno))
+		BlockNumber	nextblkno = pageopaque->hasho_nextblkno;
+
+		if (BlockNumberIsValid(nextblkno))
 		{
 			/*
 			 * ovfl page exists; go get it.  if it doesn't have room,
 			 * we'll find out next pass through the loop test above.
 			 */
-			ovflbuf = _hash_getbuf(rel, pageopaque->hasho_nextblkno,
-								   HASH_WRITE);
-			_hash_relbuf(rel, buf, HASH_WRITE);
-			buf = ovflbuf;
+			_hash_relbuf(rel, buf);
+			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
 			page = BufferGetPage(buf);
 		}
 		else
@@ -154,68 +142,72 @@ _hash_insertonpg(Relation rel,
 			 * we're at the end of the bucket chain and we haven't found a
 			 * page with enough room.  allocate a new overflow page.
 			 */
-			do_expand = true;
-			ovflbuf = _hash_addovflpage(rel, &metabuf, buf);
-			_hash_relbuf(rel, buf, HASH_WRITE);
-			buf = ovflbuf;
+
+			/* release our write lock without modifying buffer */
+			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
+
+			/* chain to a new overflow page */
+			buf = _hash_addovflpage(rel, metabuf, buf);
 			page = BufferGetPage(buf);
 
-			if (PageGetFreeSpace(page) < itemsz)
-			{
-				/* it doesn't fit on an empty page -- give up */
-				elog(ERROR, "hash item too large");
-			}
+			/* should fit now, given test above */
+			Assert(PageGetFreeSpace(page) >= itemsz);
 		}
-		_hash_checkpage(page, LH_OVERFLOW_PAGE);
+		_hash_checkpage(rel, page, LH_OVERFLOW_PAGE);
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
 		Assert(pageopaque->hasho_bucket == bucket);
 	}
 
-	itup_off = _hash_pgaddtup(rel, buf, keysz, scankey, itemsz, hitem);
+	/* found page with enough space, so add the item here */
+	itup_off = _hash_pgaddtup(rel, buf, itemsz, hitem);
 	itup_blkno = BufferGetBlockNumber(buf);
 
-	/* by here, the new tuple is inserted */
-	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+	/* write and release the modified page */
+	_hash_wrtbuf(rel, buf);
 
-	ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+	/* We can drop the bucket lock now */
+	_hash_droplock(rel, blkno, HASH_SHARE);
 
-	if (res != NULL)
-	{
-		/*
-		 * Increment the number of keys in the table. We switch lock
-		 * access type just for a moment to allow greater accessibility to
-		 * the metapage.
-		 */
-		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf,
-												  HASH_READ, HASH_WRITE);
-		metap->hashm_nkeys += 1;
-		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf,
-												  HASH_WRITE, HASH_READ);
+	/*
+	 * Write-lock the metapage so we can increment the tuple count.
+	 * After incrementing it, check to see if it's time for a split.
+	 */
+	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
-	}
+	metap->hashm_ntuples += 1;
 
-	_hash_wrtbuf(rel, buf);
+	/* Make sure this stays in sync with _hash_expandtable() */
+	do_expand = metap->hashm_ntuples >
+		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
 
-	if (do_expand ||
-		(metap->hashm_nkeys / (metap->hashm_maxbucket + 1))
-		> metap->hashm_ffactor)
+	/* Write out the metapage and drop lock, but keep pin */
+	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+
+	/* Attempt to split if a split is needed */
+	if (do_expand)
 		_hash_expandtable(rel, metabuf);
-	_hash_relbuf(rel, metabuf, HASH_READ);
+
+	/* Finally drop our pin on the metapage */
+	_hash_dropbuf(rel, metabuf);
+
+	/* Create the return data structure */
+	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+
+	ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+
 	return res;
 }
 
 /*
  *	_hash_pgaddtup() -- add a tuple to a particular page in the index.
  *
- *		This routine adds the tuple to the page as requested, and keeps the
- *		write lock and reference associated with the page's buffer.  It is
- *		an error to call pgaddtup() without a write lock and reference.
+ *		This routine adds the tuple to the page as requested; it does
+ *		not write out the page.  It is an error to call pgaddtup() without
+ *		a write lock and pin.
  */
 static OffsetNumber
 _hash_pgaddtup(Relation rel,
 			   Buffer buf,
-			   int keysz,
-			   ScanKey itup_scankey,
 			   Size itemsize,
 			   HashItem hitem)
 {
@@ -223,7 +215,7 @@ _hash_pgaddtup(Relation rel,
 	Page		page;
 
 	page = BufferGetPage(buf);
-	_hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+	_hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 
 	itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
 	if (PageAddItem(page, (Item) hitem, itemsize, itup_off, LP_USED)
@@ -231,8 +223,5 @@ _hash_pgaddtup(Relation rel,
 		elog(ERROR, "failed to add index item to \"%s\"",
 			 RelationGetRelationName(rel));
 
-	/* write the buffer, but hold our lock */
-	_hash_wrtnorelbuf(buf);
-
 	return itup_off;
 }