Introduce a new smgr bulk loading facility.

The new facility makes it easier to optimize bulk loading, as the logic for buffering, WAL-logging, and syncing the relation only needs to be implemented once. It's also less error-prone: We have had a number of bugs in how a relation is fsync'd - or not - at the end of a bulk loading operation. By centralizing that logic to one place, we only need to write it correctly once. The new facility is faster for small relations: Instead of of calling smgrimmedsync(), we register the fsync to happen at next checkpoint, which avoids the fsync latency. That can make a big difference if you are e.g. restoring a schema-only dump with lots of relations. It is also slightly more efficient with large relations, as the WAL logging is performed multiple pages at a time. That avoids some WAL header overhead. The sorted GiST index build did that already, this moves the buffering to the new facility. The changes to pageinspect GiST test needs an explanation: Before this patch, the sorted GiST index build set the LSN on every page to the special GistBuildLSN value, not the LSN of the WAL record, even though they were WAL-logged. There was no particular need for it, it just happened naturally when we wrote out the pages before WAL-logging them. Now we WAL-log the pages first, like in B-tree build, so the pages are stamped with the record's real LSN. When the build is not WAL-logged, we still use GistBuildLSN. To make the test output predictable, use an unlogged index. Reviewed-by: Andres Freund Discussion: https://www.postgresql.org/message-id/30e8f366-58b3-b239-c521-422122dd5150%40iki.fi
author: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2024-02-23 16:10:51 +0200
committer: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2024-02-23 16:10:51 +0200
commit: 8af256524893987a3e534c6578dd60edfb782a77 (patch)
tree: 76761046e9f1fd4e18abc502e208c76132b7b1c7 /src/backend/access/heap/rewriteheap.c
parent: e612384fc78d35c3d3a8b3d27cef5181dca8430b (diff)
1 files changed, 20 insertions, 52 deletions
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 34107323ffe..a578b876174 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -87,8 +87,8 @@
  * is optimized for bulk inserting a lot of tuples, knowing that we have
  * exclusive access to the heap.  raw_heap_insert builds new pages in
  * local storage.  When a page is full, or at the end of the process,
- * we insert it to WAL as a single record and then write it to disk
- * directly through smgr.  Note, however, that any data sent to the new
+ * we insert it to WAL as a single record and then write it to disk with
+ * the bulk smgr writer.  Note, however, that any data sent to the new
  * heap's TOAST table will go through the normal bufmgr.
  *
  *
@@ -119,9 +119,9 @@
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "storage/bufmgr.h"
+#include "storage/bulk_write.h"
 #include "storage/fd.h"
 #include "storage/procarray.h"
-#include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -133,9 +133,9 @@ typedef struct RewriteStateData
 {
 	Relation	rs_old_rel;		/* source heap */
 	Relation	rs_new_rel;		/* destination heap */
-	Page		rs_buffer;		/* page currently being built */
+	BulkWriteState *rs_bulkstate;	/* writer for the destination */
+	BulkWriteBuffer rs_buffer;	/* page currently being built */
 	BlockNumber rs_blockno;		/* block where page will go */
-	bool		rs_buffer_valid;	/* T if any tuples in buffer */
 	bool		rs_logical_rewrite; /* do we need to do logical rewriting */
 	TransactionId rs_oldest_xmin;	/* oldest xmin used by caller to determine
 									 * tuple visibility */
@@ -255,14 +255,14 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 
 	state->rs_old_rel = old_heap;
 	state->rs_new_rel = new_heap;
-	state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	state->rs_buffer = NULL;
 	/* new_heap needn't be empty, just locked */
 	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
-	state->rs_buffer_valid = false;
 	state->rs_oldest_xmin = oldest_xmin;
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_bulkstate = smgr_bulk_start_rel(new_heap, MAIN_FORKNUM);
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
@@ -314,30 +314,13 @@ end_heap_rewrite(RewriteState state)
 	}
 
 	/* Write the last page, if any */
-	if (state->rs_buffer_valid)
+	if (state->rs_buffer)
 	{
-		if (RelationNeedsWAL(state->rs_new_rel))
-			log_newpage(&state->rs_new_rel->rd_locator,
-						MAIN_FORKNUM,
-						state->rs_blockno,
-						state->rs_buffer,
-						true);
-
-		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
-
-		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-				   state->rs_blockno, state->rs_buffer, true);
+		smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true);
+		state->rs_buffer = NULL;
 	}
 
-	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is the same as in storage.c's RelationCopyStorage(): we're
-	 * writing data that's not in shared buffers, and so a CHECKPOINT
-	 * occurring during the rewriteheap operation won't have fsync'd data we
-	 * wrote before the checkpoint.
-	 */
-	if (RelationNeedsWAL(state->rs_new_rel))
-		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
+	smgr_bulk_finish(state->rs_bulkstate);
 
 	logical_end_heap_rewrite(state);
 
@@ -611,7 +594,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple)
 static void
 raw_heap_insert(RewriteState state, HeapTuple tup)
 {
-	Page		page = state->rs_buffer;
+	Page		page;
 	Size		pageFreeSpace,
 				saveFreeSpace;
 	Size		len;
@@ -664,7 +647,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 												   HEAP_DEFAULT_FILLFACTOR);
 
 	/* Now we can check to see if there's enough free space already. */
-	if (state->rs_buffer_valid)
+	page = (Page) state->rs_buffer;
+	if (page)
 	{
 		pageFreeSpace = PageGetHeapFreeSpace(page);
 
@@ -675,35 +659,19 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * contains a tuple.  Hence, unlike RelationGetBufferForTuple(),
 			 * enforce saveFreeSpace unconditionally.
 			 */
-
-			/* XLOG stuff */
-			if (RelationNeedsWAL(state->rs_new_rel))
-				log_newpage(&state->rs_new_rel->rd_locator,
-							MAIN_FORKNUM,
-							state->rs_blockno,
-							page,
-							true);
-
-			/*
-			 * Now write the page. We say skipFsync = true because there's no
-			 * need for smgr to schedule an fsync for this write; we'll do it
-			 * ourselves in end_heap_rewrite.
-			 */
-			PageSetChecksumInplace(page, state->rs_blockno);
-
-			smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-					   state->rs_blockno, page, true);
-
+			smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true);
+			state->rs_buffer = NULL;
+			page = NULL;
 			state->rs_blockno++;
-			state->rs_buffer_valid = false;
 		}
 	}
 
-	if (!state->rs_buffer_valid)
+	if (!page)
 	{
 		/* Initialize a new empty page */
+		state->rs_buffer = smgr_bulk_get_buf(state->rs_bulkstate);
+		page = (Page) state->rs_buffer;
 		PageInit(page, BLCKSZ, 0);
-		state->rs_buffer_valid = true;
 	}
 
 	/* And now we can insert the tuple into the page */
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2024-02-23 16:10:51 +0200
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2024-02-23 16:10:51 +0200
commit	8af256524893987a3e534c6578dd60edfb782a77 (patch)
tree	76761046e9f1fd4e18abc502e208c76132b7b1c7 /src/backend/access/heap/rewriteheap.c
parent	e612384fc78d35c3d3a8b3d27cef5181dca8430b (diff)