diff options
author | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2024-02-23 16:10:51 +0200 |
---|---|---|
committer | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2024-02-23 16:10:51 +0200 |
commit | 8af256524893987a3e534c6578dd60edfb782a77 (patch) | |
tree | 76761046e9f1fd4e18abc502e208c76132b7b1c7 /src/backend/access/heap/rewriteheap.c | |
parent | e612384fc78d35c3d3a8b3d27cef5181dca8430b (diff) |
Introduce a new smgr bulk loading facility.
The new facility makes it easier to optimize bulk loading, as the
logic for buffering, WAL-logging, and syncing the relation only needs
to be implemented once. It's also less error-prone: We have had a
number of bugs in how a relation is fsync'd - or not - at the end of a
bulk loading operation. By centralizing that logic to one place, we
only need to write it correctly once.
The new facility is faster for small relations: Instead of of calling
smgrimmedsync(), we register the fsync to happen at next checkpoint,
which avoids the fsync latency. That can make a big difference if you
are e.g. restoring a schema-only dump with lots of relations.
It is also slightly more efficient with large relations, as the WAL
logging is performed multiple pages at a time. That avoids some WAL
header overhead. The sorted GiST index build did that already, this
moves the buffering to the new facility.
The changes to pageinspect GiST test needs an explanation: Before this
patch, the sorted GiST index build set the LSN on every page to the
special GistBuildLSN value, not the LSN of the WAL record, even though
they were WAL-logged. There was no particular need for it, it just
happened naturally when we wrote out the pages before WAL-logging
them. Now we WAL-log the pages first, like in B-tree build, so the
pages are stamped with the record's real LSN. When the build is not
WAL-logged, we still use GistBuildLSN. To make the test output
predictable, use an unlogged index.
Reviewed-by: Andres Freund
Discussion: https://www.postgresql.org/message-id/30e8f366-58b3-b239-c521-422122dd5150%40iki.fi
Diffstat (limited to 'src/backend/access/heap/rewriteheap.c')
-rw-r--r-- | src/backend/access/heap/rewriteheap.c | 72 |
1 files changed, 20 insertions, 52 deletions
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 34107323ffe..a578b876174 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -87,8 +87,8 @@ * is optimized for bulk inserting a lot of tuples, knowing that we have * exclusive access to the heap. raw_heap_insert builds new pages in * local storage. When a page is full, or at the end of the process, - * we insert it to WAL as a single record and then write it to disk - * directly through smgr. Note, however, that any data sent to the new + * we insert it to WAL as a single record and then write it to disk with + * the bulk smgr writer. Note, however, that any data sent to the new * heap's TOAST table will go through the normal bufmgr. * * @@ -119,9 +119,9 @@ #include "replication/logical.h" #include "replication/slot.h" #include "storage/bufmgr.h" +#include "storage/bulk_write.h" #include "storage/fd.h" #include "storage/procarray.h" -#include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -133,9 +133,9 @@ typedef struct RewriteStateData { Relation rs_old_rel; /* source heap */ Relation rs_new_rel; /* destination heap */ - Page rs_buffer; /* page currently being built */ + BulkWriteState *rs_bulkstate; /* writer for the destination */ + BulkWriteBuffer rs_buffer; /* page currently being built */ BlockNumber rs_blockno; /* block where page will go */ - bool rs_buffer_valid; /* T if any tuples in buffer */ bool rs_logical_rewrite; /* do we need to do logical rewriting */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine * tuple visibility */ @@ -255,14 +255,14 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm state->rs_old_rel = old_heap; state->rs_new_rel = new_heap; - state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + state->rs_buffer = NULL; /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); - state->rs_buffer_valid = false; state->rs_oldest_xmin = oldest_xmin; state->rs_freeze_xid = freeze_xid; state->rs_cutoff_multi = cutoff_multi; state->rs_cxt = rw_cxt; + state->rs_bulkstate = smgr_bulk_start_rel(new_heap, MAIN_FORKNUM); /* Initialize hash tables used to track update chains */ hash_ctl.keysize = sizeof(TidHashKey); @@ -314,30 +314,13 @@ end_heap_rewrite(RewriteState state) } /* Write the last page, if any */ - if (state->rs_buffer_valid) + if (state->rs_buffer) { - if (RelationNeedsWAL(state->rs_new_rel)) - log_newpage(&state->rs_new_rel->rd_locator, - MAIN_FORKNUM, - state->rs_blockno, - state->rs_buffer, - true); - - PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); - - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, - state->rs_blockno, state->rs_buffer, true); + smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true); + state->rs_buffer = NULL; } - /* - * When we WAL-logged rel pages, we must nonetheless fsync them. The - * reason is the same as in storage.c's RelationCopyStorage(): we're - * writing data that's not in shared buffers, and so a CHECKPOINT - * occurring during the rewriteheap operation won't have fsync'd data we - * wrote before the checkpoint. - */ - if (RelationNeedsWAL(state->rs_new_rel)) - smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM); + smgr_bulk_finish(state->rs_bulkstate); logical_end_heap_rewrite(state); @@ -611,7 +594,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple) static void raw_heap_insert(RewriteState state, HeapTuple tup) { - Page page = state->rs_buffer; + Page page; Size pageFreeSpace, saveFreeSpace; Size len; @@ -664,7 +647,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup) HEAP_DEFAULT_FILLFACTOR); /* Now we can check to see if there's enough free space already. */ - if (state->rs_buffer_valid) + page = (Page) state->rs_buffer; + if (page) { pageFreeSpace = PageGetHeapFreeSpace(page); @@ -675,35 +659,19 @@ raw_heap_insert(RewriteState state, HeapTuple tup) * contains a tuple. Hence, unlike RelationGetBufferForTuple(), * enforce saveFreeSpace unconditionally. */ - - /* XLOG stuff */ - if (RelationNeedsWAL(state->rs_new_rel)) - log_newpage(&state->rs_new_rel->rd_locator, - MAIN_FORKNUM, - state->rs_blockno, - page, - true); - - /* - * Now write the page. We say skipFsync = true because there's no - * need for smgr to schedule an fsync for this write; we'll do it - * ourselves in end_heap_rewrite. - */ - PageSetChecksumInplace(page, state->rs_blockno); - - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, - state->rs_blockno, page, true); - + smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true); + state->rs_buffer = NULL; + page = NULL; state->rs_blockno++; - state->rs_buffer_valid = false; } } - if (!state->rs_buffer_valid) + if (!page) { /* Initialize a new empty page */ + state->rs_buffer = smgr_bulk_get_buf(state->rs_bulkstate); + page = (Page) state->rs_buffer; PageInit(page, BLCKSZ, 0); - state->rs_buffer_valid = true; } /* And now we can insert the tuple into the page */ |