summaryrefslogtreecommitdiff
path: root/src/backend/access/transam/xlogutils.c
diff options
context:
space:
mode:
authorThomas Munro <tmunro@postgresql.org>2022-04-07 19:28:40 +1200
committerThomas Munro <tmunro@postgresql.org>2022-04-07 19:42:14 +1200
commit5dc0418fab281d017a61a5756240467af982bdfd (patch)
treecdcfda92621a9a7cd458999ede8f3974ef2a0bc1 /src/backend/access/transam/xlogutils.c
parent9553b4115f1879f66935f42fff0b798ef91866d0 (diff)
Prefetch data referenced by the WAL, take II.
Introduce a new GUC recovery_prefetch. When enabled, look ahead in the WAL and try to initiate asynchronous reading of referenced data blocks that are not yet cached in our buffer pool. For now, this is done with posix_fadvise(), which has several caveats. Since not all OSes have that system call, "try" is provided so that it can be enabled where available. Better mechanisms for asynchronous I/O are possible in later work. Set to "try" for now for test coverage. Default setting to be finalized before release. The GUC wal_decode_buffer_size limits the distance we can look ahead in bytes of decoded data. The existing GUC maintenance_io_concurrency is used to limit the number of concurrent I/Os allowed, based on pessimistic heuristics used to infer that I/Os have begun and completed. We'll also not look more than maintenance_io_concurrency * 4 block references ahead. Reviewed-by: Julien Rouhaud <rjuju123@gmail.com> Reviewed-by: Tomas Vondra <tomas.vondra@2ndquadrant.com> Reviewed-by: Alvaro Herrera <alvherre@2ndquadrant.com> (earlier version) Reviewed-by: Andres Freund <andres@anarazel.de> (earlier version) Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> (earlier version) Tested-by: Tomas Vondra <tomas.vondra@2ndquadrant.com> (earlier version) Tested-by: Jakub Wartak <Jakub.Wartak@tomtom.com> (earlier version) Tested-by: Dmitry Dolgov <9erthalion6@gmail.com> (earlier version) Tested-by: Sait Talha Nisanci <Sait.Nisanci@microsoft.com> (earlier version) Discussion: https://postgr.es/m/CA%2BhUKGJ4VJN8ttxScUFM8dOKX0BrBiboo5uz1cq%3DAovOddfHpA%40mail.gmail.com
Diffstat (limited to 'src/backend/access/transam/xlogutils.c')
-rw-r--r--src/backend/access/transam/xlogutils.c27
1 files changed, 23 insertions, 4 deletions
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index a4dedc58b71..bb2d3ec991c 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -22,6 +22,7 @@
#include "access/timeline.h"
#include "access/xlogrecovery.h"
#include "access/xlog_internal.h"
+#include "access/xlogprefetcher.h"
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
@@ -355,11 +356,13 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blkno;
+ Buffer prefetch_buffer;
Page page;
bool zeromode;
bool willinit;
- if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+ if (!XLogRecGetBlockTagExtended(record, block_id, &rnode, &forknum, &blkno,
+ &prefetch_buffer))
{
/* Caller specified a bogus block_id */
elog(PANIC, "failed to locate backup block with ID %d", block_id);
@@ -381,7 +384,8 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
{
Assert(XLogRecHasBlockImage(record, block_id));
*buf = XLogReadBufferExtended(rnode, forknum, blkno,
- get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
+ get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK,
+ prefetch_buffer);
page = BufferGetPage(*buf);
if (!RestoreBlockImage(record, block_id, page))
elog(ERROR, "failed to restore block image");
@@ -410,7 +414,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
}
else
{
- *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode);
+ *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode, prefetch_buffer);
if (BufferIsValid(*buf))
{
if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
@@ -450,6 +454,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
* exist, and we don't check for all-zeroes. Thus, no log entry is made
* to imply that the page should be dropped or truncated later.
*
+ * Optionally, recent_buffer can be used to provide a hint about the location
+ * of the page in the buffer pool; it does not have to be correct, but avoids
+ * a buffer mapping table probe if it is.
+ *
* NB: A redo function should normally not call this directly. To get a page
* to modify, use XLogReadBufferForRedoExtended instead. It is important that
* all pages modified by a WAL record are registered in the WAL records, or
@@ -457,7 +465,8 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
*/
Buffer
XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
- BlockNumber blkno, ReadBufferMode mode)
+ BlockNumber blkno, ReadBufferMode mode,
+ Buffer recent_buffer)
{
BlockNumber lastblock;
Buffer buffer;
@@ -465,6 +474,15 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
Assert(blkno != P_NEW);
+ /* Do we have a clue where the buffer might be already? */
+ if (BufferIsValid(recent_buffer) &&
+ mode == RBM_NORMAL &&
+ ReadRecentBuffer(rnode, forknum, blkno, recent_buffer))
+ {
+ buffer = recent_buffer;
+ goto recent_buffer_fast_path;
+ }
+
/* Open the relation at smgr level */
smgr = smgropen(rnode, InvalidBackendId);
@@ -523,6 +541,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
}
}
+recent_buffer_fast_path:
if (mode == RBM_NORMAL)
{
/* check that page has been initialized */