summaryrefslogtreecommitdiff
path: root/src/backend/postmaster/pgstat.c
diff options
context:
space:
mode:
authorThomas Munro <tmunro@postgresql.org>2021-04-08 23:03:43 +1200
committerThomas Munro <tmunro@postgresql.org>2021-04-08 23:20:42 +1200
commit1d257577e08d3e598011d6850fd1025858de8c8c (patch)
tree1c2ac92489fe1e54cdc0837dec9b84d17434c4b7 /src/backend/postmaster/pgstat.c
parentf003d9f8721b3249e4aec8a1946034579d40d42c (diff)
Optionally prefetch referenced data in recovery.
Introduce a new GUC recovery_prefetch, disabled by default. When enabled, look ahead in the WAL and try to initiate asynchronous reading of referenced data blocks that are not yet cached in our buffer pool. For now, this is done with posix_fadvise(), which has several caveats. Better mechanisms will follow in later work on the I/O subsystem. The GUC maintenance_io_concurrency is used to limit the number of concurrent I/Os we allow ourselves to initiate, based on pessimistic heuristics used to infer that I/Os have begun and completed. The GUC wal_decode_buffer_size is used to limit the maximum distance we are prepared to read ahead in the WAL to find uncached blocks. Reviewed-by: Alvaro Herrera <alvherre@2ndquadrant.com> (parts) Reviewed-by: Andres Freund <andres@anarazel.de> (parts) Reviewed-by: Tomas Vondra <tomas.vondra@2ndquadrant.com> (parts) Tested-by: Tomas Vondra <tomas.vondra@2ndquadrant.com> Tested-by: Jakub Wartak <Jakub.Wartak@tomtom.com> Tested-by: Dmitry Dolgov <9erthalion6@gmail.com> Tested-by: Sait Talha Nisanci <Sait.Nisanci@microsoft.com> Discussion: https://postgr.es/m/CA%2BhUKGJ4VJN8ttxScUFM8dOKX0BrBiboo5uz1cq%3DAovOddfHpA%40mail.gmail.com
Diffstat (limited to 'src/backend/postmaster/pgstat.c')
-rw-r--r--src/backend/postmaster/pgstat.c103
1 files changed, 102 insertions, 1 deletions
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 958183dd69d..f4467625f7f 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -38,6 +38,7 @@
#include "access/transam.h"
#include "access/twophase_rmgr.h"
#include "access/xact.h"
+#include "access/xlogprefetch.h"
#include "catalog/partition.h"
#include "catalog/pg_database.h"
#include "catalog/pg_proc.h"
@@ -278,6 +279,7 @@ static PgStat_WalStats walStats;
static PgStat_SLRUStats slruStats[SLRU_NUM_ELEMENTS];
static PgStat_ReplSlotStats *replSlotStats;
static int nReplSlotStats;
+static PgStat_RecoveryPrefetchStats recoveryPrefetchStats;
/*
* List of OIDs of databases we need to write out. If an entry is InvalidOid,
@@ -349,6 +351,7 @@ static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
static void pgstat_recv_wal(PgStat_MsgWal *msg, int len);
static void pgstat_recv_slru(PgStat_MsgSLRU *msg, int len);
+static void pgstat_recv_recoveryprefetch(PgStat_MsgRecoveryPrefetch *msg, int len);
static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
@@ -1424,11 +1427,20 @@ pgstat_reset_shared_counters(const char *target)
msg.m_resettarget = RESET_BGWRITER;
else if (strcmp(target, "wal") == 0)
msg.m_resettarget = RESET_WAL;
+ else if (strcmp(target, "prefetch_recovery") == 0)
+ {
+ /*
+ * We can't ask the stats collector to do this for us as it is not
+ * attached to shared memory.
+ */
+ XLogPrefetchRequestResetStats();
+ return;
+ }
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized reset target: \"%s\"", target),
- errhint("Target must be \"archiver\", \"bgwriter\" or \"wal\".")));
+ errhint("Target must be \"archiver\", \"bgwriter\", \"wal\" or \"prefetch_recovery\".")));
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
pgstat_send(&msg, sizeof(msg));
@@ -2874,6 +2886,22 @@ pgstat_fetch_replslot(int *nslots_p)
}
/*
+ * ---------
+ * pgstat_fetch_recoveryprefetch() -
+ *
+ * Support function for restoring the counters managed by xlogprefetch.c.
+ * ---------
+ */
+PgStat_RecoveryPrefetchStats *
+pgstat_fetch_recoveryprefetch(void)
+{
+ backend_read_statsfile();
+
+ return &recoveryPrefetchStats;
+}
+
+
+/*
* Shut down a single backend's statistics reporting at process exit.
*
* Flush any remaining statistics counts out to the collector.
@@ -3149,6 +3177,23 @@ pgstat_send_slru(void)
/* ----------
+ * pgstat_send_recoveryprefetch() -
+ *
+ * Send recovery prefetch statistics to the collector
+ * ----------
+ */
+void
+pgstat_send_recoveryprefetch(PgStat_RecoveryPrefetchStats *stats)
+{
+ PgStat_MsgRecoveryPrefetch msg;
+
+ pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYPREFETCH);
+ msg.m_stats = *stats;
+ pgstat_send(&msg, sizeof(msg));
+}
+
+
+/* ----------
* PgstatCollectorMain() -
*
* Start up the statistics collector process. This is the body of the
@@ -3365,6 +3410,10 @@ PgstatCollectorMain(int argc, char *argv[])
pgstat_recv_slru(&msg.msg_slru, len);
break;
+ case PGSTAT_MTYPE_RECOVERYPREFETCH:
+ pgstat_recv_recoveryprefetch(&msg.msg_recoveryprefetch, len);
+ break;
+
case PGSTAT_MTYPE_FUNCSTAT:
pgstat_recv_funcstat(&msg.msg_funcstat, len);
break;
@@ -3659,6 +3708,13 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
(void) rc; /* we'll check for error with ferror */
/*
+ * Write recovery prefetch stats struct
+ */
+ rc = fwrite(&recoveryPrefetchStats, sizeof(recoveryPrefetchStats), 1,
+ fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
* Walk through the database table.
*/
hash_seq_init(&hstat, pgStatDBHash);
@@ -3933,6 +3989,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
memset(&archiverStats, 0, sizeof(archiverStats));
memset(&walStats, 0, sizeof(walStats));
memset(&slruStats, 0, sizeof(slruStats));
+ memset(&recoveryPrefetchStats, 0, sizeof(recoveryPrefetchStats));
/*
* Set the current timestamp (will be kept only in case we can't load an
@@ -4039,6 +4096,18 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
}
/*
+ * Read recoveryPrefetchStats struct
+ */
+ if (fread(&recoveryPrefetchStats, 1, sizeof(recoveryPrefetchStats),
+ fpin) != sizeof(recoveryPrefetchStats))
+ {
+ ereport(pgStatRunningInCollector ? LOG : WARNING,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ memset(&recoveryPrefetchStats, 0, sizeof(recoveryPrefetchStats));
+ goto done;
+ }
+
+ /*
* We found an existing collector stats file. Read it and put all the
* hashtable entries into place.
*/
@@ -4356,6 +4425,7 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
PgStat_WalStats myWalStats;
PgStat_SLRUStats mySLRUStats[SLRU_NUM_ELEMENTS];
PgStat_ReplSlotStats myReplSlotStats;
+ PgStat_RecoveryPrefetchStats myRecoveryPrefetchStats;
FILE *fpin;
int32 format_id;
const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
@@ -4432,6 +4502,18 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
return false;
}
+ /*
+ * Read recovery prefetch stats struct
+ */
+ if (fread(&myRecoveryPrefetchStats, 1, sizeof(myRecoveryPrefetchStats),
+ fpin) != sizeof(myRecoveryPrefetchStats))
+ {
+ ereport(pgStatRunningInCollector ? LOG : WARNING,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ FreeFile(fpin);
+ return false;
+ }
+
/* By default, we're going to return the timestamp of the global file. */
*ts = myGlobalStats.stats_timestamp;
@@ -4615,6 +4697,13 @@ backend_read_statsfile(void)
if (ok && file_ts >= min_ts)
break;
+ /*
+ * If we're in crash recovery, the collector may not even be running,
+ * so work with what we have.
+ */
+ if (InRecovery)
+ break;
+
/* Not there or too old, so kick the collector and wait a bit */
if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
@@ -5350,6 +5439,18 @@ pgstat_recv_slru(PgStat_MsgSLRU *msg, int len)
}
/* ----------
+ * pgstat_recv_recoveryprefetch() -
+ *
+ * Process a recovery prefetch message.
+ * ----------
+ */
+static void
+pgstat_recv_recoveryprefetch(PgStat_MsgRecoveryPrefetch *msg, int len)
+{
+ recoveryPrefetchStats = msg->m_stats;
+}
+
+/* ----------
* pgstat_recv_recoveryconflict() -
*
* Process a RECOVERYCONFLICT message.