Fix stalled lag columns in pg_stat_replication when replay LSN stops advancing.

Previously, when the replay LSN reported in feedback messages from a standby stopped advancing, for example, due to a recovery conflict, the write_lag and flush_lag columns in pg_stat_replication would initially update but then stop progressing. This prevented users from correctly monitoring replication lag. The problem occurred because when any LSN stopped updating, the lag tracker's cyclic buffer became full (the write head reached the slowest read head). In that state, the lag tracker could no longer compute round-trip lag values correctly. This commit fixes the issue by handling the slowest read entry (the one causing the buffer to fill) as a separate overflow entry and freeing space so the write and other read heads can continue advancing in the buffer. As a result, write_lag and flush_lag now continue updating even if the reported replay LSN remains stalled. Backpatch to all supported versions. Author: Fujii Masao <masao.fujii@gmail.com> Reviewed-by: Chao Li <lic@highgo.com> Reviewed-by: Shinya Kato <shinya11.kato@gmail.com> Reviewed-by: Xuneng Zhou <xunengzhou@gmail.com> Discussion: https://postgr.es/m/CAHGQGwGdGQ=1-X-71Caee-LREBUXSzyohkoQJd4yZZCMt24C0g@mail.gmail.com Backpatch-through: 13
author: Fujii Masao <fujii@postgresql.org> 2025-10-22 11:27:15 +0900
committer: Fujii Masao <fujii@postgresql.org> 2025-10-22 11:27:15 +0900
commit: 883a95646a8e67a2d316f230712ed82b8ba58e28 (patch)
tree: c9a3c1a390122be0f369e8c61f37877381ffdb87
parent: 2519fa836235d371a6d7f5c2eb3a8f379120a988 (diff)
1 files changed, 33 insertions, 17 deletions
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 59822f22b8d..1ce21a2ad98 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -233,6 +233,7 @@ typedef struct
 	int			write_head;
 	int			read_heads[NUM_SYNC_REP_WAIT_MODE];
 	WalTimeSample last_read[NUM_SYNC_REP_WAIT_MODE];
+	WalTimeSample overflowed[NUM_SYNC_REP_WAIT_MODE];
 } LagTracker;
 
 static LagTracker *lag_tracker;
@@ -4207,7 +4208,6 @@ WalSndKeepaliveIfNecessary(void)
 static void
 LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time)
 {
-	bool		buffer_full;
 	int			new_write_head;
 	int			i;
 
@@ -4229,25 +4229,19 @@ LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time)
 	 * of space.
 	 */
 	new_write_head = (lag_tracker->write_head + 1) % LAG_TRACKER_BUFFER_SIZE;
-	buffer_full = false;
 	for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; ++i)
 	{
+		/*
+		 * If the buffer is full, move the slowest reader to a separate
+		 * overflow entry and free its space in the buffer so the write head
+		 * can advance.
+		 */
 		if (new_write_head == lag_tracker->read_heads[i])
-			buffer_full = true;
-	}
-
-	/*
-	 * If the buffer is full, for now we just rewind by one slot and overwrite
-	 * the last sample, as a simple (if somewhat uneven) way to lower the
-	 * sampling rate.  There may be better adaptive compaction algorithms.
-	 */
-	if (buffer_full)
-	{
-		new_write_head = lag_tracker->write_head;
-		if (lag_tracker->write_head > 0)
-			lag_tracker->write_head--;
-		else
-			lag_tracker->write_head = LAG_TRACKER_BUFFER_SIZE - 1;
+		{
+			lag_tracker->overflowed[i] =
+				lag_tracker->buffer[lag_tracker->read_heads[i]];
+			lag_tracker->read_heads[i] = -1;
+		}
 	}
 
 	/* Store a sample at the current write head position. */
@@ -4274,6 +4268,28 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
 {
 	TimestampTz time = 0;
 
+	/*
+	 * If 'lsn' has not passed the WAL position stored in the overflow entry,
+	 * return the elapsed time (in microseconds) since the saved local flush
+	 * time. If the flush time is in the future (due to clock drift), return
+	 * -1 to treat as no valid sample.
+	 *
+	 * Otherwise, switch back to using the buffer to control the read head and
+	 * compute the elapsed time.  The read head is then reset to point to the
+	 * oldest entry in the buffer.
+	 */
+	if (lag_tracker->read_heads[head] == -1)
+	{
+		if (lag_tracker->overflowed[head].lsn > lsn)
+			return (now >= lag_tracker->overflowed[head].time) ?
+				now - lag_tracker->overflowed[head].time : -1;
+
+		time = lag_tracker->overflowed[head].time;
+		lag_tracker->last_read[head] = lag_tracker->overflowed[head];
+		lag_tracker->read_heads[head] =
+			(lag_tracker->write_head + 1) % LAG_TRACKER_BUFFER_SIZE;
+	}
+
 	/* Read all unread samples up to this LSN or end of buffer. */
 	while (lag_tracker->read_heads[head] != lag_tracker->write_head &&
 		   lag_tracker->buffer[lag_tracker->read_heads[head]].lsn <= lsn)
author	Fujii Masao <fujii@postgresql.org>	2025-10-22 11:27:15 +0900
committer	Fujii Masao <fujii@postgresql.org>	2025-10-22 11:27:15 +0900
commit	883a95646a8e67a2d316f230712ed82b8ba58e28 (patch)
tree	c9a3c1a390122be0f369e8c61f37877381ffdb87
parent	2519fa836235d371a6d7f5c2eb3a8f379120a988 (diff)