src/backend/access/transam/xlogwait.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409

/*-------------------------------------------------------------------------
 *
 * xlogwait.c
 *	  Implements waiting for WAL operations to reach specific LSNs.
 *
 * Copyright (c) 2025, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/backend/access/transam/xlogwait.c
 *
 * NOTES
 *		This file implements waiting for WAL operations to reach specific LSNs
 *		on both physical standby and primary servers. The core idea is simple:
 *		every process that wants to wait publishes the LSN it needs to the
 *		shared memory, and the appropriate process (startup on standby, or
 *		WAL writer/backend on primary) wakes it once that LSN has been reached.
 *
 *		The shared memory used by this module comprises a procInfos
 *		per-backend array with the information of the awaited LSN for each
 *		of the backend processes.  The elements of that array are organized
 *		into a pairing heap waitersHeap, which allows for very fast finding
 *		of the least awaited LSN.
 *
 *		In addition, the least-awaited LSN is cached as minWaitedLSN.  The
 *		waiter process publishes information about itself to the shared
 *		memory and waits on the latch until it is woken up by the appropriate
 *		process, standby is promoted, or the postmaster	dies.  Then, it cleans
 *		information about itself in the shared memory.
 *
 *		On standby servers: After replaying a WAL record, the startup process
 *		first performs a fast path check minWaitedLSN > replayLSN.  If this
 *		check is negative, it checks waitersHeap and wakes up the backend
 *		whose awaited LSNs are reached.
 *
 *		On primary servers: After flushing WAL, the WAL writer or backend
 *		process performs a similar check against the flush LSN and wakes up
 *		waiters whose target flush LSNs have been reached.
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <float.h>
#include <math.h>

#include "access/xlog.h"
#include "access/xlogrecovery.h"
#include "access/xlogwait.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/latch.h"
#include "storage/proc.h"
#include "storage/shmem.h"
#include "utils/fmgrprotos.h"
#include "utils/pg_lsn.h"
#include "utils/snapmgr.h"


static int	waitlsn_cmp(const pairingheap_node *a, const pairingheap_node *b,
						void *arg);

struct WaitLSNState *waitLSNState = NULL;

/* Report the amount of shared memory space needed for WaitLSNState. */
Size
WaitLSNShmemSize(void)
{
	Size		size;

	size = offsetof(WaitLSNState, procInfos);
	size = add_size(size, mul_size(MaxBackends + NUM_AUXILIARY_PROCS, sizeof(WaitLSNProcInfo)));
	return size;
}

/* Initialize the WaitLSNState in the shared memory. */
void
WaitLSNShmemInit(void)
{
	bool		found;

	waitLSNState = (WaitLSNState *) ShmemInitStruct("WaitLSNState",
													WaitLSNShmemSize(),
													&found);
	if (!found)
	{
		int			i;

		/* Initialize heaps and tracking */
		for (i = 0; i < WAIT_LSN_TYPE_COUNT; i++)
		{
			pg_atomic_init_u64(&waitLSNState->minWaitedLSN[i], PG_UINT64_MAX);
			pairingheap_initialize(&waitLSNState->waitersHeap[i], waitlsn_cmp, (void *) (uintptr_t) i);
		}

		/* Initialize process info array */
		memset(&waitLSNState->procInfos, 0,
			   (MaxBackends + NUM_AUXILIARY_PROCS) * sizeof(WaitLSNProcInfo));
	}
}

/*
 * Comparison function for LSN waiters heaps. Waiting processes are ordered by
 * LSN, so that the waiter with smallest LSN is at the top.
 */
static int
waitlsn_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg)
{
	int			i = (uintptr_t) arg;
	const WaitLSNProcInfo *aproc = pairingheap_const_container(WaitLSNProcInfo, heapNode[i], a);
	const WaitLSNProcInfo *bproc = pairingheap_const_container(WaitLSNProcInfo, heapNode[i], b);

	if (aproc->waitLSN < bproc->waitLSN)
		return 1;
	else if (aproc->waitLSN > bproc->waitLSN)
		return -1;
	else
		return 0;
}

/*
 * Update minimum waited LSN for the specified LSN type
 */
static void
updateMinWaitedLSN(WaitLSNType lsnType)
{
	XLogRecPtr	minWaitedLSN = PG_UINT64_MAX;
	int			i = (int) lsnType;

	Assert(i >= 0 && i < (int) WAIT_LSN_TYPE_COUNT);

	if (!pairingheap_is_empty(&waitLSNState->waitersHeap[i]))
	{
		pairingheap_node *node = pairingheap_first(&waitLSNState->waitersHeap[i]);
		WaitLSNProcInfo *procInfo = pairingheap_container(WaitLSNProcInfo, heapNode[i], node);

		minWaitedLSN = procInfo->waitLSN;
	}
	pg_atomic_write_u64(&waitLSNState->minWaitedLSN[i], minWaitedLSN);
}

/*
 * Add current process to appropriate waiters heap based on LSN type
 */
static void
addLSNWaiter(XLogRecPtr lsn, WaitLSNType lsnType)
{
	WaitLSNProcInfo *procInfo = &waitLSNState->procInfos[MyProcNumber];
	int			i = (int) lsnType;

	Assert(i >= 0 && i < (int) WAIT_LSN_TYPE_COUNT);

	LWLockAcquire(WaitLSNLock, LW_EXCLUSIVE);

	procInfo->procno = MyProcNumber;
	procInfo->waitLSN = lsn;

	Assert(!procInfo->inHeap[i]);
	pairingheap_add(&waitLSNState->waitersHeap[i], &procInfo->heapNode[i]);
	procInfo->inHeap[i] = true;
	updateMinWaitedLSN(lsnType);

	LWLockRelease(WaitLSNLock);
}

/*
 * Remove current process from appropriate waiters heap based on LSN type
 */
static void
deleteLSNWaiter(WaitLSNType lsnType)
{
	WaitLSNProcInfo *procInfo = &waitLSNState->procInfos[MyProcNumber];
	int			i = (int) lsnType;

	Assert(i >= 0 && i < (int) WAIT_LSN_TYPE_COUNT);

	LWLockAcquire(WaitLSNLock, LW_EXCLUSIVE);

	if (procInfo->inHeap[i])
	{
		pairingheap_remove(&waitLSNState->waitersHeap[i], &procInfo->heapNode[i]);
		procInfo->inHeap[i] = false;
		updateMinWaitedLSN(lsnType);
	}

	LWLockRelease(WaitLSNLock);
}

/*
 * Size of a static array of procs to wakeup by WaitLSNWakeup() allocated
 * on the stack.  It should be enough to take single iteration for most cases.
 */
#define	WAKEUP_PROC_STATIC_ARRAY_SIZE (16)

/*
 * Remove waiters whose LSN has been reached from the heap and set their
 * latches.  If InvalidXLogRecPtr is given, remove all waiters from the heap
 * and set latches for all waiters.
 *
 * This function first accumulates waiters to wake up into an array, then
 * wakes them up without holding a WaitLSNLock.  The array size is static and
 * equal to WAKEUP_PROC_STATIC_ARRAY_SIZE.  That should be more than enough
 * to wake up all the waiters at once in the vast majority of cases.  However,
 * if there are more waiters, this function will loop to process them in
 * multiple chunks.
 */
static void
wakeupWaiters(WaitLSNType lsnType, XLogRecPtr currentLSN)
{
	ProcNumber	wakeUpProcs[WAKEUP_PROC_STATIC_ARRAY_SIZE];
	int			numWakeUpProcs;
	int			i = (int) lsnType;

	Assert(i >= 0 && i < (int) WAIT_LSN_TYPE_COUNT);

	do
	{
		numWakeUpProcs = 0;
		LWLockAcquire(WaitLSNLock, LW_EXCLUSIVE);

		/*
		 * Iterate the waiters heap until we find LSN not yet reached. Record
		 * process numbers to wake up, but send wakeups after releasing lock.
		 */
		while (!pairingheap_is_empty(&waitLSNState->waitersHeap[i]))
		{
			pairingheap_node *node = pairingheap_first(&waitLSNState->waitersHeap[i]);
			WaitLSNProcInfo *procInfo;

			/* Get procInfo using appropriate heap node */
			procInfo = pairingheap_container(WaitLSNProcInfo, heapNode[i], node);

			if (!XLogRecPtrIsInvalid(currentLSN) && procInfo->waitLSN > currentLSN)
				break;

			Assert(numWakeUpProcs < WAKEUP_PROC_STATIC_ARRAY_SIZE);
			wakeUpProcs[numWakeUpProcs++] = procInfo->procno;
			(void) pairingheap_remove_first(&waitLSNState->waitersHeap[i]);

			/* Update appropriate flag */
			procInfo->inHeap[i] = false;

			if (numWakeUpProcs == WAKEUP_PROC_STATIC_ARRAY_SIZE)
				break;
		}

		updateMinWaitedLSN(lsnType);
		LWLockRelease(WaitLSNLock);

		/*
		 * Set latches for processes whose waited LSNs have been reached.
		 * Since SetLatch() is a time-consuming operation, we do this outside
		 * of WaitLSNLock. This is safe because procLatch is never freed, so
		 * at worst we may set a latch for the wrong process or for no process
		 * at all, which is harmless.
		 */
		for (i = 0; i < numWakeUpProcs; i++)
			SetLatch(&GetPGProcByNumber(wakeUpProcs[i])->procLatch);

	} while (numWakeUpProcs == WAKEUP_PROC_STATIC_ARRAY_SIZE);
}

/*
 * Wake up processes waiting for LSN to reach currentLSN
 */
void
WaitLSNWakeup(WaitLSNType lsnType, XLogRecPtr currentLSN)
{
	int			i = (int) lsnType;

	Assert(i >= 0 && i < (int) WAIT_LSN_TYPE_COUNT);

	/* Fast path check */
	if (pg_atomic_read_u64(&waitLSNState->minWaitedLSN[i]) > currentLSN)
		return;

	wakeupWaiters(lsnType, currentLSN);
}

/*
 * Clean up LSN waiters for exiting process
 */
void
WaitLSNCleanup(void)
{
	if (waitLSNState)
	{
		int			i;

		/*
		 * We do a fast-path check of the heap flags without the lock.  These
		 * flags are set to true only by the process itself.  So, it's only
		 * possible to get a false positive.  But that will be eliminated by a
		 * recheck inside deleteLSNWaiter().
		 */

		for (i = 0; i < (int) WAIT_LSN_TYPE_COUNT; i++)
		{
			if (waitLSNState->procInfos[MyProcNumber].inHeap[i])
				deleteLSNWaiter((WaitLSNType) i);
		}
	}
}

/*
 * Wait using MyLatch till the given LSN is reached, the replica gets
 * promoted, or the postmaster dies.
 *
 * Returns WAIT_LSN_RESULT_SUCCESS if target LSN was reached.
 * Returns WAIT_LSN_RESULT_NOT_IN_RECOVERY if run not in recovery,
 * or replica got promoted before the target LSN reached.
 */
WaitLSNResult
WaitForLSN(WaitLSNType lsnType, XLogRecPtr targetLSN, int64 timeout)
{
	XLogRecPtr	currentLSN;
	TimestampTz endtime = 0;
	int			wake_events = WL_LATCH_SET | WL_POSTMASTER_DEATH;

	/* Shouldn't be called when shmem isn't initialized */
	Assert(waitLSNState);

	/* Should have a valid proc number */
	Assert(MyProcNumber >= 0 && MyProcNumber < MaxBackends);

	if (timeout > 0)
	{
		endtime = TimestampTzPlusMilliseconds(GetCurrentTimestamp(), timeout);
		wake_events |= WL_TIMEOUT;
	}

	/*
	 * Add our process to the waiters heap.  It might happen that target LSN
	 * gets reached before we do.  The check at the beginning of the loop
	 * below prevents the race condition.
	 */
	addLSNWaiter(targetLSN, lsnType);

	for (;;)
	{
		int			rc;
		long		delay_ms = -1;

		if (lsnType == WAIT_LSN_TYPE_REPLAY)
			currentLSN = GetXLogReplayRecPtr(NULL);
		else
			currentLSN = GetFlushRecPtr(NULL);

		/* Check that recovery is still in-progress */
		if (!RecoveryInProgress())
		{
			/*
			 * Recovery was ended, but check if target LSN was already
			 * reached.
			 */
			deleteLSNWaiter(lsnType);

			if (PromoteIsTriggered() && targetLSN <= currentLSN)
				return WAIT_LSN_RESULT_SUCCESS;
			return WAIT_LSN_RESULT_NOT_IN_RECOVERY;
		}
		else
		{
			/* Check if the waited LSN has been reached */
			if (targetLSN <= currentLSN)
				break;
		}

		if (timeout > 0)
		{
			delay_ms = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), endtime);
			if (delay_ms <= 0)
				break;
		}

		CHECK_FOR_INTERRUPTS();

		rc = WaitLatch(MyLatch, wake_events, delay_ms,
					   (lsnType == WAIT_LSN_TYPE_REPLAY) ? WAIT_EVENT_WAIT_FOR_WAL_REPLAY : WAIT_EVENT_WAIT_FOR_WAL_FLUSH);

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (rc & WL_POSTMASTER_DEATH)
			ereport(FATAL,
					errcode(ERRCODE_ADMIN_SHUTDOWN),
					errmsg("terminating connection due to unexpected postmaster exit"),
					errcontext("while waiting for LSN"));

		if (rc & WL_LATCH_SET)
			ResetLatch(MyLatch);
	}

	/*
	 * Delete our process from the shared memory heap.  We might already be
	 * deleted by the startup process.  The 'inHeap' flags prevents us from
	 * the double deletion.
	 */
	deleteLSNWaiter(lsnType);

	/*
	 * If we didn't reach the target LSN, we must be exited by timeout.
	 */
	if (targetLSN > currentLSN)
		return WAIT_LSN_RESULT_TIMEOUT;

	return WAIT_LSN_RESULT_SUCCESS;
}