From 8c58624df46222d4d09c5655d8350f3b037880c8 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Wed, 8 Feb 2023 07:58:25 +0530
Subject: Fix the logical replication timeout during large DDLs.

The DDLs like Refresh Materialized views that generate lots of temporary
data due to rewrite rules may not be processed by output plugins (for
example pgoutput). So, we won't send keep-alive messages for a long time
while processing such commands and that can lead the subscriber side to
timeout. We have previously fixed a similar case for large transactions in
commit f95d53eded where the output plugin filters all or most of the
changes but missed to handle the DDLs.

We decided not to backpatch this as this adds a new callback in the
existing exposed structure and moreover, users can increase the
wal_sender_timeout and wal_receiver_timeout to avoid this problem.

Author: Wang wei, Hou Zhijie
Reviewed-by: Peter Smith, Ashutosh Bapat, Shi yu, Amit Kapila
Discussion: https://postgr.es/m/OS3PR01MB6275478E5D29E4A563302D3D9E2B9@OS3PR01MB6275.jpnprd01.prod.outlook.com
Discussion: https://postgr.es/m/CAA5-nLARN7-3SLU_QUxfy510pmrYK6JJb=bk3hcgemAM_pAv+w@mail.gmail.com
---
 src/backend/replication/logical/reorderbuffer.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'src/backend/replication/logical/reorderbuffer.c')

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 0468d12936f..d5f90a5f5d2 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -2100,6 +2100,8 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
 	PG_TRY();
 	{
 		ReorderBufferChange *change;
+		int			changes_count = 0;	/* used to accumulate the number of
+										 * changes */
 
 		if (using_subtxn)
 			BeginInternalSubTransaction(streaming ? "stream" : "replay");
@@ -2440,6 +2442,24 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
 					elog(ERROR, "tuplecid value in changequeue");
 					break;
 			}
+
+			/*
+			 * It is possible that the data is not sent to downstream for a
+			 * long time either because the output plugin filtered it or there
+			 * is a DDL that generates a lot of data that is not processed by
+			 * the plugin. So, in such cases, the downstream can timeout. To
+			 * avoid that we try to send a keepalive message if required.
+			 * Trying to send a keepalive message after every change has some
+			 * overhead, but testing showed there is no noticeable overhead if
+			 * we do it after every ~100 changes.
+			 */
+#define CHANGES_THRESHOLD 100
+
+			if (++changes_count >= CHANGES_THRESHOLD)
+			{
+				rb->update_progress_txn(rb, txn, change->lsn);
+				changes_count = 0;
+			}
 		}
 
 		/* speculative insertion record must be freed by now */
-- 
cgit v1.2.3