From 5173bfd0443e0c0f3fa37006727d516dc1ba4cee Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Sat, 25 Oct 2025 09:07:31 +0900 Subject: pg_rewind: Skip copy of WAL segments generated before point of divergence This commit makes the way WAL segments are handled from the source to the target server slightly smarter: the copy of the WAL segments is now skipped if these have been created before the point where source and target have diverged (the WAL segment where the point of divergence exists is still copied), because we know that such segments exist on both the target and source. Note that the on-disk size of the WAL segments on the source and target need to match. Hence, only the segments generated after the point of divergence are now copied. A segment existing on the source but not the target is copied. Previously, all the WAL segments were just copied in full. This change can make the rewind operation cheaper in some configurations, especially for setups where some WAL retention causes many segments to remain on the source server even after the promotion of a standby used as source to rewind a previous primary. A TAP test is added to track these new behaviors. The file map printed with --debug now includes all the information related to WAL segments, to be able to track if these are copied or skipped, and the test relies on the debug output generated. Author: John Hsu Author: Justin Kwan Reviewed-by: Robert Haas Reviewed-by: Alexander Korotkov Reviewed-by: Japin Li Reviewed-by: Michael Paquier Reviewed-by: Srinath Reddy Sadipiralla Discussion: https://postgr.es/m/181b4c6fa9c.b8b725681941212.7547232617810891479@viggy28.dev --- src/bin/pg_rewind/filemap.c | 55 ++++++++++++++-- src/bin/pg_rewind/filemap.h | 3 +- src/bin/pg_rewind/meson.build | 1 + src/bin/pg_rewind/pg_rewind.c | 9 ++- src/bin/pg_rewind/t/011_wal_copy.pl | 122 ++++++++++++++++++++++++++++++++++++ 5 files changed, 183 insertions(+), 7 deletions(-) create mode 100644 src/bin/pg_rewind/t/011_wal_copy.pl (limited to 'src') diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 00f5d60d620..467fd97ebcf 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -546,7 +546,9 @@ print_filemap(filemap_t *filemap) for (i = 0; i < filemap->nentries; i++) { entry = filemap->entries[i]; + if (entry->action != FILE_ACTION_NONE || + entry->content_type == FILE_CONTENT_TYPE_WAL || entry->target_pages_to_overwrite.bitmapsize > 0) { pg_log_debug("%s (%s)", entry->path, @@ -706,11 +708,45 @@ final_filemap_cmp(const void *a, const void *b) return strcmp(fa->path, fb->path); } +/* + * Decide what to do with a WAL segment file based on its position + * relative to the point of divergence. + * + * Caller is responsible for ensuring that the file exists on both + * source and target servers. + */ +static file_action_t +decide_wal_file_action(const char *fname, XLogSegNo last_common_segno, + size_t source_size, size_t target_size) +{ + TimeLineID file_tli; + XLogSegNo file_segno; + + /* Get current WAL segment number given current segment file name */ + XLogFromFileName(fname, &file_tli, &file_segno, WalSegSz); + + /* + * Avoid copying files before the last common segment. + * + * These files exist on the source and the target servers, so they should + * be identical and located strictly before the segment that contains the + * LSN where target and source servers have diverged. + * + * While we are on it, double-check the size of each file and copy the + * file if they do not match, in case. + */ + if (file_segno < last_common_segno && + source_size == target_size) + return FILE_ACTION_NONE; + + return FILE_ACTION_COPY; +} + /* * Decide what action to perform to a file. */ static file_action_t -decide_file_action(file_entry_t *entry) +decide_file_action(file_entry_t *entry, XLogSegNo last_common_segno) { const char *path = entry->path; @@ -814,8 +850,17 @@ decide_file_action(file_entry_t *entry) case FILE_TYPE_REGULAR: if (entry->content_type == FILE_CONTENT_TYPE_WAL) { - /* It's a WAL file, copy it. */ - return FILE_ACTION_COPY; + /* Handle WAL segment file */ + const char *filename = last_dir_separator(entry->path); + + if (filename == NULL) + filename = entry->path; + else + filename++; /* Skip the separator */ + + return decide_wal_file_action(filename, last_common_segno, + entry->source_size, + entry->target_size); } else if (entry->content_type != FILE_CONTENT_TYPE_RELATION) { @@ -876,7 +921,7 @@ decide_file_action(file_entry_t *entry) * should be executed. */ filemap_t * -decide_file_actions(void) +decide_file_actions(XLogSegNo last_common_segno) { int i; filehash_iterator it; @@ -886,7 +931,7 @@ decide_file_actions(void) filehash_start_iterate(filehash, &it); while ((entry = filehash_iterate(filehash, &it)) != NULL) { - entry->action = decide_file_action(entry); + entry->action = decide_file_action(entry, last_common_segno); } /* diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h index fada420fc23..5145f0b4c46 100644 --- a/src/bin/pg_rewind/filemap.h +++ b/src/bin/pg_rewind/filemap.h @@ -11,6 +11,7 @@ #include "datapagemap.h" #include "storage/block.h" #include "storage/relfilelocator.h" +#include "access/xlogdefs.h" /* these enum values are sorted in the order we want actions to be processed */ typedef enum @@ -113,7 +114,7 @@ extern void process_target_wal_block_change(ForkNumber forknum, RelFileLocator rlocator, BlockNumber blkno); -extern filemap_t *decide_file_actions(void); +extern filemap_t *decide_file_actions(XLogSegNo last_common_segno); extern void calculate_totals(filemap_t *filemap); extern void print_filemap(filemap_t *filemap); diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build index 36171600cca..97f001d94a5 100644 --- a/src/bin/pg_rewind/meson.build +++ b/src/bin/pg_rewind/meson.build @@ -44,6 +44,7 @@ tests += { 't/008_min_recovery_point.pl', 't/009_growing_files.pl', 't/010_keep_recycled_wals.pl', + 't/011_wal_copy.pl', ], }, } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 0c68dd4235e..1b953692b17 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -147,6 +147,7 @@ main(int argc, char **argv) TimeLineID source_tli; TimeLineID target_tli; XLogRecPtr target_wal_endrec; + XLogSegNo last_common_segno; size_t size; char *buffer; bool no_ensure_shutdown = false; @@ -397,6 +398,12 @@ main(int argc, char **argv) LSN_FORMAT_ARGS(divergerec), targetHistory[lastcommontliIndex].tli); + /* + * Convert the divergence LSN to a segment number, that will be used + * to decide how WAL segments should be processed. + */ + XLByteToSeg(divergerec, last_common_segno, ControlFile_target.xlog_seg_size); + /* * Don't need the source history anymore. The target history is still * needed by the routines in parsexlog.c, when we read the target WAL. @@ -492,7 +499,7 @@ main(int argc, char **argv) * We have collected all information we need from both systems. Decide * what to do with each file. */ - filemap = decide_file_actions(); + filemap = decide_file_actions(last_common_segno); if (showprogress) calculate_totals(filemap); diff --git a/src/bin/pg_rewind/t/011_wal_copy.pl b/src/bin/pg_rewind/t/011_wal_copy.pl new file mode 100644 index 00000000000..89ef2590ed9 --- /dev/null +++ b/src/bin/pg_rewind/t/011_wal_copy.pl @@ -0,0 +1,122 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group +# +# Check how the copy of WAL segments is handled from the source to +# the target server. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Utils; +use Test::More; +use File::stat qw(stat); + +use FindBin; +use lib $FindBin::RealBin; +use RewindTest; + +RewindTest::setup_cluster(); +RewindTest::start_primary(); +RewindTest::create_standby(); + +# Advance WAL on primary +RewindTest::primary_psql("CREATE TABLE t(a int)"); +RewindTest::primary_psql("INSERT INTO t VALUES(0)"); + +# Segment that is not copied from the source to the target, being +# generated before the servers have diverged. +my $wal_seg_skipped = $node_primary->safe_psql('postgres', + 'SELECT pg_walfile_name(pg_current_wal_lsn())'); + +RewindTest::primary_psql("SELECT pg_switch_wal()"); + +# Follow-up segment, that will include corrupted contents, and will be +# copied from the source to the target even if generated before the point +# of divergence. +RewindTest::primary_psql("INSERT INTO t VALUES(0)"); +my $corrupt_wal_seg = $node_primary->safe_psql('postgres', + 'SELECT pg_walfile_name(pg_current_wal_lsn())'); +RewindTest::primary_psql("SELECT pg_switch_wal()"); + +RewindTest::primary_psql("CHECKPOINT"); +RewindTest::promote_standby; + +# New segment on a new timeline, expected to be copied. +my $new_timeline_wal_seg = $node_standby->safe_psql('postgres', + 'SELECT pg_walfile_name(pg_current_wal_lsn())'); + +# Corrupt a WAL segment on target that has been generated before the +# divergence point. We will check that it is copied from the source. +my $corrupt_wal_seg_in_target_path = + $node_primary->data_dir . '/pg_wal/' . $corrupt_wal_seg; +open my $fh, ">>", $corrupt_wal_seg_in_target_path + or die "could not open $corrupt_wal_seg_in_target_path"; + +print $fh 'a'; +close $fh; + +my $corrupt_wal_seg_stat_before_rewind = + stat($corrupt_wal_seg_in_target_path); +ok(defined($corrupt_wal_seg_stat_before_rewind), + "segment $corrupt_wal_seg exists in target before rewind"); + +# Verify that the WAL segment on the new timeline does not exist in target +# before the rewind. +my $new_timeline_wal_seg_path = + $node_primary->data_dir . '/pg_wal/' . $new_timeline_wal_seg; +my $new_timeline_wal_seg_stat = stat($new_timeline_wal_seg_path); +ok(!defined($new_timeline_wal_seg_stat), + "segment $new_timeline_wal_seg does not exist in target before rewind"); + +$node_standby->stop(); +$node_primary->stop(); + +# Cross-check how WAL segments are handled: +# - The "corrupted" segment generated before the point of divergence is +# copied. +# - The "clean" segment generated before the point of divergence is skipped. +# - The segment of the new timeline is copied. +command_checks_all( + [ + 'pg_rewind', '--debug', + '--source-pgdata' => $node_standby->data_dir, + '--target-pgdata' => $node_primary->data_dir, + '--no-sync', + ], + 0, + [qr//], + [ + qr/pg_wal\/$wal_seg_skipped \(NONE\)/, + qr/pg_wal\/$corrupt_wal_seg \(COPY\)/, + qr/pg_wal\/$new_timeline_wal_seg \(COPY\)/, + ], + 'run pg_rewind'); + +# Verify that the first WAL segment of the new timeline now exists in +# target. +$new_timeline_wal_seg_stat = stat($new_timeline_wal_seg_path); +ok(defined($new_timeline_wal_seg_stat), + "new timeline segment $new_timeline_wal_seg exists in target after rewind" +); + +# Validate that the WAL segment with the same file name as the +# corrupted WAL segment in target has been copied from source +# where it was still intact. +my $corrupt_wal_seg_in_source_path = + $node_standby->data_dir . '/pg_wal/' . $corrupt_wal_seg; +my $corrupt_wal_seg_source_stat = stat($corrupt_wal_seg_in_source_path); +ok(defined($corrupt_wal_seg_source_stat), + "corrupted $corrupt_wal_seg exists in source after rewind"); + +my $corrupt_wal_seg_stat_after_rewind = stat($corrupt_wal_seg_in_target_path); +ok(defined($corrupt_wal_seg_stat_after_rewind), + "corrupted $corrupt_wal_seg exists in target after rewind"); +isnt( + $corrupt_wal_seg_stat_before_rewind->size, + $corrupt_wal_seg_source_stat->size, + "different size of corrupted $corrupt_wal_seg in source vs target before rewind" +); +is( $corrupt_wal_seg_stat_after_rewind->size, + $corrupt_wal_seg_source_stat->size, + "same size of corrupted $corrupt_wal_seg in source and target after rewind" +); + +done_testing(); -- cgit v1.2.3