diff options
Diffstat (limited to 'src/bin/pg_combinebackup/reconstruct.c')
-rw-r--r-- | src/bin/pg_combinebackup/reconstruct.c | 687 |
1 files changed, 687 insertions, 0 deletions
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c new file mode 100644 index 00000000000..6decdd89340 --- /dev/null +++ b/src/bin/pg_combinebackup/reconstruct.c @@ -0,0 +1,687 @@ +/*------------------------------------------------------------------------- + * + * reconstruct.c + * Reconstruct full file from incremental file and backup chain. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/reconstruct.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <unistd.h> + +#include "backup/basebackup_incremental.h" +#include "common/logging.h" +#include "common/file_perm.h" +#include "copy_file.h" +#include "lib/stringinfo.h" +#include "reconstruct.h" +#include "storage/block.h" + +/* + * An rfile stores the data that we need in order to be able to use some file + * on disk for reconstruction. For any given output file, we create one rfile + * per backup that we need to consult when we constructing that output file. + * + * If we find a full version of the file in the backup chain, then only + * filename and fd are initialized; the remaining fields are 0 or NULL. + * For an incremental file, header_length, num_blocks, relative_block_numbers, + * and truncation_block_length are also set. + * + * num_blocks_read and highest_offset_read always start out as 0. + */ +typedef struct rfile +{ + char *filename; + int fd; + size_t header_length; + unsigned num_blocks; + BlockNumber *relative_block_numbers; + unsigned truncation_block_length; + unsigned num_blocks_read; + off_t highest_offset_read; +} rfile; + +static void debug_reconstruction(int n_source, + rfile **sources, + bool dry_run); +static unsigned find_reconstructed_block_length(rfile *s); +static rfile *make_incremental_rfile(char *filename); +static rfile *make_rfile(char *filename, bool missing_ok); +static void write_reconstructed_file(char *input_filename, + char *output_filename, + unsigned block_length, + rfile **sourcemap, + off_t *offsetmap, + pg_checksum_context *checksum_ctx, + bool debug, + bool dry_run); +static void read_bytes(rfile *rf, void *buffer, unsigned length); + +/* + * Reconstruct a full file from an incremental file and a chain of prior + * backups. + * + * input_filename should be the path to the incremental file, and + * output_filename should be the path where the reconstructed file is to be + * written. + * + * relative_path should be the relative path to the directory containing this + * file. bare_file_name should be the name of the file within that directory, + * without "INCREMENTAL.". + * + * n_prior_backups is the number of prior backups, and prior_backup_dirs is + * an array of pathnames where those backups can be found. + */ +void +reconstruct_from_incremental_file(char *input_filename, + char *output_filename, + char *relative_path, + char *bare_file_name, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + char *manifest_path, + pg_checksum_type checksum_type, + int *checksum_length, + uint8 **checksum_payload, + bool debug, + bool dry_run) +{ + rfile **source; + rfile *latest_source = NULL; + rfile **sourcemap; + off_t *offsetmap; + unsigned block_length; + unsigned i; + unsigned sidx = n_prior_backups; + bool full_copy_possible = true; + int copy_source_index = -1; + rfile *copy_source = NULL; + pg_checksum_context checksum_ctx; + + /* + * Every block must come either from the latest version of the file or + * from one of the prior backups. + */ + source = pg_malloc0(sizeof(rfile *) * (1 + n_prior_backups)); + + /* + * Use the information from the latest incremental file to figure out how + * long the reconstructed file should be. + */ + latest_source = make_incremental_rfile(input_filename); + source[n_prior_backups] = latest_source; + block_length = find_reconstructed_block_length(latest_source); + + /* + * For each block in the output file, we need to know from which file we + * need to obtain it and at what offset in that file it's stored. + * sourcemap gives us the first of these things, and offsetmap the latter. + */ + sourcemap = pg_malloc0(sizeof(rfile *) * block_length); + offsetmap = pg_malloc0(sizeof(off_t) * block_length); + + /* + * Every block that is present in the newest incremental file should be + * sourced from that file. If it precedes the truncation_block_length, + * it's a block that we would otherwise have had to find in an older + * backup and thus reduces the number of blocks remaining to be found by + * one; otherwise, it's an extra block that needs to be included in the + * output but would not have needed to be found in an older backup if it + * had not been present. + */ + for (i = 0; i < latest_source->num_blocks; ++i) + { + BlockNumber b = latest_source->relative_block_numbers[i]; + + Assert(b < block_length); + sourcemap[b] = latest_source; + offsetmap[b] = latest_source->header_length + (i * BLCKSZ); + + /* + * A full copy of a file from an earlier backup is only possible if no + * blocks are needed from any later incremental file. + */ + full_copy_possible = false; + } + + while (1) + { + char source_filename[MAXPGPATH]; + rfile *s; + + /* + * Move to the next backup in the chain. If there are no more, then + * we're done. + */ + if (sidx == 0) + break; + --sidx; + + /* + * Look for the full file in the previous backup. If not found, then + * look for an incremental file instead. + */ + snprintf(source_filename, MAXPGPATH, "%s/%s/%s", + prior_backup_dirs[sidx], relative_path, bare_file_name); + if ((s = make_rfile(source_filename, true)) == NULL) + { + snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s", + prior_backup_dirs[sidx], relative_path, bare_file_name); + s = make_incremental_rfile(source_filename); + } + source[sidx] = s; + + /* + * If s->header_length == 0, then this is a full file; otherwise, it's + * an incremental file. + */ + if (s->header_length == 0) + { + struct stat sb; + BlockNumber b; + BlockNumber blocklength; + + /* We need to know the length of the file. */ + if (fstat(s->fd, &sb) < 0) + pg_fatal("could not stat \"%s\": %m", s->filename); + + /* + * Since we found a full file, source all blocks from it that + * exist in the file. + * + * Note that there may be blocks that don't exist either in this + * file or in any incremental file but that precede + * truncation_block_length. These are, presumably, zero-filled + * blocks that result from the server extending the file but + * taking no action on those blocks that generated any WAL. + * + * Sadly, we have no way of validating that this is really what + * happened, and neither does the server. From it's perspective, + * an unmodified block that contains data looks exactly the same + * as a zero-filled block that never had any data: either way, + * it's not mentioned in any WAL summary and the server has no + * reason to read it. From our perspective, all we know is that + * nobody had a reason to back up the block. That certainly means + * that the block didn't exist at the time of the full backup, but + * the supposition that it was all zeroes at the time of every + * later backup is one that we can't validate. + */ + blocklength = sb.st_size / BLCKSZ; + for (b = 0; b < latest_source->truncation_block_length; ++b) + { + if (sourcemap[b] == NULL && b < blocklength) + { + sourcemap[b] = s; + offsetmap[b] = b * BLCKSZ; + } + } + + /* + * If a full copy looks possible, check whether the resulting file + * should be exactly as long as the source file is. If so, a full + * copy is acceptable, otherwise not. + */ + if (full_copy_possible) + { + uint64 expected_length; + + expected_length = + (uint64) latest_source->truncation_block_length; + expected_length *= BLCKSZ; + if (expected_length == sb.st_size) + { + copy_source = s; + copy_source_index = sidx; + } + } + + /* We don't need to consider any further sources. */ + break; + } + + /* + * Since we found another incremental file, source all blocks from it + * that we need but don't yet have. + */ + for (i = 0; i < s->num_blocks; ++i) + { + BlockNumber b = s->relative_block_numbers[i]; + + if (b < latest_source->truncation_block_length && + sourcemap[b] == NULL) + { + sourcemap[b] = s; + offsetmap[b] = s->header_length + (i * BLCKSZ); + + /* + * A full copy of a file from an earlier backup is only + * possible if no blocks are needed from any later incremental + * file. + */ + full_copy_possible = false; + } + } + } + + /* + * If a checksum of the required type already exists in the + * backup_manifest for the relevant input directory, we can save some work + * by reusing that checksum instead of computing a new one. + */ + if (copy_source_index >= 0 && manifests[copy_source_index] != NULL && + checksum_type != CHECKSUM_TYPE_NONE) + { + manifest_file *mfile; + + mfile = manifest_files_lookup(manifests[copy_source_index]->files, + manifest_path); + if (mfile == NULL) + { + char *path = psprintf("%s/backup_manifest", + prior_backup_dirs[copy_source_index]); + + /* + * The directory is out of sync with the backup_manifest, so emit + * a warning. + */ + /*- translator: the first %s is a backup manifest file, the second is a file absent therein */ + pg_log_warning("\"%s\" contains no entry for \"%s\"", + path, + manifest_path); + pfree(path); + } + else if (mfile->checksum_type == checksum_type) + { + *checksum_length = mfile->checksum_length; + *checksum_payload = pg_malloc(*checksum_length); + memcpy(*checksum_payload, mfile->checksum_payload, + *checksum_length); + checksum_type = CHECKSUM_TYPE_NONE; + } + } + + /* Prepare for checksum calculation, if required. */ + pg_checksum_init(&checksum_ctx, checksum_type); + + /* + * If the full file can be created by copying a file from an older backup + * in the chain without needing to overwrite any blocks or truncate the + * result, then forget about performing reconstruction and just copy that + * file in its entirety. + * + * Otherwise, reconstruct. + */ + if (copy_source != NULL) + copy_file(copy_source->filename, output_filename, + &checksum_ctx, dry_run); + else + { + write_reconstructed_file(input_filename, output_filename, + block_length, sourcemap, offsetmap, + &checksum_ctx, debug, dry_run); + debug_reconstruction(n_prior_backups + 1, source, dry_run); + } + + /* Save results of checksum calculation. */ + if (checksum_type != CHECKSUM_TYPE_NONE) + { + *checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH); + *checksum_length = pg_checksum_final(&checksum_ctx, + *checksum_payload); + } + + /* + * Close files and release memory. + */ + for (i = 0; i <= n_prior_backups; ++i) + { + rfile *s = source[i]; + + if (s == NULL) + continue; + if (close(s->fd) != 0) + pg_fatal("could not close \"%s\": %m", s->filename); + if (s->relative_block_numbers != NULL) + pfree(s->relative_block_numbers); + pg_free(s->filename); + } + pfree(sourcemap); + pfree(offsetmap); + pfree(source); +} + +/* + * Perform post-reconstruction logging and sanity checks. + */ +static void +debug_reconstruction(int n_source, rfile **sources, bool dry_run) +{ + unsigned i; + + for (i = 0; i < n_source; ++i) + { + rfile *s = sources[i]; + + /* Ignore source if not used. */ + if (s == NULL) + continue; + + /* If no data is needed from this file, we can ignore it. */ + if (s->num_blocks_read == 0) + continue; + + /* Debug logging. */ + if (dry_run) + pg_log_debug("would have read %u blocks from \"%s\"", + s->num_blocks_read, s->filename); + else + pg_log_debug("read %u blocks from \"%s\"", + s->num_blocks_read, s->filename); + + /* + * In dry-run mode, we don't actually try to read data from the file, + * but we do try to verify that the file is long enough that we could + * have read the data if we'd tried. + * + * If this fails, then it means that a non-dry-run attempt would fail, + * complaining of not being able to read the required bytes from the + * file. + */ + if (dry_run) + { + struct stat sb; + + if (fstat(s->fd, &sb) < 0) + pg_fatal("could not stat \"%s\": %m", s->filename); + if (sb.st_size < s->highest_offset_read) + pg_fatal("file \"%s\" is too short: expected %llu, found %llu", + s->filename, + (unsigned long long) s->highest_offset_read, + (unsigned long long) sb.st_size); + } + } +} + +/* + * When we perform reconstruction using an incremental file, the output file + * should be at least as long as the truncation_block_length. Any blocks + * present in the incremental file increase the output length as far as is + * necessary to include those blocks. + */ +static unsigned +find_reconstructed_block_length(rfile *s) +{ + unsigned block_length = s->truncation_block_length; + unsigned i; + + for (i = 0; i < s->num_blocks; ++i) + if (s->relative_block_numbers[i] >= block_length) + block_length = s->relative_block_numbers[i] + 1; + + return block_length; +} + +/* + * Initialize an incremental rfile, reading the header so that we know which + * blocks it contains. + */ +static rfile * +make_incremental_rfile(char *filename) +{ + rfile *rf; + unsigned magic; + + rf = make_rfile(filename, false); + + /* Read and validate magic number. */ + read_bytes(rf, &magic, sizeof(magic)); + if (magic != INCREMENTAL_MAGIC) + pg_fatal("file \"%s\" has bad incremental magic number (0x%x not 0x%x)", + filename, magic, INCREMENTAL_MAGIC); + + /* Read block count. */ + read_bytes(rf, &rf->num_blocks, sizeof(rf->num_blocks)); + if (rf->num_blocks > RELSEG_SIZE) + pg_fatal("file \"%s\" has block count %u in excess of segment size %u", + filename, rf->num_blocks, RELSEG_SIZE); + + /* Read truncation block length. */ + read_bytes(rf, &rf->truncation_block_length, + sizeof(rf->truncation_block_length)); + if (rf->truncation_block_length > RELSEG_SIZE) + pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u", + filename, rf->truncation_block_length, RELSEG_SIZE); + + /* Read block numbers if there are any. */ + if (rf->num_blocks > 0) + { + rf->relative_block_numbers = + pg_malloc0(sizeof(BlockNumber) * rf->num_blocks); + read_bytes(rf, rf->relative_block_numbers, + sizeof(BlockNumber) * rf->num_blocks); + } + + /* Remember length of header. */ + rf->header_length = sizeof(magic) + sizeof(rf->num_blocks) + + sizeof(rf->truncation_block_length) + + sizeof(BlockNumber) * rf->num_blocks; + + return rf; +} + +/* + * Allocate and perform basic initialization of an rfile. + */ +static rfile * +make_rfile(char *filename, bool missing_ok) +{ + rfile *rf; + + rf = pg_malloc0(sizeof(rfile)); + rf->filename = pstrdup(filename); + if ((rf->fd = open(filename, O_RDONLY | PG_BINARY, 0)) < 0) + { + if (missing_ok && errno == ENOENT) + { + pg_free(rf); + return NULL; + } + pg_fatal("could not open file \"%s\": %m", filename); + } + + return rf; +} + +/* + * Read the indicated number of bytes from an rfile into the buffer. + */ +static void +read_bytes(rfile *rf, void *buffer, unsigned length) +{ + unsigned rb = read(rf->fd, buffer, length); + + if (rb != length) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", rf->filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes", + rf->filename, (int) rb, length); + } +} + +/* + * Write out a reconstructed file. + */ +static void +write_reconstructed_file(char *input_filename, + char *output_filename, + unsigned block_length, + rfile **sourcemap, + off_t *offsetmap, + pg_checksum_context *checksum_ctx, + bool debug, + bool dry_run) +{ + int wfd = -1; + unsigned i; + unsigned zero_blocks = 0; + + /* Debugging output. */ + if (debug) + { + StringInfoData debug_buf; + unsigned start_of_range = 0; + unsigned current_block = 0; + + /* Basic information about the output file to be produced. */ + if (dry_run) + pg_log_debug("would reconstruct \"%s\" (%u blocks, checksum %s)", + output_filename, block_length, + pg_checksum_type_name(checksum_ctx->type)); + else + pg_log_debug("reconstructing \"%s\" (%u blocks, checksum %s)", + output_filename, block_length, + pg_checksum_type_name(checksum_ctx->type)); + + /* Print out the plan for reconstructing this file. */ + initStringInfo(&debug_buf); + while (current_block < block_length) + { + rfile *s = sourcemap[current_block]; + + /* Extend range, if possible. */ + if (current_block + 1 < block_length && + s == sourcemap[current_block + 1]) + { + ++current_block; + continue; + } + + /* Add details about this range. */ + if (s == NULL) + { + if (current_block == start_of_range) + appendStringInfo(&debug_buf, " %u:zero", current_block); + else + appendStringInfo(&debug_buf, " %u-%u:zero", + start_of_range, current_block); + } + else + { + if (current_block == start_of_range) + appendStringInfo(&debug_buf, " %u:%s@" UINT64_FORMAT, + current_block, + s == NULL ? "ZERO" : s->filename, + (uint64) offsetmap[current_block]); + else + appendStringInfo(&debug_buf, " %u-%u:%s@" UINT64_FORMAT, + start_of_range, current_block, + s == NULL ? "ZERO" : s->filename, + (uint64) offsetmap[current_block]); + } + + /* Begin new range. */ + start_of_range = ++current_block; + + /* If the output is very long or we are done, dump it now. */ + if (current_block == block_length || debug_buf.len > 1024) + { + pg_log_debug("reconstruction plan:%s", debug_buf.data); + resetStringInfo(&debug_buf); + } + } + + /* Free memory. */ + pfree(debug_buf.data); + } + + /* Open the output file, except in dry_run mode. */ + if (!dry_run && + (wfd = open(output_filename, + O_RDWR | PG_BINARY | O_CREAT | O_EXCL, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", output_filename); + + /* Read and write the blocks as required. */ + for (i = 0; i < block_length; ++i) + { + uint8 buffer[BLCKSZ]; + rfile *s = sourcemap[i]; + unsigned wb; + + /* Update accounting information. */ + if (s == NULL) + ++zero_blocks; + else + { + s->num_blocks_read++; + s->highest_offset_read = Max(s->highest_offset_read, + offsetmap[i] + BLCKSZ); + } + + /* Skip the rest of this in dry-run mode. */ + if (dry_run) + continue; + + /* Read or zero-fill the block as appropriate. */ + if (s == NULL) + { + /* + * New block not mentioned in the WAL summary. Should have been an + * uninitialized block, so just zero-fill it. + */ + memset(buffer, 0, BLCKSZ); + } + else + { + unsigned rb; + + /* Read the block from the correct source, except if dry-run. */ + rb = pg_pread(s->fd, buffer, BLCKSZ, offsetmap[i]); + if (rb != BLCKSZ) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", s->filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %u", + s->filename, (int) rb, BLCKSZ, + (unsigned) offsetmap[i]); + } + } + + /* Write out the block. */ + if ((wb = write(wfd, buffer, BLCKSZ)) != BLCKSZ) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", output_filename); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + output_filename, (int) wb, BLCKSZ); + } + + /* Update the checksum computation. */ + if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0) + pg_fatal("could not update checksum of file \"%s\"", + output_filename); + } + + /* Debugging output. */ + if (zero_blocks > 0) + { + if (dry_run) + pg_log_debug("would have zero-filled %u blocks", zero_blocks); + else + pg_log_debug("zero-filled %u blocks", zero_blocks); + } + + /* Close the output file. */ + if (wfd >= 0 && close(wfd) != 0) + pg_fatal("could not close \"%s\": %m", output_filename); +} |