summaryrefslogtreecommitdiff
path: root/src/backend/storage/smgr/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/smgr/md.c')
-rw-r--r--src/backend/storage/smgr/md.c52
1 files changed, 43 insertions, 9 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 301a24bd66c..a25d07a6f76 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -359,11 +359,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
* During replay, we would delete the file and then recreate it, which is fine
* if the contents of the file were repopulated by subsequent WAL entries.
* But if we didn't WAL-log insertions, but instead relied on fsyncing the
- * file after populating it (as for instance CLUSTER and CREATE INDEX do),
- * the contents of the file would be lost forever. By leaving the empty file
- * until after the next checkpoint, we prevent reassignment of the relfilenode
- * number until it's safe, because relfilenode assignment skips over any
- * existing file.
+ * file after populating it (as we do at wal_level=minimal), the contents of
+ * the file would be lost forever. By leaving the empty file until after the
+ * next checkpoint, we prevent reassignment of the relfilenode number until
+ * it's safe, because relfilenode assignment skips over any existing file.
*
* We do not need to go through this dance for temp relations, though, because
* we never make WAL entries for temp rels, and so a temp rel poses no threat
@@ -1019,12 +1018,19 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
* mdimmedsync() -- Immediately sync a relation to stable storage.
*
* Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.
+ * nothing of dirty buffers that may exist inside the buffer manager. We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive. If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
*/
void
mdimmedsync(SMgrRelation reln, ForkNumber forknum)
{
MdfdVec *v;
+ BlockNumber segno = 0;
+ bool active = true;
/*
* NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1034,14 +1040,42 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
v = mdopen(reln, forknum, EXTENSION_FAIL);
+ /*
+ * Temporarily open inactive segments, then close them after sync. There
+ * may be some inactive segments left opened after fsync() error, but that
+ * is harmless. We don't bother to clean them up and take a risk of
+ * further trouble. The next mdclose() will soon close them.
+ */
while (v != NULL)
{
- if (FileSync(v->mdfd_vfd) < 0)
+ File vfd = v->mdfd_vfd;
+
+ if (active)
+ v = v->mdfd_chain;
+ else
+ {
+ Assert(v->mdfd_chain == NULL);
+ pfree(v);
+ v = NULL;
+ }
+
+ if (FileSync(vfd) < 0)
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
- FilePathName(v->mdfd_vfd))));
- v = v->mdfd_chain;
+ FilePathName(vfd))));
+
+ /* Close inactive segments immediately */
+ if (!active)
+ FileClose(vfd);
+
+ segno++;
+
+ if (v == NULL)
+ {
+ v = _mdfd_openseg(reln, forknum, segno, 0);
+ active = false;
+ }
}
}