summaryrefslogtreecommitdiff
path: root/src/backend/storage
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage')
-rw-r--r--src/backend/storage/buffer/bufmgr.c125
-rw-r--r--src/backend/storage/lmgr/lock.c12
-rw-r--r--src/backend/storage/smgr/md.c52
-rw-r--r--src/backend/storage/smgr/smgr.c35
4 files changed, 209 insertions, 15 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0e4a8ac800a..8e051fc9b46 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -65,7 +65,7 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
-#define DROP_RELS_BSEARCH_THRESHOLD 20
+#define RELS_BSEARCH_THRESHOLD 20
typedef struct PrivateRefCountEntry
{
@@ -104,6 +104,19 @@ typedef struct CkptTsStatus
int index;
} CkptTsStatus;
+/*
+ * Type for array used to sort SMgrRelations
+ *
+ * FlushRelationsAllBuffers shares the same comparator function with
+ * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
+ * compatible.
+ */
+typedef struct SMgrSortArray
+{
+ RelFileNode rnode; /* This must be the first member */
+ SMgrRelation srel;
+} SMgrSortArray;
+
/* GUC variables */
bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100;
@@ -2977,7 +2990,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
* an exactly determined value, as it depends on many factors (CPU and RAM
* speeds, amount of shared buffers etc.).
*/
- use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
+ use_bsearch = n > RELS_BSEARCH_THRESHOLD;
/* sort the list of rnodes if necessary */
if (use_bsearch)
@@ -3228,6 +3241,104 @@ FlushRelationBuffers(Relation rel)
}
/* ---------------------------------------------------------------------
+ * FlushRelationsAllBuffers
+ *
+ * This function flushes out of the buffer pool all the pages of all
+ * forks of the specified smgr relations. It's equivalent to calling
+ * FlushRelationBuffers once per fork per relation. The relations are
+ * assumed not to use local buffers.
+ * --------------------------------------------------------------------
+ */
+void
+FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
+{
+ int i;
+ SMgrSortArray *srels;
+ bool use_bsearch;
+
+ if (nrels == 0)
+ return;
+
+ /* fill-in array for qsort */
+ srels = palloc(sizeof(SMgrSortArray) * nrels);
+
+ for (i = 0; i < nrels; i++)
+ {
+ Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
+
+ srels[i].rnode = smgrs[i]->smgr_rnode.node;
+ srels[i].srel = smgrs[i];
+ }
+
+ /*
+ * Save the bsearch overhead for low number of relations to sync. See
+ * DropRelFileNodesAllBuffers for details.
+ */
+ use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
+
+ /* sort the list of SMgrRelations if necessary */
+ if (use_bsearch)
+ pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
+
+ /* Make sure we can handle the pin inside the loop */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ SMgrSortArray *srelent = NULL;
+ BufferDesc *bufHdr = GetBufferDescriptor(i);
+ uint32 buf_state;
+
+ /*
+ * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+ * and saves some cycles.
+ */
+
+ if (!use_bsearch)
+ {
+ int j;
+
+ for (j = 0; j < nrels; j++)
+ {
+ if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
+ {
+ srelent = &srels[j];
+ break;
+ }
+ }
+
+ }
+ else
+ {
+ srelent = bsearch((const void *) &(bufHdr->tag.rnode),
+ srels, nrels, sizeof(SMgrSortArray),
+ rnode_comparator);
+ }
+
+ /* buffer doesn't belong to any of the given relfilenodes; skip it */
+ if (srelent == NULL)
+ continue;
+
+ ReservePrivateRefCountEntry();
+
+ buf_state = LockBufHdr(bufHdr);
+ if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
+ (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+ {
+ PinBuffer_Locked(bufHdr);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+ FlushBuffer(bufHdr, srelent->srel);
+ LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+ UnpinBuffer(bufHdr, true);
+ }
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+
+ pfree(srels);
+}
+
+/* ---------------------------------------------------------------------
* FlushDatabaseBuffers
*
* This function writes all dirty pages of a database out to disk
@@ -3428,13 +3539,15 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
(pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
{
/*
- * If we're in recovery we cannot dirty a page because of a hint.
- * We can set the hint, just not dirty the page as a result so the
- * hint is lost when we evict the page or shutdown.
+ * If we must not write WAL, due to a relfilenode-specific
+ * condition or being in recovery, don't dirty the page. We can
+ * set the hint, just not dirty the page as a result so the hint
+ * is lost when we evict the page or shutdown.
*
* See src/backend/storage/page/README for longer discussion.
*/
- if (RecoveryInProgress())
+ if (RecoveryInProgress() ||
+ RelFileNodeSkippingWAL(bufHdr->tag.rnode))
return;
/*
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 98a2f366b4d..66ed9f44d41 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -563,6 +563,18 @@ DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
return false;
}
+#ifdef USE_ASSERT_CHECKING
+/*
+ * GetLockMethodLocalHash -- return the hash of local locks, for modules that
+ * evaluate assertions based on all locks held.
+ */
+HTAB *
+GetLockMethodLocalHash(void)
+{
+ return LockMethodLocalHash;
+}
+#endif
+
/*
* LockHasWaiters -- look up 'locktag' and check if releasing this
* lock would wake up other processes waiting for it.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 301a24bd66c..a25d07a6f76 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -359,11 +359,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
* During replay, we would delete the file and then recreate it, which is fine
* if the contents of the file were repopulated by subsequent WAL entries.
* But if we didn't WAL-log insertions, but instead relied on fsyncing the
- * file after populating it (as for instance CLUSTER and CREATE INDEX do),
- * the contents of the file would be lost forever. By leaving the empty file
- * until after the next checkpoint, we prevent reassignment of the relfilenode
- * number until it's safe, because relfilenode assignment skips over any
- * existing file.
+ * file after populating it (as we do at wal_level=minimal), the contents of
+ * the file would be lost forever. By leaving the empty file until after the
+ * next checkpoint, we prevent reassignment of the relfilenode number until
+ * it's safe, because relfilenode assignment skips over any existing file.
*
* We do not need to go through this dance for temp relations, though, because
* we never make WAL entries for temp rels, and so a temp rel poses no threat
@@ -1019,12 +1018,19 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
* mdimmedsync() -- Immediately sync a relation to stable storage.
*
* Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.
+ * nothing of dirty buffers that may exist inside the buffer manager. We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive. If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
*/
void
mdimmedsync(SMgrRelation reln, ForkNumber forknum)
{
MdfdVec *v;
+ BlockNumber segno = 0;
+ bool active = true;
/*
* NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1034,14 +1040,42 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
v = mdopen(reln, forknum, EXTENSION_FAIL);
+ /*
+ * Temporarily open inactive segments, then close them after sync. There
+ * may be some inactive segments left opened after fsync() error, but that
+ * is harmless. We don't bother to clean them up and take a risk of
+ * further trouble. The next mdclose() will soon close them.
+ */
while (v != NULL)
{
- if (FileSync(v->mdfd_vfd) < 0)
+ File vfd = v->mdfd_vfd;
+
+ if (active)
+ v = v->mdfd_chain;
+ else
+ {
+ Assert(v->mdfd_chain == NULL);
+ pfree(v);
+ v = NULL;
+ }
+
+ if (FileSync(vfd) < 0)
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
- FilePathName(v->mdfd_vfd))));
- v = v->mdfd_chain;
+ FilePathName(vfd))));
+
+ /* Close inactive segments immediately */
+ if (!active)
+ FileClose(vfd);
+
+ segno++;
+
+ if (v == NULL)
+ {
+ v = _mdfd_openseg(reln, forknum, segno, 0);
+ active = false;
+ }
}
}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 97119f2063f..e5383722a65 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -409,6 +409,41 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
}
/*
+ * smgrdosyncall() -- Immediately sync all forks of all given relations
+ *
+ * All forks of all given relations are synced out to the store.
+ *
+ * This is equivalent to FlushRelationBuffers() for each smgr relation,
+ * then calling smgrimmedsync() for all forks of each relation, but it's
+ * significantly quicker so should be preferred when possible.
+ */
+void
+smgrdosyncall(SMgrRelation *rels, int nrels)
+{
+ int i = 0;
+ ForkNumber forknum;
+
+ if (nrels == 0)
+ return;
+
+ FlushRelationsAllBuffers(rels, nrels);
+
+ /*
+ * Sync the physical file(s).
+ */
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
+
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ {
+ if (smgrsw[which].smgr_exists(rels[i], forknum))
+ smgrsw[which].smgr_immedsync(rels[i], forknum);
+ }
+ }
+}
+
+/*
* smgrdounlinkall() -- Immediately unlink all forks of all given relations
*
* All forks of all given relations are removed from the store. This