summaryrefslogtreecommitdiff
path: root/src/backend/storage/buffer/bufmgr.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r--src/backend/storage/buffer/bufmgr.c256
1 files changed, 159 insertions, 97 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index fb09389e4eb..82fa49abc48 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,35 +8,23 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.221 2007/06/18 00:47:20 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.222 2007/06/28 00:02:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
/*
+ * Principal entry points:
+ *
* ReadBuffer() -- find or create a buffer holding the requested page,
* and pin it so that no one can destroy it while this process
* is using it.
*
- * ReadOrZeroBuffer() -- like ReadBuffer, but if the page is not already in
- * cache we don't read it, but just return a zeroed-out buffer. Useful
- * when the caller intends to fill the page from scratch, since this
- * saves I/O and avoids unnecessary failure if the page-on-disk has
- * corrupt page headers. Caution: do not use this to read a page that
- * is beyond the relation's current physical EOF; that is likely to
- * cause problems in md.c when the page is modified and written out.
- *
* ReleaseBuffer() -- unpin a buffer
*
* MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
* The disk write is delayed until buffer replacement or checkpoint.
*
- * BufferSync() -- flush all dirty buffers in the buffer pool.
- *
- * BgBufferSync() -- flush some dirty buffers in the buffer pool.
- *
- * InitBufferPool() -- Init the buffer module.
- *
- * See other files:
+ * See also these files:
* freelist.c -- chooses victim for buffer replacement
* buf_table.c -- manages the buffer lookup table
*/
@@ -64,16 +52,11 @@
#define LocalBufHdrGetBlock(bufHdr) \
LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
-/* interval for calling AbsorbFsyncRequests in BufferSync */
-#define WRITES_PER_ABSORB 1000
-
/* GUC variables */
bool zero_damaged_pages = false;
double bgwriter_lru_percent = 1.0;
-double bgwriter_all_percent = 0.333;
int bgwriter_lru_maxpages = 5;
-int bgwriter_all_maxpages = 5;
long NDirectFileRead; /* some I/O's are direct file access. bypass
@@ -95,6 +78,7 @@ static Buffer ReadBuffer_common(Relation reln, BlockNumber blockNum,
static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
static void PinBuffer_Locked(volatile BufferDesc *buf);
static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
+static void BufferSync(int flags);
static bool SyncOneBuffer(int buf_id, bool skip_pinned);
static void WaitIO(volatile BufferDesc *buf);
static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
@@ -143,8 +127,10 @@ ReadBufferWithStrategy(Relation reln, BlockNumber blockNum,
/*
* ReadOrZeroBuffer -- like ReadBuffer, but if the page isn't in buffer
* cache already, it's filled with zeros instead of reading it from
- * disk. The caller is expected to overwrite the whole buffer,
- * so that the current page contents are not interesting.
+ * disk. Useful when the caller intends to fill the page from scratch,
+ * since this saves I/O and avoids unnecessary failure if the
+ * page-on-disk has corrupt page headers.
+ *
* Caution: do not use this to read a page that is beyond the relation's
* current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though.
@@ -644,7 +630,7 @@ BufferAlloc(Relation reln,
* at 1 so that the buffer can survive one clock-sweep pass.)
*/
buf->tag = newTag;
- buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
+ buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR);
buf->flags |= BM_TAG_VALID;
buf->usage_count = 1;
@@ -999,45 +985,114 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
* BufferSync -- Write out all dirty buffers in the pool.
*
* This is called at checkpoint time to write out all dirty shared buffers.
+ * The checkpoint request flags should be passed in; currently the only one
+ * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
*/
-void
-BufferSync(void)
+static void
+BufferSync(int flags)
{
int buf_id;
int num_to_scan;
- int absorb_counter;
+ int num_to_write;
+ int num_written;
+
+ /* Make sure we can handle the pin inside SyncOneBuffer */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
/*
- * Find out where to start the circular scan.
+ * Loop over all buffers, and mark the ones that need to be written with
+ * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_write), so that we
+ * can estimate how much work needs to be done.
+ *
+ * This allows us to write only those pages that were dirty when the
+ * checkpoint began, and not those that get dirtied while it proceeds.
+ * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
+ * later in this function, or by normal backends or the bgwriter cleaning
+ * scan, the flag is cleared. Any buffer dirtied after this point won't
+ * have the flag set.
+ *
+ * Note that if we fail to write some buffer, we may leave buffers with
+ * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer
+ * would certainly need to be written for the next checkpoint attempt,
+ * too.
*/
- buf_id = StrategySyncStart();
+ num_to_write = 0;
+ for (buf_id = 0; buf_id < NBuffers; buf_id++)
+ {
+ volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
- /* Make sure we can handle the pin inside SyncOneBuffer */
- ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+ /*
+ * Header spinlock is enough to examine BM_DIRTY, see comment in
+ * SyncOneBuffer.
+ */
+ LockBufHdr(bufHdr);
+
+ if (bufHdr->flags & BM_DIRTY)
+ {
+ bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+ num_to_write++;
+ }
+
+ UnlockBufHdr(bufHdr);
+ }
+
+ if (num_to_write == 0)
+ return; /* nothing to do */
/*
- * Loop over all buffers.
+ * Loop over all buffers again, and write the ones (still) marked with
+ * BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep
+ * point since we might as well dump soon-to-be-recycled buffers first.
*/
+ buf_id = StrategySyncStart();
num_to_scan = NBuffers;
- absorb_counter = WRITES_PER_ABSORB;
+ num_written = 0;
while (num_to_scan-- > 0)
{
- if (SyncOneBuffer(buf_id, false))
- {
- BgWriterStats.m_buf_written_checkpoints++;
+ volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
- /*
- * If in bgwriter, absorb pending fsync requests after each
- * WRITES_PER_ABSORB write operations, to prevent overflow of the
- * fsync request queue. If not in bgwriter process, this is a
- * no-op.
- */
- if (--absorb_counter <= 0)
+ /*
+ * We don't need to acquire the lock here, because we're only looking
+ * at a single bit. It's possible that someone else writes the buffer
+ * and clears the flag right after we check, but that doesn't matter
+ * since SyncOneBuffer will then do nothing. However, there is a
+ * further race condition: it's conceivable that between the time we
+ * examine the bit here and the time SyncOneBuffer acquires lock,
+ * someone else not only wrote the buffer but replaced it with another
+ * page and dirtied it. In that improbable case, SyncOneBuffer will
+ * write the buffer though we didn't need to. It doesn't seem
+ * worth guarding against this, though.
+ */
+ if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
+ {
+ if (SyncOneBuffer(buf_id, false))
{
- AbsorbFsyncRequests();
- absorb_counter = WRITES_PER_ABSORB;
+ BgWriterStats.m_buf_written_checkpoints++;
+ num_written++;
+
+ /*
+ * We know there are at most num_to_write buffers with
+ * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
+ * num_written reaches num_to_write.
+ *
+ * Note that num_written doesn't include buffers written by
+ * other backends, or by the bgwriter cleaning scan. That
+ * means that the estimate of how much progress we've made is
+ * conservative, and also that this test will often fail to
+ * trigger. But it seems worth making anyway.
+ */
+ if (num_written >= num_to_write)
+ break;
+
+ /*
+ * Perform normal bgwriter duties and sleep to throttle
+ * our I/O rate.
+ */
+ CheckpointWriteDelay(flags,
+ (double) num_written / num_to_write);
}
}
+
if (++buf_id >= NBuffers)
buf_id = 0;
}
@@ -1051,8 +1106,7 @@ BufferSync(void)
void
BgBufferSync(void)
{
- static int buf_id1 = 0;
- int buf_id2;
+ int buf_id;
int num_to_scan;
int num_written;
@@ -1060,45 +1114,10 @@ BgBufferSync(void)
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
/*
- * To minimize work at checkpoint time, we want to try to keep all the
- * buffers clean; this motivates a scan that proceeds sequentially through
- * all buffers. But we are also charged with ensuring that buffers that
+ * The purpose of this sweep is to ensure that buffers that
* will be recycled soon are clean when needed; these buffers are the ones
- * just ahead of the StrategySyncStart point. We make a separate scan
- * through those.
- */
-
- /*
- * This loop runs over all buffers, including pinned ones. The starting
- * point advances through the buffer pool on successive calls.
+ * just ahead of the StrategySyncStart point.
*
- * Note that we advance the static counter *before* trying to write. This
- * ensures that, if we have a persistent write failure on a dirty buffer,
- * we'll still be able to make progress writing other buffers. (The
- * bgwriter will catch the error and just call us again later.)
- */
- if (bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0)
- {
- num_to_scan = (int) ((NBuffers * bgwriter_all_percent + 99) / 100);
- num_written = 0;
-
- while (num_to_scan-- > 0)
- {
- if (++buf_id1 >= NBuffers)
- buf_id1 = 0;
- if (SyncOneBuffer(buf_id1, false))
- {
- if (++num_written >= bgwriter_all_maxpages)
- {
- BgWriterStats.m_maxwritten_all++;
- break;
- }
- }
- }
- BgWriterStats.m_buf_written_all += num_written;
- }
-
- /*
* This loop considers only unpinned buffers close to the clock sweep
* point.
*/
@@ -1107,22 +1126,22 @@ BgBufferSync(void)
num_to_scan = (int) ((NBuffers * bgwriter_lru_percent + 99) / 100);
num_written = 0;
- buf_id2 = StrategySyncStart();
+ buf_id = StrategySyncStart();
while (num_to_scan-- > 0)
{
- if (SyncOneBuffer(buf_id2, true))
+ if (SyncOneBuffer(buf_id, true))
{
if (++num_written >= bgwriter_lru_maxpages)
{
- BgWriterStats.m_maxwritten_lru++;
+ BgWriterStats.m_maxwritten_clean++;
break;
}
}
- if (++buf_id2 >= NBuffers)
- buf_id2 = 0;
+ if (++buf_id >= NBuffers)
+ buf_id = 0;
}
- BgWriterStats.m_buf_written_lru += num_written;
+ BgWriterStats.m_buf_written_clean += num_written;
}
}
@@ -1333,16 +1352,17 @@ PrintBufferLeakWarning(Buffer buffer)
}
/*
- * FlushBufferPool
+ * CheckPointBuffers
*
- * Flush all dirty blocks in buffer pool to disk at the checkpoint time.
- * Local relations do not participate in checkpoints, so they don't need to be
- * flushed.
+ * Flush all dirty blocks in buffer pool to disk at checkpoint time.
+ *
+ * Note: temporary relations do not participate in checkpoints, so they don't
+ * need to be flushed.
*/
void
-FlushBufferPool(void)
+CheckPointBuffers(int flags)
{
- BufferSync();
+ BufferSync(flags);
smgrsync();
}
@@ -1732,6 +1752,48 @@ FlushRelationBuffers(Relation rel)
}
}
+/* ---------------------------------------------------------------------
+ * FlushDatabaseBuffers
+ *
+ * This function writes all dirty pages of a database out to disk
+ * (or more accurately, out to kernel disk buffers), ensuring that the
+ * kernel has an up-to-date view of the database.
+ *
+ * Generally, the caller should be holding an appropriate lock to ensure
+ * no other backend is active in the target database; otherwise more
+ * pages could get dirtied.
+ *
+ * Note we don't worry about flushing any pages of temporary relations.
+ * It's assumed these wouldn't be interesting.
+ * --------------------------------------------------------------------
+ */
+void
+FlushDatabaseBuffers(Oid dbid)
+{
+ int i;
+ volatile BufferDesc *bufHdr;
+
+ /* Make sure we can handle the pin inside the loop */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ bufHdr = &BufferDescriptors[i];
+ LockBufHdr(bufHdr);
+ if (bufHdr->tag.rnode.dbNode == dbid &&
+ (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+ {
+ PinBuffer_Locked(bufHdr);
+ LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+ FlushBuffer(bufHdr, NULL);
+ LWLockRelease(bufHdr->content_lock);
+ UnpinBuffer(bufHdr, true);
+ }
+ else
+ UnlockBufHdr(bufHdr);
+ }
+}
+
/*
* ReleaseBuffer -- release the pin on a buffer
*/
@@ -2131,7 +2193,7 @@ TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
Assert(buf->flags & BM_IO_IN_PROGRESS);
buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
- buf->flags &= ~BM_DIRTY;
+ buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
buf->flags |= set_flag_bits;
UnlockBufHdr(buf);