summaryrefslogtreecommitdiff
path: root/src/backend/storage/buffer/bufmgr.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r--src/backend/storage/buffer/bufmgr.c320
1 files changed, 280 insertions, 40 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 9c0ef67f6bb..6ba935a09af 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.224 2007/09/20 17:56:31 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.225 2007/09/25 20:03:37 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -52,11 +52,15 @@
#define LocalBufHdrGetBlock(bufHdr) \
LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
+/* Bits in SyncOneBuffer's return value */
+#define BUF_WRITTEN 0x01
+#define BUF_REUSABLE 0x02
+
/* GUC variables */
bool zero_damaged_pages = false;
-double bgwriter_lru_percent = 1.0;
-int bgwriter_lru_maxpages = 5;
+int bgwriter_lru_maxpages = 100;
+double bgwriter_lru_multiplier = 2.0;
long NDirectFileRead; /* some I/O's are direct file access. bypass
@@ -79,7 +83,7 @@ static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
static void PinBuffer_Locked(volatile BufferDesc *buf);
static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
static void BufferSync(int flags);
-static bool SyncOneBuffer(int buf_id, bool skip_pinned);
+static int SyncOneBuffer(int buf_id, bool skip_recently_used);
static void WaitIO(volatile BufferDesc *buf);
static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
@@ -1043,8 +1047,11 @@ BufferSync(int flags)
* Loop over all buffers again, and write the ones (still) marked with
* BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep
* point since we might as well dump soon-to-be-recycled buffers first.
+ *
+ * Note that we don't read the buffer alloc count here --- that should
+ * be left untouched till the next BgBufferSync() call.
*/
- buf_id = StrategySyncStart();
+ buf_id = StrategySyncStart(NULL, NULL);
num_to_scan = NBuffers;
num_written = 0;
while (num_to_scan-- > 0)
@@ -1065,7 +1072,7 @@ BufferSync(int flags)
*/
if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
{
- if (SyncOneBuffer(buf_id, false))
+ if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
{
BgWriterStats.m_buf_written_checkpoints++;
num_written++;
@@ -1112,61 +1119,289 @@ BufferSync(int flags)
void
BgBufferSync(void)
{
- int buf_id;
+ /* info obtained from freelist.c */
+ int strategy_buf_id;
+ uint32 strategy_passes;
+ uint32 recent_alloc;
+
+ /*
+ * Information saved between calls so we can determine the strategy
+ * point's advance rate and avoid scanning already-cleaned buffers.
+ */
+ static bool saved_info_valid = false;
+ static int prev_strategy_buf_id;
+ static uint32 prev_strategy_passes;
+ static int next_to_clean;
+ static uint32 next_passes;
+
+ /* Moving averages of allocation rate and clean-buffer density */
+ static float smoothed_alloc = 0;
+ static float smoothed_density = 10.0;
+
+ /* Potentially these could be tunables, but for now, not */
+ float smoothing_samples = 16;
+ float scan_whole_pool_milliseconds = 120000.0;
+
+ /* Used to compute how far we scan ahead */
+ long strategy_delta;
+ int bufs_to_lap;
+ int bufs_ahead;
+ float scans_per_alloc;
+ int reusable_buffers_est;
+ int upcoming_alloc_est;
+ int min_scan_buffers;
+
+ /* Variables for the scanning loop proper */
int num_to_scan;
int num_written;
+ int reusable_buffers;
- /* Make sure we can handle the pin inside SyncOneBuffer */
- ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+ /*
+ * Find out where the freelist clock sweep currently is, and how
+ * many buffer allocations have happened since our last call.
+ */
+ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
+
+ /* Report buffer alloc counts to pgstat */
+ BgWriterStats.m_buf_alloc += recent_alloc;
+
+ /*
+ * If we're not running the LRU scan, just stop after doing the
+ * stats stuff. We mark the saved state invalid so that we can recover
+ * sanely if LRU scan is turned back on later.
+ */
+ if (bgwriter_lru_maxpages <= 0)
+ {
+ saved_info_valid = false;
+ return;
+ }
+
+ /*
+ * Compute strategy_delta = how many buffers have been scanned by the
+ * clock sweep since last time. If first time through, assume none.
+ * Then see if we are still ahead of the clock sweep, and if so, how many
+ * buffers we could scan before we'd catch up with it and "lap" it.
+ * Note: weird-looking coding of xxx_passes comparisons are to avoid
+ * bogus behavior when the passes counts wrap around.
+ */
+ if (saved_info_valid)
+ {
+ int32 passes_delta = strategy_passes - prev_strategy_passes;
+
+ strategy_delta = strategy_buf_id - prev_strategy_buf_id;
+ strategy_delta += (long) passes_delta * NBuffers;
+ Assert(strategy_delta >= 0);
+
+ if ((int32) (next_passes - strategy_passes) > 0)
+ {
+ /* we're one pass ahead of the strategy point */
+ bufs_to_lap = strategy_buf_id - next_to_clean;
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+ next_passes, next_to_clean,
+ strategy_passes, strategy_buf_id,
+ strategy_delta, bufs_to_lap);
+#endif
+ }
+ else if (next_passes == strategy_passes &&
+ next_to_clean >= strategy_buf_id)
+ {
+ /* on same pass, but ahead or at least not behind */
+ bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+ next_passes, next_to_clean,
+ strategy_passes, strategy_buf_id,
+ strategy_delta, bufs_to_lap);
+#endif
+ }
+ else
+ {
+ /*
+ * We're behind, so skip forward to the strategy point
+ * and start cleaning from there.
+ */
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
+ next_passes, next_to_clean,
+ strategy_passes, strategy_buf_id,
+ strategy_delta);
+#endif
+ next_to_clean = strategy_buf_id;
+ next_passes = strategy_passes;
+ bufs_to_lap = NBuffers;
+ }
+ }
+ else
+ {
+ /*
+ * Initializing at startup or after LRU scanning had been off.
+ * Always start at the strategy point.
+ */
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
+ strategy_passes, strategy_buf_id);
+#endif
+ strategy_delta = 0;
+ next_to_clean = strategy_buf_id;
+ next_passes = strategy_passes;
+ bufs_to_lap = NBuffers;
+ }
+
+ /* Update saved info for next time */
+ prev_strategy_buf_id = strategy_buf_id;
+ prev_strategy_passes = strategy_passes;
+ saved_info_valid = true;
+
+ /*
+ * Compute how many buffers had to be scanned for each new allocation,
+ * ie, 1/density of reusable buffers, and track a moving average of that.
+ *
+ * If the strategy point didn't move, we don't update the density estimate
+ */
+ if (strategy_delta > 0 && recent_alloc > 0)
+ {
+ scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+ smoothed_density += (scans_per_alloc - smoothed_density) /
+ smoothing_samples;
+ }
+
+ /*
+ * Estimate how many reusable buffers there are between the current
+ * strategy point and where we've scanned ahead to, based on the
+ * smoothed density estimate.
+ */
+ bufs_ahead = NBuffers - bufs_to_lap;
+ reusable_buffers_est = (float) bufs_ahead / smoothed_density;
+
+ /*
+ * Track a moving average of recent buffer allocations. Here, rather
+ * than a true average we want a fast-attack, slow-decline behavior:
+ * we immediately follow any increase.
+ */
+ if (smoothed_alloc <= (float) recent_alloc)
+ smoothed_alloc = recent_alloc;
+ else
+ smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
+ smoothing_samples;
+
+ /* Scale the estimate by a GUC to allow more aggressive tuning. */
+ upcoming_alloc_est = smoothed_alloc * bgwriter_lru_multiplier;
/*
- * The purpose of this sweep is to ensure that buffers that
- * will be recycled soon are clean when needed; these buffers are the ones
- * just ahead of the StrategySyncStart point.
+ * Even in cases where there's been little or no buffer allocation
+ * activity, we want to make a small amount of progress through the buffer
+ * cache so that as many reusable buffers as possible are clean
+ * after an idle period.
*
- * This loop considers only unpinned buffers close to the clock sweep
- * point.
+ * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many
+ * times the BGW will be called during the scan_whole_pool time;
+ * slice the buffer pool into that many sections.
*/
- if (bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0)
+ min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
+
+ if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
{
- num_to_scan = (int) ((NBuffers * bgwriter_lru_percent + 99) / 100);
- num_written = 0;
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
+ upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
+#endif
+ upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
+ }
+
+ /*
+ * Now write out dirty reusable buffers, working forward from the
+ * next_to_clean point, until we have lapped the strategy scan, or
+ * cleaned enough buffers to match our estimate of the next cycle's
+ * allocation requirements, or hit the bgwriter_lru_maxpages limit.
+ */
+
+ /* Make sure we can handle the pin inside SyncOneBuffer */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ num_to_scan = bufs_to_lap;
+ num_written = 0;
+ reusable_buffers = reusable_buffers_est;
- buf_id = StrategySyncStart();
+ /* Execute the LRU scan */
+ while (num_to_scan-- > 0 && reusable_buffers < upcoming_alloc_est)
+ {
+ int buffer_state = SyncOneBuffer(next_to_clean, true);
- while (num_to_scan-- > 0)
+ if (buffer_state & BUF_WRITTEN)
{
- if (SyncOneBuffer(buf_id, true))
+ reusable_buffers++;
+ if (++num_written >= bgwriter_lru_maxpages)
{
- if (++num_written >= bgwriter_lru_maxpages)
- {
- BgWriterStats.m_maxwritten_clean++;
- break;
- }
+ BgWriterStats.m_maxwritten_clean++;
+ break;
}
- if (++buf_id >= NBuffers)
- buf_id = 0;
}
- BgWriterStats.m_buf_written_clean += num_written;
+ else if (buffer_state & BUF_REUSABLE)
+ reusable_buffers++;
+
+ if (++next_to_clean >= NBuffers)
+ {
+ next_to_clean = 0;
+ next_passes++;
+ }
+ }
+
+ BgWriterStats.m_buf_written_clean += num_written;
+
+#ifdef BGW_DEBUG
+ elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
+ recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
+ smoothed_density, reusable_buffers_est, upcoming_alloc_est,
+ bufs_to_lap - num_to_scan - 1,
+ num_written,
+ reusable_buffers - reusable_buffers_est);
+#endif
+
+ /*
+ * Consider the above scan as being like a new allocation scan.
+ * Characterize its density and update the smoothed one based on it.
+ * This effectively halves the moving average period in cases where
+ * both the strategy and the background writer are doing some useful
+ * scanning, which is helpful because a long memory isn't as desirable
+ * on the density estimates.
+ */
+ strategy_delta = bufs_to_lap - num_to_scan - 1;
+ recent_alloc = reusable_buffers - reusable_buffers_est;
+ if (strategy_delta > 0 && recent_alloc > 0)
+ {
+ scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+ smoothed_density += (scans_per_alloc - smoothed_density) /
+ smoothing_samples;
+
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
+ recent_alloc, strategy_delta, scans_per_alloc, smoothed_density);
+#endif
}
}
/*
* SyncOneBuffer -- process a single buffer during syncing.
*
- * If skip_pinned is true, we don't write currently-pinned buffers, nor
+ * If skip_recently_used is true, we don't write currently-pinned buffers, nor
* buffers marked recently used, as these are not replacement candidates.
*
- * Returns true if buffer was written, else false. (This could be in error
- * if FlushBuffers finds the buffer clean after locking it, but we don't
- * care all that much.)
+ * Returns a bitmask containing the following flag bits:
+ * BUF_WRITTEN: we wrote the buffer.
+ * BUF_REUSABLE: buffer is available for replacement, ie, it has
+ * pin count 0 and usage count 0.
+ *
+ * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
+ * after locking it, but we don't care all that much.)
*
* Note: caller must have done ResourceOwnerEnlargeBuffers.
*/
-static bool
-SyncOneBuffer(int buf_id, bool skip_pinned)
+static int
+SyncOneBuffer(int buf_id, bool skip_recently_used)
{
volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
+ int result = 0;
/*
* Check whether buffer needs writing.
@@ -1178,16 +1413,21 @@ SyncOneBuffer(int buf_id, bool skip_pinned)
* upcoming changes and so we are not required to write such dirty buffer.
*/
LockBufHdr(bufHdr);
- if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
+
+ if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
+ result |= BUF_REUSABLE;
+ else if (skip_recently_used)
{
+ /* Caller told us not to write recently-used buffers */
UnlockBufHdr(bufHdr);
- return false;
+ return result;
}
- if (skip_pinned &&
- (bufHdr->refcount != 0 || bufHdr->usage_count != 0))
+
+ if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
{
+ /* It's clean, so nothing to do */
UnlockBufHdr(bufHdr);
- return false;
+ return result;
}
/*
@@ -1202,7 +1442,7 @@ SyncOneBuffer(int buf_id, bool skip_pinned)
LWLockRelease(bufHdr->content_lock);
UnpinBuffer(bufHdr, true);
- return true;
+ return result | BUF_WRITTEN;
}