diff options
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 320 |
1 files changed, 280 insertions, 40 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9c0ef67f6bb..6ba935a09af 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.224 2007/09/20 17:56:31 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.225 2007/09/25 20:03:37 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -52,11 +52,15 @@ #define LocalBufHdrGetBlock(bufHdr) \ LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)] +/* Bits in SyncOneBuffer's return value */ +#define BUF_WRITTEN 0x01 +#define BUF_REUSABLE 0x02 + /* GUC variables */ bool zero_damaged_pages = false; -double bgwriter_lru_percent = 1.0; -int bgwriter_lru_maxpages = 5; +int bgwriter_lru_maxpages = 100; +double bgwriter_lru_multiplier = 2.0; long NDirectFileRead; /* some I/O's are direct file access. bypass @@ -79,7 +83,7 @@ static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy); static void PinBuffer_Locked(volatile BufferDesc *buf); static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner); static void BufferSync(int flags); -static bool SyncOneBuffer(int buf_id, bool skip_pinned); +static int SyncOneBuffer(int buf_id, bool skip_recently_used); static void WaitIO(volatile BufferDesc *buf); static bool StartBufferIO(volatile BufferDesc *buf, bool forInput); static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty, @@ -1043,8 +1047,11 @@ BufferSync(int flags) * Loop over all buffers again, and write the ones (still) marked with * BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep * point since we might as well dump soon-to-be-recycled buffers first. + * + * Note that we don't read the buffer alloc count here --- that should + * be left untouched till the next BgBufferSync() call. */ - buf_id = StrategySyncStart(); + buf_id = StrategySyncStart(NULL, NULL); num_to_scan = NBuffers; num_written = 0; while (num_to_scan-- > 0) @@ -1065,7 +1072,7 @@ BufferSync(int flags) */ if (bufHdr->flags & BM_CHECKPOINT_NEEDED) { - if (SyncOneBuffer(buf_id, false)) + if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN) { BgWriterStats.m_buf_written_checkpoints++; num_written++; @@ -1112,61 +1119,289 @@ BufferSync(int flags) void BgBufferSync(void) { - int buf_id; + /* info obtained from freelist.c */ + int strategy_buf_id; + uint32 strategy_passes; + uint32 recent_alloc; + + /* + * Information saved between calls so we can determine the strategy + * point's advance rate and avoid scanning already-cleaned buffers. + */ + static bool saved_info_valid = false; + static int prev_strategy_buf_id; + static uint32 prev_strategy_passes; + static int next_to_clean; + static uint32 next_passes; + + /* Moving averages of allocation rate and clean-buffer density */ + static float smoothed_alloc = 0; + static float smoothed_density = 10.0; + + /* Potentially these could be tunables, but for now, not */ + float smoothing_samples = 16; + float scan_whole_pool_milliseconds = 120000.0; + + /* Used to compute how far we scan ahead */ + long strategy_delta; + int bufs_to_lap; + int bufs_ahead; + float scans_per_alloc; + int reusable_buffers_est; + int upcoming_alloc_est; + int min_scan_buffers; + + /* Variables for the scanning loop proper */ int num_to_scan; int num_written; + int reusable_buffers; - /* Make sure we can handle the pin inside SyncOneBuffer */ - ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + /* + * Find out where the freelist clock sweep currently is, and how + * many buffer allocations have happened since our last call. + */ + strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); + + /* Report buffer alloc counts to pgstat */ + BgWriterStats.m_buf_alloc += recent_alloc; + + /* + * If we're not running the LRU scan, just stop after doing the + * stats stuff. We mark the saved state invalid so that we can recover + * sanely if LRU scan is turned back on later. + */ + if (bgwriter_lru_maxpages <= 0) + { + saved_info_valid = false; + return; + } + + /* + * Compute strategy_delta = how many buffers have been scanned by the + * clock sweep since last time. If first time through, assume none. + * Then see if we are still ahead of the clock sweep, and if so, how many + * buffers we could scan before we'd catch up with it and "lap" it. + * Note: weird-looking coding of xxx_passes comparisons are to avoid + * bogus behavior when the passes counts wrap around. + */ + if (saved_info_valid) + { + int32 passes_delta = strategy_passes - prev_strategy_passes; + + strategy_delta = strategy_buf_id - prev_strategy_buf_id; + strategy_delta += (long) passes_delta * NBuffers; + Assert(strategy_delta >= 0); + + if ((int32) (next_passes - strategy_passes) > 0) + { + /* we're one pass ahead of the strategy point */ + bufs_to_lap = strategy_buf_id - next_to_clean; +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta, bufs_to_lap); +#endif + } + else if (next_passes == strategy_passes && + next_to_clean >= strategy_buf_id) + { + /* on same pass, but ahead or at least not behind */ + bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id); +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta, bufs_to_lap); +#endif + } + else + { + /* + * We're behind, so skip forward to the strategy point + * and start cleaning from there. + */ +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta); +#endif + next_to_clean = strategy_buf_id; + next_passes = strategy_passes; + bufs_to_lap = NBuffers; + } + } + else + { + /* + * Initializing at startup or after LRU scanning had been off. + * Always start at the strategy point. + */ +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter initializing: strategy %u-%u", + strategy_passes, strategy_buf_id); +#endif + strategy_delta = 0; + next_to_clean = strategy_buf_id; + next_passes = strategy_passes; + bufs_to_lap = NBuffers; + } + + /* Update saved info for next time */ + prev_strategy_buf_id = strategy_buf_id; + prev_strategy_passes = strategy_passes; + saved_info_valid = true; + + /* + * Compute how many buffers had to be scanned for each new allocation, + * ie, 1/density of reusable buffers, and track a moving average of that. + * + * If the strategy point didn't move, we don't update the density estimate + */ + if (strategy_delta > 0 && recent_alloc > 0) + { + scans_per_alloc = (float) strategy_delta / (float) recent_alloc; + smoothed_density += (scans_per_alloc - smoothed_density) / + smoothing_samples; + } + + /* + * Estimate how many reusable buffers there are between the current + * strategy point and where we've scanned ahead to, based on the + * smoothed density estimate. + */ + bufs_ahead = NBuffers - bufs_to_lap; + reusable_buffers_est = (float) bufs_ahead / smoothed_density; + + /* + * Track a moving average of recent buffer allocations. Here, rather + * than a true average we want a fast-attack, slow-decline behavior: + * we immediately follow any increase. + */ + if (smoothed_alloc <= (float) recent_alloc) + smoothed_alloc = recent_alloc; + else + smoothed_alloc += ((float) recent_alloc - smoothed_alloc) / + smoothing_samples; + + /* Scale the estimate by a GUC to allow more aggressive tuning. */ + upcoming_alloc_est = smoothed_alloc * bgwriter_lru_multiplier; /* - * The purpose of this sweep is to ensure that buffers that - * will be recycled soon are clean when needed; these buffers are the ones - * just ahead of the StrategySyncStart point. + * Even in cases where there's been little or no buffer allocation + * activity, we want to make a small amount of progress through the buffer + * cache so that as many reusable buffers as possible are clean + * after an idle period. * - * This loop considers only unpinned buffers close to the clock sweep - * point. + * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many + * times the BGW will be called during the scan_whole_pool time; + * slice the buffer pool into that many sections. */ - if (bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0) + min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay)); + + if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est)) { - num_to_scan = (int) ((NBuffers * bgwriter_lru_percent + 99) / 100); - num_written = 0; +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d", + upcoming_alloc_est, min_scan_buffers, reusable_buffers_est); +#endif + upcoming_alloc_est = min_scan_buffers + reusable_buffers_est; + } + + /* + * Now write out dirty reusable buffers, working forward from the + * next_to_clean point, until we have lapped the strategy scan, or + * cleaned enough buffers to match our estimate of the next cycle's + * allocation requirements, or hit the bgwriter_lru_maxpages limit. + */ + + /* Make sure we can handle the pin inside SyncOneBuffer */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + num_to_scan = bufs_to_lap; + num_written = 0; + reusable_buffers = reusable_buffers_est; - buf_id = StrategySyncStart(); + /* Execute the LRU scan */ + while (num_to_scan-- > 0 && reusable_buffers < upcoming_alloc_est) + { + int buffer_state = SyncOneBuffer(next_to_clean, true); - while (num_to_scan-- > 0) + if (buffer_state & BUF_WRITTEN) { - if (SyncOneBuffer(buf_id, true)) + reusable_buffers++; + if (++num_written >= bgwriter_lru_maxpages) { - if (++num_written >= bgwriter_lru_maxpages) - { - BgWriterStats.m_maxwritten_clean++; - break; - } + BgWriterStats.m_maxwritten_clean++; + break; } - if (++buf_id >= NBuffers) - buf_id = 0; } - BgWriterStats.m_buf_written_clean += num_written; + else if (buffer_state & BUF_REUSABLE) + reusable_buffers++; + + if (++next_to_clean >= NBuffers) + { + next_to_clean = 0; + next_passes++; + } + } + + BgWriterStats.m_buf_written_clean += num_written; + +#ifdef BGW_DEBUG + elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d", + recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead, + smoothed_density, reusable_buffers_est, upcoming_alloc_est, + bufs_to_lap - num_to_scan - 1, + num_written, + reusable_buffers - reusable_buffers_est); +#endif + + /* + * Consider the above scan as being like a new allocation scan. + * Characterize its density and update the smoothed one based on it. + * This effectively halves the moving average period in cases where + * both the strategy and the background writer are doing some useful + * scanning, which is helpful because a long memory isn't as desirable + * on the density estimates. + */ + strategy_delta = bufs_to_lap - num_to_scan - 1; + recent_alloc = reusable_buffers - reusable_buffers_est; + if (strategy_delta > 0 && recent_alloc > 0) + { + scans_per_alloc = (float) strategy_delta / (float) recent_alloc; + smoothed_density += (scans_per_alloc - smoothed_density) / + smoothing_samples; + +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f", + recent_alloc, strategy_delta, scans_per_alloc, smoothed_density); +#endif } } /* * SyncOneBuffer -- process a single buffer during syncing. * - * If skip_pinned is true, we don't write currently-pinned buffers, nor + * If skip_recently_used is true, we don't write currently-pinned buffers, nor * buffers marked recently used, as these are not replacement candidates. * - * Returns true if buffer was written, else false. (This could be in error - * if FlushBuffers finds the buffer clean after locking it, but we don't - * care all that much.) + * Returns a bitmask containing the following flag bits: + * BUF_WRITTEN: we wrote the buffer. + * BUF_REUSABLE: buffer is available for replacement, ie, it has + * pin count 0 and usage count 0. + * + * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean + * after locking it, but we don't care all that much.) * * Note: caller must have done ResourceOwnerEnlargeBuffers. */ -static bool -SyncOneBuffer(int buf_id, bool skip_pinned) +static int +SyncOneBuffer(int buf_id, bool skip_recently_used) { volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id]; + int result = 0; /* * Check whether buffer needs writing. @@ -1178,16 +1413,21 @@ SyncOneBuffer(int buf_id, bool skip_pinned) * upcoming changes and so we are not required to write such dirty buffer. */ LockBufHdr(bufHdr); - if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY)) + + if (bufHdr->refcount == 0 && bufHdr->usage_count == 0) + result |= BUF_REUSABLE; + else if (skip_recently_used) { + /* Caller told us not to write recently-used buffers */ UnlockBufHdr(bufHdr); - return false; + return result; } - if (skip_pinned && - (bufHdr->refcount != 0 || bufHdr->usage_count != 0)) + + if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY)) { + /* It's clean, so nothing to do */ UnlockBufHdr(bufHdr); - return false; + return result; } /* @@ -1202,7 +1442,7 @@ SyncOneBuffer(int buf_id, bool skip_pinned) LWLockRelease(bufHdr->content_lock); UnpinBuffer(bufHdr, true); - return true; + return result | BUF_WRITTEN; } |