summaryrefslogtreecommitdiff
path: root/src/include/storage/buf_internals.h
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2016-04-10 20:12:32 -0700
committerAndres Freund <andres@anarazel.de>2016-04-10 20:12:32 -0700
commit48354581a49c30f5757c203415aa8412d85b0f70 (patch)
treeca509a2c196f179e97993ac89979c361c4b5f431 /src/include/storage/buf_internals.h
parentcf223c3bf5ba16232147c66b5fef4037aafe747c (diff)
Allow Pin/UnpinBuffer to operate in a lockfree manner.
Pinning/Unpinning a buffer is a very frequent operation; especially in read-mostly cache resident workloads. Benchmarking shows that in various scenarios the spinlock protecting a buffer header's state becomes a significant bottleneck. The problem can be reproduced with pgbench -S on larger machines, but can be considerably worse for queries which touch the same buffers over and over at a high frequency (e.g. nested loops over a small inner table). To allow atomic operations to be used, cram BufferDesc's flags, usage_count, buf_hdr_lock, refcount into a single 32bit atomic variable; that allows to manipulate them together using 32bit compare-and-swap operations. This requires reducing MAX_BACKENDS to 2^18-1 (which could be lifted by using a 64bit field, but it's not a realistic configuration atm). As not all operations can easily implemented in a lockfree manner, implement the previous buf_hdr_lock via a flag bit in the atomic variable. That way we can continue to lock the header in places where it's needed, but can get away without acquiring it in the more frequent hot-paths. There's some additional operations which can be done without the lock, but aren't in this patch; but the most important places are covered. As bufmgr.c now essentially re-implements spinlocks, abstract the delay logic from s_lock.c into something more generic. It now has already two users, and more are coming up; there's a follupw patch for lwlock.c at least. This patch is based on a proof-of-concept written by me, which Alexander Korotkov made into a fully working patch; the committed version is again revised by me. Benchmarking and testing has, amongst others, been provided by Dilip Kumar, Alexander Korotkov, Robert Haas. On a large x86 system improvements for readonly pgbench, with a high client count, of a factor of 8 have been observed. Author: Alexander Korotkov and Andres Freund Discussion: 2400449.GjM57CE0Yg@dinodell
Diffstat (limited to 'src/include/storage/buf_internals.h')
-rw-r--r--src/include/storage/buf_internals.h101
1 files changed, 69 insertions, 32 deletions
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index d04363b1665..f8f71255fdd 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -21,29 +21,51 @@
#include "storage/lwlock.h"
#include "storage/shmem.h"
#include "storage/smgr.h"
+#include "port/atomics.h"
#include "storage/spin.h"
#include "utils/relcache.h"
/*
+ * Buffer state is a single 32-bit variable where following data is combined.
+ *
+ * - 18 bits refcount
+ * - 4 bits usage count
+ * - 10 bits of flags
+ *
+ * Combining these values allows to perform some operations without locking
+ * the buffer header, by modifying them together with a CAS loop.
+ *
+ * The definition of buffer state components is below.
+ */
+#define BUF_REFCOUNT_ONE 1
+#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+#define BUF_USAGECOUNT_MASK 0x003C0000U
+#define BUF_USAGECOUNT_ONE (1U << 18)
+#define BUF_USAGECOUNT_SHIFT 18
+#define BUF_FLAG_MASK 0xFFC00000U
+
+/* Get refcount and usagecount from buffer state */
+#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+
+/*
* Flags for buffer descriptors
*
* Note: TAG_VALID essentially means that there is a buffer hashtable
* entry associated with the buffer's tag.
*/
-#define BM_DIRTY (1 << 0) /* data needs writing */
-#define BM_VALID (1 << 1) /* data is valid */
-#define BM_TAG_VALID (1 << 2) /* tag is assigned */
-#define BM_IO_IN_PROGRESS (1 << 3) /* read or write in progress */
-#define BM_IO_ERROR (1 << 4) /* previous I/O failed */
-#define BM_JUST_DIRTIED (1 << 5) /* dirtied since write started */
-#define BM_PIN_COUNT_WAITER (1 << 6) /* have waiter for sole pin */
-#define BM_CHECKPOINT_NEEDED (1 << 7) /* must write for checkpoint */
-#define BM_PERMANENT (1 << 8) /* permanent relation (not
+#define BM_LOCKED (1U << 22) /* buffer header is locked */
+#define BM_DIRTY (1U << 23) /* data needs writing */
+#define BM_VALID (1U << 24) /* data is valid */
+#define BM_TAG_VALID (1U << 25) /* tag is assigned */
+#define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
+#define BM_IO_ERROR (1U << 27) /* previous I/O failed */
+#define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
+#define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
+#define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
+#define BM_PERMANENT (1U << 31) /* permanent relation (not
* unlogged) */
-
-typedef bits16 BufFlags;
-
/*
* The maximum allowed value of usage_count represents a tradeoff between
* accuracy and speed of the clock-sweep buffer management algorithm. A
@@ -113,18 +135,29 @@ typedef struct buftag
/*
* BufferDesc -- shared descriptor/state data for a single shared buffer.
*
- * Note: buf_hdr_lock must be held to examine or change the tag, flags,
- * usage_count, refcount, or wait_backend_pid fields. buf_id field never
- * changes after initialization, so does not need locking. freeNext is
- * protected by the buffer_strategy_lock not buf_hdr_lock. The LWLock can
- * take care of itself. The buf_hdr_lock is *not* used to control access to
- * the data in the buffer!
+ * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
+ * the tag, state or wait_backend_pid fields. In general, buffer header lock
+ * is a spinlock which is combined with flags, refcount and usagecount into
+ * single atomic variable. This layout allow us to do some operations in a
+ * single atomic operation, without actually acquiring and releasing spinlock;
+ * for instance, increase or decrease refcount. buf_id field never changes
+ * after initialization, so does not need locking. freeNext is protected by
+ * the buffer_strategy_lock not buffer header lock. The LWLock can take care
+ * of itself. The buffer header lock is *not* used to control access to the
+ * data in the buffer!
+ *
+ * It's assumed that nobody changes the state field while buffer header lock
+ * is held. Thus buffer header lock holder can do complex updates of the
+ * state variable in single write, simultaneously with lock release (cleaning
+ * BM_LOCKED flag). On the other hand, updating of state without holding
+ * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
+ * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
*
* An exception is that if we have the buffer pinned, its tag can't change
- * underneath us, so we can examine the tag without locking the spinlock.
+ * underneath us, so we can examine the tag without locking the buffer header.
* Also, in places we do one-time reads of the flags without bothering to
- * lock the spinlock; this is generally for situations where we don't expect
- * the flag bit being tested to be changing.
+ * lock the buffer header; this is generally for situations where we don't
+ * expect the flag bit being tested to be changing.
*
* We can't physically remove items from a disk page if another backend has
* the buffer pinned. Hence, a backend may need to wait for all other pins
@@ -142,13 +175,12 @@ typedef struct buftag
typedef struct BufferDesc
{
BufferTag tag; /* ID of page contained in buffer */
- BufFlags flags; /* see bit definitions above */
- uint8 usage_count; /* usage counter for clock sweep code */
- slock_t buf_hdr_lock; /* protects a subset of fields, see above */
- unsigned refcount; /* # of backends holding pins on buffer */
- int wait_backend_pid; /* backend PID of pin-count waiter */
-
int buf_id; /* buffer's index number (from 0) */
+
+ /* state of the tag, containing flags, refcount and usagecount */
+ pg_atomic_uint32 state;
+
+ int wait_backend_pid; /* backend PID of pin-count waiter */
int freeNext; /* link in freelist chain */
LWLock content_lock; /* to lock access to buffer contents */
@@ -202,11 +234,15 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
#define FREENEXT_NOT_IN_LIST (-2)
/*
- * Macros for acquiring/releasing a shared buffer header's spinlock.
- * Do not apply these to local buffers!
+ * Functions for acquiring/releasing a shared buffer header's spinlock. Do
+ * not apply these to local buffers!
*/
-#define LockBufHdr(bufHdr) SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
-#define UnlockBufHdr(bufHdr) SpinLockRelease(&(bufHdr)->buf_hdr_lock)
+extern uint32 LockBufHdr(BufferDesc *desc);
+#define UnlockBufHdr(desc, s) \
+ do { \
+ pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
+ pg_write_barrier(); \
+ } while (0)
/*
@@ -267,7 +303,8 @@ extern void IssuePendingWritebacks(WritebackContext *context);
extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
/* freelist.c */
-extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
+extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
+ uint32 *buf_state);
extern void StrategyFreeBuffer(BufferDesc *buf);
extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
BufferDesc *buf);