From 5c60169a01af712b0b1aa1f5db3fcb8776b22d9f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 23 Jun 2004 18:49:33 -0700 Subject: [PATCH] rcu lock update: Add per-cpu batch counter From: Manfred Spraul Below is the one of my patches from my rcu lock update. Jack Steiner tested the first one on a 512p and it resolved the rcu cache line trashing. All were tested on osdl with STP. Step one for reducing cacheline trashing within rcupdate.c: The current code uses the rcu_cpu_mask bitmap both for keeping track of the cpus that haven't gone through a quiescent state and for checking if a cpu should look for quiescent states. The bitmap is frequently changed and the check is done by polling - together this causes cache line trashing. If it's cheaper to access a (mostly) read-only cacheline than a cacheline that is frequently dirtied, then it's possible to reduce the trashing by splitting the rcu_cpu_mask bitmap into two cachelines: The patch adds a generation counter and moves it into a separate cacheline. This allows to removes all accesses to rcu_cpumask (in the read-write cacheline) from rcu_pending and at least 50% of the accesses from rcu_check_quiescent_state. rcu_pending and all but one call per cpu to rcu_check_quiescent_state access the read-only cacheline. Probably not enough for 512p, but it's a start, just for 128 byte more memory use, without slowing down rcu grace periods. Obviously the read-only cacheline is not really read-only: it's written once per grace period to indicate that a new grace period is running. Tests on an 8-way Pentium III with reaim showed some improvement: oprofile hits: Reference: http://khack.osdl.org/stp/293075/ Hits % 23741 0.0994 rcu_pending 19057 0.0798 rcu_check_quiescent_state 6530 0.0273 rcu_check_callbacks Patched: http://khack.osdl.org/stp/293076/ 8291 0.0579 rcu_pending 5475 0.0382 rcu_check_quiescent_state 3604 0.0252 rcu_check_callbacks The total runtime differs between both runs, thus the % number must be compared: Around 50% faster. I've uninlined rcu_pending for the test. Tested with reaim and kernbench. Description: - per-cpu quiescbatch and qs_pending fields introduced: quiescbatch contains the number of the last quiescent period that the cpu has seen and qs_pending is set if the cpu has not yet reported the quiescent state for the current period. With these two fields a cpu can test if it should report a quiescent state without having to look at the frequently written rcu_cpu_mask bitmap. - curbatch split into two fields: rcu_ctrlblk.batch.completed and rcu_ctrlblk.batch.cur. This makes it possible to figure out if a grace period is running (completed != cur) without accessing the rcu_cpu_mask bitmap. - rcu_ctrlblk.maxbatch removed and replaced with a true/false next_pending flag: next_pending=1 means that another grace period should be started immediately after the end of the current period. Previously, this was achieved by maxbatch: curbatch==maxbatch means don't start, curbatch!= maxbatch means start. A flag improves the readability: The only possible values for maxbatch were curbatch and curbatch+1. - rcu_ctrlblk split into two cachelines for better performance. - common code from rcu_offline_cpu and rcu_check_quiescent_state merged into cpu_quiet. - rcu_offline_cpu: replace spin_lock_irq with spin_lock_bh, there are no accesses from irq context (and there are accesses to the spinlock with enabled interrupts from tasklet context). - rcu_restart_cpu introduced, s390 should call it after changing nohz: Theoretically the global batch counter could wrap around and end up at RCU_quiescbatch(cpu). Then the cpu would not look for a quiescent state and rcu would lock up. Signed-off-by: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rcupdate.h | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 58048abd7446..f9981251d542 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -65,11 +65,18 @@ struct rcu_head { /* Control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { - spinlock_t mutex; /* Guard this struct */ - long curbatch; /* Current batch number. */ - long maxbatch; /* Max requested batch number. */ - cpumask_t rcu_cpu_mask; /* CPUs that need to switch in order */ - /* for current batch to proceed. */ + /* "const" members: only changed when starting/ending a grace period */ + struct { + long cur; /* Current batch number. */ + long completed; /* Number of the last completed batch */ + } batch ____cacheline_maxaligned_in_smp; + /* remaining members: bookkeeping of the progress of the grace period */ + struct { + spinlock_t mutex; /* Guard this struct */ + int next_pending; /* Is the next batch already waiting? */ + cpumask_t rcu_cpu_mask; /* CPUs that need to switch */ + /* in order for current batch to proceed. */ + } state ____cacheline_maxaligned_in_smp; }; /* Is batch a before batch b ? */ @@ -90,9 +97,14 @@ static inline int rcu_batch_after(long a, long b) * curlist - current batch for which quiescent cycle started if any */ struct rcu_data { + /* 1) quiescent state handling : */ + long quiescbatch; /* Batch # for grace period */ long qsctr; /* User-mode/idle loop etc. */ long last_qsctr; /* value of qsctr at beginning */ /* of rcu grace period */ + int qs_pending; /* core waits for quiesc state */ + + /* 2) batch handling */ long batch; /* Batch # for current RCU batch */ struct list_head nxtlist; struct list_head curlist; @@ -101,24 +113,31 @@ struct rcu_data { DECLARE_PER_CPU(struct rcu_data, rcu_data); extern struct rcu_ctrlblk rcu_ctrlblk; +#define RCU_quiescbatch(cpu) (per_cpu(rcu_data, (cpu)).quiescbatch) #define RCU_qsctr(cpu) (per_cpu(rcu_data, (cpu)).qsctr) #define RCU_last_qsctr(cpu) (per_cpu(rcu_data, (cpu)).last_qsctr) +#define RCU_qs_pending(cpu) (per_cpu(rcu_data, (cpu)).qs_pending) #define RCU_batch(cpu) (per_cpu(rcu_data, (cpu)).batch) #define RCU_nxtlist(cpu) (per_cpu(rcu_data, (cpu)).nxtlist) #define RCU_curlist(cpu) (per_cpu(rcu_data, (cpu)).curlist) -#define RCU_QSCTR_INVALID 0 - static inline int rcu_pending(int cpu) { - if ((!list_empty(&RCU_curlist(cpu)) && - rcu_batch_before(RCU_batch(cpu), rcu_ctrlblk.curbatch)) || - (list_empty(&RCU_curlist(cpu)) && - !list_empty(&RCU_nxtlist(cpu))) || - cpu_isset(cpu, rcu_ctrlblk.rcu_cpu_mask)) + /* This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!list_empty(&RCU_curlist(cpu)) && + !rcu_batch_before(rcu_ctrlblk.batch.completed,RCU_batch(cpu))) + return 1; + /* This cpu has no pending entries, but there are new entries */ + if (list_empty(&RCU_curlist(cpu)) && + !list_empty(&RCU_nxtlist(cpu))) + return 1; + /* The rcu core waits for a quiescent state from the cpu */ + if (RCU_quiescbatch(cpu) != rcu_ctrlblk.batch.cur || RCU_qs_pending(cpu)) return 1; - else - return 0; + /* nothing to do */ + return 0; } #define rcu_read_lock() preempt_disable() @@ -126,6 +145,7 @@ static inline int rcu_pending(int cpu) extern void rcu_init(void); extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_restart_cpu(int cpu); /* Exported interfaces */ extern void FASTCALL(call_rcu(struct rcu_head *head, -- cgit v1.2.3