summaryrefslogtreecommitdiff
path: root/drivers/char/random.c
diff options
context:
space:
mode:
authorAndrew Morton <akpm@osdl.org>2004-04-11 23:04:22 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2004-04-11 23:04:22 -0700
commit47b54fbff358a1d5ee4738cec8a53a08bead72e4 (patch)
treec169c876fcbd446edeea358e576dd627d7a4a1e8 /drivers/char/random.c
parentce334bb8f0f084112dcfe96214cacfa0afba7e10 (diff)
[PATCH] /dev/urandom scalability improvement
From: David Mosberger <davidm@napali.hpl.hp.com> Somebody recently pointed out a performance-anomaly to me where an unusual amount of time was being spent reading from /dev/urandom. The problem isn't really surprising as it happened only on >= 4-way machines and the random driver isn't terribly scalable the way it is written today. If scalability _really_ mattered, I suppose per-CPU data structures would be the way to go. However, I found that at least for 4-way machines, performance can be improved considerably with the attached patch. In particular, I saw the following performance on a 4-way ia64 machine: Test: 3 tasks running "dd if=/dev/urandom of=/dev/null bs=1024": throughput:
Diffstat (limited to 'drivers/char/random.c')
-rw-r--r--drivers/char/random.c51
1 files changed, 35 insertions, 16 deletions
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 117f195029a1..6941fdeb6a4b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -490,12 +490,15 @@ static inline __u32 int_ln_12bits(__u32 word)
**********************************************************************/
struct entropy_store {
+ /* mostly-read data: */
+ struct poolinfo poolinfo;
+ __u32 *pool;
+
+ /* read-write data: */
+ spinlock_t lock ____cacheline_aligned_in_smp;
unsigned add_ptr;
int entropy_count;
int input_rotate;
- struct poolinfo poolinfo;
- __u32 *pool;
- spinlock_t lock;
};
/*
@@ -571,38 +574,54 @@ static void add_entropy_words(struct entropy_store *r, const __u32 *in,
static __u32 const twist_table[8] = {
0, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
- unsigned i;
- int new_rotate;
+ unsigned long i, add_ptr, tap1, tap2, tap3, tap4, tap5;
+ int new_rotate, input_rotate;
int wordmask = r->poolinfo.poolwords - 1;
- __u32 w;
+ __u32 w, next_w;
unsigned long flags;
+ /* Taps are constant, so we can load them without holding r->lock. */
+ tap1 = r->poolinfo.tap1;
+ tap2 = r->poolinfo.tap2;
+ tap3 = r->poolinfo.tap3;
+ tap4 = r->poolinfo.tap4;
+ tap5 = r->poolinfo.tap5;
+ next_w = *in++;
+
spin_lock_irqsave(&r->lock, flags);
+ prefetch_range(r->pool, wordmask);
+ input_rotate = r->input_rotate;
+ add_ptr = r->add_ptr;
while (nwords--) {
- w = rotate_left(r->input_rotate, *in++);
- i = r->add_ptr = (r->add_ptr - 1) & wordmask;
+ w = rotate_left(input_rotate, next_w);
+ if (nwords > 0)
+ next_w = *in++;
+ i = add_ptr = (add_ptr - 1) & wordmask;
/*
* Normally, we add 7 bits of rotation to the pool.
* At the beginning of the pool, add an extra 7 bits
* rotation, so that successive passes spread the
* input bits across the pool evenly.
*/
- new_rotate = r->input_rotate + 14;
+ new_rotate = input_rotate + 14;
if (i)
- new_rotate = r->input_rotate + 7;
- r->input_rotate = new_rotate & 31;
+ new_rotate = input_rotate + 7;
+ input_rotate = new_rotate & 31;
/* XOR in the various taps */
- w ^= r->pool[(i + r->poolinfo.tap1) & wordmask];
- w ^= r->pool[(i + r->poolinfo.tap2) & wordmask];
- w ^= r->pool[(i + r->poolinfo.tap3) & wordmask];
- w ^= r->pool[(i + r->poolinfo.tap4) & wordmask];
- w ^= r->pool[(i + r->poolinfo.tap5) & wordmask];
+ w ^= r->pool[(i + tap1) & wordmask];
+ w ^= r->pool[(i + tap2) & wordmask];
+ w ^= r->pool[(i + tap3) & wordmask];
+ w ^= r->pool[(i + tap4) & wordmask];
+ w ^= r->pool[(i + tap5) & wordmask];
w ^= r->pool[i];
r->pool[i] = (w >> 3) ^ twist_table[w & 7];
}
+ r->input_rotate = input_rotate;
+ r->add_ptr = add_ptr;
+
spin_unlock_irqrestore(&r->lock, flags);
}