From 6d27f67bf6ee2b9ad0c8814118264bc273d916a1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 12 Apr 2004 00:15:51 -0700 Subject: [PATCH] per-backing dev unplugging From: Jens Axboe , Chris Mason, me, others. The global unplug list causes horrid spinlock contention on many-disk many-CPU setups - throughput is worse than halved. The other problem with the global unplugging is of course that it will cause the unplugging of queues which are unrelated to the I/O upon which the caller is about to wait. So what we do to solve these problems is to remove the global unplug and set up the infrastructure under which the VFS can tell the block layer to unplug only those queues which are relevant to the page or buffer_head whcih is about to be waited upon. We do this via the very appropriate address_space->backing_dev_info structure. Most of the complexity is in devicemapper, MD and swapper_space, because for these backing devices, multiple queues may need to be unplugged to complete a page/buffer I/O. In each case we ensure that data structures are in place to permit us to identify all the lower-level queues which contribute to the higher-level backing_dev_info. Each contributing queue is told to unplug in response to a higher-level unplug. To simplify things in various places we also introduce the concept of a "synchronous BIO": it is tagged with BIO_RW_SYNC. The block layer will perform an immediate unplug when it sees one of these go past. --- include/linux/raid/md.h | 1 + include/linux/raid/md_k.h | 26 -------------------------- 2 files changed, 1 insertion(+), 26 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 240dc450dcd3..9c06e776cfc2 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -76,6 +76,7 @@ extern void md_handle_safemode(mddev_t *mddev); extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors); extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev); +extern void md_unplug_mddev(mddev_t *mddev); extern void md_print_devices (void); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index bea64b0fb6c1..42c973c53d04 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -326,7 +326,6 @@ do { \ if (condition) \ break; \ spin_unlock_irq(&lock); \ - blk_run_queues(); \ schedule(); \ spin_lock_irq(&lock); \ } \ @@ -341,30 +340,5 @@ do { \ __wait_event_lock_irq(wq, condition, lock); \ } while (0) - -#define __wait_disk_event(wq, condition) \ -do { \ - wait_queue_t __wait; \ - init_waitqueue_entry(&__wait, current); \ - \ - add_wait_queue(&wq, &__wait); \ - for (;;) { \ - set_current_state(TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - blk_run_queues(); \ - schedule(); \ - } \ - current->state = TASK_RUNNING; \ - remove_wait_queue(&wq, &__wait); \ -} while (0) - -#define wait_disk_event(wq, condition) \ -do { \ - if (condition) \ - break; \ - __wait_disk_event(wq, condition); \ -} while (0) - #endif -- cgit v1.2.3 From 66db15b4577185624ae95ffe99a66305c8c63ef7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 12 Apr 2004 00:16:17 -0700 Subject: [PATCH] unplugging: md update From: Neil Brown I've made a bunch of changes to the 'md' bits - largely moving the unplugging into the individual personalities which know more about which drives are actually in use. --- drivers/md/linear.c | 15 +++++++++++++ drivers/md/md.c | 35 +++++------------------------ drivers/md/multipath.c | 23 +++++++++++++++++++ drivers/md/raid0.c | 17 ++++++++++++++ drivers/md/raid1.c | 56 ++++++++++++++++++++++++++++++++++++++--------- drivers/md/raid5.c | 36 ++++++++++++++++++++++++++---- drivers/md/raid6main.c | 36 +++++++++++++++++++++++++++--- include/linux/raid/md_k.h | 7 +++--- 8 files changed, 175 insertions(+), 50 deletions(-) (limited to 'include/linux/raid') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 1198e07e7abe..e0aa017a26b7 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -80,6 +80,20 @@ static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio return maxsectors << 9; } +static void linear_unplug(request_queue_t *q) +{ + mddev_t *mddev = q->queuedata; + linear_conf_t *conf = mddev_to_conf(mddev); + int i; + + for (i=0; i < mddev->raid_disks; i++) { + request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); + if (r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + } +} + + static int linear_run (mddev_t *mddev) { linear_conf_t *conf; @@ -185,6 +199,7 @@ static int linear_run (mddev_t *mddev) BUG(); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); + mddev->queue->unplug_fn = linear_unplug; return 0; out: diff --git a/drivers/md/md.c b/drivers/md/md.c index 72d6a2da5827..b521ca509b1e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -160,30 +160,6 @@ static int md_fail_request (request_queue_t *q, struct bio *bio) return 0; } -void md_unplug_mddev(mddev_t *mddev) -{ - struct list_head *tmp; - mdk_rdev_t *rdev; - - /* - * this list iteration is done without any locking in md?! - */ - ITERATE_RDEV(mddev, rdev, tmp) { - request_queue_t *r_queue = bdev_get_queue(rdev->bdev); - - if (r_queue->unplug_fn) - r_queue->unplug_fn(r_queue); - } -} -EXPORT_SYMBOL(md_unplug_mddev); - -static void md_unplug_all(request_queue_t *q) -{ - mddev_t *mddev = q->queuedata; - - md_unplug_mddev(mddev); -} - static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); @@ -1669,7 +1645,6 @@ static int do_md_run(mddev_t * mddev) */ mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; - mddev->queue->unplug_fn = md_unplug_all; mddev->changed = 1; return 0; @@ -2742,10 +2717,9 @@ int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); run = thread->run; - if (run) { + if (run) run(thread->mddev); - md_unplug_mddev(thread->mddev); - } + if (signal_pending(current)) flush_signals(current); } @@ -3313,8 +3287,6 @@ static void md_do_sync(mddev_t *mddev) test_bit(MD_RECOVERY_ERR, &mddev->recovery)) break; - md_unplug_mddev(mddev); - repeat: if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { /* step marks */ @@ -3347,6 +3319,7 @@ static void md_do_sync(mddev_t *mddev) * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ + mddev->queue->unplug_fn(mddev->queue); cond_resched(); currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; @@ -3365,6 +3338,8 @@ static void md_do_sync(mddev_t *mddev) * this also signals 'finished resyncing' to md_stop */ out: + mddev->queue->unplug_fn(mddev->queue); + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index bf9980a8b1fd..9114c7c269ed 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -155,6 +155,27 @@ static int multipath_read_balance (multipath_conf_t *conf) return 0; } +static void unplug_slaves(mddev_t *mddev) +{ + multipath_conf_t *conf = mddev_to_conf(mddev); + int i; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->multipaths[i].rdev; + if (rdev && !rdev->faulty) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + + if (r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + } + } +} +static void multipath_unplug(request_queue_t *q) +{ + unplug_slaves(q->queuedata); +} + + static int multipath_make_request (request_queue_t *q, struct bio * bio) { mddev_t *mddev = q->queuedata; @@ -419,6 +440,8 @@ static int multipath_run (mddev_t *mddev) } memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks); + mddev->queue->unplug_fn = multipath_unplug; + conf->working_disks = 0; ITERATE_RDEV(mddev,rdev,tmp) { disk_idx = rdev->raid_disk; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 3cbf14021820..5f4b8bfefc91 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -25,6 +25,21 @@ #define MD_DRIVER #define MD_PERSONALITY +static void raid0_unplug(request_queue_t *q) +{ + mddev_t *mddev = q->queuedata; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t **devlist = conf->strip_zone[0].dev; + int i; + + for (i=0; iraid_disks; i++) { + request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev); + + if (r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + } +} + static int create_strip_zones (mddev_t *mddev) { int i, c, j; @@ -202,6 +217,8 @@ static int create_strip_zones (mddev_t *mddev) conf->hash_spacing = sz; } + mddev->queue->unplug_fn = raid0_unplug; + printk("raid0: done.\n"); return 0; abort: diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 6616cd46c50f..bcc81ef13a35 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -37,6 +37,9 @@ static mdk_personality_t raid1_personality; static spinlock_t retry_list_lock = SPIN_LOCK_UNLOCKED; static LIST_HEAD(retry_list_head); +static void unplug_slaves(mddev_t *mddev); + + static void * r1bio_pool_alloc(int gfp_flags, void *data) { mddev_t *mddev = data; @@ -47,6 +50,8 @@ static void * r1bio_pool_alloc(int gfp_flags, void *data) gfp_flags); if (r1_bio) memset(r1_bio, 0, sizeof(*r1_bio) + sizeof(struct bio*)*mddev->raid_disks); + else + unplug_slaves(mddev); return r1_bio; } @@ -71,8 +76,10 @@ static void * r1buf_pool_alloc(int gfp_flags, void *data) int i, j; r1_bio = r1bio_pool_alloc(gfp_flags, conf->mddev); - if (!r1_bio) + if (!r1_bio) { + unplug_slaves(conf->mddev); return NULL; + } /* * Allocate bios : 1 for reading, n-1 for writing @@ -443,6 +450,29 @@ rb_out: return new_disk; } +static void unplug_slaves(mddev_t *mddev) +{ + conf_t *conf = mddev_to_conf(mddev); + int i; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && !rdev->faulty) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + + if (r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); +} +static void raid1_unplug(request_queue_t *q) +{ + unplug_slaves(q->queuedata); +} + /* * Throttle resync depth, so that we can both get proper overlapping of * requests, but are still able to handle normal requests quickly. @@ -451,16 +481,18 @@ rb_out: static void device_barrier(conf_t *conf, sector_t sect) { - md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock); + wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), + conf->resync_lock, unplug_slaves(conf->mddev)); if (!conf->barrier++) { - wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, conf->resync_lock); + wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, + conf->resync_lock, unplug_slaves(conf->mddev)); if (conf->nr_pending) BUG(); } - wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, conf->resync_lock); + wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, + conf->resync_lock, unplug_slaves(conf->mddev)); conf->next_resync = sect; spin_unlock_irq(&conf->resync_lock); } @@ -479,9 +511,8 @@ static int make_request(request_queue_t *q, struct bio * bio) * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ - md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock); + wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); @@ -646,9 +677,9 @@ static void print_conf(conf_t *conf) static void close_sync(conf_t *conf) { - md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock); + wait_event_lock_irq(conf->wait_resume, !conf->barrier, + conf->resync_lock, unplug_slaves(conf->mddev)); spin_unlock_irq(&conf->resync_lock); if (conf->barrier) BUG(); @@ -862,6 +893,7 @@ static void raid1d(mddev_t *mddev) struct bio *bio; unsigned long flags; conf_t *conf = mddev_to_conf(mddev); + int unplug=0; mdk_rdev_t *rdev; md_check_recovery(mddev); @@ -881,6 +913,7 @@ static void raid1d(mddev_t *mddev) bio = r1_bio->master_bio; if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); + unplug = 1; } else { if (map(mddev, &rdev) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" @@ -896,12 +929,14 @@ static void raid1d(mddev_t *mddev) bio->bi_bdev = rdev->bdev; bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_rw = READ; - + unplug = 1; generic_make_request(bio); } } } spin_unlock_irqrestore(&retry_list_lock, flags); + if (unplug) + unplug_slaves(mddev); } @@ -1104,6 +1139,7 @@ static int run(mddev_t *mddev) mdname(mddev)); goto out_free_conf; } + mddev->queue->unplug_fn = raid1_unplug; ITERATE_RDEV(mddev, rdev, tmp) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5c9d3fd66913..05087b8ae056 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -231,6 +231,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) return NULL; } +static void unplug_slaves(mddev_t *mddev); + static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int pd_idx, int noblock) { @@ -249,12 +251,13 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector break; if (!sh) { conf->inactive_blocked = 1; - md_unplug_mddev(conf->mddev); wait_event_lock_irq(conf->wait_for_stripe, !list_empty(&conf->inactive_list) && (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) || !conf->inactive_blocked), - conf->device_lock); + conf->device_lock, + unplug_slaves(conf->mddev); + ); conf->inactive_blocked = 0; } else init_stripe(sh, sector, pd_idx); @@ -1293,6 +1296,25 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf) } } } + +static void unplug_slaves(mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + int i; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + if (bdev) { + request_queue_t *r_queue = bdev_get_queue(bdev); + if (r_queue && r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + } + } + } +} + static void raid5_unplug_device(request_queue_t *q) { mddev_t *mddev = q->queuedata; @@ -1306,6 +1328,8 @@ static void raid5_unplug_device(request_queue_t *q) md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); + + unplug_slaves(mddev); } static inline void raid5_plug_device(raid5_conf_t *conf) @@ -1392,9 +1416,11 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) int raid_disks = conf->raid_disks; int data_disks = raid_disks-1; - if (sector_nr >= mddev->size <<1) - /* just being told to finish up .. nothing to do */ + if (sector_nr >= mddev->size <<1) { + /* just being told to finish up .. nothing much to do */ + unplug_slaves(mddev); return 0; + } x = sector_nr; chunk_offset = sector_div(x, sectors_per_chunk); @@ -1474,6 +1500,8 @@ static void raid5d (mddev_t *mddev) spin_unlock_irq(&conf->device_lock); + unplug_slaves(mddev); + PRINTK("--- raid5d inactive\n"); } diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 131f4a1f34eb..99d08d67342f 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -250,6 +250,8 @@ static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) return NULL; } +static void unplug_slaves(mddev_t *mddev); + static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector, int pd_idx, int noblock) { @@ -272,7 +274,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector !list_empty(&conf->inactive_list) && (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) || !conf->inactive_blocked), - conf->device_lock); + conf->device_lock, + unplug_slaves(conf->mddev); + ); conf->inactive_blocked = 0; } else init_stripe(sh, sector, pd_idx); @@ -1454,6 +1458,26 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf) } } } + +static void unplug_slaves(mddev_t *mddev) +{ + /* note: this is always called with device_lock held */ + raid6_conf_t *conf = mddev_to_conf(mddev); + int i; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + if (bdev) { + request_queue_t *r_queue = bdev_get_queue(bdev); + if (r_queue && r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + } + } + } +} + static void raid6_unplug_device(request_queue_t *q) { mddev_t *mddev = q->queuedata; @@ -1467,6 +1491,8 @@ static void raid6_unplug_device(request_queue_t *q) md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); + + unplug_slaves(mddev); } static inline void raid6_plug_device(raid6_conf_t *conf) @@ -1553,9 +1579,11 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) int raid_disks = conf->raid_disks; int data_disks = raid_disks - 2; - if (sector_nr >= mddev->size <<1) - /* just being told to finish up .. nothing to do */ + if (sector_nr >= mddev->size <<1) { + /* just being told to finish up .. nothing much to do */ + unplug_slaves(mddev); return 0; + } x = sector_nr; chunk_offset = sector_div(x, sectors_per_chunk); @@ -1635,6 +1663,8 @@ static void raid6d (mddev_t *mddev) spin_unlock_irq(&conf->device_lock); + unplug_slaves(mddev); + PRINTK("--- raid6d inactive\n"); } diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 42c973c53d04..0b6b5e6f34eb 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -315,7 +315,7 @@ typedef struct mdk_thread_s { #define THREAD_WAKEUP 0 -#define __wait_event_lock_irq(wq, condition, lock) \ +#define __wait_event_lock_irq(wq, condition, lock, cmd) \ do { \ wait_queue_t __wait; \ init_waitqueue_entry(&__wait, current); \ @@ -326,6 +326,7 @@ do { \ if (condition) \ break; \ spin_unlock_irq(&lock); \ + cmd; \ schedule(); \ spin_lock_irq(&lock); \ } \ @@ -333,11 +334,11 @@ do { \ remove_wait_queue(&wq, &__wait); \ } while (0) -#define wait_event_lock_irq(wq, condition, lock) \ +#define wait_event_lock_irq(wq, condition, lock, cmd) \ do { \ if (condition) \ break; \ - __wait_event_lock_irq(wq, condition, lock); \ + __wait_event_lock_irq(wq, condition, lock, cmd); \ } while (0) #endif -- cgit v1.2.3