From 6a435d69de04e96de8001edbd4a3da94eaec56b3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 12 Apr 2004 00:15:12 -0700 Subject: [PATCH] Add queue congestion callout From: Miquel van Smoorenburg The VM and VFS use the address_space_backing_dev_info to track the realtime status of the device which backs the mapping. The read_congested and write_congested fields are used to determine whether a read or write against that device may block. We use this infrastructure to a) allow pdflush to service many queues in parallel (by not getting stuck on any particular one) and b) to avoid undesirable and uncontrolled latencies in places such as page reclaim and c) To avoid blocking in readahead operations The current code only supports simple disk queues (and I have a patch here for NFS). Stacked queues (MD and DM) don't get this information right and problems were expected. Efficiency problems have now been noted and it's time to fix it. This patch lays down the infrastructure which permits the queue implementation to get control when someone at a higher level is querying the queue's congestion state. So DM (for example) can run around and examine all the queues which contribute to the higher-level queue. It also adds bdi_rw_congested() for code in xfs and ext2 that calls both bdi_read_congested() and bdi_write_congested() in a row, and it was "free" anyway. --- include/linux/backing-dev.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 94c93c9c5f66..e34916ddd1d7 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -20,10 +20,14 @@ enum bdi_state { BDI_unused, /* Available bits start here */ }; +typedef int (congested_fn)(void *, int); + struct backing_dev_info { unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ unsigned long state; /* Always use atomic bitops on this */ int memory_backed; /* Cannot clean pages with writepage */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ }; extern struct backing_dev_info default_backing_dev_info; @@ -32,14 +36,27 @@ int writeback_acquire(struct backing_dev_info *bdi); int writeback_in_progress(struct backing_dev_info *bdi); void writeback_release(struct backing_dev_info *bdi); +static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) +{ + if (bdi->congested_fn) + return bdi->congested_fn(bdi->congested_data, bdi_bits); + return (bdi->state & bdi_bits); +} + static inline int bdi_read_congested(struct backing_dev_info *bdi) { - return test_bit(BDI_read_congested, &bdi->state); + return bdi_congested(bdi, 1 << BDI_read_congested); } static inline int bdi_write_congested(struct backing_dev_info *bdi) { - return test_bit(BDI_write_congested, &bdi->state); + return bdi_congested(bdi, 1 << BDI_write_congested); +} + +static inline int bdi_rw_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, (1 << BDI_read_congested)| + (1 << BDI_write_congested)); } #endif /* _LINUX_BACKING_DEV_H */ -- cgit v1.2.3 From 6d27f67bf6ee2b9ad0c8814118264bc273d916a1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 12 Apr 2004 00:15:51 -0700 Subject: [PATCH] per-backing dev unplugging From: Jens Axboe , Chris Mason, me, others. The global unplug list causes horrid spinlock contention on many-disk many-CPU setups - throughput is worse than halved. The other problem with the global unplugging is of course that it will cause the unplugging of queues which are unrelated to the I/O upon which the caller is about to wait. So what we do to solve these problems is to remove the global unplug and set up the infrastructure under which the VFS can tell the block layer to unplug only those queues which are relevant to the page or buffer_head whcih is about to be waited upon. We do this via the very appropriate address_space->backing_dev_info structure. Most of the complexity is in devicemapper, MD and swapper_space, because for these backing devices, multiple queues may need to be unplugged to complete a page/buffer I/O. In each case we ensure that data structures are in place to permit us to identify all the lower-level queues which contribute to the higher-level backing_dev_info. Each contributing queue is told to unplug in response to a higher-level unplug. To simplify things in various places we also introduce the concept of a "synchronous BIO": it is tagged with BIO_RW_SYNC. The block layer will perform an immediate unplug when it sees one of these go past. --- drivers/block/ll_rw_blk.c | 96 +++++++++++++------------------------------- drivers/block/loop.c | 15 ++++++- drivers/block/rd.c | 1 + drivers/block/umem.c | 3 +- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-table.c | 16 ++++++++ drivers/md/dm.c | 23 +++++++++-- drivers/md/dm.h | 1 + drivers/md/md.c | 32 +++++++++++++-- drivers/md/raid1.c | 3 ++ drivers/md/raid5.c | 4 +- drivers/md/raid6main.c | 3 +- drivers/mtd/devices/blkmtd.c | 6 +-- fs/buffer.c | 12 ++++-- fs/direct-io.c | 4 +- fs/jfs/jfs_logmgr.c | 6 +-- fs/ntfs/compress.c | 3 +- fs/ufs/truncate.c | 3 +- fs/xfs/linux/xfs_buf.c | 24 ++++------- include/linux/backing-dev.h | 3 ++ include/linux/bio.h | 3 ++ include/linux/blkdev.h | 23 ++++++++--- include/linux/fs.h | 2 + include/linux/raid/md.h | 1 + include/linux/raid/md_k.h | 26 ------------ include/linux/swap.h | 3 ++ kernel/power/disk.c | 1 - kernel/power/pmdisk.c | 3 +- kernel/power/swsusp.c | 5 --- mm/filemap.c | 4 +- mm/mempool.c | 2 - mm/nommu.c | 5 +++ mm/readahead.c | 8 +++- mm/shmem.c | 1 + mm/swap_state.c | 1 + mm/swapfile.c | 65 +++++++++++++++++++++++++++++- 36 files changed, 254 insertions(+), 159 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index fc4b6c698fcf..209fdef4d986 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -42,12 +42,6 @@ static void blk_unplug_timeout(unsigned long data); */ static kmem_cache_t *request_cachep; -/* - * plug management - */ -static LIST_HEAD(blk_plug_list); -static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; - static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) @@ -251,8 +245,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) */ blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); - INIT_LIST_HEAD(&q->plug_list); - blk_queue_activity_fn(q, NULL, NULL); } @@ -1104,13 +1096,11 @@ void blk_plug_device(request_queue_t *q) * don't plug a stopped queue, it must be paired with blk_start_queue() * which will restart the queueing */ - if (!blk_queue_plugged(q) - && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) { - spin_lock(&blk_plug_lock); - list_add_tail(&q->plug_list, &blk_plug_list); + if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) + return; + + if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); - spin_unlock(&blk_plug_lock); - } } EXPORT_SYMBOL(blk_plug_device); @@ -1122,15 +1112,12 @@ EXPORT_SYMBOL(blk_plug_device); int blk_remove_plug(request_queue_t *q) { WARN_ON(!irqs_disabled()); - if (blk_queue_plugged(q)) { - spin_lock(&blk_plug_lock); - list_del_init(&q->plug_list); - del_timer(&q->unplug_timer); - spin_unlock(&blk_plug_lock); - return 1; - } - return 0; + if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) + return 0; + + del_timer(&q->unplug_timer); + return 1; } EXPORT_SYMBOL(blk_remove_plug); @@ -1161,24 +1148,32 @@ static inline void __generic_unplug_device(request_queue_t *q) * Linux uses plugging to build bigger requests queues before letting * the device have at them. If a queue is plugged, the I/O scheduler * is still adding and merging requests on the queue. Once the queue - * gets unplugged (either by manually calling this function, or by - * calling blk_run_queues()), the request_fn defined for the - * queue is invoked and transfers started. + * gets unplugged, the request_fn defined for the queue is invoked and + * transfers started. **/ -void generic_unplug_device(void *data) +void generic_unplug_device(request_queue_t *q) { - request_queue_t *q = data; - spin_lock_irq(q->queue_lock); __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); } - EXPORT_SYMBOL(generic_unplug_device); +static void blk_backing_dev_unplug(struct backing_dev_info *bdi) +{ + request_queue_t *q = bdi->unplug_io_data; + + /* + * devices don't necessarily have an ->unplug_fn defined + */ + if (q->unplug_fn) + q->unplug_fn(q); +} + static void blk_unplug_work(void *data) { request_queue_t *q = data; + q->unplug_fn(q); } @@ -1255,42 +1250,6 @@ void blk_run_queue(struct request_queue *q) EXPORT_SYMBOL(blk_run_queue); -/** - * blk_run_queues - fire all plugged queues - * - * Description: - * Start I/O on all plugged queues known to the block layer. Queues that - * are currently stopped are ignored. This is equivalent to the older - * tq_disk task queue run. - **/ -#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list) -void blk_run_queues(void) -{ - LIST_HEAD(local_plug_list); - - spin_lock_irq(&blk_plug_lock); - - /* - * this will happen fairly often - */ - if (list_empty(&blk_plug_list)) - goto out; - - list_splice_init(&blk_plug_list, &local_plug_list); - - while (!list_empty(&local_plug_list)) { - request_queue_t *q = blk_plug_entry(local_plug_list.next); - - spin_unlock_irq(&blk_plug_lock); - q->unplug_fn(q); - spin_lock_irq(&blk_plug_lock); - } -out: - spin_unlock_irq(&blk_plug_lock); -} - -EXPORT_SYMBOL(blk_run_queues); - /** * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed * @q: the request queue to be released @@ -1390,6 +1349,10 @@ request_queue_t *blk_alloc_queue(int gfp_mask) memset(q, 0, sizeof(*q)); init_timer(&q->unplug_timer); atomic_set(&q->refcnt, 1); + + q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; + q->backing_dev_info.unplug_io_data = q; + return q; } @@ -2050,7 +2013,6 @@ long blk_congestion_wait(int rw, long timeout) DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[rw]; - blk_run_queues(); prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); @@ -2315,7 +2277,7 @@ out: if (blk_queue_plugged(q)) { int nr_queued = q->rq.count[READ] + q->rq.count[WRITE]; - if (nr_queued == q->unplug_thresh) + if (nr_queued == q->unplug_thresh || bio_sync(bio)) __generic_unplug_device(q); } spin_unlock_irq(q->queue_lock); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f29f72ee30d0..a43c545071cb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -434,6 +434,17 @@ inactive: goto out; } +/* + * kick off io on the underlying address space + */ +static void loop_unplug(request_queue_t *q) +{ + struct loop_device *lo = q->queuedata; + + clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags); + blk_run_address_space(lo->lo_backing_file->f_mapping); +} + struct switch_request { struct file *file; struct completion wait; @@ -614,7 +625,6 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file, { struct file *file; struct inode *inode; - struct block_device *lo_device = NULL; struct address_space *mapping; unsigned lo_blocksize; int lo_flags = 0; @@ -671,7 +681,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file, set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->lo_blocksize = lo_blocksize; - lo->lo_device = lo_device; + lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; lo->transfer = NULL; @@ -688,6 +698,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file, */ blk_queue_make_request(lo->lo_queue, loop_make_request); lo->lo_queue->queuedata = lo; + lo->lo_queue->unplug_fn = loop_unplug; set_capacity(disks[lo->lo_number], size); bd_set_size(bdev, size << 9); diff --git a/drivers/block/rd.c b/drivers/block/rd.c index e626344c9b58..3dd9163a64e2 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c @@ -271,6 +271,7 @@ static int rd_ioctl(struct inode *inode, struct file *file, static struct backing_dev_info rd_backing_dev_info = { .ra_pages = 0, /* No readahead */ .memory_backed = 1, /* Does not contribute to dirty memory */ + .unplug_io_fn = default_unplug_io_fn, }; static int rd_open(struct inode *inode, struct file *filp) diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 31cd010f4d56..5a1e349b131d 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -368,9 +368,8 @@ static inline void reset_page(struct mm_page *page) page->biotail = & page->bio; } -static void mm_unplug_device(void *data) +static void mm_unplug_device(request_queue_t *q) { - request_queue_t *q = data; struct cardinfo *card = q->queuedata; unsigned long flags; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8e1798115e2f..a17b25380fce 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -668,7 +668,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, /* out of memory -> run queues */ if (remaining) - blk_run_queues(); + blk_congestion_wait(bio_data_dir(clone), HZ/100); } /* drop reference, clones could have returned before we reach this */ diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4aa6c43ffd01..93dc0e6361c0 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -885,8 +885,24 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) return r; } +void dm_table_unplug_all(struct dm_table *t) +{ + struct list_head *d, *devices = dm_table_get_devices(t); + + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + + if (q->unplug_fn) + q->unplug_fn(q); + } +} + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); EXPORT_SYMBOL(dm_table_event); EXPORT_SYMBOL(dm_table_get_mode); +EXPORT_SYMBOL(dm_table_put); +EXPORT_SYMBOL(dm_table_get); +EXPORT_SYMBOL(dm_table_unplug_all); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6dc34c8b4604..542f9cd0acc0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -575,6 +575,17 @@ static int dm_request(request_queue_t *q, struct bio *bio) return 0; } +static void dm_unplug_all(request_queue_t *q) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_table(md); + + if (map) { + dm_table_unplug_all(map); + dm_table_put(map); + } +} + static int dm_any_congested(void *congested_data, int bdi_bits) { int r; @@ -672,6 +683,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent) md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); + md->queue->unplug_fn = dm_unplug_all; md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, mempool_free_slab, _io_cache); @@ -896,11 +908,17 @@ int dm_suspend(struct mapped_device *md) add_wait_queue(&md->wait, &wait); up_write(&md->lock); + /* unplug */ + map = dm_get_table(md); + if (map) { + dm_table_unplug_all(map); + dm_table_put(map); + } + /* * Then we wait for the already mapped ios to * complete. */ - blk_run_queues(); while (1) { set_current_state(TASK_INTERRUPTIBLE); @@ -945,10 +963,9 @@ int dm_resume(struct mapped_device *md) def = bio_list_get(&md->deferred); __flush_deferred_io(md, def); up_write(&md->lock); + dm_table_unplug_all(map); dm_table_put(map); - blk_run_queues(); - return 0; } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 780185db38d0..34bf0e7cceb2 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -116,6 +116,7 @@ int dm_table_get_mode(struct dm_table *t); void dm_table_suspend_targets(struct dm_table *t); void dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); +void dm_table_unplug_all(struct dm_table *t); /*----------------------------------------------------------------- * A registry of target types. diff --git a/drivers/md/md.c b/drivers/md/md.c index aa6fef11aa4e..72d6a2da5827 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -160,6 +160,30 @@ static int md_fail_request (request_queue_t *q, struct bio *bio) return 0; } +void md_unplug_mddev(mddev_t *mddev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + /* + * this list iteration is done without any locking in md?! + */ + ITERATE_RDEV(mddev, rdev, tmp) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + + if (r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + } +} +EXPORT_SYMBOL(md_unplug_mddev); + +static void md_unplug_all(request_queue_t *q) +{ + mddev_t *mddev = q->queuedata; + + md_unplug_mddev(mddev); +} + static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); @@ -335,6 +359,8 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct bio_vec vec; struct completion event; + rw |= (1 << BIO_RW_SYNC); + bio_init(&bio); bio.bi_io_vec = &vec; vec.bv_page = page; @@ -349,7 +375,6 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size, bio.bi_private = &event; bio.bi_end_io = bi_complete; submit_bio(rw, &bio); - blk_run_queues(); wait_for_completion(&event); return test_bit(BIO_UPTODATE, &bio.bi_flags); @@ -1644,6 +1669,7 @@ static int do_md_run(mddev_t * mddev) */ mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; + mddev->queue->unplug_fn = md_unplug_all; mddev->changed = 1; return 0; @@ -2718,7 +2744,7 @@ int md_thread(void * arg) run = thread->run; if (run) { run(thread->mddev); - blk_run_queues(); + md_unplug_mddev(thread->mddev); } if (signal_pending(current)) flush_signals(current); @@ -3287,7 +3313,7 @@ static void md_do_sync(mddev_t *mddev) test_bit(MD_RECOVERY_ERR, &mddev->recovery)) break; - blk_run_queues(); + md_unplug_mddev(mddev); repeat: if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f308d5fe946f..6616cd46c50f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -451,6 +451,7 @@ rb_out: static void device_barrier(conf_t *conf, sector_t sect) { + md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock); @@ -478,6 +479,7 @@ static int make_request(request_queue_t *q, struct bio * bio) * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ + md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock); conf->nr_pending++; @@ -644,6 +646,7 @@ static void print_conf(conf_t *conf) static void close_sync(conf_t *conf) { + md_unplug_mddev(conf->mddev); spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock); spin_unlock_irq(&conf->resync_lock); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b5cc6c4ba6ba..5c9d3fd66913 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -249,6 +249,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector break; if (!sh) { conf->inactive_blocked = 1; + md_unplug_mddev(conf->mddev); wait_event_lock_irq(conf->wait_for_stripe, !list_empty(&conf->inactive_list) && (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) @@ -1292,9 +1293,8 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf) } } } -static void raid5_unplug_device(void *data) +static void raid5_unplug_device(request_queue_t *q) { - request_queue_t *q = data; mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long flags; diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 747085a6dac0..131f4a1f34eb 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1454,9 +1454,8 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf) } } } -static void raid6_unplug_device(void *data) +static void raid6_unplug_device(request_queue_t *q) { - request_queue_t *q = data; mddev_t *mddev = q->queuedata; raid6_conf_t *conf = mddev_to_conf(mddev); unsigned long flags; diff --git a/drivers/mtd/devices/blkmtd.c b/drivers/mtd/devices/blkmtd.c index b4b4178943a1..4bd5d3219458 100644 --- a/drivers/mtd/devices/blkmtd.c +++ b/drivers/mtd/devices/blkmtd.c @@ -147,8 +147,7 @@ static int blkmtd_readpage(struct blkmtd_dev *dev, struct page *page) bio->bi_private = &event; bio->bi_end_io = bi_read_complete; if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { - submit_bio(READ, bio); - blk_run_queues(); + submit_bio(READ_SYNC, bio); wait_for_completion(&event); err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO; bio_put(bio); @@ -179,8 +178,7 @@ static int blkmtd_write_out(struct bio *bio) init_completion(&event); bio->bi_private = &event; bio->bi_end_io = bi_write_complete; - submit_bio(WRITE, bio); - blk_run_queues(); + submit_bio(WRITE_SYNC, bio); wait_for_completion(&event); DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt); err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO; diff --git a/fs/buffer.c b/fs/buffer.c index be9cc963a178..8ab66d0b7548 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -132,7 +132,11 @@ void __wait_on_buffer(struct buffer_head * bh) do { prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); if (buffer_locked(bh)) { - blk_run_queues(); + struct block_device *bd; + smp_mb(); + bd = bh->b_bdev; + if (bd) + blk_run_address_space(bd->bd_inode->i_mapping); io_schedule(); } } while (buffer_locked(bh)); @@ -492,7 +496,6 @@ static void free_more_memory(void) pg_data_t *pgdat; wakeup_bdflush(1024); - blk_run_queues(); yield(); for_each_pgdat(pgdat) { @@ -2927,7 +2930,10 @@ EXPORT_SYMBOL(try_to_free_buffers); int block_sync_page(struct page *page) { - blk_run_queues(); + struct address_space *mapping; + smp_mb(); + mapping = page->mapping; + blk_run_address_space(mapping); return 0; } diff --git a/fs/direct-io.c b/fs/direct-io.c index d022a233820f..79534d258f37 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -364,7 +364,7 @@ static struct bio *dio_await_one(struct dio *dio) if (dio->bio_list == NULL) { dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - blk_run_queues(); + blk_run_address_space(dio->inode->i_mapping); io_schedule(); spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; @@ -1035,7 +1035,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, if (ret == 0) ret = dio->result; finished_one_bio(dio); /* This can free the dio */ - blk_run_queues(); + blk_run_address_space(inode->i_mapping); if (should_wait) { unsigned long flags; /* diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index b72fb4a40adc..b90aa961dd5a 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1975,8 +1975,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - submit_bio(READ, bio); - blk_run_queues(); + submit_bio(READ_SYNC, bio); wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); @@ -2120,9 +2119,8 @@ static void lbmStartIO(struct lbuf * bp) /* check if journaling to disk has been disabled */ if (!log->no_integrity) { - submit_bio(WRITE, bio); + submit_bio(WRITE_SYNC, bio); INCREMENT(lmStat.submitted); - blk_run_queues(); } else { bio->bi_size = 0; diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index a8618f107ead..68231e909496 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -23,6 +23,7 @@ #include #include +#include #include "ntfs.h" @@ -668,7 +669,7 @@ lock_retry_remap: "uptodate! Unplugging the disk queue " "and rescheduling."); get_bh(tbh); - blk_run_queues(); + blk_run_address_space(mapping); schedule(); put_bh(tbh); if (unlikely(!buffer_uptodate(tbh))) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 04e50f696202..b22169e7ba76 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include "swab.h" @@ -456,7 +457,7 @@ void ufs_truncate (struct inode * inode) break; if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) ufs_sync_inode (inode); - blk_run_queues(); + blk_run_address_space(inode->i_mapping); yield(); } offset = inode->i_size & uspi->s_fshift; diff --git a/fs/xfs/linux/xfs_buf.c b/fs/xfs/linux/xfs_buf.c index c5f06aad5234..2d4cf586cf85 100644 --- a/fs/xfs/linux/xfs_buf.c +++ b/fs/xfs/linux/xfs_buf.c @@ -1013,7 +1013,7 @@ pagebuf_lock( { PB_TRACE(pb, "lock", 0); if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); + blk_run_address_space(pb->pb_target->pbr_mapping); down(&pb->pb_sema); PB_SET_OWNER(pb); PB_TRACE(pb, "locked", 0); @@ -1109,7 +1109,7 @@ _pagebuf_wait_unpin( if (atomic_read(&pb->pb_pin_count) == 0) break; if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); + blk_run_address_space(pb->pb_target->pbr_mapping); schedule(); } remove_wait_queue(&pb->pb_waiters, &wait); @@ -1407,7 +1407,7 @@ submit_io: if (pb->pb_flags & PBF_RUN_QUEUES) { pb->pb_flags &= ~PBF_RUN_QUEUES; if (atomic_read(&pb->pb_io_remaining) > 1) - blk_run_queues(); + blk_run_address_space(pb->pb_target->pbr_mapping); } } @@ -1471,7 +1471,7 @@ pagebuf_iowait( { PB_TRACE(pb, "iowait", 0); if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); + blk_run_address_space(pb->pb_target->pbr_mapping); down(&pb->pb_iodonesema); PB_TRACE(pb, "iowaited", (long)pb->pb_error); return pb->pb_error; @@ -1617,7 +1617,6 @@ STATIC int pagebuf_daemon( void *data) { - int count; page_buf_t *pb; struct list_head *curr, *next, tmp; @@ -1640,7 +1639,6 @@ pagebuf_daemon( spin_lock(&pbd_delwrite_lock); - count = 0; list_for_each_safe(curr, next, &pbd_delwrite_queue) { pb = list_entry(curr, page_buf_t, pb_list); @@ -1657,7 +1655,6 @@ pagebuf_daemon( pb->pb_flags &= ~PBF_DELWRI; pb->pb_flags |= PBF_WRITE; list_move(&pb->pb_list, &tmp); - count++; } } @@ -1667,12 +1664,11 @@ pagebuf_daemon( list_del_init(&pb->pb_list); pagebuf_iostrategy(pb); + blk_run_address_space(pb->pb_target->pbr_mapping); } if (as_list_len > 0) purge_addresses(); - if (count) - blk_run_queues(); force_flush = 0; } while (pagebuf_daemon_active); @@ -1689,7 +1685,6 @@ pagebuf_delwri_flush( page_buf_t *pb; struct list_head *curr, *next, tmp; int pincount = 0; - int flush_cnt = 0; pagebuf_runall_queues(pagebuf_dataio_workqueue); pagebuf_runall_queues(pagebuf_logio_workqueue); @@ -1733,14 +1728,8 @@ pagebuf_delwri_flush( pagebuf_lock(pb); pagebuf_iostrategy(pb); - if (++flush_cnt > 32) { - blk_run_queues(); - flush_cnt = 0; - } } - blk_run_queues(); - while (!list_empty(&tmp)) { pb = list_entry(tmp.next, page_buf_t, pb_list); @@ -1751,6 +1740,9 @@ pagebuf_delwri_flush( pagebuf_rele(pb); } + if (flags & PBDF_WAIT) + blk_run_address_space(target->pbr_mapping); + if (pinptr) *pinptr = pincount; } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e34916ddd1d7..00371734995c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -28,9 +28,12 @@ struct backing_dev_info { int memory_backed; /* Cannot clean pages with writepage */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ + void (*unplug_io_fn)(struct backing_dev_info *); + void *unplug_io_data; }; extern struct backing_dev_info default_backing_dev_info; +void default_unplug_io_fn(struct backing_dev_info *bdi); int writeback_acquire(struct backing_dev_info *bdi); int writeback_in_progress(struct backing_dev_info *bdi); diff --git a/include/linux/bio.h b/include/linux/bio.h index c421c46bfbb2..c4dd287dd1c8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -119,11 +119,13 @@ struct bio { * bit 1 -- rw-ahead when set * bit 2 -- barrier * bit 3 -- fail fast, don't want low level driver retries + * bit 4 -- synchronous I/O hint: the block layer will unplug immediately */ #define BIO_RW 0 #define BIO_RW_AHEAD 1 #define BIO_RW_BARRIER 2 #define BIO_RW_FAILFAST 3 +#define BIO_RW_SYNC 4 /* * various member access, note that bio_data should of course not be used @@ -138,6 +140,7 @@ struct bio { #define bio_cur_sectors(bio) (bio_iovec(bio)->bv_len >> 9) #define bio_data(bio) (page_address(bio_page((bio))) + bio_offset((bio))) #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) +#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) /* * will die diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1a521e16b398..572f96e6940a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -243,7 +243,7 @@ typedef int (merge_requests_fn) (request_queue_t *, struct request *, typedef void (request_fn_proc) (request_queue_t *q); typedef int (make_request_fn) (request_queue_t *q, struct bio *bio); typedef int (prep_rq_fn) (request_queue_t *, struct request *); -typedef void (unplug_fn) (void *q); +typedef void (unplug_fn) (request_queue_t *); struct bio_vec; typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *); @@ -315,8 +315,6 @@ struct request_queue unsigned long bounce_pfn; int bounce_gfp; - struct list_head plug_list; - /* * various queue flags, see QUEUE_* below */ @@ -370,8 +368,9 @@ struct request_queue #define QUEUE_FLAG_WRITEFULL 4 /* read queue has been filled */ #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ +#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ -#define blk_queue_plugged(q) !list_empty(&(q)->plug_list) +#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) @@ -515,7 +514,7 @@ extern int scsi_cmd_ioctl(struct gendisk *, unsigned int, unsigned long); extern void blk_start_queue(request_queue_t *q); extern void blk_stop_queue(request_queue_t *q); extern void __blk_stop_queue(request_queue_t *q); -extern void blk_run_queue(request_queue_t *q); +extern void blk_run_queue(request_queue_t *); extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *); extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int); extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int); @@ -526,6 +525,18 @@ static inline request_queue_t *bdev_get_queue(struct block_device *bdev) return bdev->bd_disk->queue; } +static inline void blk_run_backing_dev(struct backing_dev_info *bdi) +{ + if (bdi && bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi); +} + +static inline void blk_run_address_space(struct address_space *mapping) +{ + if (mapping) + blk_run_backing_dev(mapping->backing_dev_info); +} + /* * end_request() and friends. Must be called with the request queue spinlock * acquired. All functions called within end_request() _must_be_ atomic. @@ -572,7 +583,7 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); -extern void generic_unplug_device(void *); +extern void generic_unplug_device(request_queue_t *); extern long nr_blockdev_pages(void); int blk_get_queue(request_queue_t *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 39c893f8aa28..c7f0052b4abd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -83,6 +83,8 @@ extern int leases_enable, dir_notify_enable, lease_break_time; #define WRITE 1 #define READA 2 /* read-ahead - don't block if no resources */ #define SPECIAL 4 /* For non-blockdevice requests in request queue */ +#define READ_SYNC (READ | (1 << BIO_RW_SYNC)) +#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) #define SEL_IN 1 #define SEL_OUT 2 diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 240dc450dcd3..9c06e776cfc2 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -76,6 +76,7 @@ extern void md_handle_safemode(mddev_t *mddev); extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors); extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev); +extern void md_unplug_mddev(mddev_t *mddev); extern void md_print_devices (void); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index bea64b0fb6c1..42c973c53d04 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -326,7 +326,6 @@ do { \ if (condition) \ break; \ spin_unlock_irq(&lock); \ - blk_run_queues(); \ schedule(); \ spin_lock_irq(&lock); \ } \ @@ -341,30 +340,5 @@ do { \ __wait_event_lock_irq(wq, condition, lock); \ } while (0) - -#define __wait_disk_event(wq, condition) \ -do { \ - wait_queue_t __wait; \ - init_waitqueue_entry(&__wait, current); \ - \ - add_wait_queue(&wq, &__wait); \ - for (;;) { \ - set_current_state(TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - blk_run_queues(); \ - schedule(); \ - } \ - current->state = TASK_RUNNING; \ - remove_wait_queue(&wq, &__wait); \ -} while (0) - -#define wait_disk_event(wq, condition) \ -do { \ - if (condition) \ - break; \ - __wait_disk_event(wq, condition); \ -} while (0) - #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index b000c56803b8..d189090cf63a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -197,6 +197,8 @@ extern int shmem_unuse(swp_entry_t entry, struct page *page); #define SWAP_AGAIN 1 #define SWAP_FAIL 2 +extern void swap_unplug_io_fn(struct backing_dev_info *); + #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct file *, struct page *); @@ -232,6 +234,7 @@ extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); +struct backing_dev_info; extern struct swap_list_t swap_list; extern spinlock_t swaplock; diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 7e035a9b42d1..6abcf99b7ada 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -84,7 +84,6 @@ static void free_some_memory(void) while (shrink_all_memory(10000)) printk("."); printk("|\n"); - blk_run_queues(); } diff --git a/kernel/power/pmdisk.c b/kernel/power/pmdisk.c index d54147214bea..22855abbdd6e 100644 --- a/kernel/power/pmdisk.c +++ b/kernel/power/pmdisk.c @@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsigned int num, int err) static void wait_io(void) { - blk_run_queues(); while(atomic_read(&io_done)) io_schedule(); } @@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_off, void * page) if (rw == WRITE) bio_set_pages_dirty(bio); start_io(); - submit_bio(rw,bio); + submit_bio(rw | (1 << BIO_RW_SYNC), bio); wait_io(); Done: bio_put(bio); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 20134ab8e0b2..ae748a467af5 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -707,11 +707,6 @@ int software_suspend(void) free_some_memory(); - /* No need to invalidate any vfsmnt list -- - * they will be valid after resume, anyway. - */ - blk_run_queues(); - /* Save state of all device drivers, and stop them. */ if ((res = device_suspend(4))==0) /* If stopping device drivers worked, we proceed basically into diff --git a/mm/filemap.c b/mm/filemap.c index ec1952db8baf..dc2f0992d879 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -119,8 +119,10 @@ void remove_from_page_cache(struct page *page) static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping; + smp_mb(); + mapping = page->mapping; if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); return 0; diff --git a/mm/mempool.c b/mm/mempool.c index 756e60ee18d6..da6ad1e12c97 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -234,8 +234,6 @@ repeat_alloc: if (!(gfp_mask & __GFP_WAIT)) return NULL; - blk_run_queues(); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); mb(); if (!pool->curr_nr) diff --git a/mm/nommu.c b/mm/nommu.c index c940756b49e5..1432dbab85eb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -571,3 +572,7 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr, void pte_chain_init(void) { } + +void swap_unplug_io_fn(struct backing_dev_info *) +{ +} diff --git a/mm/readahead.c b/mm/readahead.c index 08a2d9f1051d..71bf2462d097 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -15,11 +15,16 @@ #include #include +void default_unplug_io_fn(struct backing_dev_info *bdi) +{ +} +EXPORT_SYMBOL(default_unplug_io_fn); + struct backing_dev_info default_backing_dev_info = { .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, .state = 0, + .unplug_io_fn = default_unplug_io_fn, }; - EXPORT_SYMBOL_GPL(default_backing_dev_info); /* @@ -32,7 +37,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) ra->ra_pages = mapping->backing_dev_info->ra_pages; ra->average = ra->ra_pages / 2; } - EXPORT_SYMBOL(file_ra_state_init); /* diff --git a/mm/shmem.c b/mm/shmem.c index 4116ea26daf1..345e04cb0f6c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -169,6 +169,7 @@ static struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info = { .ra_pages = 0, /* No readahead */ .memory_backed = 1, /* Does not contribute to dirty memory */ + .unplug_io_fn = default_unplug_io_fn, }; LIST_HEAD(shmem_inodes); diff --git a/mm/swap_state.c b/mm/swap_state.c index 22946f0d9ecf..97f80d20807c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -19,6 +19,7 @@ static struct backing_dev_info swap_backing_dev_info = { .ra_pages = 0, /* No readahead */ .memory_backed = 1, /* Does not contribute to dirty memory */ + .unplug_io_fn = swap_unplug_io_fn, }; extern struct address_space_operations swap_aops; diff --git a/mm/swapfile.c b/mm/swapfile.c index e5cebb1800b9..f885e6d17a49 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -44,8 +45,64 @@ struct swap_list_t swap_list = {-1, -1}; struct swap_info_struct swap_info[MAX_SWAPFILES]; +/* + * Array of backing blockdevs, for swap_unplug_fn. We need this because the + * bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling + * the unplug_fn. And swap_list_lock cannot be turned into a semaphore. + */ +static DECLARE_MUTEX(swap_bdevs_sem); +static struct block_device *swap_bdevs[MAX_SWAPFILES]; + #define SWAPFILE_CLUSTER 256 +/* + * Caller holds swap_bdevs_sem + */ +static void install_swap_bdev(struct block_device *bdev) +{ + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (swap_bdevs[i] == NULL) { + swap_bdevs[i] = bdev; + return; + } + } + BUG(); +} + +static void remove_swap_bdev(struct block_device *bdev) +{ + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (swap_bdevs[i] == bdev) { + memcpy(&swap_bdevs[i], &swap_bdevs[i + 1], + (MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs)); + swap_bdevs[MAX_SWAPFILES - 1] = NULL; + return; + } + } + BUG(); +} + +void swap_unplug_io_fn(struct backing_dev_info *unused_bdi) +{ + int i; + + down(&swap_bdevs_sem); + for (i = 0; i < MAX_SWAPFILES; i++) { + struct block_device *bdev = swap_bdevs[i]; + struct backing_dev_info *bdi; + + if (bdev == NULL) + break; + bdi = bdev->bd_inode->i_mapping->backing_dev_info; + (*bdi->unplug_io_fn)(bdi); + } + up(&swap_bdevs_sem); +} + static inline int scan_swap_map(struct swap_info_struct *si) { unsigned long offset; @@ -1088,6 +1145,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) swap_list_unlock(); goto out_dput; } + down(&swap_bdevs_sem); swap_list_lock(); swap_device_lock(p); swap_file = p->swap_file; @@ -1099,6 +1157,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile) destroy_swap_extents(p); swap_device_unlock(p); swap_list_unlock(); + remove_swap_bdev(p->bdev); + up(&swap_bdevs_sem); vfree(swap_map); if (S_ISBLK(mapping->host->i_mode)) { struct block_device *bdev = I_BDEV(mapping->host); @@ -1440,6 +1500,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) if (error) goto bad_swap; + down(&swap_bdevs_sem); swap_list_lock(); swap_device_lock(p); p->flags = SWP_ACTIVE; @@ -1465,6 +1526,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) } swap_device_unlock(p); swap_list_unlock(); + install_swap_bdev(p->bdev); + up(&swap_bdevs_sem); error = 0; goto out; bad_swap: @@ -1484,7 +1547,7 @@ bad_swap_2: destroy_swap_extents(p); if (swap_map) vfree(swap_map); - if (swap_file && !IS_ERR(swap_file)) + if (swap_file) filp_close(swap_file, NULL); out: if (page && !IS_ERR(page)) { -- cgit v1.2.3