From e692cb668fdd5a712c6ed2a2d6f2a36ee83997b4 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Dec 2010 19:41:49 +0100 Subject: block: Deprecate QUEUE_FLAG_CLUSTER and use queue_limits instead When stacking devices, a request_queue is not always available. This forced us to have a no_cluster flag in the queue_limits that could be used as a carrier until the request_queue had been set up for a metadevice. There were several problems with that approach. First of all it was up to the stacking device to remember to set queue flag after stacking had completed. Also, the queue flag and the queue limits had to be kept in sync at all times. We got that wrong, which could lead to us issuing commands that went beyond the max scatterlist limit set by the driver. The proper fix is to avoid having two flags for tracking the same thing. We deprecate QUEUE_FLAG_CLUSTER and use the queue limit directly in the block layer merging functions. The queue_limit 'no_cluster' is turned into 'cluster' to avoid double negatives and to ease stacking. Clustering defaults to being enabled as before. The queue flag logic is removed from the stacking function, and explicitly setting the cluster flag is no longer necessary in DM and MD. Reported-by: Ed Lin Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Cc: stable@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd10c4f..95aeeeb49e8b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -250,7 +250,7 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; - unsigned char no_cluster; + unsigned char cluster; signed char discard_zeroes_data; }; @@ -380,7 +380,6 @@ struct request_queue #endif }; -#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ @@ -403,7 +402,6 @@ struct request_queue #define QUEUE_FLAG_SECDISCARD 19 /* supports SECDISCARD */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_CLUSTER) | \ (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_ADD_RANDOM)) @@ -510,6 +508,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define rq_data_dir(rq) ((rq)->cmd_flags & 1) +static inline unsigned int blk_queue_cluster(struct request_queue *q) +{ + return q->limits.cluster; +} + /* * We regard a request as sync, if either a read or a sync write */ -- cgit v1.2.3 From 72d4cd9f38b5ed96b75df4c622be25e1c2648dd3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 17 Dec 2010 08:34:20 +0100 Subject: block: max hardware sectors limit wrapper Implement blk_limits_max_hw_sectors() and make blk_queue_max_hw_sectors() a wrapper around it. DM needs this to avoid setting queue_limits' max_hw_sectors and max_sectors directly. dm_set_device_limits() now leverages blk_limits_max_hw_sectors() logic to establish the appropriate max_hw_sectors minimum (PAGE_SIZE). Fixes issue where DM was incorrectly setting max_sectors rather than max_hw_sectors (which caused dm_merge_bvec()'s max_hw_sectors check to be ineffective). Signed-off-by: Mike Snitzer Cc: stable@kernel.org Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 26 ++++++++++++++++++++------ drivers/md/dm-table.c | 5 ++--- include/linux/blkdev.h | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-settings.c b/block/blk-settings.c index e55f5fc4ca22..36c8c1f2af18 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -229,8 +229,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device + * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request + * @limits: the queue limits * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -244,7 +244,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * per-device basis in /sys/block//queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) { if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); @@ -252,9 +252,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto __func__, max_hw_sectors); } - q->limits.max_hw_sectors = max_hw_sectors; - q->limits.max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + limits->max_hw_sectors = max_hw_sectors; + limits->max_sectors = min_t(unsigned int, max_hw_sectors, + BLK_DEF_MAX_SECTORS); +} +EXPORT_SYMBOL(blk_limits_max_hw_sectors); + +/** + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * + * Description: + * See description for blk_limits_max_hw_sectors(). + **/ +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +{ + blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e2da1912a2cb..4d705cea0f8c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -517,9 +517,8 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ if (q->merge_bvec_fn && !ti->type->merge) - limits->max_sectors = - min_not_zero(limits->max_sectors, - (unsigned int) (PAGE_SIZE >> 9)); + blk_limits_max_hw_sectors(limits, + (unsigned int) (PAGE_SIZE >> 9)); return 0; } EXPORT_SYMBOL_GPL(dm_set_device_limits); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 95aeeeb49e8b..36ab42c9bb99 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -808,6 +808,7 @@ extern struct request_queue *blk_init_allocated_queue(struct request_queue *, extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); +extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); -- cgit v1.2.3 From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 5 Jan 2011 16:57:38 +0100 Subject: block: fix accounting bug on cross partition merges /proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. Also add a refcount to struct hd_struct to keep the partition in memory as long as users exist. We use kref_test_and_get() to ensure we don't add a reference to a partition which is going away. Signed-off-by: Jerome Marchand Signed-off-by: Yasuaki Ishimatsu Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 26 +++++++++++++++++++++----- block/blk-merge.c | 3 ++- block/genhd.c | 1 + fs/partitions/check.c | 10 +++++++++- include/linux/blkdev.h | 1 + include/linux/genhd.h | 2 ++ 6 files changed, 36 insertions(+), 7 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 3689319a5974..500c080a6a6b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) + if (!new_io) { + part = rq->part; part_stat_inc(cpu, part, merges[rw]); - else { + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!kref_test_and_get(&part->ref)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + kref_get(&part->ref); + } part_round_stats(cpu, part); part_inc_in_flight(part, rw); + rq->part = part; } part_stat_unlock(); @@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); + rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); part_dec_in_flight(part, rw); + kref_put(&part->ref, __delete_partition); part_stat_unlock(); } } diff --git a/block/blk-merge.c b/block/blk-merge.c index 77b7c26df6b5..b06b83b89d89 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); + kref_put(&part->ref, __delete_partition); part_stat_unlock(); } } diff --git a/block/genhd.c b/block/genhd.c index 16ccc0d2d5d3..85c150598830 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) return NULL; } disk->part_tbl->part[0] = &disk->part0; + kref_init(&disk->part0.ref); disk->minors = minors; rand_initialize_disk(disk); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index bdf8d3cc95a4..48209f58522b 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -381,6 +381,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } +void __delete_partition(struct kref *ref) +{ + struct hd_struct *part = container_of(ref, struct hd_struct, ref); + + call_rcu(&part->rcu_head, delete_partition_rcu_cb); +} + void delete_partition(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = disk->part_tbl; @@ -399,7 +406,7 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); + kref_put(&part->ref, __delete_partition); } static ssize_t whole_disk_show(struct device *dev, @@ -498,6 +505,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); + kref_init(&p->ref); return p; out_free_info: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd10c4f..482a7fd48831 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -115,6 +115,7 @@ struct request { void *elevator_private3; struct gendisk *rq_disk; + struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP unsigned long long start_time_ns; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7a7b9c1644e4..2ba2792a3dd4 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -116,6 +116,7 @@ struct hd_struct { struct disk_stats dkstats; #endif struct rcu_head rcu_head; + struct kref ref; }; #define GENHD_FL_REMOVABLE 1 @@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk, sector_t len, int flags, struct partition_meta_info *info); +extern void __delete_partition(struct kref *ref); extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); -- cgit v1.2.3