From 46031f9a38a9773021f1872abc713d62467ac22e Mon Sep 17 00:00:00 2001 From: "Raz Ben-Jehuda(caro)" Date: Sun, 10 Dec 2006 02:20:47 -0800 Subject: [PATCH] md: allow reads that have bypassed the cache to be retried on failure If a bypass-the-cache read fails, we simply try again through the cache. If it fails again it will trigger normal recovery precedures. update 1: From: NeilBrown 1/ chunk_aligned_read and retry_aligned_read assume that data_disks == raid_disks - 1 which is not true for raid6. So when an aligned read request bypasses the cache, we can get the wrong data. 2/ The cloned bio is being used-after-free in raid5_align_endio (to test BIO_UPTODATE). 3/ We forgot to add rdev->data_offset when submitting a bio for aligned-read 4/ clone_bio calls blk_recount_segments and then we change bi_bdev, so we need to invalidate the segment counts. 5/ We don't de-reference the rdev when the read completes. This means we need to record the rdev to so it is still available in the end_io routine. Fortunately bi_next in the original bio is unused at this point so we can stuff it in there. 6/ We leak a cloned bio if the target rdev is not usable. From: NeilBrown update 2: 1/ When aligned requests fail (read error) they need to be retried via the normal method (stripe cache). As we cannot be sure that we can process a single read in one go (we may not be able to allocate all the stripes needed) we store a bio-being-retried and a list of bioes-that-still-need-to-be-retried. When find a bio that needs to be retried, we should add it to the list, not to single-bio... 2/ We were never incrementing 'scnt' when resubmitting failed aligned requests. [akpm@osdl.org: build fix] Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 03636d7918fe..d8286db60b96 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -227,7 +227,10 @@ struct raid5_private_data { struct list_head handle_list; /* stripes needing handling */ struct list_head delayed_list; /* stripes that have plugged requests */ struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ + struct bio *retry_read_aligned; /* currently retrying aligned bios */ + struct bio *retry_read_aligned_list; /* aligned bios retry list */ atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t active_aligned_reads; atomic_t reshape_stripes; /* stripes with pending writes for reshape */ /* unfortunately we need two cache names as we temporarily have -- cgit v1.2.3 From 2a2275d630b982e5f90206f9bc497f6695a3ec5d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Jan 2007 00:57:11 -0800 Subject: [PATCH] md: fix potential memalloc deadlock in md If a GFP_KERNEL allocation is attempted in md while the mddev_lock is held, it is possible for a deadlock to eventuate. This happens if the array was marked 'clean', and the memalloc triggers a write-out to the md device. For the writeout to succeed, the array must be marked 'dirty', and that requires getting the mddev_lock. So, before attempting a GFP_KERNEL allocation while holding the lock, make sure the array is marked 'dirty' (unless it is currently read-only). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 29 +++++++++++++++++++++++++++++ drivers/md/raid1.c | 2 ++ drivers/md/raid5.c | 3 +++ include/linux/raid/md.h | 2 +- 4 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/drivers/md/md.c b/drivers/md/md.c index ec3d8e8a0bd3..e8807ea5377d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3564,6 +3564,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) char *ptr, *buf = NULL; int err = -ENOMEM; + md_allow_write(mddev); + file = kmalloc(sizeof(*file), GFP_KERNEL); if (!file) goto out; @@ -5032,6 +5034,33 @@ void md_write_end(mddev_t *mddev) } } +/* md_allow_write(mddev) + * Calling this ensures that the array is marked 'active' so that writes + * may proceed without blocking. It is important to call this before + * attempting a GFP_KERNEL allocation while holding the mddev lock. + * Must be called with mddev_lock held. + */ +void md_allow_write(mddev_t *mddev) +{ + if (!mddev->pers) + return; + if (mddev->ro) + return; + + spin_lock_irq(&mddev->write_lock); + if (mddev->in_sync) { + mddev->in_sync = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->safemode_delay && + mddev->safemode == 0) + mddev->safemode = 1; + spin_unlock_irq(&mddev->write_lock); + md_update_sb(mddev, 0); + } else + spin_unlock_irq(&mddev->write_lock); +} +EXPORT_SYMBOL_GPL(md_allow_write); + static DECLARE_WAIT_QUEUE_HEAD(resync_wait); #define SYNC_MARKS 10 diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ab74d40cac98..97ee870b265d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2104,6 +2104,8 @@ static int raid1_reshape(mddev_t *mddev) return -EINVAL; } + md_allow_write(mddev); + raid_disks = mddev->raid_disks + mddev->delta_disks; if (raid_disks < conf->raid_disks) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index be008f034ada..8a30b297ac3a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -405,6 +405,8 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ + md_allow_write(conf->mddev); + /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), @@ -3250,6 +3252,7 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) else break; } + md_allow_write(mddev); while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 866a1e2b0ce0..fbaeda79b2e9 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -94,7 +94,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); - +extern void md_allow_write(mddev_t *mddev); #endif /* CONFIG_MD */ #endif -- cgit v1.2.3 From da6e1a32fb8d7539a27f699c8671f64d7fefd0cc Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Thu, 8 Feb 2007 14:20:37 -0800 Subject: [PATCH] md: avoid possible BUG_ON in md bitmap handling md/bitmap tracks how many active write requests are pending on blocks associated with each bit in the bitmap, so that it knows when it can clear the bit (when count hits zero). The counter has 14 bits of space, so if there are ever more than 16383, we cannot cope. Currently the code just calles BUG_ON as "all" drivers have request queue limits much smaller than this. However is seems that some don't. Apparently some multipath configurations can allow more than 16383 concurrent write requests. So, in this unlikely situation, instead of calling BUG_ON we now wait for the count to drop down a bit. This requires a new wait_queue_head, some waiting code, and a wakeup call. Tested by limiting the counter to 20 instead of 16383 (writes go a lot slower in that case...). Signed-off-by: Neil Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/bitmap.c | 22 +++++++++++++++++++++- include/linux/raid/bitmap.h | 1 + 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 11108165e264..059704fbb753 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1160,6 +1160,22 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect return 0; } + if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) { + DEFINE_WAIT(__wait); + /* note that it is safe to do the prepare_to_wait + * after the test as long as we do it before dropping + * the spinlock. + */ + prepare_to_wait(&bitmap->overflow_wait, &__wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&bitmap->lock); + bitmap->mddev->queue + ->unplug_fn(bitmap->mddev->queue); + schedule(); + finish_wait(&bitmap->overflow_wait, &__wait); + continue; + } + switch(*bmc) { case 0: bitmap_file_set_bit(bitmap, offset); @@ -1169,7 +1185,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect case 1: *bmc = 2; } - BUG_ON((*bmc & COUNTER_MAX) == COUNTER_MAX); + (*bmc)++; spin_unlock_irq(&bitmap->lock); @@ -1207,6 +1223,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto if (!success && ! (*bmc & NEEDED_MASK)) *bmc |= NEEDED_MASK; + if ((*bmc & COUNTER_MAX) == COUNTER_MAX) + wake_up(&bitmap->overflow_wait); + (*bmc)--; if (*bmc <= 2) { set_page_attr(bitmap, @@ -1431,6 +1450,7 @@ int bitmap_create(mddev_t *mddev) spin_lock_init(&bitmap->lock); atomic_set(&bitmap->pending_writes, 0); init_waitqueue_head(&bitmap->write_wait); + init_waitqueue_head(&bitmap->overflow_wait); bitmap->mddev = mddev; diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index ebd42a3710b4..6db9a4c15355 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -247,6 +247,7 @@ struct bitmap { atomic_t pending_writes; /* pending writes to the bitmap file */ wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; }; -- cgit v1.2.3