From 46031f9a38a9773021f1872abc713d62467ac22e Mon Sep 17 00:00:00 2001
From: "Raz Ben-Jehuda(caro)" <raziebe@gmail.com>
Date: Sun, 10 Dec 2006 02:20:47 -0800
Subject: [PATCH] md: allow reads that have bypassed the cache to be retried on
 failure

If a bypass-the-cache read fails, we simply try again through the cache.  If
it fails again it will trigger normal recovery precedures.

update 1:

From: NeilBrown <neilb@suse.de>

1/
  chunk_aligned_read and retry_aligned_read assume that
      data_disks == raid_disks - 1
  which is not true for raid6.
  So when an aligned read request bypasses the cache, we can get the wrong data.

2/ The cloned bio is being used-after-free in raid5_align_endio
   (to test BIO_UPTODATE).

3/ We forgot to add rdev->data_offset when submitting
   a bio for aligned-read

4/ clone_bio calls blk_recount_segments and then we change bi_bdev,
   so we need to invalidate the segment counts.

5/ We don't de-reference the rdev when the read completes.
   This means we need to record the rdev to so it is still
   available in the end_io routine.  Fortunately
   bi_next in the original bio is unused at this point so
   we can stuff it in there.

6/ We leak a cloned bio if the target rdev is not usable.

From: NeilBrown <neilb@suse.de>

update 2:

1/ When aligned requests fail (read error) they need to be retried
   via the normal method (stripe cache).  As we cannot be sure that
   we can process a single read in one go (we may not be able to
   allocate all the stripes needed) we store a bio-being-retried
   and a list of bioes-that-still-need-to-be-retried.
   When find a bio that needs to be retried, we should add it to
   the list, not to single-bio...

2/ We were never incrementing 'scnt' when resubmitting failed
   aligned requests.

[akpm@osdl.org: build fix]
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/raid5.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 03636d7918fe..d8286db60b96 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -227,7 +227,10 @@ struct raid5_private_data {
 	struct list_head	handle_list; /* stripes needing handling */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
+	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
+	atomic_t		active_aligned_reads;
 
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
-- 
cgit v1.2.3


From 2a2275d630b982e5f90206f9bc497f6695a3ec5d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 26 Jan 2007 00:57:11 -0800
Subject: [PATCH] md: fix potential memalloc deadlock in md

If a GFP_KERNEL allocation is attempted in md while the mddev_lock is held,
it is possible for a deadlock to eventuate.

This happens if the array was marked 'clean', and the memalloc triggers a
write-out to the md device.

For the writeout to succeed, the array must be marked 'dirty', and that
requires getting the mddev_lock.

So, before attempting a GFP_KERNEL allocation while holding the lock, make
sure the array is marked 'dirty' (unless it is currently read-only).

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c         | 29 +++++++++++++++++++++++++++++
 drivers/md/raid1.c      |  2 ++
 drivers/md/raid5.c      |  3 +++
 include/linux/raid/md.h |  2 +-
 4 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ec3d8e8a0bd3..e8807ea5377d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3564,6 +3564,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
 	char *ptr, *buf = NULL;
 	int err = -ENOMEM;
 
+	md_allow_write(mddev);
+
 	file = kmalloc(sizeof(*file), GFP_KERNEL);
 	if (!file)
 		goto out;
@@ -5032,6 +5034,33 @@ void md_write_end(mddev_t *mddev)
 	}
 }
 
+/* md_allow_write(mddev)
+ * Calling this ensures that the array is marked 'active' so that writes
+ * may proceed without blocking.  It is important to call this before
+ * attempting a GFP_KERNEL allocation while holding the mddev lock.
+ * Must be called with mddev_lock held.
+ */
+void md_allow_write(mddev_t *mddev)
+{
+	if (!mddev->pers)
+		return;
+	if (mddev->ro)
+		return;
+
+	spin_lock_irq(&mddev->write_lock);
+	if (mddev->in_sync) {
+		mddev->in_sync = 0;
+		set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+		if (mddev->safemode_delay &&
+		    mddev->safemode == 0)
+			mddev->safemode = 1;
+		spin_unlock_irq(&mddev->write_lock);
+		md_update_sb(mddev, 0);
+	} else
+		spin_unlock_irq(&mddev->write_lock);
+}
+EXPORT_SYMBOL_GPL(md_allow_write);
+
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 
 #define SYNC_MARKS	10
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ab74d40cac98..97ee870b265d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2104,6 +2104,8 @@ static int raid1_reshape(mddev_t *mddev)
 		return -EINVAL;
 	}
 
+	md_allow_write(mddev);
+
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 
 	if (raid_disks < conf->raid_disks) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index be008f034ada..8a30b297ac3a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -405,6 +405,8 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	if (newsize <= conf->pool_size)
 		return 0; /* never bother to shrink */
 
+	md_allow_write(conf->mddev);
+
 	/* Step 1 */
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
 			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
@@ -3250,6 +3252,7 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 		else
 			break;
 	}
+	md_allow_write(mddev);
 	while (new > conf->max_nr_stripes) {
 		if (grow_one_stripe(conf))
 			conf->max_nr_stripes++;
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 866a1e2b0ce0..fbaeda79b2e9 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -94,7 +94,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 			struct page *page, int rw);
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
-
+extern void md_allow_write(mddev_t *mddev);
 
 #endif /* CONFIG_MD */
 #endif 
-- 
cgit v1.2.3


From da6e1a32fb8d7539a27f699c8671f64d7fefd0cc Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Thu, 8 Feb 2007 14:20:37 -0800
Subject: [PATCH] md: avoid possible BUG_ON in md bitmap handling

md/bitmap tracks how many active write requests are pending on blocks
associated with each bit in the bitmap, so that it knows when it can clear
the bit (when count hits zero).

The counter has 14 bits of space, so if there are ever more than 16383, we
cannot cope.

Currently the code just calles BUG_ON as "all" drivers have request queue
limits much smaller than this.

However is seems that some don't.  Apparently some multipath configurations
can allow more than 16383 concurrent write requests.

So, in this unlikely situation, instead of calling BUG_ON we now wait
for the count to drop down a bit.  This requires a new wait_queue_head,
some waiting code, and a wakeup call.

Tested by limiting the counter to 20 instead of 16383 (writes go a lot slower
in that case...).

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/bitmap.c         | 22 +++++++++++++++++++++-
 include/linux/raid/bitmap.h |  1 +
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 11108165e264..059704fbb753 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1160,6 +1160,22 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 			return 0;
 		}
 
+		if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) {
+			DEFINE_WAIT(__wait);
+			/* note that it is safe to do the prepare_to_wait
+			 * after the test as long as we do it before dropping
+			 * the spinlock.
+			 */
+			prepare_to_wait(&bitmap->overflow_wait, &__wait,
+					TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&bitmap->lock);
+			bitmap->mddev->queue
+				->unplug_fn(bitmap->mddev->queue);
+			schedule();
+			finish_wait(&bitmap->overflow_wait, &__wait);
+			continue;
+		}
+
 		switch(*bmc) {
 		case 0:
 			bitmap_file_set_bit(bitmap, offset);
@@ -1169,7 +1185,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 		case 1:
 			*bmc = 2;
 		}
-		BUG_ON((*bmc & COUNTER_MAX) == COUNTER_MAX);
+
 		(*bmc)++;
 
 		spin_unlock_irq(&bitmap->lock);
@@ -1207,6 +1223,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 		if (!success && ! (*bmc & NEEDED_MASK))
 			*bmc |= NEEDED_MASK;
 
+		if ((*bmc & COUNTER_MAX) == COUNTER_MAX)
+			wake_up(&bitmap->overflow_wait);
+
 		(*bmc)--;
 		if (*bmc <= 2) {
 			set_page_attr(bitmap,
@@ -1431,6 +1450,7 @@ int bitmap_create(mddev_t *mddev)
 	spin_lock_init(&bitmap->lock);
 	atomic_set(&bitmap->pending_writes, 0);
 	init_waitqueue_head(&bitmap->write_wait);
+	init_waitqueue_head(&bitmap->overflow_wait);
 
 	bitmap->mddev = mddev;
 
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index ebd42a3710b4..6db9a4c15355 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -247,6 +247,7 @@ struct bitmap {
 
 	atomic_t pending_writes; /* pending writes to the bitmap file */
 	wait_queue_head_t write_wait;
+	wait_queue_head_t overflow_wait;
 
 };
 
-- 
cgit v1.2.3