From 6b8b3e8a8b3e62b4209eaa36697e3c9df457e196 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 4 Aug 2005 12:53:35 -0700
Subject: [PATCH] md: make sure md bitmap updates are flushed when array is
 stopped.

The recent change to never ignore the bitmap, revealed that the bitmap isn't
begin flushed properly when an array is stopped.

We call bitmap_daemon_work three times as there is a three-stage pipeline for
flushing updates to the bitmap file.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/bitmap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 6213e976eade..4bf1659f8aa8 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -248,6 +248,7 @@ struct bitmap {
 
 /* these are used only by md/bitmap */
 int  bitmap_create(mddev_t *mddev);
+void bitmap_flush(mddev_t *mddev);
 void bitmap_destroy(mddev_t *mddev);
 int  bitmap_active(struct bitmap *bitmap);
 
-- 
cgit v1.2.3


From 36fa30636fb84b209210299684e1be66d9e58217 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:45 -0700
Subject: [PATCH] md: all hot-add and hot-remove of md intent logging bitmaps

Both file-bitmaps and superblock bitmaps are supported.

If you add a bitmap file on the array device, you lose.

This introduces a 'default_bitmap_offset' field in mddev, as the ioctl used
for adding a superblock bitmap doesn't have room for giving an offset.  Later,
this value will be setable via sysfs.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 104 ++++++++++++++++++++++++++++++++++++++--------
 drivers/md/raid1.c        |  30 +++++++++++++
 include/linux/raid/md_k.h |  10 +++++
 3 files changed, 127 insertions(+), 17 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 63c566165189..ae654466dc23 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -623,6 +623,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->size = sb->size;
 		mddev->events = md_event(sb);
 		mddev->bitmap_offset = 0;
+		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
 		if (sb->state & (1<<MD_SB_CLEAN))
 			mddev->recovery_cp = MaxSector;
@@ -648,7 +649,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 				printk(KERN_WARNING "md: bitmaps only support for raid1\n");
 				return -EINVAL;
 			}
-			mddev->bitmap_offset = (MD_SB_BYTES >> 9);
+			mddev->bitmap_offset = mddev->default_bitmap_offset;
 		}
 
 	} else if (mddev->pers == NULL) {
@@ -939,6 +940,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->size = le64_to_cpu(sb->size)/2;
 		mddev->events = le64_to_cpu(sb->events);
 		mddev->bitmap_offset = 0;
+		mddev->default_bitmap_offset = 0;
+		if (mddev->minor_version == 0)
+			mddev->default_bitmap_offset = -(64*1024)/512;
 		
 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
 		memcpy(mddev->uuid, sb->set_uuid, 16);
@@ -2073,6 +2077,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
 	info.state         = 0;
 	if (mddev->in_sync)
 		info.state = (1<<MD_SB_CLEAN);
+	if (mddev->bitmap && mddev->bitmap_offset)
+		info.state = (1<<MD_SB_BITMAP_PRESENT);
 	info.active_disks  = active;
 	info.working_disks = working;
 	info.failed_disks  = failed;
@@ -2430,25 +2436,51 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
 {
 	int err;
 
-	if (mddev->pers || mddev->bitmap_file)
-		return -EBUSY;
+	if (mddev->pers) {
+		if (!mddev->pers->quiesce)
+			return -EBUSY;
+		if (mddev->recovery || mddev->sync_thread)
+			return -EBUSY;
+		/* we should be able to change the bitmap.. */
+	}
 
-	mddev->bitmap_file = fget(fd);
 
-	if (mddev->bitmap_file == NULL) {
-		printk(KERN_ERR "%s: error: failed to get bitmap file\n",
-			mdname(mddev));
-		return -EBADF;
-	}
+	if (fd >= 0) {
+		if (mddev->bitmap)
+			return -EEXIST; /* cannot add when bitmap is present */
+		mddev->bitmap_file = fget(fd);
 
-	err = deny_bitmap_write_access(mddev->bitmap_file);
-	if (err) {
-		printk(KERN_ERR "%s: error: bitmap file is already in use\n",
-			mdname(mddev));
-		fput(mddev->bitmap_file);
-		mddev->bitmap_file = NULL;
-	} else
+		if (mddev->bitmap_file == NULL) {
+			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
+			       mdname(mddev));
+			return -EBADF;
+		}
+
+		err = deny_bitmap_write_access(mddev->bitmap_file);
+		if (err) {
+			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
+			       mdname(mddev));
+			fput(mddev->bitmap_file);
+			mddev->bitmap_file = NULL;
+			return err;
+		}
 		mddev->bitmap_offset = 0; /* file overrides offset */
+	} else if (mddev->bitmap == NULL)
+		return -ENOENT; /* cannot remove what isn't there */
+	err = 0;
+	if (mddev->pers) {
+		mddev->pers->quiesce(mddev, 1);
+		if (fd >= 0)
+			err = bitmap_create(mddev);
+		if (fd < 0 || err)
+			bitmap_destroy(mddev);
+		mddev->pers->quiesce(mddev, 0);
+	} else if (fd < 0) {
+		if (mddev->bitmap_file)
+			fput(mddev->bitmap_file);
+		mddev->bitmap_file = NULL;
+	}
+
 	return err;
 }
 
@@ -2528,6 +2560,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 {
 	int rv = 0;
 	int cnt = 0;
+	int state = 0;
+
+	/* calculate expected state,ignoring low bits */
+	if (mddev->bitmap && mddev->bitmap_offset)
+		state |= (1 << MD_SB_BITMAP_PRESENT);
 
 	if (mddev->major_version != info->major_version ||
 	    mddev->minor_version != info->minor_version ||
@@ -2536,12 +2573,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 	    mddev->level         != info->level         ||
 /*	    mddev->layout        != info->layout        || */
 	    !mddev->persistent	 != info->not_persistent||
-	    mddev->chunk_size    != info->chunk_size    )
+	    mddev->chunk_size    != info->chunk_size    ||
+	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
+	    ((state^info->state) & 0xfffffe00)
+		)
 		return -EINVAL;
 	/* Check there is only one change */
 	if (mddev->size != info->size) cnt++;
 	if (mddev->raid_disks != info->raid_disks) cnt++;
 	if (mddev->layout != info->layout) cnt++;
+	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
 	if (cnt == 0) return 0;
 	if (cnt > 1) return -EINVAL;
 
@@ -2620,6 +2661,35 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 			}
 		}
 	}
+	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
+		if (mddev->pers->quiesce == NULL)
+			return -EINVAL;
+		if (mddev->recovery || mddev->sync_thread)
+			return -EBUSY;
+		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
+			/* add the bitmap */
+			if (mddev->bitmap)
+				return -EEXIST;
+			if (mddev->default_bitmap_offset == 0)
+				return -EINVAL;
+			mddev->bitmap_offset = mddev->default_bitmap_offset;
+			mddev->pers->quiesce(mddev, 1);
+			rv = bitmap_create(mddev);
+			if (rv)
+				bitmap_destroy(mddev);
+			mddev->pers->quiesce(mddev, 0);
+		} else {
+			/* remove the bitmap */
+			if (!mddev->bitmap)
+				return -ENOENT;
+			if (mddev->bitmap->file)
+				return -EINVAL;
+			mddev->pers->quiesce(mddev, 1);
+			bitmap_destroy(mddev);
+			mddev->pers->quiesce(mddev, 0);
+			mddev->bitmap_offset = 0;
+		}
+	}
 	md_update_sb(mddev);
 	return rv;
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ace41c571aeb..ba643e4bfac9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1565,6 +1565,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 	return 0;
 }
 
+void raid1_quiesce(mddev_t *mddev, int state)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+
+	switch(state) {
+	case 0:
+		spin_lock_irq(&conf->resync_lock);
+		conf->barrier++;
+		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+				    conf->resync_lock, raid1_unplug(mddev->queue));
+		spin_unlock_irq(&conf->resync_lock);
+		break;
+	case 1:
+		spin_lock_irq(&conf->resync_lock);
+		conf->barrier--;
+		spin_unlock_irq(&conf->resync_lock);
+		wake_up(&conf->wait_resume);
+		wake_up(&conf->wait_idle);
+		break;
+	}
+	if (mddev->thread) {
+		if (mddev->bitmap)
+			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+		else
+			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+		md_wakeup_thread(mddev->thread);
+	}
+}
+
 
 static mdk_personality_t raid1_personality =
 {
@@ -1581,6 +1610,7 @@ static mdk_personality_t raid1_personality =
 	.sync_request	= sync_request,
 	.resize		= raid1_resize,
 	.reshape	= raid1_reshape,
+	.quiesce	= raid1_quiesce,
 };
 
 static int __init raid_init(void)
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 8c14ba565a45..817062bf7352 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -278,6 +278,10 @@ struct mddev_s
 							* start of bitmap. May be
 							* negative, but not '0'
 							*/
+	long				default_bitmap_offset; /* this is the offset to use when
+								* hot-adding a bitmap.  It should
+								* eventually be settable by sysfs.
+								*/
 
 	struct list_head		all_mddevs;
 };
@@ -314,6 +318,12 @@ struct mdk_personality_s
 	int (*resize) (mddev_t *mddev, sector_t sectors);
 	int (*reshape) (mddev_t *mddev, int raid_disks);
 	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+	/* quiesce moves between quiescence states
+	 * 0 - fully active
+	 * 1 - no new requests allowed
+	 * others - reserved
+	 */
+	void (*quiesce) (mddev_t *mddev, int state);
 };
 
 
-- 
cgit v1.2.3


From 8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:45 -0700
Subject: [PATCH] md: support write-mostly device in raid1

This allows a device in a raid1 to be marked as "write mostly".  Read requests
will only be sent if there is no other option.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 18 +++++++++++
 drivers/md/raid1.c        | 76 ++++++++++++++++++++++++++++++++---------------
 include/linux/raid/md_k.h |  3 ++
 include/linux/raid/md_p.h | 11 +++++--
 4 files changed, 82 insertions(+), 26 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ae654466dc23..f1ac356e656d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -670,6 +670,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (mddev->level != LEVEL_MULTIPATH) {
 		rdev->faulty = 0;
+		rdev->flags = 0;
 		desc = sb->disks + rdev->desc_nr;
 
 		if (desc->state & (1<<MD_DISK_FAULTY))
@@ -679,6 +680,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			rdev->in_sync = 1;
 			rdev->raid_disk = desc->raid_disk;
 		}
+		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
 	} else /* MULTIPATH are always insync */
 		rdev->in_sync = 1;
 	return 0;
@@ -777,6 +780,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 			spare++;
 			working++;
 		}
+		if (test_bit(WriteMostly, &rdev2->flags))
+			d->state |= (1<<MD_DISK_WRITEMOSTLY);
 	}
 	
 	/* now set the "removed" and "faulty" bits on any missing devices */
@@ -990,6 +995,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			rdev->raid_disk = role;
 			break;
 		}
+		rdev->flags = 0;
+		if (sb->devflags & WriteMostly1)
+			set_bit(WriteMostly, &rdev->flags);
 	} else /* MULTIPATH are always insync */
 		rdev->in_sync = 1;
 
@@ -2152,6 +2160,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg)
 			info.state |= (1<<MD_DISK_ACTIVE);
 			info.state |= (1<<MD_DISK_SYNC);
 		}
+		if (test_bit(WriteMostly, &rdev->flags))
+			info.state |= (1<<MD_DISK_WRITEMOSTLY);
 	} else {
 		info.major = info.minor = 0;
 		info.raid_disk = -1;
@@ -2237,6 +2247,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		rdev->saved_raid_disk = rdev->raid_disk;
 
 		rdev->in_sync = 0; /* just to be sure */
+		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+
 		rdev->raid_disk = -1;
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err)
@@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		else
 			rdev->in_sync = 0;
 
+		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
 			export_rdev(rdev);
@@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
 			char b[BDEVNAME_SIZE];
 			seq_printf(seq, " %s[%d]",
 				bdevname(rdev->bdev,b), rdev->desc_nr);
+			if (test_bit(WriteMostly, &rdev->flags))
+				seq_printf(seq, "(W)");
 			if (rdev->faulty) {
 				seq_printf(seq, "(F)");
 				continue;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ba643e4bfac9..28839a8193f2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 {
 	const unsigned long this_sector = r1_bio->sector;
 	int new_disk = conf->last_used, disk = new_disk;
+	int wonly_disk = -1;
 	const int sectors = r1_bio->sectors;
 	sector_t new_distance, current_distance;
-	mdk_rdev_t *new_rdev, *rdev;
+	mdk_rdev_t *rdev;
 
 	rcu_read_lock();
 	/*
-	 * Check if it if we can balance. We can balance on the whole
+	 * Check if we can balance. We can balance on the whole
 	 * device if no resync is going on, or below the resync window.
 	 * We take the first readable disk when above the resync window.
 	 */
@@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 		/* Choose the first operation device, for consistancy */
 		new_disk = 0;
 
-		while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-		       !new_rdev->in_sync) {
-			new_disk++;
-			if (new_disk == conf->raid_disks) {
-				new_disk = -1;
+		for (rdev = conf->mirrors[new_disk].rdev;
+		     !rdev || !rdev->in_sync
+			     || test_bit(WriteMostly, &rdev->flags);
+		     rdev = conf->mirrors[++new_disk].rdev) {
+
+			if (rdev && rdev->in_sync)
+				wonly_disk = new_disk;
+
+			if (new_disk == conf->raid_disks - 1) {
+				new_disk = wonly_disk;
 				break;
 			}
 		}
@@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 
 	/* make sure the disk is operational */
-	while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-	       !new_rdev->in_sync) {
+	for (rdev = conf->mirrors[new_disk].rdev;
+	     !rdev || !rdev->in_sync ||
+		     test_bit(WriteMostly, &rdev->flags);
+	     rdev = conf->mirrors[new_disk].rdev) {
+
+		if (rdev && rdev->in_sync)
+			wonly_disk = new_disk;
+
 		if (new_disk <= 0)
 			new_disk = conf->raid_disks;
 		new_disk--;
 		if (new_disk == disk) {
-			new_disk = -1;
-			goto rb_out;
+			new_disk = wonly_disk;
+			break;
 		}
 	}
+
+	if (new_disk < 0)
+		goto rb_out;
+
 	disk = new_disk;
 	/* now disk == new_disk == starting point for search */
 
@@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 			disk = conf->raid_disks;
 		disk--;
 
-		if ((rdev=conf->mirrors[disk].rdev) == NULL ||
-		    !rdev->in_sync)
+		rdev = conf->mirrors[disk].rdev;
+
+		if (!rdev ||
+		    !rdev->in_sync ||
+		    test_bit(WriteMostly, &rdev->flags))
 			continue;
 
 		if (!atomic_read(&rdev->nr_pending)) {
 			new_disk = disk;
-			new_rdev = rdev;
 			break;
 		}
 		new_distance = abs(this_sector - conf->mirrors[disk].head_position);
 		if (new_distance < current_distance) {
 			current_distance = new_distance;
 			new_disk = disk;
-			new_rdev = rdev;
 		}
 	} while (disk != conf->last_used);
 
-rb_out:
+ rb_out:
 
 
 	if (new_disk >= 0) {
-		conf->next_seq_sect = this_sector + sectors;
-		conf->last_used = new_disk;
-		atomic_inc(&new_rdev->nr_pending);
-		if (!new_rdev->in_sync) {
+		rdev = conf->mirrors[new_disk].rdev;
+		if (!rdev)
+			goto retry;
+		atomic_inc(&rdev->nr_pending);
+		if (!rdev->in_sync) {
 			/* cannot risk returning a device that failed
 			 * before we inc'ed nr_pending
 			 */
-			atomic_dec(&new_rdev->nr_pending);
+			atomic_dec(&rdev->nr_pending);
 			goto retry;
 		}
+		conf->next_seq_sect = this_sector + sectors;
+		conf->last_used = new_disk;
 	}
 	rcu_read_unlock();
 
@@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	sector_t max_sector, nr_sectors;
 	int disk;
 	int i;
+	int wonly;
 	int write_targets = 0;
 	int sync_blocks;
 	int still_degraded = 0;
@@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	 */
 	disk = conf->last_used;
 	/* make sure disk is operational */
-
+	wonly = disk;
 	while (conf->mirrors[disk].rdev == NULL ||
-	       !conf->mirrors[disk].rdev->in_sync) {
+	       !conf->mirrors[disk].rdev->in_sync ||
+	       test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
+		) {
+		if (conf->mirrors[disk].rdev  &&
+		    conf->mirrors[disk].rdev->in_sync)
+			wonly = disk;
 		if (disk <= 0)
 			disk = conf->raid_disks;
 		disk--;
-		if (disk == conf->last_used)
+		if (disk == conf->last_used) {
+			disk = wonly;
 			break;
+		}
 	}
 	conf->last_used = disk;
 	atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 817062bf7352..7ef78e15ce04 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -181,6 +181,9 @@ struct mdk_rdev_s
 	int faulty;			/* if faulty do not issue IO requests */
 	int in_sync;			/* device is a full member of the array */
 
+	unsigned long	flags;		/* Should include faulty and in_sync here. */
+#define	WriteMostly	4		/* Avoid reading if at all possible */
+
 	int desc_nr;			/* descriptor index in the superblock */
 	int raid_disk;			/* role of device in array */
 	int saved_raid_disk;		/* role that device used to have in the
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index dc65cd435494..4f047f84fb1f 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -79,6 +79,11 @@
 #define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
 #define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */
 
+#define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
+				   * read requests will only be sent here in
+				   * dire need
+				   */
+
 typedef struct mdp_device_descriptor_s {
 	__u32 number;		/* 0 Device number in the entire set	      */
 	__u32 major;		/* 1 Device major number		      */
@@ -193,7 +198,7 @@ struct mdp_superblock_1 {
 
 	__u64	ctime;		/* lo 40 bits are seconds, top 24 are microseconds or 0*/
 	__u32	level;		/* -4 (multipath), -1 (linear), 0,1,4,5 */
-	__u32	layout;		/* only for raid5 currently */
+	__u32	layout;		/* only for raid5 and raid10 currently */
 	__u64	size;		/* used size of component devices, in 512byte sectors */
 
 	__u32	chunksize;	/* in 512byte sectors */
@@ -212,7 +217,9 @@ struct mdp_superblock_1 {
 	__u32	dev_number;	/* permanent identifier of this  device - not role in raid */
 	__u32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
-	__u8	pad2[64-56];	/* set to 0 when writing */
+	__u8	devflags;	/* per-device flags.  Only one defined...*/
+#define	WriteMostly1	1	/* mask for writemostly flag in above */
+	__u8	pad2[64-57];	/* set to 0 when writing */
 
 	/* array state information - 64 bytes */
 	__u64	utime;		/* 40 bits second, 24 btes microseconds */
-- 
cgit v1.2.3


From 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:47 -0700
Subject: [PATCH] md: add write-behind support for md/raid1

If a device is flagged 'WriteMostly' and the array has a bitmap, and the
bitmap superblock indicates that write_behind is allowed, then write_behind is
enabled for WriteMostly devices.

Write requests will be acknowledges as complete to the caller (via b_end_io)
when all non-WriteMostly devices have completed the write, but will not be
cleared from the bitmap until all devices complete.

This requires memory allocation to make a local copy of the data being
written.  If there is insufficient memory, then we fall-back on normal write
semantics.

Signed-Off-By: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c         |  26 ++++++++--
 drivers/md/raid1.c          | 124 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/raid/bitmap.h |  15 ++++--
 include/linux/raid/md_k.h   |   3 ++
 include/linux/raid/raid1.h  |  13 +++++
 5 files changed, 165 insertions(+), 16 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2925219f0881..2c84de2b4ad5 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -437,6 +437,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
 	printk(KERN_DEBUG "  daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
 	printk(KERN_DEBUG "     sync size: %llu KB\n",
 			(unsigned long long)le64_to_cpu(sb->sync_size)/2);
+	printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
 	kunmap(bitmap->sb_page);
 }
 
@@ -445,7 +446,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 {
 	char *reason = NULL;
 	bitmap_super_t *sb;
-	unsigned long chunksize, daemon_sleep;
+	unsigned long chunksize, daemon_sleep, write_behind;
 	unsigned long bytes_read;
 	unsigned long long events;
 	int err = -EINVAL;
@@ -474,6 +475,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 
 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep);
+	write_behind = le32_to_cpu(sb->write_behind);
 
 	/* verify that the bitmap-specific fields are valid */
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -485,7 +487,9 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	else if ((1 << ffz(~chunksize)) != chunksize)
 		reason = "bitmap chunksize not a power of 2";
 	else if (daemon_sleep < 1 || daemon_sleep > 15)
-		reason = "daemon sleep period out of range";
+		reason = "daemon sleep period out of range (1-15s)";
+	else if (write_behind > COUNTER_MAX)
+		reason = "write-behind limit out of range (0 - 16383)";
 	if (reason) {
 		printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
 			bmname(bitmap), reason);
@@ -518,6 +522,7 @@ success:
 	/* assign fields using values from superblock */
 	bitmap->chunksize = chunksize;
 	bitmap->daemon_sleep = daemon_sleep;
+	bitmap->max_write_behind = write_behind;
 	bitmap->flags |= sb->state;
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
 	if (sb->state & BITMAP_STALE)
@@ -1282,9 +1287,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
 	}
 }
 
-int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors)
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
 {
 	if (!bitmap) return 0;
+
+	if (behind) {
+		atomic_inc(&bitmap->behind_writes);
+		PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
+		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+	}
+
 	while (sectors) {
 		int blocks;
 		bitmap_counter_t *bmc;
@@ -1319,9 +1331,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 }
 
 void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
-		     int success)
+		     int success, int behind)
 {
 	if (!bitmap) return;
+	if (behind) {
+		atomic_dec(&bitmap->behind_writes);
+		PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
+		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+	}
+
 	while (sectors) {
 		int blocks;
 		unsigned long flags;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 28839a8193f2..ba7f5f256161 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
 
-	bio_endio(bio, bio->bi_size,
-		test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+	/* if nobody has done the final endio yet, do it now */
+	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+		PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
+			(bio_data_dir(bio) == WRITE) ? "write" : "read",
+			(unsigned long long) bio->bi_sector,
+			(unsigned long long) bio->bi_sector +
+				(bio->bi_size >> 9) - 1);
+
+		bio_endio(bio, bio->bi_size,
+			test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+	}
 	free_r1bio(r1_bio);
 }
 
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
-	int mirror;
+	int mirror, behind;
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 
 	if (bio->bi_size)
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
 
 	update_head_pos(mirror, r1_bio);
 
+	behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
+	if (behind) {
+		if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+			atomic_dec(&r1_bio->behind_remaining);
+
+		/* In behind mode, we ACK the master bio once the I/O has safely
+		 * reached all non-writemostly disks. Setting the Returned bit
+		 * ensures that this gets done only once -- we don't ever want to
+		 * return -EIO here, instead we'll wait */
+
+		if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+		    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+			/* Maybe we can return now */
+			if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+				struct bio *mbio = r1_bio->master_bio;
+				PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+				       (unsigned long long) mbio->bi_sector,
+				       (unsigned long long) mbio->bi_sector +
+				       (mbio->bi_size >> 9) - 1);
+				bio_endio(mbio, mbio->bi_size, 0);
+			}
+		}
+	}
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
 	 * already.
 	 */
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
+		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+			/* free extra copy of the data pages */
+			int i = bio->bi_vcnt;
+			while (i--)
+				__free_page(bio->bi_io_vec[i].bv_page);
+		}
 		/* clear the bitmap if all writes complete successfully */
 		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
 				r1_bio->sectors,
-				!test_bit(R1BIO_Degraded, &r1_bio->state));
+				!test_bit(R1BIO_Degraded, &r1_bio->state),
+				behind);
 		md_write_end(r1_bio->mddev);
 		raid_end_bio_io(r1_bio);
 	}
@@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+/* duplicate the data pages for behind I/O */
+static struct page **alloc_behind_pages(struct bio *bio)
+{
+	int i;
+	struct bio_vec *bvec;
+	struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
+					GFP_NOIO);
+	if (unlikely(!pages))
+		goto do_sync_io;
+
+	memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
+
+	bio_for_each_segment(bvec, bio, i) {
+		pages[i] = alloc_page(GFP_NOIO);
+		if (unlikely(!pages[i]))
+			goto do_sync_io;
+		memcpy(kmap(pages[i]) + bvec->bv_offset,
+			kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+		kunmap(pages[i]);
+		kunmap(bvec->bv_page);
+	}
+
+	return pages;
+
+do_sync_io:
+	if (pages)
+		for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
+			__free_page(pages[i]);
+	kfree(pages);
+	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+	return NULL;
+}
+
 static int make_request(request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
 	struct bio_list bl;
+	struct page **behind_pages = NULL;
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
 
-	r1_bio->state = 0;
-
 	if (bio_data_dir(bio) == READ) {
 		/*
 		 * read balancing logic:
@@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	}
 	rcu_read_unlock();
 
+	BUG_ON(targets == 0); /* we never fail the last device */
+
 	if (targets < conf->raid_disks) {
 		/* array is degraded, we will not clear the bitmap
 		 * on I/O completion (see raid1_end_write_request) */
 		set_bit(R1BIO_Degraded, &r1_bio->state);
 	}
 
+	/* do behind I/O ? */
+	if (bitmap &&
+	    atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
+	    (behind_pages = alloc_behind_pages(bio)) != NULL)
+		set_bit(R1BIO_BehindIO, &r1_bio->state);
+
 	atomic_set(&r1_bio->remaining, 0);
+	atomic_set(&r1_bio->behind_remaining, 0);
 
 	bio_list_init(&bl);
 	for (i = 0; i < disks; i++) {
@@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio)
 		mbio->bi_rw = WRITE;
 		mbio->bi_private = r1_bio;
 
+		if (behind_pages) {
+			struct bio_vec *bvec;
+			int j;
+
+			/* Yes, I really want the '__' version so that
+			 * we clear any unused pointer in the io_vec, rather
+			 * than leave them unchanged.  This is important
+			 * because when we come to free the pages, we won't
+			 * know the originial bi_idx, so we just free
+			 * them all
+			 */
+			__bio_for_each_segment(bvec, mbio, j, 0)
+				bvec->bv_page = behind_pages[j];
+			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+				atomic_inc(&r1_bio->behind_remaining);
+		}
+
 		atomic_inc(&r1_bio->remaining);
 
 		bio_list_add(&bl, mbio);
 	}
+	kfree(behind_pages); /* the behind pages are attached to the bios now */
 
-	bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors);
+	bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
+				test_bit(R1BIO_BehindIO, &r1_bio->state));
 	spin_lock_irqsave(&conf->device_lock, flags);
 	bio_list_merge(&conf->pending_bio_list, &bl);
 	bio_list_init(&bl);
@@ -1471,6 +1570,17 @@ out:
 static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
+	struct bitmap *bitmap = mddev->bitmap;
+	int behind_wait = 0;
+
+	/* wait for behind writes to complete */
+	while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+		behind_wait++;
+		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(HZ); /* wait a second */
+		/* need to kick something here to make sure I/O goes? */
+	}
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 4bf1659f8aa8..9de99198caf1 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -7,7 +7,7 @@
 #define BITMAP_H 1
 
 #define BITMAP_MAJOR 3
-#define BITMAP_MINOR 38
+#define BITMAP_MINOR 39
 
 /*
  * in-memory bitmap:
@@ -147,8 +147,9 @@ typedef struct bitmap_super_s {
 	__u32 state;        /* 48  bitmap state information */
 	__u32 chunksize;    /* 52  the bitmap chunk size in bytes */
 	__u32 daemon_sleep; /* 56  seconds between disk flushes */
+	__u32 write_behind; /* 60  number of outstanding write-behind writes */
 
-	__u8  pad[256 - 60]; /* set to zero */
+	__u8  pad[256 - 64]; /* set to zero */
 } bitmap_super_t;
 
 /* notes:
@@ -226,6 +227,9 @@ struct bitmap {
 
 	unsigned long flags;
 
+	unsigned long max_write_behind; /* write-behind mode */
+	atomic_t behind_writes;
+
 	/*
 	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
 	 * file, cleaning up bits and flushing out pages to disk as necessary
@@ -260,9 +264,10 @@ int  bitmap_setallbits(struct bitmap *bitmap);
 void bitmap_write_all(struct bitmap *bitmap);
 
 /* these are exported */
-int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors);
-void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
-		     int success);
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int success, int behind);
 int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
 void bitmap_close_sync(struct bitmap *bitmap);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 7ef78e15ce04..2514e5fcda7f 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -275,6 +275,9 @@ struct mddev_s
 	atomic_t			writes_pending; 
 	request_queue_t			*queue;	/* for plugging ... */
 
+	atomic_t                        write_behind; /* outstanding async IO */
+	unsigned int                    max_write_behind; /* 0 = sync */
+
 	struct bitmap                   *bitmap; /* the bitmap for the device */
 	struct file			*bitmap_file; /* the bitmap file */
 	long				bitmap_offset; /* offset from superblock of
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 9d93cf12e890..60e19b667548 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -80,6 +80,9 @@ struct r1bio_s {
 	atomic_t		remaining; /* 'have we finished' count,
 					    * used from IRQ handlers
 					    */
+	atomic_t		behind_remaining; /* number of write-behind ios remaining
+						 * in this BehindIO request
+						 */
 	sector_t		sector;
 	int			sectors;
 	unsigned long		state;
@@ -107,4 +110,14 @@ struct r1bio_s {
 #define	R1BIO_Uptodate	0
 #define	R1BIO_IsSync	1
 #define	R1BIO_Degraded	2
+#define	R1BIO_BehindIO   3
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful.  Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define	R1BIO_Returned 4
+
 #endif
-- 
cgit v1.2.3


From 15945fee6f09bff1f86b1a735b5888dc59cf38e3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:47 -0700
Subject: [PATCH] md: support md/linear array with components greater than 2
 terabytes.

linear currently uses division by the size of the smallest componenet device
to find which device a request goes to.  If that smallest device is larger
than 2 terabytes, then the division will not work on some systems.

So we introduce a pre-shift, and take care not to make the hash table too
large, much like the code in raid0.

Also get rid of conf->nr_zones, which is not needed.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/linear.c         | 95 +++++++++++++++++++++++++++++++--------------
 include/linux/raid/linear.h |  4 +-
 2 files changed, 68 insertions(+), 31 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 4991ba543368..bb279fad2fd2 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 	/*
 	 * sector_div(a,b) returns the remainer and sets a to a/b
 	 */
-	(void)sector_div(block, conf->smallest->size);
+	block >>= conf->preshift;
+	(void)sector_div(block, conf->hash_spacing);
 	hash = conf->hash_table[block];
 
 	while ((sector>>1) >= (hash->size + hash->offset))
@@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 }
 
 /**
- *	linear_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *	linear_mergeable_bvec -- tell bio layer if two requests can be merged
  *	@q: request queue
  *	@bio: the buffer head that's been built up so far
  *	@biovec: the request that could be merged to it.
@@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev)
 	dev_info_t **table;
 	mdk_rdev_t *rdev;
 	int i, nb_zone, cnt;
-	sector_t start;
+	sector_t min_spacing;
 	sector_t curr_offset;
 	struct list_head *tmp;
 
@@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev)
 	memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
 	mddev->private = conf;
 
-	/*
-	 * Find the smallest device.
-	 */
-
-	conf->smallest = NULL;
 	cnt = 0;
 	mddev->array_size = 0;
 
@@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev)
 		disk->size = rdev->size;
 		mddev->array_size += rdev->size;
 
-		if (!conf->smallest || (disk->size < conf->smallest->size))
-			conf->smallest = disk;
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
@@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev)
 		goto out;
 	}
 
+	min_spacing = mddev->array_size;
+	sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
+
+	/* min_spacing is the minimum spacing that will fit the hash
+	 * table in one PAGE.  This may be much smaller than needed.
+	 * We find the smallest non-terminal set of consecutive devices
+	 * that is larger than min_spacing as use the size of that as
+	 * the actual spacing
+	 */
+	conf->hash_spacing = mddev->array_size;
+	for (i=0; i < cnt-1 ; i++) {
+		sector_t sz = 0;
+		int j;
+		for (j=i; i<cnt-1 && sz < min_spacing ; j++)
+			sz += conf->disks[j].size;
+		if (sz >= min_spacing && sz < conf->hash_spacing)
+			conf->hash_spacing = sz;
+	}
+
+	/* hash_spacing may be too large for sector_div to work with,
+	 * so we might need to pre-shift
+	 */
+	conf->preshift = 0;
+	if (sizeof(sector_t) > sizeof(u32)) {
+		sector_t space = conf->hash_spacing;
+		while (space > (sector_t)(~(u32)0)) {
+			space >>= 1;
+			conf->preshift++;
+		}
+	}
 	/*
 	 * This code was restructured to work around a gcc-2.95.3 internal
 	 * compiler error.  Alter it with care.
@@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev)
 		unsigned round;
 		unsigned long base;
 
-		sz = mddev->array_size;
-		base = conf->smallest->size;
+		sz = mddev->array_size >> conf->preshift;
+		sz += 1; /* force round-up */
+		base = conf->hash_spacing >> conf->preshift;
 		round = sector_div(sz, base);
-		nb_zone = conf->nr_zones = sz + (round ? 1 : 0);
+		nb_zone = sz + (round ? 1 : 0);
 	}
-			
-	conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone,
+	BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
+
+	conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
 					GFP_KERNEL);
 	if (!conf->hash_table)
 		goto out;
 
 	/*
 	 * Here we generate the linear hash table
+	 * First calculate the device offsets.
 	 */
+	conf->disks[0].offset = 0;
+	for (i=1; i<mddev->raid_disks; i++)
+		conf->disks[i].offset =
+			conf->disks[i-1].offset +
+			conf->disks[i-1].size;
+
 	table = conf->hash_table;
-	start = 0;
 	curr_offset = 0;
-	for (i = 0; i < cnt; i++) {
-		dev_info_t *disk = conf->disks + i;
+	i = 0;
+	for (curr_offset = 0;
+	     curr_offset < mddev->array_size;
+	     curr_offset += conf->hash_spacing) {
 
-		disk->offset = curr_offset;
-		curr_offset += disk->size;
+		while (i < mddev->raid_disks-1 &&
+		       curr_offset >= conf->disks[i+1].offset)
+			i++;
 
-		/* 'curr_offset' is the end of this disk
-		 * 'start' is the start of table
+		*table ++ = conf->disks + i;
+	}
+
+	if (conf->preshift) {
+		conf->hash_spacing >>= conf->preshift;
+		/* round hash_spacing up so that when we divide by it,
+		 * we err on the side of "too-low", which is safest.
 		 */
-		while (start < curr_offset) {
-			*table++ = disk;
-			start += conf->smallest->size;
-		}
+		conf->hash_spacing++;
 	}
-	if (table-conf->hash_table != nb_zone)
-		BUG();
+
+	BUG_ON(table - conf->hash_table > nb_zone);
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
@@ -299,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
 	sector_t s = 0;
   
 	seq_printf(seq, "      ");
-	for (j = 0; j < conf->nr_zones; j++)
+	for (j = 0; j < mddev->raid_disks; j++)
 	{
 		char b[BDEVNAME_SIZE];
 		s += conf->smallest_size;
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index e04c4fe45b53..7eaf290e10e7 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -14,8 +14,8 @@ typedef struct dev_info dev_info_t;
 struct linear_private_data
 {
 	dev_info_t		**hash_table;
-	dev_info_t		*smallest;
-	int			nr_zones;
+	sector_t		hash_spacing;
+	int			preshift; /* shift before dividing by hash_spacing */
 	dev_info_t		disks[0];
 };
 
-- 
cgit v1.2.3


From 71c0805cb48462c99fbe0e5fcc6c12d7b9929c09 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:51 -0700
Subject: [PATCH] md: allow md to load a superblock with feature-bit '1' set

As this is used to flag an internal bitmap.

Also, introduce symbolic names for feature bits.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 6 +++---
 include/linux/raid/md_p.h | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866c704e008a..1be3f2de396b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -875,7 +875,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	    sb->major_version != cpu_to_le32(1) ||
 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
 	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
-	    sb->feature_map != 0)
+	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
 		return -EINVAL;
 
 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
@@ -954,7 +954,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		mddev->max_disks =  (4096-256)/2;
 
-		if ((le32_to_cpu(sb->feature_map) & 1) &&
+		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
 		    mddev->bitmap_file == NULL ) {
 			if (mddev->level != 1) {
 				printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
@@ -1029,7 +1029,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
-		sb->feature_map = cpu_to_le32(1);
+		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
 
 	max_dev = 0;
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 4f047f84fb1f..c100fa5d4bfa 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -238,5 +238,10 @@ struct mdp_superblock_1 {
 	__u16	dev_roles[0];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
 };
 
+/* feature_map bits */
+#define MD_FEATURE_BITMAP_OFFSET	1
+
+#define	MD_FEATURE_ALL			1
+
 #endif 
 
-- 
cgit v1.2.3


From 773f7834425e83144c95fbbc553ced3c2b74b828 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:53 -0700
Subject: [PATCH] md: remove old cruft from md_k.h header file

These inlines haven't been used for ages, they should go.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/md_k.h | 64 -----------------------------------------------
 1 file changed, 64 deletions(-)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 2514e5fcda7f..8042f55dd323 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -85,70 +85,6 @@ typedef struct mdk_rdev_s mdk_rdev_t;
 
 #define MAX_CHUNK_SIZE (4096*1024)
 
-/*
- * default readahead
- */
-
-static inline int disk_faulty(mdp_disk_t * d)
-{
-	return d->state & (1 << MD_DISK_FAULTY);
-}
-
-static inline int disk_active(mdp_disk_t * d)
-{
-	return d->state & (1 << MD_DISK_ACTIVE);
-}
-
-static inline int disk_sync(mdp_disk_t * d)
-{
-	return d->state & (1 << MD_DISK_SYNC);
-}
-
-static inline int disk_spare(mdp_disk_t * d)
-{
-	return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
-}
-
-static inline int disk_removed(mdp_disk_t * d)
-{
-	return d->state & (1 << MD_DISK_REMOVED);
-}
-
-static inline void mark_disk_faulty(mdp_disk_t * d)
-{
-	d->state |= (1 << MD_DISK_FAULTY);
-}
-
-static inline void mark_disk_active(mdp_disk_t * d)
-{
-	d->state |= (1 << MD_DISK_ACTIVE);
-}
-
-static inline void mark_disk_sync(mdp_disk_t * d)
-{
-	d->state |= (1 << MD_DISK_SYNC);
-}
-
-static inline void mark_disk_spare(mdp_disk_t * d)
-{
-	d->state = 0;
-}
-
-static inline void mark_disk_removed(mdp_disk_t * d)
-{
-	d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
-}
-
-static inline void mark_disk_inactive(mdp_disk_t * d)
-{
-	d->state &= ~(1 << MD_DISK_ACTIVE);
-}
-
-static inline void mark_disk_nonsync(mdp_disk_t * d)
-{
-	d->state &= ~(1 << MD_DISK_SYNC);
-}
-
 /*
  * MD's 'extended' device
  */
-- 
cgit v1.2.3


From 0002b2718dd04da67c21f8a7830de8d95a9b0345 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:53 -0700
Subject: [PATCH] md: limit size of sb read/written to appropriate amount

version-1 superblocks are not (normally) 4K long, and can be of variable size.
 Writing the full 4K can cause corruption (but only in non-default
configurations).

With this patch the super-block-flavour can choose a size to read, and set a
size to write based on what it finds.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 20 +++++++++++++++-----
 include/linux/raid/md_k.h |  1 +
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1be3f2de396b..be7873c61b3c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -393,7 +393,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	return ret;
 }
 
-static int read_disk_sb(mdk_rdev_t * rdev)
+static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
 	char b[BDEVNAME_SIZE];
 	if (!rdev->sb_page) {
@@ -404,7 +404,7 @@ static int read_disk_sb(mdk_rdev_t * rdev)
 		return 0;
 
 
-	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
 		goto fail;
 	rdev->sb_loaded = 1;
 	return 0;
@@ -531,7 +531,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	sb_offset = calc_dev_sboffset(rdev->bdev);
 	rdev->sb_offset = sb_offset;
 
-	ret = read_disk_sb(rdev);
+	ret = read_disk_sb(rdev, MD_SB_BYTES);
 	if (ret) return ret;
 
 	ret = -EINVAL;
@@ -564,6 +564,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 
 	rdev->preferred_minor = sb->md_minor;
 	rdev->data_offset = 0;
+	rdev->sb_size = MD_SB_BYTES;
 
 	if (sb->level == LEVEL_MULTIPATH)
 		rdev->desc_nr = -1;
@@ -837,6 +838,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	int ret;
 	sector_t sb_offset;
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+	int bmask;
 
 	/*
 	 * Calculate the position of the superblock.
@@ -865,7 +867,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	}
 	rdev->sb_offset = sb_offset;
 
-	ret = read_disk_sb(rdev);
+	/* superblock is rarely larger than 1K, but it can be larger,
+	 * and it is safe to read 4k, so we do that
+	 */
+	ret = read_disk_sb(rdev, 4096);
 	if (ret) return ret;
 
 
@@ -891,6 +896,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	rdev->preferred_minor = 0xffff;
 	rdev->data_offset = le64_to_cpu(sb->data_offset);
 
+	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
+	bmask = block_size(rdev->bdev)-1;
+	if (rdev->sb_size & bmask)
+		rdev-> sb_size = (rdev->sb_size | bmask)+1;
+
 	if (refdev == 0)
 		return 1;
 	else {
@@ -1375,7 +1385,7 @@ repeat:
 		dprintk("%s ", bdevname(rdev->bdev,b));
 		if (!rdev->faulty) {
 			md_super_write(mddev,rdev,
-				       rdev->sb_offset<<1, MD_SB_BYTES,
+				       rdev->sb_offset<<1, rdev->sb_size,
 				       rdev->sb_page);
 			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
 				bdevname(rdev->bdev,b),
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 8042f55dd323..ebce949b1443 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -102,6 +102,7 @@ struct mdk_rdev_s
 	int		sb_loaded;
 	sector_t	data_offset;	/* start of data in array */
 	sector_t	sb_offset;
+	int		sb_size;	/* bytes in the superblock */
 	int		preferred_minor;	/* autorun support */
 
 	/* A device can be in one of three states based on two flags:
-- 
cgit v1.2.3


From 72626685dc66d455742a7f215a0535c551628b9e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:54 -0700
Subject: [PATCH] md: add write-intent-bitmap support to raid5

Most awkward part of this is delaying write requests until bitmap updates have
been flushed.

To achieve this, we have a sequence number (seq_flush) which is incremented
each time the raid5 is unplugged.

If the raid thread notices that this has changed, it flushes bitmap changes,
and assigned the value of seq_flush to seq_write.

When a write request arrives, it is given the number from seq_write, and that
write request may not complete until seq_flush is larger than the saved seq
number.

We have a new queue for storing stripes which are waiting for a bitmap flush
and an extra flag for stripes to record if the write was 'degraded' and so
should not clear the a bit in the bitmap.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |   3 +-
 drivers/md/raid5.c         | 133 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/raid/raid5.h |  14 ++++-
 3 files changed, 137 insertions(+), 13 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index be7873c61b3c..dbf540a7fccc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 		    mddev->bitmap_file == NULL) {
-			if (mddev->level != 1) {
+			if (mddev->level != 1 && mddev->level != 5) {
 				/* FIXME use a better test */
 				printk(KERN_WARNING "md: bitmaps only support for raid1\n");
 				return -EINVAL;
@@ -3517,7 +3517,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
  */
 void md_write_start(mddev_t *mddev, struct bio *bi)
 {
-	DEFINE_WAIT(w);
 	if (bio_data_dir(bi) != WRITE)
 		return;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed859e08d600..4683ca24c046 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -24,6 +24,8 @@
 #include <linux/bitops.h>
 #include <asm/atomic.h>
 
+#include <linux/raid/bitmap.h>
+
 /*
  * Stripe cache
  */
@@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
 			if (test_bit(STRIPE_DELAYED, &sh->state))
 				list_add_tail(&sh->lru, &conf->delayed_list);
-			else
+			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+				 conf->seq_write == sh->bm_seq)
+				list_add_tail(&sh->lru, &conf->bitmap_list);
+			else {
+				clear_bit(STRIPE_BIT_DELAY, &sh->state);
 				list_add_tail(&sh->lru, &conf->handle_list);
+			}
 			md_wakeup_thread(conf->mddev->thread);
 		} else {
 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 	spin_lock_irq(&conf->device_lock);
 
 	do {
+		wait_event_lock_irq(conf->wait_for_stripe,
+				    conf->quiesce == 0,
+				    conf->device_lock, /* nothing */);
 		sh = __find_stripe(conf, sector);
 		if (!sh) {
 			if (!conf->inactive_blocked)
@@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 {
 	struct bio **bip;
 	raid5_conf_t *conf = sh->raid_conf;
+	int firstwrite=0;
 
 	PRINTK("adding bh b#%llu to stripe s#%llu\n",
 		(unsigned long long)bi->bi_sector,
@@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 
 	spin_lock(&sh->lock);
 	spin_lock_irq(&conf->device_lock);
-	if (forwrite)
+	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
-	else
+		if (*bip == NULL && sh->dev[dd_idx].written == NULL)
+			firstwrite = 1;
+	} else
 		bip = &sh->dev[dd_idx].toread;
 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
 		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		(unsigned long long)bi->bi_sector,
 		(unsigned long long)sh->sector, dd_idx);
 
+	if (conf->mddev->bitmap && firstwrite) {
+		sh->bm_seq = conf->seq_write;
+		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+				  STRIPE_SECTORS, 0);
+		set_bit(STRIPE_BIT_DELAY, &sh->state);
+	}
+
 	if (forwrite) {
 		/* check if page is covered */
 		sector_t sector = sh->dev[dd_idx].sector;
@@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh)
 	 * need to be failed
 	 */
 	if (failed > 1 && to_read+to_write+written) {
-		spin_lock_irq(&conf->device_lock);
 		for (i=disks; i--; ) {
+			int bitmap_end = 0;
+			spin_lock_irq(&conf->device_lock);
 			/* fail all writes first */
 			bi = sh->dev[i].towrite;
 			sh->dev[i].towrite = NULL;
-			if (bi) to_write--;
+			if (bi) { to_write--; bitmap_end = 1; }
 
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 				wake_up(&conf->wait_for_overlap);
@@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh)
 			/* and fail all 'written' */
 			bi = sh->dev[i].written;
 			sh->dev[i].written = NULL;
+			if (bi) bitmap_end = 1;
 			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
 				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh)
 					bi = nextbi;
 				}
 			}
+			spin_unlock_irq(&conf->device_lock);
+			if (bitmap_end)
+				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+						STRIPE_SECTORS, 0, 0);
 		}
-		spin_unlock_irq(&conf->device_lock);
 	}
 	if (failed > 1 && syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh)
 			 test_bit(R5_UPTODATE, &dev->flags) ) {
 			/* We can return any write requests */
 			    struct bio *wbi, *wbi2;
+			    int bitmap_end = 0;
 			    PRINTK("Return write for disc %d\n", i);
 			    spin_lock_irq(&conf->device_lock);
 			    wbi = dev->written;
@@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh)
 				    }
 				    wbi = wbi2;
 			    }
+			    if (dev->towrite == NULL)
+				    bitmap_end = 1;
 			    spin_unlock_irq(&conf->device_lock);
+			    if (bitmap_end)
+				    bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+						    STRIPE_SECTORS,
+						    !test_bit(STRIPE_DEGRADED, &sh->state), 0);
 		    }
 		}
 	}
@@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh)
 				}
 			}
 		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && (rcw == 0 ||rmw == 0)) {
+		if (locked == 0 && (rcw == 0 ||rmw == 0) &&
+		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
 			PRINTK("Computing parity...\n");
 			compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
 			/* now every locked buffer is ready to be written */
@@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh)
 			dev = &sh->dev[failed_num];
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantwrite, &dev->flags);
+			clear_bit(STRIPE_DEGRADED, &sh->state);
 			locked++;
 			set_bit(STRIPE_INSYNC, &sh->state);
 			set_bit(R5_Syncio, &dev->flags);
@@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh)
 			bi->bi_next = NULL;
 			generic_make_request(bi);
 		} else {
+			if (rw == 1)
+				set_bit(STRIPE_DEGRADED, &sh->state);
 			PRINTK("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 	}
 }
 
+static inline void activate_bit_delay(raid5_conf_t *conf)
+{
+	/* device_lock is held */
+	struct list_head head;
+	list_add(&head, &conf->bitmap_list);
+	list_del_init(&conf->bitmap_list);
+	while (!list_empty(&head)) {
+		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+		list_del_init(&sh->lru);
+		atomic_inc(&sh->count);
+		__release_stripe(conf, sh);
+	}
+}
+
 static void unplug_slaves(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q)
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 
-	if (blk_remove_plug(q))
+	if (blk_remove_plug(q)) {
+		conf->seq_flush++;
 		raid5_activate_delayed(conf);
+	}
 	md_wakeup_thread(mddev->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1493,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	sector_t first_sector;
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
+	sector_t max_sector = mddev->size << 1;
+	int sync_blocks;
 
-	if (sector_nr >= mddev->size <<1) {
+	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
 		unplug_slaves(mddev);
+
+		if (mddev->curr_resync < max_sector) /* aborted */
+			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+					&sync_blocks, 1);
+		else /* compelted sync */
+			conf->fullsync = 0;
+		bitmap_close_sync(mddev->bitmap);
+
 		return 0;
 	}
 	/* if there is 1 or more failed drives and we are trying
@@ -1508,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		*skipped = 1;
 		return rv;
 	}
+	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+		/* we can skip this block, and probably more */
+		sync_blocks /= STRIPE_SECTORS;
+		*skipped = 1;
+		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+	}
 
 	x = sector_nr;
 	chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1525,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
 	}
+	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
 	spin_lock(&sh->lock);	
 	set_bit(STRIPE_SYNCING, &sh->state);
 	clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1558,6 +1628,13 @@ static void raid5d (mddev_t *mddev)
 	while (1) {
 		struct list_head *first;
 
+		if (conf->seq_flush - conf->seq_write > 0) {
+			int seq = conf->seq_flush;
+			bitmap_unplug(mddev->bitmap);
+			conf->seq_write = seq;
+			activate_bit_delay(conf);
+		}
+
 		if (list_empty(&conf->handle_list) &&
 		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
 		    !blk_queue_plugged(mddev->queue) &&
@@ -1591,7 +1668,7 @@ static void raid5d (mddev_t *mddev)
 	PRINTK("--- raid5d inactive\n");
 }
 
-static int run (mddev_t *mddev)
+static int run(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
 	int raid_disk, memory;
@@ -1621,6 +1698,7 @@ static int run (mddev_t *mddev)
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
+	INIT_LIST_HEAD(&conf->bitmap_list);
 	INIT_LIST_HEAD(&conf->inactive_list);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
@@ -1732,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
 
 	/* Ok, everything is just fine now */
 
+	if (mddev->bitmap)
+		mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
 
@@ -1912,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			rdev->in_sync = 0;
 			rdev->raid_disk = disk;
 			found = 1;
+			if (rdev->saved_raid_disk != disk)
+				conf->fullsync = 1;
 			p->rdev = rdev;
 			break;
 		}
@@ -1941,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
+static void raid5_quiesce(mddev_t *mddev, int state)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+
+	switch(state) {
+	case 1: /* stop all writes */
+		spin_lock_irq(&conf->device_lock);
+		conf->quiesce = 1;
+		wait_event_lock_irq(conf->wait_for_stripe,
+				    atomic_read(&conf->active_stripes) == 0,
+				    conf->device_lock, /* nothing */);
+		spin_unlock_irq(&conf->device_lock);
+		break;
+
+	case 0: /* re-enable writes */
+		spin_lock_irq(&conf->device_lock);
+		conf->quiesce = 0;
+		wake_up(&conf->wait_for_stripe);
+		spin_unlock_irq(&conf->device_lock);
+		break;
+	}
+	if (mddev->thread) {
+		if (mddev->bitmap)
+			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+		else
+			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+		md_wakeup_thread(mddev->thread);
+	}
+}
 static mdk_personality_t raid5_personality=
 {
 	.name		= "raid5",
@@ -1955,6 +2067,7 @@ static mdk_personality_t raid5_personality=
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
+	.quiesce	= raid5_quiesce,
 };
 
 static int __init raid5_init (void)
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index d63ddcb4afad..176fc653c284 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -134,6 +134,7 @@ struct stripe_head {
 	unsigned long		state;			/* state flags */
 	atomic_t		count;			/* nr of active thread/requests */
 	spinlock_t		lock;
+	int			bm_seq;	/* sequence number for bitmap flushes */
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
@@ -165,12 +166,13 @@ struct stripe_head {
 /*
  * Stripe state
  */
-#define STRIPE_ERROR		1
 #define STRIPE_HANDLE		2
 #define	STRIPE_SYNCING		3
 #define	STRIPE_INSYNC		4
 #define	STRIPE_PREREAD_ACTIVE	5
 #define	STRIPE_DELAYED		6
+#define	STRIPE_DEGRADED		7
+#define	STRIPE_BIT_DELAY	8
 
 /*
  * Plugging:
@@ -210,10 +212,20 @@ struct raid5_private_data {
 
 	struct list_head	handle_list; /* stripes needing handling */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
+	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
 	char			cache_name[20];
 	kmem_cache_t		*slab_cache; /* for allocating stripes */
+
+	int			seq_flush, seq_write;
+	int			quiesce;
+
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
 	/*
 	 * Free stripes pool
 	 */
-- 
cgit v1.2.3