From ca8895d9bb41e743271c42a4438a296de891b73b Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 26 Nov 2014 12:22:03 -0600 Subject: Return MD_SB_CLUSTERED if mddev is clustered Signed-off-by: Goldwyn Rodrigues --- include/uapi/linux/raid/md_p.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 49f4210d4394..643489d33e68 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -101,6 +101,7 @@ typedef struct mdp_device_descriptor_s { #define MD_SB_CLEAN 0 #define MD_SB_ERRORS 1 +#define MD_SB_CLUSTERED 5 /* MD is clustered */ #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ /* -- cgit v1.3 From 1aee41f637694d4bbf91c24195f2b63e3f6badd2 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 29 Oct 2014 18:51:31 -0500 Subject: Add new disk to clustered array Algorithm: 1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD) 2. Node 1 sends NEWDISK with uuid and slot number 3. Other nodes issue kobject_uevent_env with uuid and slot number (Steps 4,5 could be a udev rule) 4. In userspace, the node searches for the disk, perhaps using blkid -t SUB_UUID="" 5. Other nodes issue either of the following depending on whether the disk was found: ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and disc.number set to slot number) ioctl(CLUSTERED_DISK_NACK) 6. Other nodes drop lock on no-new-devs (CR) if device is found 7. Node 1 attempts EX lock on no-new-devs 8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk as SpareLocal 9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED 10. Other nodes understand if the device is added or not by reading the superblock again after receiving the METADATA_UPDATED message. Signed-off-by: Lidong Zhong Signed-off-by: Goldwyn Rodrigues --- drivers/md/md-cluster.c | 104 ++++++++++++++++++++++++++++++++++++++++- drivers/md/md-cluster.h | 4 ++ drivers/md/md.c | 52 +++++++++++++++++++-- drivers/md/md.h | 5 ++ drivers/md/raid1.c | 1 + include/uapi/linux/raid/md_p.h | 6 +++ include/uapi/linux/raid/md_u.h | 1 + 7 files changed, 169 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index d85a6ca4443e..03e521a9ca7d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -12,11 +12,13 @@ #include #include #include +#include #include "md.h" #include "bitmap.h" #include "md-cluster.h" #define LVB_SIZE 64 +#define NEW_DEV_TIMEOUT 5000 struct dlm_lock_resource { dlm_lockspace_t *ls; @@ -56,19 +58,25 @@ struct md_cluster_info { struct dlm_lock_resource *ack_lockres; struct dlm_lock_resource *message_lockres; struct dlm_lock_resource *token_lockres; + struct dlm_lock_resource *no_new_dev_lockres; struct md_thread *recv_thread; + struct completion newdisk_completion; }; enum msg_type { METADATA_UPDATED = 0, RESYNCING, + NEWDISK, }; struct cluster_msg { int type; int slot; + /* TODO: Unionize this for smaller footprint */ sector_t low; sector_t high; + char uuid[16]; + int raid_slot; }; static void sync_ast(void *arg) @@ -358,13 +366,41 @@ static void process_suspend_info(struct md_cluster_info *cinfo, spin_unlock_irq(&cinfo->suspend_lock); } +static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) +{ + char disk_uuid[64]; + struct md_cluster_info *cinfo = mddev->cluster_info; + char event_name[] = "EVENT=ADD_DEVICE"; + char raid_slot[16]; + char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; + int len; + + len = snprintf(disk_uuid, 64, "DEVICE_UUID="); + pretty_uuid(disk_uuid + len, cmsg->uuid); + snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); + pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); + init_completion(&cinfo->newdisk_completion); + kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); + wait_for_completion_timeout(&cinfo->newdisk_completion, + NEW_DEV_TIMEOUT); +} + + +static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + md_reload_sb(mddev); + dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); +} + static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) { switch (msg->type) { case METADATA_UPDATED: pr_info("%s: %d Received message: METADATA_UPDATE from %d\n", __func__, __LINE__, msg->slot); - md_reload_sb(mddev); + process_metadata_update(mddev, msg); break; case RESYNCING: pr_info("%s: %d Received message: RESYNCING from %d\n", @@ -372,6 +408,10 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) process_suspend_info(mddev->cluster_info, msg->slot, msg->low, msg->high); break; + case NEWDISK: + pr_info("%s: %d Received message: NEWDISK from %d\n", + __func__, __LINE__, msg->slot); + process_add_new_disk(mddev, msg); }; } @@ -593,10 +633,18 @@ static int join(struct mddev *mddev, int nodes) cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); if (!cinfo->ack_lockres) goto err; + cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); + if (!cinfo->no_new_dev_lockres) + goto err; + /* get sync CR lock on ACK. */ if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", ret); + /* get sync CR lock on no-new-dev. */ + if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) + pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); + pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); @@ -621,6 +669,7 @@ err: lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); + lockres_free(cinfo->no_new_dev_lockres); lockres_free(cinfo->bitmap_lockres); lockres_free(cinfo->sb_lock); if (cinfo->lockspace) @@ -642,6 +691,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); + lockres_free(cinfo->no_new_dev_lockres); lockres_free(cinfo->sb_lock); lockres_free(cinfo->bitmap_lockres); dlm_release_lockspace(cinfo->lockspace, 2); @@ -742,6 +792,55 @@ out: return ret; } +static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg; + int ret = 0; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + char *uuid = sb->device_uuid; + + memset(&cmsg, 0, sizeof(cmsg)); + cmsg.type = cpu_to_le32(NEWDISK); + memcpy(cmsg.uuid, uuid, 16); + cmsg.raid_slot = rdev->desc_nr; + lock_comm(cinfo); + ret = __sendmsg(cinfo, &cmsg); + if (ret) + return ret; + cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX); + cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE; + /* Some node does not "see" the device */ + if (ret == -EAGAIN) + ret = -ENOENT; + else + dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + return ret; +} + +static int add_new_disk_finish(struct mddev *mddev) +{ + struct cluster_msg cmsg; + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret; + /* Write sb and inform others */ + md_update_sb(mddev, 1); + cmsg.type = METADATA_UPDATED; + ret = __sendmsg(cinfo, &cmsg); + unlock_comm(cinfo); + return ret; +} + +static void new_disk_ack(struct mddev *mddev, bool ack) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + if (ack) + dlm_unlock_sync(cinfo->no_new_dev_lockres); + complete(&cinfo->newdisk_completion); +} + static struct md_cluster_operations cluster_ops = { .join = join, .leave = leave, @@ -753,6 +852,9 @@ static struct md_cluster_operations cluster_ops = { .metadata_update_finish = metadata_update_finish, .metadata_update_cancel = metadata_update_cancel, .area_resyncing = area_resyncing, + .add_new_disk_start = add_new_disk_start, + .add_new_disk_finish = add_new_disk_finish, + .new_disk_ack = new_disk_ack, }; static int __init cluster_init(void) diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 03785402afaa..60d7e58964f5 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -6,6 +6,7 @@ #include "md.h" struct mddev; +struct md_rdev; struct md_cluster_operations { int (*join)(struct mddev *mddev, int nodes); @@ -18,6 +19,9 @@ struct md_cluster_operations { int (*metadata_update_finish)(struct mddev *mddev); int (*metadata_update_cancel)(struct mddev *mddev); int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); + int (*add_new_disk_finish)(struct mddev *mddev); + void (*new_disk_ack)(struct mddev *mddev, bool ack); }; #endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md.c b/drivers/md/md.c index fe0484648de4..5703c2e89f3a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2210,7 +2210,7 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } -static void md_update_sb(struct mddev *mddev, int force_change) +void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; @@ -2371,6 +2371,7 @@ repeat: wake_up(&rdev->blocked_wait); } } +EXPORT_SYMBOL(md_update_sb); /* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match. @@ -3151,7 +3152,7 @@ static void analyze_sbs(struct mddev *mddev) kick_rdev_from_array(rdev); continue; } - if (rdev != freshest) + if (rdev != freshest) { if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { printk(KERN_WARNING "md: kicking non-fresh %s" @@ -3160,6 +3161,15 @@ static void analyze_sbs(struct mddev *mddev) kick_rdev_from_array(rdev); continue; } + /* No device should have a Candidate flag + * when reading devices + */ + if (test_bit(Candidate, &rdev->flags)) { + pr_info("md: kicking Cluster Candidate %s from array!\n", + bdevname(rdev->bdev, b)); + kick_rdev_from_array(rdev); + } + } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; @@ -5655,7 +5665,6 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.state |= (1<major,info->minor); + if (mddev_is_clustered(mddev) && + !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { + pr_err("%s: Cannot add to clustered mddev. Try --cluster-add\n", + mdname(mddev)); + return -EINVAL; + } + if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) return -EOVERFLOW; @@ -5830,6 +5846,25 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); + /* + * check whether the device shows up in other nodes + */ + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) { + /* Through --cluster-confirm */ + set_bit(Candidate, &rdev->flags); + md_cluster_ops->new_disk_ack(mddev, true); + } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + /* --add initiated by this node */ + err = md_cluster_ops->add_new_disk_start(mddev, rdev); + if (err) { + md_cluster_ops->add_new_disk_finish(mddev); + export_rdev(rdev); + return err; + } + } + } + rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (!err && !mddev->pers->hot_remove_disk) { @@ -5855,6 +5890,9 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) if (!err) md_new_event(mddev); md_wakeup_thread(mddev->thread); + if (mddev_is_clustered(mddev) && + (info->state & (1 << MD_DISK_CLUSTER_ADD))) + md_cluster_ops->add_new_disk_finish(mddev); return err; } @@ -6456,6 +6494,7 @@ static inline bool md_ioctl_valid(unsigned int cmd) case SET_DISK_FAULTY: case STOP_ARRAY: case STOP_ARRAY_RO: + case CLUSTERED_DISK_NACK: return true; default: return false; @@ -6728,6 +6767,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto unlock; } + case CLUSTERED_DISK_NACK: + if (mddev_is_clustered(mddev)) + md_cluster_ops->new_disk_ack(mddev, false); + else + err = -EINVAL; + goto unlock; + case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); goto unlock; diff --git a/drivers/md/md.h b/drivers/md/md.h index bfebcfdf54e6..6dc0ce09f50c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -171,6 +171,10 @@ enum flag_bits { * a want_replacement device with same * raid_disk number. */ + Candidate, /* For clustered environments only: + * This device is seen locally but not + * by the whole cluster + */ }; #define BB_LEN_MASK (0x00000000000001FFULL) @@ -666,6 +670,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); extern void md_reload_sb(struct mddev *mddev); +extern void md_update_sb(struct mddev *mddev, int force); static inline int mddev_check_plugged(struct mddev *mddev) { return !!blk_check_plugged(md_unplug, mddev, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f70d74189d16..53ed5d48308f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1571,6 +1571,7 @@ static int raid1_spare_active(struct mddev *mddev) struct md_rdev *rdev = conf->mirrors[i].rdev; struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; if (repl + && !test_bit(Candidate, &repl->flags) && repl->recovery_offset == MaxSector && !test_bit(Faulty, &repl->flags) && !test_and_set_bit(In_sync, &repl->flags)) { diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 643489d33e68..2ae6131e69a5 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -78,6 +78,12 @@ #define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster + * For clustered enviroments only. + */ +#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed + * For clustered enviroments only. + */ #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. * read requests will only be sent here in diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 74e7c60c4716..1cb8aa6850b5 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -62,6 +62,7 @@ #define STOP_ARRAY _IO (MD_MAJOR, 0x32) #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35) /* 63 partitions with the alternate major number (mdp) */ #define MdpMinorShift 6 -- cgit v1.3 From fe5cbc6e06c7d8b3a86f6f5491d74766bb5c2827 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 15 Dec 2014 12:57:04 +1100 Subject: md/raid6 algorithms: delta syndrome functions v3: s-o-b comment, explanation of performance and descision for the start/stop implementation Implementing rmw functionality for RAID6 requires optimized syndrome calculation. Up to now we can only generate a complete syndrome. The target P/Q pages are always overwritten. With this patch we provide a framework for inplace P/Q modification. In the first place simply fill those functions with NULL values. xor_syndrome() has two additional parameters: start & stop. These will indicate the first and last page that are changing during a rmw run. That makes it possible to avoid several unneccessary loops and speed up calculation. The caller needs to implement the following logic to make the functions work. 1) xor_syndrome(disks, start, stop, ...): "Remove" all data of source blocks inside P/Q between (and including) start and end. 2) modify any block with start <= block <= stop 3) xor_syndrome(disks, start, stop, ...): "Reinsert" all data of source blocks into P/Q between (and including) start and end. Pages between start and stop that won't be changed should be filled with a pointer to the kernel zero page. The reasons for not taking NULL pages are: 1) Algorithms cross the whole source data line by line. Thus avoid additional branches. 2) Having a NULL page avoids calculating the XOR P parity but still need calulation steps for the Q parity. Depending on the algorithm unrolling that might be only a difference of 2 instructions per loop. The benchmark numbers of the gen_syndrome() functions are displayed in the kernel log. Do the same for the xor_syndrome() functions. This will help to analyze performance problems and give an rough estimate how well the algorithm works. The choice of the fastest algorithm will still depend on the gen_syndrome() performance. With the start/stop page implementation the speed can vary a lot in real life. E.g. a change of page 0 & page 15 on a stripe will be harder to compute than the case where page 0 & page 1 are XOR candidates. To be not to enthusiatic about the expected speeds we will run a worse case test that simulates a change on the upper half of the stripe. So we do: 1) calculation of P/Q for the upper pages 2) continuation of Q for the lower (empty) pages Signed-off-by: Markus Stockhausen Signed-off-by: NeilBrown --- include/linux/raid/pq.h | 1 + lib/raid6/algos.c | 41 ++++++++++++++++++++++++++++++++++------- lib/raid6/altivec.uc | 1 + lib/raid6/avx2.c | 3 +++ lib/raid6/int.uc | 3 ++- lib/raid6/mmx.c | 2 ++ lib/raid6/neon.c | 1 + lib/raid6/sse1.c | 2 ++ lib/raid6/sse2.c | 3 +++ lib/raid6/tilegx.uc | 1 + 10 files changed, 50 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 73069cb6c54a..a7a06d1dcf9c 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -72,6 +72,7 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; /* Routine choices */ struct raid6_calls { void (*gen_syndrome)(int, size_t, void **); + void (*xor_syndrome)(int, int, int, size_t, void **); int (*valid)(void); /* Returns 1 if this routine set is usable */ const char *name; /* Name of this routine set */ int prefer; /* Has special performance attribute */ diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index dbef2314901e..975c6e0434bd 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -131,11 +131,12 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) static inline const struct raid6_calls *raid6_choose_gen( void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) { - unsigned long perf, bestperf, j0, j1; + unsigned long perf, bestgenperf, bestxorperf, j0, j1; + int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ const struct raid6_calls *const *algo; const struct raid6_calls *best; - for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { + for (bestgenperf = 0, bestxorperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { if (!best || (*algo)->prefer >= best->prefer) { if ((*algo)->valid && !(*algo)->valid()) continue; @@ -153,19 +154,45 @@ static inline const struct raid6_calls *raid6_choose_gen( } preempt_enable(); - if (perf > bestperf) { - bestperf = perf; + if (perf > bestgenperf) { + bestgenperf = perf; best = *algo; } - pr_info("raid6: %-8s %5ld MB/s\n", (*algo)->name, + pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + + if (!(*algo)->xor_syndrome) + continue; + + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ((j1 = jiffies) == j0) + cpu_relax(); + while (time_before(jiffies, + j1 + (1<xor_syndrome(disks, start, stop, + PAGE_SIZE, *dptrs); + perf++; + } + preempt_enable(); + + if (best == *algo) + bestxorperf = perf; + + pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name, + (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); } } if (best) { - pr_info("raid6: using algorithm %s (%ld MB/s)\n", + pr_info("raid6: using algorithm %s gen() %ld MB/s\n", best->name, - (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + (bestgenperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + if (best->xor_syndrome) + pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", + (bestxorperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); raid6_call = *best; } else pr_err("raid6: Yikes! No algorithm found!\n"); diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc index 7cc12b532e95..bec27fce7501 100644 --- a/lib/raid6/altivec.uc +++ b/lib/raid6/altivec.uc @@ -119,6 +119,7 @@ int raid6_have_altivec(void) const struct raid6_calls raid6_altivec$# = { raid6_altivec$#_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_altivec, "altivecx$#", 0 diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index bc3b1dd436eb..76734004358d 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c @@ -89,6 +89,7 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_avx2x1 = { raid6_avx21_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_avx2, "avx2x1", 1 /* Has cache hints */ @@ -150,6 +151,7 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_avx2x2 = { raid6_avx22_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_avx2, "avx2x2", 1 /* Has cache hints */ @@ -242,6 +244,7 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_avx2x4 = { raid6_avx24_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_avx2, "avx2x4", 1 /* Has cache hints */ diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc index 5b50f8dfc5d2..5ca60bee1388 100644 --- a/lib/raid6/int.uc +++ b/lib/raid6/int.uc @@ -109,7 +109,8 @@ static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_intx$# = { raid6_int$#_gen_syndrome, - NULL, /* always valid */ + NULL, /* XOR not yet implemented */ + NULL, /* always valid */ "int" NSTRING "x$#", 0 }; diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c index 590c71c9e200..b3b0e1fcd3af 100644 --- a/lib/raid6/mmx.c +++ b/lib/raid6/mmx.c @@ -76,6 +76,7 @@ static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_mmxx1 = { raid6_mmx1_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_mmx, "mmxx1", 0 @@ -134,6 +135,7 @@ static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_mmxx2 = { raid6_mmx2_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_mmx, "mmxx2", 0 diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c index 36ad4705df1a..d9ad6ee284f4 100644 --- a/lib/raid6/neon.c +++ b/lib/raid6/neon.c @@ -42,6 +42,7 @@ } \ struct raid6_calls const raid6_neonx ## _n = { \ raid6_neon ## _n ## _gen_syndrome, \ + NULL, /* XOR not yet implemented */ \ raid6_have_neon, \ "neonx" #_n, \ 0 \ diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c index f76297139445..9025b8ca9aa3 100644 --- a/lib/raid6/sse1.c +++ b/lib/raid6/sse1.c @@ -92,6 +92,7 @@ static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse1x1 = { raid6_sse11_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse1_or_mmxext, "sse1x1", 1 /* Has cache hints */ @@ -154,6 +155,7 @@ static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse1x2 = { raid6_sse12_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse1_or_mmxext, "sse1x2", 1 /* Has cache hints */ diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c index 85b82c85f28e..31acd59a0ef7 100644 --- a/lib/raid6/sse2.c +++ b/lib/raid6/sse2.c @@ -90,6 +90,7 @@ static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse2x1 = { raid6_sse21_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse2, "sse2x1", 1 /* Has cache hints */ @@ -152,6 +153,7 @@ static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse2x2 = { raid6_sse22_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse2, "sse2x2", 1 /* Has cache hints */ @@ -250,6 +252,7 @@ static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse2x4 = { raid6_sse24_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse2, "sse2x4", 1 /* Has cache hints */ diff --git a/lib/raid6/tilegx.uc b/lib/raid6/tilegx.uc index e7c29459cbcd..2dd291a11264 100644 --- a/lib/raid6/tilegx.uc +++ b/lib/raid6/tilegx.uc @@ -80,6 +80,7 @@ void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_tilegx$# = { raid6_tilegx$#_gen_syndrome, + NULL, /* XOR not yet implemented */ NULL, "tilegx$#", 0 -- cgit v1.3 From 584acdd49cd2472ca0f5a06adbe979db82d0b4af Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 15 Dec 2014 12:57:05 +1100 Subject: md/raid5: activate raid6 rmw feature Glue it altogehter. The raid6 rmw path should work the same as the already existing raid5 logic. So emulate the prexor handling/flags and split functions as needed. 1) Enable xor_syndrome() in the async layer. 2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome at the start of a rmw run as we did it before for the single parity. 3) Take care of rmw run in ops_run_reconstruct6(). Again process only the changed pages to get syndrome back into sync. 4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw run. The lower layers will calculate start & end pages from that and call the xor_syndrome() correspondingly. 5) Adapt the several places where we ignored Q handling up to now. Performance numbers for a single E5630 system with a mix of 10 7200k desktop/server disks. 300 seconds random write with 8 threads onto a 3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4) bsize rmw_level=1 rmw_level=0 rmw_level=1 rmw_level=0 skip_copy=1 skip_copy=1 skip_copy=0 skip_copy=0 4K 115 KB/s 141 KB/s 165 KB/s 140 KB/s 8K 225 KB/s 275 KB/s 324 KB/s 274 KB/s 16K 434 KB/s 536 KB/s 640 KB/s 534 KB/s 32K 751 KB/s 1,051 KB/s 1,234 KB/s 1,045 KB/s 64K 1,339 KB/s 1,958 KB/s 2,282 KB/s 1,962 KB/s 128K 2,673 KB/s 3,862 KB/s 4,113 KB/s 3,898 KB/s 256K 7,685 KB/s 7,539 KB/s 7,557 KB/s 7,638 KB/s 512K 19,556 KB/s 19,558 KB/s 19,652 KB/s 19,688 Kb/s Signed-off-by: Markus Stockhausen Signed-off-by: NeilBrown --- crypto/async_tx/async_pq.c | 19 +++++++-- drivers/md/raid5.c | 104 +++++++++++++++++++++++++++++++++------------ drivers/md/raid5.h | 19 ++++++++- include/linux/async_tx.h | 3 ++ 4 files changed, 115 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index d05327caf69d..5d355e0c2633 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, { void **srcs; int i; + int start = -1, stop = disks - 3; if (submit->scribble) srcs = submit->scribble; @@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, if (blocks[i] == NULL) { BUG_ON(i > disks - 3); /* P or Q can't be zero */ srcs[i] = (void*)raid6_empty_zero_page; - } else + } else { srcs[i] = page_address(blocks[i]) + offset; + if (i < disks - 2) { + stop = i; + if (start == -1) + start = i; + } + } } - raid6_call.gen_syndrome(disks, len, srcs); + if (submit->flags & ASYNC_TX_PQ_XOR_DST) { + BUG_ON(!raid6_call.xor_syndrome); + if (start >= 0) + raid6_call.xor_syndrome(disks, start, stop, len, srcs); + } else + raid6_call.gen_syndrome(disks, len, srcs); async_tx_sync_epilog(submit); } @@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, if (device) unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); - if (unmap && + /* XORing P/Q is only implemented in software */ + if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) && (src_cnt <= dma_maxpq(device, 0) || dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && is_dma_pq_aligned(device, offset, 0, len)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3ae097d50b51..c82ce1fd8723 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) * destination buffer is recorded in srcs[count] and the Q destination * is recorded in srcs[count+1]]. */ -static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +static int set_syndrome_sources(struct page **srcs, + struct stripe_head *sh, + int srctype) { int disks = sh->disks; int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); @@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + struct r5dev *dev = &sh->dev[i]; - srcs[slot] = sh->dev[i].page; + if (i == sh->qd_idx || i == sh->pd_idx || + (srctype == SYNDROME_SRC_ALL) || + (srctype == SYNDROME_SRC_WANT_DRAIN && + test_bit(R5_Wantdrain, &dev->flags)) || + (srctype == SYNDROME_SRC_WRITTEN && + dev->written)) + srcs[slot] = sh->dev[i].page; i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) atomic_inc(&sh->count); if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh); + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); blocks[count] = NULL; /* regenerating p is not necessary */ BUG_ON(blocks[count+1] != dest); /* q should already be set */ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, @@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); - count = set_syndrome_sources(blocks, sh); + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); init_async_submit(&submit, ASYNC_TX_FENCE, tx, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); @@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref) } static struct dma_async_tx_descriptor * -ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) +ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); @@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, return tx; } +static struct dma_async_tx_descriptor * +ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct page **blocks = to_addr_page(percpu, 0); + int count; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); + tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + + return tx; +} + static struct dma_async_tx_descriptor * ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { @@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, int count, i, j = 0; struct stripe_head *head_sh = sh; int last_stripe; + int synflags; + unsigned long txflags; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, again: blocks = to_addr_page(percpu, j); - count = set_syndrome_sources(blocks, sh); + + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + synflags = SYNDROME_SRC_WRITTEN; + txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; + } else { + synflags = SYNDROME_SRC_ALL; + txflags = ASYNC_TX_ACK; + } + + count = set_syndrome_sources(blocks, sh, synflags); last_stripe = !head_sh->batch_head || list_first_entry(&sh->batch_list, struct stripe_head, batch_list) == head_sh; if (last_stripe) { atomic_inc(&head_sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, + init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, head_sh, to_addr_conv(sh, percpu, j)); } else init_async_submit(&submit, 0, tx, NULL, NULL, @@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu (unsigned long long)sh->sector, checkp); BUG_ON(sh->batch_head); - count = set_syndrome_sources(srcs, sh); + count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); if (!checkp) srcs[count] = NULL; @@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) async_tx_ack(tx); } - if (test_bit(STRIPE_OP_PREXOR, &ops_request)) - tx = ops_run_prexor(sh, percpu, tx); + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { + if (level < 6) + tx = ops_run_prexor5(sh, percpu, tx); + else + tx = ops_run_prexor6(sh, percpu, tx); + } if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); @@ -2770,7 +2814,7 @@ static void schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { - int i, pd_idx = sh->pd_idx, disks = sh->disks; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; struct r5conf *conf = sh->raid_conf; int level = conf->level; @@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&conf->pending_full_writes); } else { - BUG_ON(level == 6); BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + BUG_ON(level == 6 && + (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (i == pd_idx) + if (i == pd_idx || i == qd_idx) continue; if (dev->towrite && @@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf, int rmw = 0, rcw = 0, i; sector_t recovery_cp = conf->mddev->recovery_cp; - /* RAID6 requires 'rcw' in current implementation. - * Otherwise, check whether resync is now happening or should start. + /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or * initial creation), so parity in some stripes might be inconsistent. * In this case, we need to always do reconstruct-write, to ensure * that in case of drive failure or read-error correction, we * generate correct data from the parity. */ - if (conf->max_degraded == 2 || + if (conf->rmw_level == PARITY_DISABLE_RMW || (recovery_cp < MaxSector && sh->sector >= recovery_cp && s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; - pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", - conf->max_degraded, (unsigned long long)recovery_cp, + pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)recovery_cp, (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { @@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, rmw += 2*disks; /* cannot read it */ } /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { @@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) { + if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ if (conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, @@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && @@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if (rcw <= rmw && rcw > 0) { + if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { /* want reconstruct write, but need to get some data */ int qread =0; rcw = 0; @@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) } conf->level = mddev->new_level; - if (conf->level == 6) + if (conf->level == 6) { conf->max_degraded = 2; - else + if (raid6_call.xor_syndrome) + conf->rmw_level = PARITY_ENABLE_RMW; + else + conf->rmw_level = PARITY_DISABLE_RMW; + } else { conf->max_degraded = 1; + conf->rmw_level = PARITY_ENABLE_RMW; + } conf->algorithm = mddev->new_layout; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ee65ed844d3f..57fef9ba36fa 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -355,6 +355,23 @@ enum { STRIPE_OP_RECONSTRUCT, STRIPE_OP_CHECK, }; + +/* + * RAID parity calculation preferences + */ +enum { + PARITY_DISABLE_RMW = 0, + PARITY_ENABLE_RMW, +}; + +/* + * Pages requested from set_syndrome_sources() + */ +enum { + SYNDROME_SRC_ALL, + SYNDROME_SRC_WANT_DRAIN, + SYNDROME_SRC_WRITTEN, +}; /* * Plugging: * @@ -411,7 +428,7 @@ struct r5conf { spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; struct mddev *mddev; int chunk_sectors; - int level, algorithm; + int level, algorithm, rmw_level; int max_degraded; int raid_disks; int max_nr_stripes; diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 179b38ffd351..388574ea38ed 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -60,12 +60,15 @@ struct dma_chan_ref { * dependency chain * @ASYNC_TX_FENCE: specify that the next operation in the dependency * chain uses this operation's result as an input + * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the + * input data. Required for rmw case. */ enum async_tx_flags { ASYNC_TX_XOR_ZERO_DST = (1 << 0), ASYNC_TX_XOR_DROP_DST = (1 << 1), ASYNC_TX_ACK = (1 << 2), ASYNC_TX_FENCE = (1 << 3), + ASYNC_TX_PQ_XOR_DST = (1 << 4), }; /** -- cgit v1.3