From e511c4a3d2a1f64aafc1f5df37a2ffcf7ef91b55 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 13 May 2022 15:10:58 -0700 Subject: dax: introduce DAX_RECOVERY_WRITE dax access mode Up till now, dax_direct_access() is used implicitly for normal access, but for the purpose of recovery write, dax range with poison is requested. To make the interface clear, introduce enum dax_access_mode { DAX_ACCESS, DAX_RECOVERY_WRITE, } where DAX_ACCESS is used for normal dax access, and DAX_RECOVERY_WRITE is used for dax recovery write. Suggested-by: Dan Williams Signed-off-by: Jane Chu Reviewed-by: Christoph Hellwig Cc: Mike Snitzer Reviewed-by: Vivek Goyal Link: https://lore.kernel.org/r/165247982851.52965.11024212198889762949.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/dax.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux/dax.h') diff --git a/include/linux/dax.h b/include/linux/dax.h index 9fc5f99a0ae2..3f1339bce3c0 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,11 @@ struct iomap_ops; struct iomap_iter; struct iomap; +enum dax_access_mode { + DAX_ACCESS, + DAX_RECOVERY_WRITE, +}; + struct dax_operations { /* * direct_access: translate a device-relative @@ -21,7 +26,7 @@ struct dax_operations { * number of pages available for DAX at that pfn. */ long (*direct_access)(struct dax_device *, pgoff_t, long, - void **, pfn_t *); + enum dax_access_mode, void **, pfn_t *); /* * Validate whether this device is usable as an fsdax backing * device. @@ -178,7 +183,7 @@ static inline void dax_read_unlock(int id) bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, - void **kaddr, pfn_t *pfn); + enum dax_access_mode mode, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, -- cgit v1.2.3 From 047218ec904da19c45c4a70274fc3f818a1fcba1 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 22 Apr 2022 16:45:06 -0600 Subject: dax: add .recovery_write dax_operation Introduce dax_recovery_write() operation. The function is used to recover a dax range that contains poison. Typical use case is when a user process receives a SIGBUS with si_code BUS_MCEERR_AR indicating poison(s) in a dax range, in response, the user process issues a pwrite() to the page-aligned dax range, thus clears the poison and puts valid data in the range. Reviewed-by: Christoph Hellwig Signed-off-by: Jane Chu Link: https://lore.kernel.org/r/20220422224508.440670-6-jane.chu@oracle.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 9 +++++++++ drivers/md/dm-linear.c | 10 ++++++++++ drivers/md/dm-log-writes.c | 10 ++++++++++ drivers/md/dm-stripe.c | 10 ++++++++++ drivers/md/dm.c | 20 ++++++++++++++++++++ drivers/nvdimm/pmem.c | 7 +++++++ fs/dax.c | 13 ++++++++++++- include/linux/dax.h | 13 +++++++++++++ include/linux/device-mapper.h | 9 +++++++++ 9 files changed, 100 insertions(+), 1 deletion(-) (limited to 'include/linux/dax.h') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 5405eb553430..50a08b2ec247 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -195,6 +195,15 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, } EXPORT_SYMBOL_GPL(dax_zero_page_range); +size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *iter) +{ + if (!dax_dev->ops->recovery_write) + return 0; + return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); +} +EXPORT_SYMBOL_GPL(dax_recovery_write); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 13e263299c9c..cdf48bc8c5b0 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -188,9 +188,18 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages); } +static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define linear_dax_direct_access NULL #define linear_dax_zero_page_range NULL +#define linear_dax_recovery_write NULL #endif static struct target_type linear_target = { @@ -208,6 +217,7 @@ static struct target_type linear_target = { .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, .dax_zero_page_range = linear_dax_zero_page_range, + .dax_recovery_write = linear_dax_recovery_write, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 06bdbed65eb1..22739dccdd17 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -905,9 +905,18 @@ static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT); } +static size_t log_writes_dax_recovery_write(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define log_writes_dax_direct_access NULL #define log_writes_dax_zero_page_range NULL +#define log_writes_dax_recovery_write NULL #endif static struct target_type log_writes_target = { @@ -925,6 +934,7 @@ static struct target_type log_writes_target = { .io_hints = log_writes_io_hints, .direct_access = log_writes_dax_direct_access, .dax_zero_page_range = log_writes_dax_zero_page_range, + .dax_recovery_write = log_writes_dax_recovery_write, }; static int __init dm_log_writes_init(void) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 77d72900e997..baa085cc67bd 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -331,9 +331,18 @@ static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages); } +static size_t stripe_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define stripe_dax_direct_access NULL #define stripe_dax_zero_page_range NULL +#define stripe_dax_recovery_write NULL #endif /* @@ -470,6 +479,7 @@ static struct target_type stripe_target = { .io_hints = stripe_io_hints, .direct_access = stripe_dax_direct_access, .dax_zero_page_range = stripe_dax_zero_page_range, + .dax_recovery_write = stripe_dax_recovery_write, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9c452641c3d5..3fe76ab20069 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1147,6 +1147,25 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } +static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int srcu_idx; + long ret = 0; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + if (!ti || !ti->type->dax_recovery_write) + goto out; + + ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i); +out: + dm_put_live_table(md, srcu_idx); + return ret; +} + /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management @@ -3147,6 +3166,7 @@ static const struct block_device_operations dm_rq_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .zero_page_range = dm_dax_zero_page_range, + .recovery_write = dm_dax_recovery_write, }; /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 47f34c50f944..e5e288135af7 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -287,9 +287,16 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn); } +static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return 0; +} + static const struct dax_operations pmem_dax_ops = { .direct_access = pmem_dax_direct_access, .zero_page_range = pmem_dax_zero_page_range, + .recovery_write = pmem_recovery_write, }; static ssize_t write_cache_show(struct device *dev, diff --git a/fs/dax.c b/fs/dax.c index ef3103107104..a1e4b45cbf55 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1240,6 +1240,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; + bool recovery = false; void *kaddr; if (fatal_signal_pending(current)) { @@ -1249,6 +1250,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, &kaddr, NULL); + if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { + map_len = dax_direct_access(dax_dev, pgoff, + PHYS_PFN(size), DAX_RECOVERY_WRITE, + &kaddr, NULL); + if (map_len > 0) + recovery = true; + } if (map_len < 0) { ret = map_len; break; @@ -1260,7 +1268,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (map_len > end - pos) map_len = end - pos; - if (iov_iter_rw(iter) == WRITE) + if (recovery) + xfer = dax_recovery_write(dax_dev, pgoff, kaddr, + map_len, iter); + else if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else diff --git a/include/linux/dax.h b/include/linux/dax.h index 3f1339bce3c0..e7b81634c52a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -35,6 +35,12 @@ struct dax_operations { sector_t, sector_t); /* zero_page_range: required operation. Zero page range */ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); + /* + * recovery_write: recover a poisoned range by DAX device driver + * capable of clearing poison. + */ + size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *iter); }; #if IS_ENABLED(CONFIG_DAX) @@ -45,6 +51,8 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); +size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); /* * Check if given mapping is supported by the file / underlying device. */ @@ -92,6 +100,11 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, { return !(vma->vm_flags & VM_SYNC); } +static inline size_t dax_recovery_write(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + return 0; +} #endif void set_dax_nocache(struct dax_device *dax_dev); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index acdedda0d12b..47a01c7cffdf 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -152,6 +152,14 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); +/* + * Returns: + * != 0 : number of bytes transferred + * 0 : recovery write failed + */ +typedef size_t (*dm_dax_recovery_write_fn)(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); + void dm_error(const char *message); struct dm_dev { @@ -201,6 +209,7 @@ struct target_type { dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; dm_dax_zero_page_range_fn dax_zero_page_range; + dm_dax_recovery_write_fn dax_recovery_write; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.2.3 From 8012b866085523758780850087102421dbcce522 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:25 +0800 Subject: dax: introduce holder for dax_device Patch series "v14 fsdax-rmap + v11 fsdax-reflink", v2. The patchset fsdax-rmap is aimed to support shared pages tracking for fsdax. It moves owner tracking from dax_assocaite_entry() to pmem device driver, by introducing an interface ->memory_failure() for struct pagemap. This interface is called by memory_failure() in mm, and implemented by pmem device. Then call holder operations to find the filesystem which the corrupted data located in, and call filesystem handler to track files or metadata associated with this page. Finally we are able to try to fix the corrupted data in filesystem and do other necessary processing, such as killing processes who are using the files affected. The call trace is like this: memory_failure() |* fsdax case |------------ |pgmap->ops->memory_failure() => pmem_pgmap_memory_failure() | dax_holder_notify_failure() => | dax_device->holder_ops->notify_failure() => | - xfs_dax_notify_failure() | |* xfs_dax_notify_failure() | |-------------------------- | | xfs_rmap_query_range() | | xfs_dax_failure_fn() | | * corrupted on metadata | | try to recover data, call xfs_force_shutdown() | | * corrupted on file data | | try to recover data, call mf_dax_kill_procs() |* normal case |------------- |mf_generic_kill_procs() The patchset fsdax-reflink attempts to add CoW support for fsdax, and takes XFS, which has both reflink and fsdax features, as an example. One of the key mechanisms needed to be implemented in fsdax is CoW. Copy the data from srcmap before we actually write data to the destination iomap. And we just copy range in which data won't be changed. Another mechanism is range comparison. In page cache case, readpage() is used to load data on disk to page cache in order to be able to compare data. In fsdax case, readpage() does not work. So, we need another compare data with direct access support. With the two mechanisms implemented in fsdax, we are able to make reflink and fsdax work together in XFS. This patch (of 14): To easily track filesystem from a pmem device, we introduce a holder for dax_device structure, and also its operation. This holder is used to remember who is using this dax_device: - When it is the backend of a filesystem, the holder will be the instance of this filesystem. - When this pmem device is one of the targets in a mapped device, the holder will be this mapped device. In this case, the mapped device has its own dax_device and it will follow the first rule. So that we can finally track to the filesystem we needed. The holder and holder_ops will be set when filesystem is being mounted, or an target device is being activated. Link: https://lkml.kernel.org/r/20220603053738.1218681-1-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/20220603053738.1218681-2-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Cc: Dave Chinner Cc: Jane Chu Cc: Goldwyn Rodrigues Cc: Al Viro Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Miaohe Lin Cc: Dan Williams Cc: Goldwyn Rodrigues Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- drivers/dax/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++- drivers/md/dm.c | 2 +- fs/erofs/super.c | 10 ++++---- fs/ext2/super.c | 7 +++--- fs/ext4/super.c | 9 +++---- fs/xfs/xfs_buf.c | 5 ++-- include/linux/dax.h | 33 +++++++++++++++++++------- 7 files changed, 110 insertions(+), 23 deletions(-) (limited to 'include/linux/dax.h') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 50a08b2ec247..9b5e2a5eb0ae 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -22,6 +22,8 @@ * @private: dax driver private data * @flags: state and boolean properties * @ops: operations for this device + * @holder_data: holder of a dax_device: could be filesystem or mapped device + * @holder_ops: operations for the inner holder */ struct dax_device { struct inode inode; @@ -29,6 +31,8 @@ struct dax_device { void *private; unsigned long flags; const struct dax_operations *ops; + void *holder_data; + const struct dax_holder_operations *holder_ops; }; static dev_t dax_devt; @@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host); * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax * @bdev: block device to find a dax_device for * @start_off: returns the byte offset into the dax_device that @bdev starts + * @holder: filesystem or mapped device inside the dax_device + * @ops: operations for the inner holder */ -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops) { struct dax_device *dax_dev; u64 part_size; @@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) dax_dev = NULL; + else if (holder) { + if (!cmpxchg(&dax_dev->holder_data, NULL, holder)) + dax_dev->holder_ops = ops; + else + dax_dev = NULL; + } dax_read_unlock(id); return dax_dev; } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); + +void fs_put_dax(struct dax_device *dax_dev, void *holder) +{ + if (dax_dev && holder && + cmpxchg(&dax_dev->holder_data, holder, NULL) == holder) + dax_dev->holder_ops = NULL; + put_dax(dax_dev); +} +EXPORT_SYMBOL_GPL(fs_put_dax); #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ enum dax_device_flags { @@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, } EXPORT_SYMBOL_GPL(dax_recovery_write); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, + u64 len, int mf_flags) +{ + int rc, id; + + id = dax_read_lock(); + if (!dax_alive(dax_dev)) { + rc = -ENXIO; + goto out; + } + + if (!dax_dev->holder_ops) { + rc = -EOPNOTSUPP; + goto out; + } + + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); +out: + dax_read_unlock(id); + return rc; +} +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) @@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev) if (!dax_dev) return; + if (dax_dev->holder_data != NULL) + dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); + clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); + + /* clear holder data */ + dax_dev->holder_ops = NULL; + dax_dev->holder_data = NULL; } EXPORT_SYMBOL_GPL(kill_dax); @@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(put_dax); +/** + * dax_holder() - obtain the holder of a dax device + * @dax_dev: a dax_device instance + + * Return: the holder's data which represents the holder if registered, + * otherwize NULL. + */ +void *dax_holder(struct dax_device *dax_dev) +{ + return dax_dev->holder_data; +} +EXPORT_SYMBOL_GPL(dax_holder); + /** * inode_dax: convert a public inode into its dax_dev * @inode: An inode with i_cdev pointing to a dax_dev diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2b75f1ef7386..0177a4ce9a18 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -758,7 +758,7 @@ static int open_table_device(struct table_device *td, dev_t dev, } td->dm_dev.bdev = bdev; - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); return 0; } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 95addc5c9d34..3173debeaa5a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(bdev)) return PTR_ERR(bdev); dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, + NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off); + &sbi->dax_part_off, + NULL, NULL); } err = erofs_read_superblock(sb); @@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data) { struct erofs_device_info *dif = ptr; - fs_put_dax(dif->dax_dev); + fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); erofs_fscache_unregister_cookie(&dif->fscache); @@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb) return; erofs_free_dev_context(sbi->devs); - fs_put_dax(sbi->dax_dev); + fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); kfree(sbi->opt.fsid); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..4638946251b9 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb) brelse (sbi->s_sbh); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -835,7 +835,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); spin_lock_init(&sbi->s_lock); ret = -EINVAL; @@ -1204,7 +1205,7 @@ failed_mount_group_desc: failed_mount: brelse(bh); failed_sbi: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 845f2f8aee5f..1f8bf507ba5a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); @@ -4272,7 +4272,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi) return; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -4284,7 +4284,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) if (!sbi) return NULL; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); @@ -4296,7 +4297,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) sbi->s_sb = sb; return sbi; err_out: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4aa9c9cf5b6e..1ec2a7b6d44e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1911,7 +1911,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); - fs_put_dax(btp->bt_daxdev); + fs_put_dax(btp->bt_daxdev, NULL); kmem_free(btp); } @@ -1964,7 +1964,8 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL, + NULL); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --git a/include/linux/dax.h b/include/linux/dax.h index e7b81634c52a..cf85fc36da5f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -43,8 +43,21 @@ struct dax_operations { void *addr, size_t bytes, struct iov_iter *iter); }; +struct dax_holder_operations { + /* + * notify_failure - notify memory failure into inner holder device + * @dax_dev: the dax device which contains the holder + * @offset: offset on this dax device where memory failure occurs + * @len: length of this memory failure event + * @flags: action flags for memory failure handler + */ + int (*notify_failure)(struct dax_device *dax_dev, u64 offset, + u64 len, int mf_flags); +}; + #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); +void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else +static inline void *dax_holder(struct dax_device *dax_dev) +{ + return NULL; +} static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { @@ -114,12 +131,9 @@ struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off); -static inline void fs_put_dax(struct dax_device *dax_dev) -{ - put_dax(dax_dev); -} +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops); +void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk) { } static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off) + u64 *start_off, void *holder, + const struct dax_holder_operations *ops) { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev) +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) { } #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ @@ -203,6 +218,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, + int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, -- cgit v1.2.3 From 2f437effc689ef913fbe5e31110580b4e7cf04be Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:28 +0800 Subject: fsdax: introduce dax_lock_mapping_entry() The current dax_lock_page() locks dax entry by obtaining mapping and index in page. To support 1-to-N RMAP in NVDIMM, we need a new function to lock a specific dax entry corresponding to this file's mapping,index. And output the page corresponding to the specific dax entry for caller use. Link: https://lkml.kernel.org/r/20220603053738.1218681-5-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 15 +++++++++++++ 2 files changed, 78 insertions(+) (limited to 'include/linux/dax.h') diff --git a/fs/dax.c b/fs/dax.c index 4155a6107fa1..65e44d78b3bb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -455,6 +455,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie) dax_unlock_entry(&xas, (void *)cookie); } +/* + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping + * @mapping: the file's mapping whose entry we want to lock + * @index: the offset within this file + * @page: output the dax page corresponding to this dax entry + * + * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry + * could not be locked. + */ +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, + struct page **page) +{ + XA_STATE(xas, NULL, 0); + void *entry; + + rcu_read_lock(); + for (;;) { + entry = NULL; + if (!dax_mapping(mapping)) + break; + + xas.xa = &mapping->i_pages; + xas_lock_irq(&xas); + xas_set(&xas, index); + entry = xas_load(&xas); + if (dax_is_locked(entry)) { + rcu_read_unlock(); + wait_entry_unlocked(&xas, entry); + rcu_read_lock(); + continue; + } + if (!entry || + dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + /* + * Because we are looking for entry from file's mapping + * and index, so the entry may not be inserted for now, + * or even a zero/empty entry. We don't think this is + * an error case. So, return a special value and do + * not output @page. + */ + entry = (void *)~0UL; + } else { + *page = pfn_to_page(dax_to_pfn(entry)); + dax_lock_entry(&xas, entry); + } + xas_unlock_irq(&xas); + break; + } + rcu_read_unlock(); + return (dax_entry_t)entry; +} + +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, + dax_entry_t cookie) +{ + XA_STATE(xas, &mapping->i_pages, index); + + if (cookie == ~0UL) + return; + + dax_unlock_entry(&xas, (void *)cookie); +} + /* * Find page cache entry at given index. If it is a DAX entry, return it * with the entry locked. If the page cache doesn't contain an entry at diff --git a/include/linux/dax.h b/include/linux/dax.h index cf85fc36da5f..7116681b48c0 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -161,6 +161,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page); +void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie); #else static inline struct page *dax_layout_busy_page(struct address_space *mapping) { @@ -188,6 +192,17 @@ static inline dax_entry_t dax_lock_page(struct page *page) static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) { } + +static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page) +{ + return 0; +} + +static inline void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie) +{ +} #endif int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, -- cgit v1.2.3 From 6f7db3894ae23eb5d40af4efb404aa0c072a68d2 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:36 +0800 Subject: fsdax: dedup file range to use a compare function With dax we cannot deal with readpage() etc. So, we create a dax comparison function which is similar with vfs_dedupe_file_range_compare(). And introduce dax_remap_file_range_prep() for filesystem use. Link: https://lkml.kernel.org/r/20220603053738.1218681-13-ruansy.fnst@fujitsu.com Signed-off-by: Goldwyn Rodrigues Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/remap_range.c | 31 ++++++++++++++++---- fs/xfs/xfs_reflink.c | 8 +++-- include/linux/dax.h | 8 +++++ include/linux/fs.h | 12 +++++--- 5 files changed, 130 insertions(+), 11 deletions(-) (limited to 'include/linux/dax.h') diff --git a/fs/dax.c b/fs/dax.c index 0aab32300531..e0f9c4a0a0c1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1873,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); + +static loff_t dax_range_compare_iter(struct iomap_iter *it_src, + struct iomap_iter *it_dest, u64 len, bool *same) +{ + const struct iomap *smap = &it_src->iomap; + const struct iomap *dmap = &it_dest->iomap; + loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + void *saddr, *daddr; + int id, ret; + + len = min(len, min(smap->length, dmap->length)); + + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { + *same = true; + return len; + } + + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { + *same = false; + return 0; + } + + id = dax_read_lock(); + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), + &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), + &daddr, NULL); + if (ret < 0) + goto out_unlock; + + *same = !memcmp(saddr, daddr, len); + if (!*same) + len = 0; + dax_read_unlock(id); + return len; + +out_unlock: + dax_read_unlock(id); + return -EIO; +} + +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dst, loff_t dstoff, loff_t len, bool *same, + const struct iomap_ops *ops) +{ + struct iomap_iter src_iter = { + .inode = src, + .pos = srcoff, + .len = len, + .flags = IOMAP_DAX, + }; + struct iomap_iter dst_iter = { + .inode = dst, + .pos = dstoff, + .len = len, + .flags = IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&src_iter, ops)) > 0) { + while ((ret = iomap_iter(&dst_iter, ops)) > 0) { + dst_iter.processed = dax_range_compare_iter(&src_iter, + &dst_iter, len, same); + } + if (ret <= 0) + src_iter.processed = ret; + } + return ret; +} + +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, ops); +} +EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); diff --git a/fs/remap_range.c b/fs/remap_range.c index e112b5424cdb..231de627c1b9 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #include @@ -271,9 +272,11 @@ out_error: * If there's an error, then the usual negative error code is returned. * Otherwise returns 0 with *len set to the request length. */ -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) +int +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -333,8 +336,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; - ret = vfs_dedupe_file_range_compare(file_in, pos_in, - file_out, pos_out, *len, &is_same); + if (*len == 0) + return 0; + + if (!IS_DAX(inode_in)) + ret = vfs_dedupe_file_range_compare(file_in, pos_in, + file_out, pos_out, *len, &is_same); + else if (dax_read_ops) + ret = dax_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same, + dax_read_ops); + else + return -EINVAL; if (ret) return ret; if (!is_same) @@ -352,6 +365,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, return ret; } + +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, NULL); +} EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e7a7c00d93be..cbaf36d21020 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1367,8 +1367,12 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); + if (!IS_DAX(inode_in)) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags); + else + ret = dax_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, &xfs_read_iomap_ops); if (ret || *len == 0) goto out_unlock; diff --git a/include/linux/dax.h b/include/linux/dax.h index 7116681b48c0..ba985333e26b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -246,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same, + const struct iomap_ops *ops); +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..134e9d7ad5d6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,6 +74,7 @@ struct fsverity_operations; struct fs_context; struct fs_parameter_spec; struct fileattr; +struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2070,10 +2071,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); -extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *count, - unsigned int remap_flags); +int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops); +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *count, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); -- cgit v1.2.3