From b3fdf9398a16f01dc013967a4ab25e99c3f4fc12 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 16 May 2022 11:21:46 -0700 Subject: x86/mce: relocate set{clear}_mce_nospec() functions Relocate the twin mce functions to arch/x86/mm/pat/set_memory.c file where they belong. While at it, fixup a function name in a comment. Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Signed-off-by: Jane Chu Acked-by: Borislav Petkov Cc: Stephen Rothwell [sfr: gate {set,clear}_mce_nospec() by CONFIG_X86_64] Link: https://lore.kernel.org/r/165272527328.90175.8336008202048685278.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/set_memory.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index f36be5166c19..683a6c3f7179 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -42,14 +42,14 @@ static inline bool can_set_direct_map(void) #endif #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -#ifndef set_mce_nospec +#ifdef CONFIG_X86_64 +int set_mce_nospec(unsigned long pfn, bool unmap); +int clear_mce_nospec(unsigned long pfn); +#else static inline int set_mce_nospec(unsigned long pfn, bool unmap) { return 0; } -#endif - -#ifndef clear_mce_nospec static inline int clear_mce_nospec(unsigned long pfn) { return 0; -- cgit v1.2.3 From 5898b43af954b83c4a4ee4ab85c4dbafa395822a Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 16 May 2022 11:38:10 -0700 Subject: mce: fix set_mce_nospec to always unmap the whole page The set_memory_uc() approach doesn't work well in all cases. As Dan pointed out when "The VMM unmapped the bad page from guest physical space and passed the machine check to the guest." "The guest gets virtual #MC on an access to that page. When the guest tries to do set_memory_uc() and instructs cpa_flush() to do clean caches that results in taking another fault / exception perhaps because the VMM unmapped the page from the guest." Since the driver has special knowledge to handle NP or UC, mark the poisoned page with NP and let driver handle it when it comes down to repair. Please refer to discussions here for more details. https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/ Now since poisoned page is marked as not-present, in order to avoid writing to a not-present page and trigger kernel Oops, also fix pmem_do_write(). Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()") Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Signed-off-by: Jane Chu Acked-by: Tony Luck Link: https://lore.kernel.org/r/165272615484.103830.2563950688772226611.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- arch/x86/kernel/cpu/mce/core.c | 6 +++--- arch/x86/mm/pat/set_memory.c | 23 +++++++++++------------ drivers/nvdimm/pmem.c | 30 +++++++----------------------- include/linux/set_memory.h | 4 ++-- 4 files changed, 23 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 981496e6bc0e..fa67bb9d1afe 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -579,7 +579,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, pfn = mce->addr >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { - set_mce_nospec(pfn, whole_page(mce)); + set_mce_nospec(pfn); mce->kflags |= MCE_HANDLED_UC; } @@ -1316,7 +1316,7 @@ static void kill_me_maybe(struct callback_head *cb) ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); if (!ret) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); sync_core(); return; } @@ -1342,7 +1342,7 @@ static void kill_me_never(struct callback_head *cb) p->mce_count = 0; pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); } static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 0caf4b0edcbc..44f0d4260bd8 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1925,14 +1925,9 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); -/* - * Prevent speculative access to the page by either unmapping - * it (if we do not require access to any part of the page) or - * marking it uncacheable (if we want to try to retrieve data - * from non-poisoned lines in the page). - */ +/* Prevent speculative access to a page by marking it not-present */ #ifdef CONFIG_X86_64 -int set_mce_nospec(unsigned long pfn, bool unmap) +int set_mce_nospec(unsigned long pfn) { unsigned long decoy_addr; int rc; @@ -1954,19 +1949,23 @@ int set_mce_nospec(unsigned long pfn, bool unmap) */ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); - if (unmap) - rc = set_memory_np(decoy_addr, 1); - else - rc = set_memory_uc(decoy_addr, 1); + rc = set_memory_np(decoy_addr, 1); if (rc) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } +static int set_memory_present(unsigned long *addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { - return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); + unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); + + return set_memory_present(&addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 58d95242a836..4aa17132a557 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -158,36 +158,20 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem, struct page *page, unsigned int page_off, sector_t sector, unsigned int len) { - blk_status_t rc = BLK_STS_OK; - bool bad_pmem = false; phys_addr_t pmem_off = sector * 512 + pmem->data_offset; void *pmem_addr = pmem->virt_addr + pmem_off; - if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) - bad_pmem = true; + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) { + blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len); + + if (rc != BLK_STS_OK) + return rc; + } - /* - * Note that we write the data both before and after - * clearing poison. The write before clear poison - * handles situations where the latest written data is - * preserved and the clear poison operation simply marks - * the address range as valid without changing the data. - * In this case application software can assume that an - * interrupted write will either return the new good - * data or an error. - * - * However, if pmem_clear_poison() leaves the data in an - * indeterminate state we need to perform the write - * after clear poison. - */ flush_dcache_page(page); write_pmem(pmem_addr, page, page_off, len); - if (unlikely(bad_pmem)) { - rc = pmem_clear_poison(pmem, pmem_off, len); - write_pmem(pmem_addr, page, page_off, len); - } - return rc; + return BLK_STS_OK; } static void pmem_submit_bio(struct bio *bio) diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 683a6c3f7179..369769ce7399 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -43,10 +43,10 @@ static inline bool can_set_direct_map(void) #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ #ifdef CONFIG_X86_64 -int set_mce_nospec(unsigned long pfn, bool unmap); +int set_mce_nospec(unsigned long pfn); int clear_mce_nospec(unsigned long pfn); #else -static inline int set_mce_nospec(unsigned long pfn, bool unmap) +static inline int set_mce_nospec(unsigned long pfn) { return 0; } -- cgit v1.2.3 From e511c4a3d2a1f64aafc1f5df37a2ffcf7ef91b55 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 13 May 2022 15:10:58 -0700 Subject: dax: introduce DAX_RECOVERY_WRITE dax access mode Up till now, dax_direct_access() is used implicitly for normal access, but for the purpose of recovery write, dax range with poison is requested. To make the interface clear, introduce enum dax_access_mode { DAX_ACCESS, DAX_RECOVERY_WRITE, } where DAX_ACCESS is used for normal dax access, and DAX_RECOVERY_WRITE is used for dax recovery write. Suggested-by: Dan Williams Signed-off-by: Jane Chu Reviewed-by: Christoph Hellwig Cc: Mike Snitzer Reviewed-by: Vivek Goyal Link: https://lore.kernel.org/r/165247982851.52965.11024212198889762949.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 5 +++-- drivers/md/dm-linear.c | 5 +++-- drivers/md/dm-log-writes.c | 5 +++-- drivers/md/dm-stripe.c | 5 +++-- drivers/md/dm-target.c | 4 +++- drivers/md/dm-writecache.c | 7 ++++--- drivers/md/dm.c | 5 +++-- drivers/nvdimm/pmem.c | 8 +++++--- drivers/nvdimm/pmem.h | 5 ++++- drivers/s390/block/dcssblk.c | 9 ++++++--- fs/dax.c | 9 +++++---- fs/fuse/dax.c | 4 ++-- fs/fuse/virtio_fs.c | 6 ++++-- include/linux/dax.h | 9 +++++++-- include/linux/device-mapper.h | 4 +++- tools/testing/nvdimm/pmem-dax.c | 4 +++- 16 files changed, 61 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0211e6f7b47a..5405eb553430 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -117,6 +117,7 @@ enum dax_device_flags { * @dax_dev: a dax_device instance representing the logical memory range * @pgoff: offset in pages from the start of the device to translate * @nr_pages: number of consecutive pages caller can handle relative to @pfn + * @mode: indicator on normal access or recovery write * @kaddr: output parameter that returns a virtual address mapping of pfn * @pfn: output parameter that returns an absolute pfn translation of @pgoff * @@ -124,7 +125,7 @@ enum dax_device_flags { * pages accessible at the device relative @pgoff. */ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, - void **kaddr, pfn_t *pfn) + enum dax_access_mode mode, void **kaddr, pfn_t *pfn) { long avail; @@ -138,7 +139,7 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, return -EINVAL; avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, - kaddr, pfn); + mode, kaddr, pfn); if (!avail) return -ERANGE; return min(avail, nr_pages); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 76b486e4d2be..13e263299c9c 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -172,11 +172,12 @@ static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) } static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); } static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index c9d036d6bb2e..06bdbed65eb1 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -889,11 +889,12 @@ static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti, } static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); } static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index c81d331d1afe..77d72900e997 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -315,11 +315,12 @@ static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) } static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); } static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 64dd0b34fcf4..8cd5184e62f0 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -10,6 +10,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "target" @@ -142,7 +143,8 @@ static void io_err_release_clone_rq(struct request *clone, } static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { return -EIO; } diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 5630b470ba42..d74c5a7a0ab4 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -286,7 +286,8 @@ static int persistent_memory_claim(struct dm_writecache *wc) id = dax_read_lock(); - da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn); + da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, + &wc->memory_map, &pfn); if (da < 0) { wc->memory_map = NULL; r = da; @@ -308,8 +309,8 @@ static int persistent_memory_claim(struct dm_writecache *wc) i = 0; do { long daa; - daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i, - NULL, &pfn); + daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, + p - i, DAX_ACCESS, NULL, &pfn); if (daa <= 0) { r = daa ? daa : -EINVAL; goto err3; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 82957bd460e8..9c452641c3d5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1093,7 +1093,8 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, } static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct mapped_device *md = dax_get_private(dax_dev); sector_t sector = pgoff * PAGE_SECTORS; @@ -1111,7 +1112,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (len < 1) goto out; nr_pages = min(len, nr_pages); - ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); + ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn); out: dm_put_live_table(md, srcu_idx); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4aa17132a557..47f34c50f944 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -239,7 +239,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; @@ -278,11 +279,12 @@ static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, } static long pmem_dax_direct_access(struct dax_device *dax_dev, - pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) + pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) { struct pmem_device *pmem = dax_get_private(dax_dev); - return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); + return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn); } static const struct dax_operations pmem_dax_ops = { diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 1f51a2361429..392b0b38acb9 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -8,6 +8,8 @@ #include #include +enum dax_access_mode; + /* this definition is in it's own header for tools/testing/nvdimm to consume */ struct pmem_device { /* One contiguous memory region per device */ @@ -28,7 +30,8 @@ struct pmem_device { }; long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn); + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn); #ifdef CONFIG_MEMORY_FAILURE static inline bool test_and_clear_pmem_poison(struct page *page) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index d614843caf6c..8d0d0eaa3059 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -32,7 +32,8 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_submit_bio(struct bio *bio); static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn); + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -50,7 +51,8 @@ static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev, long rc; void *kaddr; - rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, + &kaddr, NULL); if (rc < 0) return rc; memset(kaddr, 0, nr_pages << PAGE_SHIFT); @@ -927,7 +929,8 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff, static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev); diff --git a/fs/dax.c b/fs/dax.c index 67a08a32fccb..ef3103107104 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -721,7 +721,8 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter int id; id = dax_read_lock(); - rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, + &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; @@ -1013,7 +1014,7 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), - NULL, pfnp); + DAX_ACCESS, NULL, pfnp); if (length < 0) { rc = length; goto out; @@ -1122,7 +1123,7 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, void *kaddr; long ret; - ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret > 0) { memset(kaddr + offset, 0, size); dax_flush(dax_dev, kaddr + offset, size); @@ -1247,7 +1248,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), - &kaddr, NULL); + DAX_ACCESS, &kaddr, NULL); if (map_len < 0) { ret = map_len; break; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index d7d3a7f06862..10eb50cbf398 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1241,8 +1241,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); id = dax_read_lock(); - nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL, - NULL); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), + DAX_ACCESS, NULL, NULL); dax_read_unlock(id); if (nr_pages < 0) { pr_debug("dax_direct_access() returned %ld\n", nr_pages); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 86b7dbb6a0d4..8db53fa67359 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -752,7 +752,8 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, * offset. */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); @@ -772,7 +773,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, long rc; void *kaddr; - rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, + NULL); if (rc < 0) return rc; memset(kaddr, 0, nr_pages << PAGE_SHIFT); diff --git a/include/linux/dax.h b/include/linux/dax.h index 9fc5f99a0ae2..3f1339bce3c0 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,11 @@ struct iomap_ops; struct iomap_iter; struct iomap; +enum dax_access_mode { + DAX_ACCESS, + DAX_RECOVERY_WRITE, +}; + struct dax_operations { /* * direct_access: translate a device-relative @@ -21,7 +26,7 @@ struct dax_operations { * number of pages available for DAX at that pfn. */ long (*direct_access)(struct dax_device *, pgoff_t, long, - void **, pfn_t *); + enum dax_access_mode, void **, pfn_t *); /* * Validate whether this device is usable as an fsdax backing * device. @@ -178,7 +183,7 @@ static inline void dax_read_unlock(int id) bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, - void **kaddr, pfn_t *pfn); + enum dax_access_mode mode, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index c2a3758c4aaa..acdedda0d12b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -20,6 +20,7 @@ struct dm_table; struct dm_report_zones_args; struct mapped_device; struct bio_vec; +enum dax_access_mode; /* * Type of table, mapped_device's mempool and request_queue @@ -146,7 +147,8 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); * >= 0 : the number of bytes accessible at the address */ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn); + long nr_pages, enum dax_access_mode node, void **kaddr, + pfn_t *pfn); typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c index af19c85558e7..c1ec099a3b1d 100644 --- a/tools/testing/nvdimm/pmem-dax.c +++ b/tools/testing/nvdimm/pmem-dax.c @@ -4,11 +4,13 @@ */ #include "test/nfit_test.h" #include +#include #include #include long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; -- cgit v1.2.3 From 047218ec904da19c45c4a70274fc3f818a1fcba1 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 22 Apr 2022 16:45:06 -0600 Subject: dax: add .recovery_write dax_operation Introduce dax_recovery_write() operation. The function is used to recover a dax range that contains poison. Typical use case is when a user process receives a SIGBUS with si_code BUS_MCEERR_AR indicating poison(s) in a dax range, in response, the user process issues a pwrite() to the page-aligned dax range, thus clears the poison and puts valid data in the range. Reviewed-by: Christoph Hellwig Signed-off-by: Jane Chu Link: https://lore.kernel.org/r/20220422224508.440670-6-jane.chu@oracle.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 9 +++++++++ drivers/md/dm-linear.c | 10 ++++++++++ drivers/md/dm-log-writes.c | 10 ++++++++++ drivers/md/dm-stripe.c | 10 ++++++++++ drivers/md/dm.c | 20 ++++++++++++++++++++ drivers/nvdimm/pmem.c | 7 +++++++ fs/dax.c | 13 ++++++++++++- include/linux/dax.h | 13 +++++++++++++ include/linux/device-mapper.h | 9 +++++++++ 9 files changed, 100 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 5405eb553430..50a08b2ec247 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -195,6 +195,15 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, } EXPORT_SYMBOL_GPL(dax_zero_page_range); +size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *iter) +{ + if (!dax_dev->ops->recovery_write) + return 0; + return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); +} +EXPORT_SYMBOL_GPL(dax_recovery_write); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 13e263299c9c..cdf48bc8c5b0 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -188,9 +188,18 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages); } +static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define linear_dax_direct_access NULL #define linear_dax_zero_page_range NULL +#define linear_dax_recovery_write NULL #endif static struct target_type linear_target = { @@ -208,6 +217,7 @@ static struct target_type linear_target = { .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, .dax_zero_page_range = linear_dax_zero_page_range, + .dax_recovery_write = linear_dax_recovery_write, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 06bdbed65eb1..22739dccdd17 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -905,9 +905,18 @@ static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT); } +static size_t log_writes_dax_recovery_write(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define log_writes_dax_direct_access NULL #define log_writes_dax_zero_page_range NULL +#define log_writes_dax_recovery_write NULL #endif static struct target_type log_writes_target = { @@ -925,6 +934,7 @@ static struct target_type log_writes_target = { .io_hints = log_writes_io_hints, .direct_access = log_writes_dax_direct_access, .dax_zero_page_range = log_writes_dax_zero_page_range, + .dax_recovery_write = log_writes_dax_recovery_write, }; static int __init dm_log_writes_init(void) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 77d72900e997..baa085cc67bd 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -331,9 +331,18 @@ static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages); } +static size_t stripe_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define stripe_dax_direct_access NULL #define stripe_dax_zero_page_range NULL +#define stripe_dax_recovery_write NULL #endif /* @@ -470,6 +479,7 @@ static struct target_type stripe_target = { .io_hints = stripe_io_hints, .direct_access = stripe_dax_direct_access, .dax_zero_page_range = stripe_dax_zero_page_range, + .dax_recovery_write = stripe_dax_recovery_write, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9c452641c3d5..3fe76ab20069 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1147,6 +1147,25 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } +static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int srcu_idx; + long ret = 0; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + if (!ti || !ti->type->dax_recovery_write) + goto out; + + ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i); +out: + dm_put_live_table(md, srcu_idx); + return ret; +} + /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management @@ -3147,6 +3166,7 @@ static const struct block_device_operations dm_rq_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .zero_page_range = dm_dax_zero_page_range, + .recovery_write = dm_dax_recovery_write, }; /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 47f34c50f944..e5e288135af7 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -287,9 +287,16 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn); } +static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return 0; +} + static const struct dax_operations pmem_dax_ops = { .direct_access = pmem_dax_direct_access, .zero_page_range = pmem_dax_zero_page_range, + .recovery_write = pmem_recovery_write, }; static ssize_t write_cache_show(struct device *dev, diff --git a/fs/dax.c b/fs/dax.c index ef3103107104..a1e4b45cbf55 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1240,6 +1240,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; + bool recovery = false; void *kaddr; if (fatal_signal_pending(current)) { @@ -1249,6 +1250,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, &kaddr, NULL); + if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { + map_len = dax_direct_access(dax_dev, pgoff, + PHYS_PFN(size), DAX_RECOVERY_WRITE, + &kaddr, NULL); + if (map_len > 0) + recovery = true; + } if (map_len < 0) { ret = map_len; break; @@ -1260,7 +1268,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (map_len > end - pos) map_len = end - pos; - if (iov_iter_rw(iter) == WRITE) + if (recovery) + xfer = dax_recovery_write(dax_dev, pgoff, kaddr, + map_len, iter); + else if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else diff --git a/include/linux/dax.h b/include/linux/dax.h index 3f1339bce3c0..e7b81634c52a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -35,6 +35,12 @@ struct dax_operations { sector_t, sector_t); /* zero_page_range: required operation. Zero page range */ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); + /* + * recovery_write: recover a poisoned range by DAX device driver + * capable of clearing poison. + */ + size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *iter); }; #if IS_ENABLED(CONFIG_DAX) @@ -45,6 +51,8 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); +size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); /* * Check if given mapping is supported by the file / underlying device. */ @@ -92,6 +100,11 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, { return !(vma->vm_flags & VM_SYNC); } +static inline size_t dax_recovery_write(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + return 0; +} #endif void set_dax_nocache(struct dax_device *dax_dev); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index acdedda0d12b..47a01c7cffdf 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -152,6 +152,14 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); +/* + * Returns: + * != 0 : number of bytes transferred + * 0 : recovery write failed + */ +typedef size_t (*dm_dax_recovery_write_fn)(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); + void dm_error(const char *message); struct dm_dev { @@ -201,6 +209,7 @@ struct target_type { dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; dm_dax_zero_page_range_fn dax_zero_page_range; + dm_dax_recovery_write_fn dax_recovery_write; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.2.3