From 602544773763da411ffa67567fa1d146f3a40231 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jan 2026 16:31:09 -0800 Subject: uapi: promote EFSCORRUPTED and EUCLEAN to errno.h Stop definining these privately and instead move them to the uapi errno.h so that they become canonical instead of copy pasta. Cc: linux-api@vger.kernel.org Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/176826402587.3490369.17659117524205214600.stgit@frogsfrogsfrogs Reviewed-by: Gao Xiang Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/jbd2.h | 3 --- include/uapi/asm-generic/errno.h | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f5eaf76198f3..a53a00d36228 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1815,7 +1815,4 @@ static inline int jbd2_handle_buffer_credits(handle_t *handle) #endif /* __KERNEL__ */ -#define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* _LINUX_JBD2_H */ diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h index cf9c51ac49f9..92e7ae493ee3 100644 --- a/include/uapi/asm-generic/errno.h +++ b/include/uapi/asm-generic/errno.h @@ -55,6 +55,7 @@ #define EMULTIHOP 72 /* Multihop attempted */ #define EDOTDOT 73 /* RFS specific error */ #define EBADMSG 74 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EOVERFLOW 75 /* Value too large for defined data type */ #define ENOTUNIQ 76 /* Name not unique on network */ #define EBADFD 77 /* File descriptor in bad state */ @@ -98,6 +99,7 @@ #define EINPROGRESS 115 /* Operation now in progress */ #define ESTALE 116 /* Stale file handle */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ -- cgit v1.2.3 From 21945e6cb5168395d7d6f9052cd16ec4eac13973 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jan 2026 16:31:25 -0800 Subject: fs: report filesystem and file I/O errors to fsnotify Create some wrapper code around struct super_block so that filesystems have a standard way to queue filesystem metadata and file I/O error reports to have them sent to fsnotify. If a filesystem wants to provide an error number, it must supply only negative error numbers. These are stored internally as negative numbers, but they are converted to positive error numbers before being passed to fanotify, per the fanotify(7) manpage. Implementations of super_operations::report_error are passed the raw internal event data. Note that we have to play some shenanigans with mempools and queue_work so that the error handling doesn't happen outside of process context, and the event handler functions (both ->report_error and fsnotify) can handle file I/O error messages without having to worry about whatever locks might be held. This asynchronicity requires that unmount wait for pending events to clear. Add a new callback to the superblock operations structure so that filesystem drivers can themselves respond to file I/O errors if they so desire. This will be used for an upcoming self-healing patchset for XFS. Suggested-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/176826402610.3490369.4378391061533403171.stgit@frogsfrogsfrogs Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/Makefile | 2 +- fs/fserror.c | 194 +++++++++++++++++++++++++++++++++++++++++ fs/super.c | 3 + include/linux/fs/super_types.h | 7 ++ include/linux/fserror.h | 75 ++++++++++++++++ 5 files changed, 280 insertions(+), 1 deletion(-) create mode 100644 fs/fserror.c create mode 100644 include/linux/fserror.h (limited to 'include') diff --git a/fs/Makefile b/fs/Makefile index a04274a3c854..f238cc5ea2e9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ - file_attr.o + file_attr.o fserror.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/fserror.c b/fs/fserror.c new file mode 100644 index 000000000000..06ca86adab9b --- /dev/null +++ b/fs/fserror.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include +#include +#include +#include + +#define FSERROR_DEFAULT_EVENT_POOL_SIZE (32) + +static struct mempool fserror_events_pool; + +void fserror_mount(struct super_block *sb) +{ + /* + * The pending error counter is biased by 1 so that we don't wake_var + * until we're actually trying to unmount. + */ + refcount_set(&sb->s_pending_errors, 1); +} + +void fserror_unmount(struct super_block *sb) +{ + /* + * If we don't drop the pending error count to zero, then wait for it + * to drop below 1, which means that the pending errors cleared and + * hopefully we didn't saturate with 1 billion+ concurrent events. + */ + if (!refcount_dec_and_test(&sb->s_pending_errors)) + wait_var_event(&sb->s_pending_errors, + refcount_read(&sb->s_pending_errors) < 1); +} + +static inline void fserror_pending_dec(struct super_block *sb) +{ + if (refcount_dec_and_test(&sb->s_pending_errors)) + wake_up_var(&sb->s_pending_errors); +} + +static inline void fserror_free_event(struct fserror_event *event) +{ + fserror_pending_dec(event->sb); + mempool_free(event, &fserror_events_pool); +} + +static void fserror_worker(struct work_struct *work) +{ + struct fserror_event *event = + container_of(work, struct fserror_event, work); + struct super_block *sb = event->sb; + + if (sb->s_flags & SB_ACTIVE) { + struct fs_error_report report = { + /* send positive error number to userspace */ + .error = -event->error, + .inode = event->inode, + .sb = event->sb, + }; + + if (sb->s_op->report_error) + sb->s_op->report_error(event); + + fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, + NULL, 0); + } + + iput(event->inode); + fserror_free_event(event); +} + +static inline struct fserror_event *fserror_alloc_event(struct super_block *sb, + gfp_t gfp_flags) +{ + struct fserror_event *event = NULL; + + /* + * If pending_errors already reached zero or is no longer active, + * the superblock is being deactivated so there's no point in + * continuing. + * + * The order of the check of s_pending_errors and SB_ACTIVE are + * mandated by order of accesses in generic_shutdown_super and + * fserror_unmount. Barriers are implicitly provided by the refcount + * manipulations in this function and fserror_unmount. + */ + if (!refcount_inc_not_zero(&sb->s_pending_errors)) + return NULL; + if (!(sb->s_flags & SB_ACTIVE)) + goto out_pending; + + event = mempool_alloc(&fserror_events_pool, gfp_flags); + if (!event) + goto out_pending; + + /* mempool_alloc doesn't support GFP_ZERO */ + memset(event, 0, sizeof(*event)); + event->sb = sb; + INIT_WORK(&event->work, fserror_worker); + + return event; + +out_pending: + fserror_pending_dec(sb); + return NULL; +} + +/** + * fserror_report - report a filesystem error of some kind + * + * @sb: superblock of the filesystem + * @inode: inode within that filesystem, if applicable + * @type: type of error encountered + * @pos: start of inode range affected, if applicable + * @len: length of inode range affected, if applicable + * @error: error number encountered, must be negative + * @gfp: memory allocation flags for conveying the event to a worker, + * since this function can be called from atomic contexts + * + * Report details of a filesystem error to the super_operations::report_error + * callback if present; and to fsnotify for distribution to userspace. @sb, + * @gfp, @type, and @error must all be specified. For file I/O errors, the + * @inode, @pos, and @len fields must also be specified. For file metadata + * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb + * must point to @sb. + * + * Reporting work is deferred to a workqueue to ensure that ->report_error is + * called from process context without any locks held. An active reference to + * the inode is maintained until event handling is complete, and unmount will + * wait for queued events to drain. + */ +void fserror_report(struct super_block *sb, struct inode *inode, + enum fserror_type type, loff_t pos, u64 len, int error, + gfp_t gfp) +{ + struct fserror_event *event; + + /* sb and inode must be from the same filesystem */ + WARN_ON_ONCE(inode && inode->i_sb != sb); + + /* error number must be negative */ + WARN_ON_ONCE(error >= 0); + + event = fserror_alloc_event(sb, gfp); + if (!event) + goto lost; + + event->type = type; + event->pos = pos; + event->len = len; + event->error = error; + + /* + * Can't iput from non-sleeping context, so grabbing another reference + * to the inode must be the last thing before submitting the event. + */ + if (inode) { + event->inode = igrab(inode); + if (!event->inode) + goto lost_event; + } + + /* + * Use schedule_work here even if we're already in process context so + * that fsnotify and super_operations::report_error implementations are + * guaranteed to run in process context without any locks held. Since + * errors are supposed to be rare, the overhead shouldn't kill us any + * more than the failing device will. + */ + schedule_work(&event->work); + return; + +lost_event: + fserror_free_event(event); +lost: + if (inode) + pr_err_ratelimited( + "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", + sb->s_id, inode->i_ino, type, pos, len, error); + else + pr_err_ratelimited( + "%s: lost filesystem error report for type %u error %d", + sb->s_id, type, error); +} +EXPORT_SYMBOL_GPL(fserror_report); + +static int __init fserror_init(void) +{ + return mempool_init_kmalloc_pool(&fserror_events_pool, + FSERROR_DEFAULT_EVENT_POOL_SIZE, + sizeof(struct fserror_event)); +} +fs_initcall(fserror_init); diff --git a/fs/super.c b/fs/super.c index 3d85265d1400..b13c1fd6a6f4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "internal.h" @@ -363,6 +364,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); + fserror_mount(s); s->s_count = 1; atomic_set(&s->s_active, 1); @@ -622,6 +624,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; + fserror_unmount(sb); cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 6bd3009e09b3..97a8552d8f2b 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -35,6 +35,7 @@ struct user_namespace; struct workqueue_struct; struct writeback_control; struct xattr_handler; +struct fserror_event; extern struct super_block *blockdev_superblock; @@ -124,6 +125,9 @@ struct super_operations { */ int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); void (*shutdown)(struct super_block *sb); + + /* Report a filesystem error */ + void (*report_error)(const struct fserror_event *event); }; struct super_block { @@ -268,6 +272,9 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ long s_min_writeback_pages; + + /* number of fserrors that are being sent to fsnotify/filesystems */ + refcount_t s_pending_errors; } __randomize_layout; /* diff --git a/include/linux/fserror.h b/include/linux/fserror.h new file mode 100644 index 000000000000..5e1ad78c346e --- /dev/null +++ b/include/linux/fserror.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef _LINUX_FSERROR_H__ +#define _LINUX_FSERROR_H__ + +void fserror_mount(struct super_block *sb); +void fserror_unmount(struct super_block *sb); + +enum fserror_type { + /* pagecache I/O failed */ + FSERR_BUFFERED_READ, + FSERR_BUFFERED_WRITE, + + /* direct I/O failed */ + FSERR_DIRECTIO_READ, + FSERR_DIRECTIO_WRITE, + + /* out of band media error reported */ + FSERR_DATA_LOST, + + /* filesystem metadata */ + FSERR_METADATA, +}; + +struct fserror_event { + struct work_struct work; + struct super_block *sb; + struct inode *inode; + loff_t pos; + u64 len; + enum fserror_type type; + + /* negative error number */ + int error; +}; + +void fserror_report(struct super_block *sb, struct inode *inode, + enum fserror_type type, loff_t pos, u64 len, int error, + gfp_t gfp); + +static inline void fserror_report_io(struct inode *inode, + enum fserror_type type, loff_t pos, + u64 len, int error, gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, type, pos, len, error, gfp); +} + +static inline void fserror_report_data_lost(struct inode *inode, loff_t pos, + u64 len, gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, FSERR_DATA_LOST, pos, len, -EIO, + gfp); +} + +static inline void fserror_report_file_metadata(struct inode *inode, int error, + gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, FSERR_METADATA, 0, 0, error, gfp); +} + +static inline void fserror_report_metadata(struct super_block *sb, int error, + gfp_t gfp) +{ + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, error, gfp); +} + +static inline void fserror_report_shutdown(struct super_block *sb, gfp_t gfp) +{ + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, -ESHUTDOWN, gfp); +} + +#endif /* _LINUX_FSERROR_H__ */ -- cgit v1.2.3