summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-25 10:34:23 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-25 10:34:23 -0800
commit0e335a7745b0a3e0421d6b4fff718c0deeb130ee (patch)
tree778b3ab9c07aab8c67c8c032f60c5dab62b01113
parentbfbc0b5b32a8f28ce284add619bf226716a59bc0 (diff)
parent4a1ddb0f1c48c2b56f21d8b5200e2e29adf4c1df (diff)
Merge tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs fixes from Christian Brauner: - Fix an uninitialized variable in file_getattr(). The flags_valid field wasn't initialized before calling vfs_fileattr_get(), triggering KMSAN uninit-value reports in fuse - Fix writeback wakeup and logging timeouts when DETECT_HUNG_TASK is not enabled. sysctl_hung_task_timeout_secs is 0 in that case causing spurious "waiting for writeback completion for more than 1 seconds" warnings - Fix a null-ptr-deref in do_statmount() when the mount is internal - Add missing kernel-doc description for the @private parameter in iomap_readahead() - Fix mount namespace creation to hold namespace_sem across the mount copy in create_new_namespace(). The previous drop-and-reacquire pattern was fragile and failed to clean up mount propagation links if the real rootfs was a shared or dependent mount - Fix /proc mount iteration where m->index wasn't updated when m->show() overflows, causing a restart to repeatedly show the same mount entry in a rapidly expanding mount table - Return EFSCORRUPTED instead of ENOSPC in minix_new_inode() when the inode number is out of range - Fix unshare(2) when CLONE_NEWNS is set and current->fs isn't shared. copy_mnt_ns() received the live fs_struct so if a subsequent namespace creation failed the rollback would leave pwd and root pointing to detached mounts. Always allocate a new fs_struct when CLONE_NEWNS is requested - fserror bug fixes: - Remove the unused fsnotify_sb_error() helper now that all callers have been converted to fserror_report_metadata - Fix a lockdep splat in fserror_report() where igrab() takes inode::i_lock which can be held in IRQ context. Replace igrab() with a direct i_count bump since filesystems should not report inodes that are about to be freed or not yet exposed - Handle error pointer in procfs for try_lookup_noperm() - Fix an integer overflow in ep_loop_check_proc() where recursive calls returning INT_MAX would overflow when +1 is added, breaking the recursion depth check - Fix a misleading break in pidfs * tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: pidfs: avoid misleading break eventpoll: Fix integer overflow in ep_loop_check_proc() proc: Fix pointer error dereference fserror: fix lockdep complaint when igrabbing inode fsnotify: drop unused helper unshare: fix unshare_fs() handling minix: Correct errno in minix_new_inode namespace: fix proc mount iteration mount: hold namespace_sem across copy in create_new_namespace() iomap: Describe @private in iomap_readahead() statmount: Fix the null-ptr-deref in do_statmount() writeback: Fix wakeup and logging timeouts for !DETECT_HUNG_TASK fs: init flags_valid before calling vfs_fileattr_get
-rw-r--r--fs/eventpoll.c5
-rw-r--r--fs/file_attr.c2
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/iomap/buffered-io.c1
-rw-r--r--fs/iomap/ioend.c46
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/namespace.c133
-rw-r--r--fs/pidfs.c10
-rw-r--r--fs/proc/base.c3
-rw-r--r--include/linux/fsnotify.h13
-rw-r--r--kernel/fork.c2
11 files changed, 139 insertions, 87 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a8c278c50083..5714e900567c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2061,7 +2061,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
- * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
+ * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
+ * a loop or went too deep.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
@@ -2080,7 +2081,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
- result = INT_MAX;
+ result = EP_MAX_NESTS+1;
else
result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
if (result > EP_MAX_NESTS)
diff --git a/fs/file_attr.c b/fs/file_attr.c
index 6d2a298a786d..da983e105d70 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -378,7 +378,7 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
struct path filepath __free(path_put) = {};
unsigned int lookup_flags = 0;
struct file_attr fattr;
- struct file_kattr fa;
+ struct file_kattr fa = { .flags_valid = true }; /* hint only */
int error;
BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8f8069fb76ba..7c75ed7e8979 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -198,10 +198,11 @@ static void wb_queue_work(struct bdi_writeback *wb,
static bool wb_wait_for_completion_cb(struct wb_completion *done)
{
+ unsigned long timeout = sysctl_hung_task_timeout_secs;
unsigned long waited_secs = (jiffies - done->wait_start) / HZ;
done->progress_stamp = jiffies;
- if (waited_secs > sysctl_hung_task_timeout_secs)
+ if (timeout && (waited_secs > timeout))
pr_info("INFO: The task %s:%d has been waiting for writeback "
"completion for more than %lu seconds.",
current->comm, current->pid, waited_secs);
@@ -1954,6 +1955,7 @@ static long writeback_sb_inodes(struct super_block *sb,
.range_end = LLONG_MAX,
};
unsigned long start_time = jiffies;
+ unsigned long timeout = sysctl_hung_task_timeout_secs;
long write_chunk;
long total_wrote = 0; /* count both pages and inodes */
unsigned long dirtied_before = jiffies;
@@ -2040,9 +2042,8 @@ static long writeback_sb_inodes(struct super_block *sb,
__writeback_single_inode(inode, &wbc);
/* Report progress to inform the hung task detector of the progress. */
- if (work->done && work->done->progress_stamp &&
- (jiffies - work->done->progress_stamp) > HZ *
- sysctl_hung_task_timeout_secs / 2)
+ if (work->done && work->done->progress_stamp && timeout &&
+ (jiffies - work->done->progress_stamp) > HZ * timeout / 2)
wake_up_all(work->done->waitq);
wbc_detach_inode(&wbc);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index a0c46aadb97d..bc82083e420a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -624,6 +624,7 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
* iomap_readahead - Attempt to read pages from a file.
* @ops: The operations vector for the filesystem.
* @ctx: The ctx used for issuing readahead.
+ * @private: The filesystem-specific information for issuing iomap_iter.
*
* This function is for filesystems to call to implement their readahead
* address_space operation.
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index e4d57cb969f1..4d1ef8a2cee9 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -69,11 +69,57 @@ static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
return folio_count;
}
+static DEFINE_SPINLOCK(failed_ioend_lock);
+static LIST_HEAD(failed_ioend_list);
+
+static void
+iomap_fail_ioends(
+ struct work_struct *work)
+{
+ struct iomap_ioend *ioend;
+ struct list_head tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&failed_ioend_lock, flags);
+ list_replace_init(&failed_ioend_list, &tmp);
+ spin_unlock_irqrestore(&failed_ioend_lock, flags);
+
+ while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
+ io_list))) {
+ list_del_init(&ioend->io_list);
+ iomap_finish_ioend_buffered(ioend);
+ cond_resched();
+ }
+}
+
+static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends);
+
+static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend)
+{
+ unsigned long flags;
+
+ /*
+ * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
+ * in the fserror code. The caller no longer owns the ioend reference
+ * after the spinlock drops.
+ */
+ spin_lock_irqsave(&failed_ioend_lock, flags);
+ if (list_empty(&failed_ioend_list))
+ WARN_ON_ONCE(!schedule_work(&failed_ioend_work));
+ list_add_tail(&ioend->io_list, &failed_ioend_list);
+ spin_unlock_irqrestore(&failed_ioend_lock, flags);
+}
+
static void ioend_writeback_end_bio(struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
ioend->io_error = blk_status_to_errno(bio->bi_status);
+ if (ioend->io_error) {
+ iomap_fail_ioend_buffered(ioend);
+ return;
+ }
+
iomap_finish_ioend_buffered(ioend);
}
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 7da66ca184f4..abec438330a7 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -247,7 +247,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
j += i * bits_per_zone;
if (!j || j > sbi->s_ninodes) {
iput(inode);
- return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-EFSCORRUPTED);
}
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = j;
diff --git a/fs/namespace.c b/fs/namespace.c
index ebe19ded293a..854f4fc66469 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1531,23 +1531,33 @@ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_mounts *p = m->private;
+ struct mount *mnt;
down_read(&namespace_sem);
- return mnt_find_id_at(p->ns, *pos);
+ mnt = mnt_find_id_at(p->ns, *pos);
+ if (mnt)
+ *pos = mnt->mnt_id_unique;
+ return mnt;
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct mount *next = NULL, *mnt = v;
+ struct mount *mnt = v;
struct rb_node *node = rb_next(&mnt->mnt_node);
- ++*pos;
if (node) {
- next = node_to_mount(node);
+ struct mount *next = node_to_mount(node);
*pos = next->mnt_id_unique;
+ return next;
}
- return next;
+
+ /*
+ * No more mounts. Set pos past current mount's ID so that if
+ * iteration restarts, mnt_find_id_at() returns NULL.
+ */
+ *pos = mnt->mnt_id_unique + 1;
+ return NULL;
}
static void m_stop(struct seq_file *m, void *v)
@@ -2791,7 +2801,8 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
}
static void lock_mount_exact(const struct path *path,
- struct pinned_mountpoint *mp);
+ struct pinned_mountpoint *mp, bool copy_mount,
+ unsigned int copy_flags);
#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
@@ -2799,7 +2810,10 @@ static void lock_mount_exact(const struct path *path,
#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
#define LOCK_MOUNT_EXACT(mp, path) \
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
- lock_mount_exact((path), &mp)
+ lock_mount_exact((path), &mp, false, 0)
+#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \
+ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
+ lock_mount_exact((path), &mp, true, (copy_flags))
static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
{
@@ -3073,16 +3087,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
return file;
}
-DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
- if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
-
static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
{
- struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
- struct path to_path __free(path_put) = {};
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct user_namespace *user_ns = current_user_ns();
- struct mount *new_ns_root;
+ struct mnt_namespace *new_ns;
+ struct mount *new_ns_root, *old_ns_root;
+ struct path to_path;
struct mount *mnt;
unsigned int copy_flags = 0;
bool locked = false;
@@ -3094,71 +3105,63 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
if (IS_ERR(new_ns))
return ERR_CAST(new_ns);
- scoped_guard(namespace_excl) {
- new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
- if (IS_ERR(new_ns_root))
- return ERR_CAST(new_ns_root);
+ old_ns_root = ns->root;
+ to_path.mnt = &old_ns_root->mnt;
+ to_path.dentry = old_ns_root->mnt.mnt_root;
- /*
- * If the real rootfs had a locked mount on top of it somewhere
- * in the stack, lock the new mount tree as well so it can't be
- * exposed.
- */
- mnt = ns->root;
- while (mnt->overmount) {
- mnt = mnt->overmount;
- if (mnt->mnt.mnt_flags & MNT_LOCKED)
- locked = true;
- }
+ VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type);
+
+ LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags);
+ if (IS_ERR(mp.parent)) {
+ free_mnt_ns(new_ns);
+ return ERR_CAST(mp.parent);
}
+ new_ns_root = mp.parent;
/*
- * We dropped the namespace semaphore so we can actually lock
- * the copy for mounting. The copied mount isn't attached to any
- * mount namespace and it is thus excluded from any propagation.
- * So realistically we're isolated and the mount can't be
- * overmounted.
+ * If the real rootfs had a locked mount on top of it somewhere
+ * in the stack, lock the new mount tree as well so it can't be
+ * exposed.
*/
-
- /* Borrow the reference from clone_mnt(). */
- to_path.mnt = &new_ns_root->mnt;
- to_path.dentry = dget(new_ns_root->mnt.mnt_root);
-
- /* Now lock for actual mounting. */
- LOCK_MOUNT_EXACT(mp, &to_path);
- if (unlikely(IS_ERR(mp.parent)))
- return ERR_CAST(mp.parent);
+ mnt = old_ns_root;
+ while (mnt->overmount) {
+ mnt = mnt->overmount;
+ if (mnt->mnt.mnt_flags & MNT_LOCKED)
+ locked = true;
+ }
/*
- * We don't emulate unshare()ing a mount namespace. We stick to the
- * restrictions of creating detached bind-mounts. It has a lot
- * saner and simpler semantics.
+ * We don't emulate unshare()ing a mount namespace. We stick
+ * to the restrictions of creating detached bind-mounts. It
+ * has a lot saner and simpler semantics.
*/
mnt = __do_loopback(path, flags, copy_flags);
- if (IS_ERR(mnt))
- return ERR_CAST(mnt);
-
scoped_guard(mount_writer) {
+ if (IS_ERR(mnt)) {
+ emptied_ns = new_ns;
+ umount_tree(new_ns_root, 0);
+ return ERR_CAST(mnt);
+ }
+
if (locked)
mnt->mnt.mnt_flags |= MNT_LOCKED;
/*
- * Now mount the detached tree on top of the copy of the
- * real rootfs we created.
+ * now mount the detached tree on top of the copy
+ * of the real rootfs we created.
*/
attach_mnt(mnt, new_ns_root, mp.mp);
if (user_ns != ns->user_ns)
lock_mnt_tree(new_ns_root);
}
- /* Add all mounts to the new namespace. */
- for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
- mnt_add_to_ns(new_ns, p);
+ for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) {
+ mnt_add_to_ns(new_ns, mnt);
new_ns->nr_mounts++;
}
- new_ns->root = real_mount(no_free_ptr(to_path.mnt));
+ new_ns->root = new_ns_root;
ns_tree_add_raw(new_ns);
- return no_free_ptr(new_ns);
+ return new_ns;
}
static struct file *open_new_namespace(struct path *path, unsigned int flags)
@@ -3840,16 +3843,20 @@ static int do_new_mount(const struct path *path, const char *fstype,
}
static void lock_mount_exact(const struct path *path,
- struct pinned_mountpoint *mp)
+ struct pinned_mountpoint *mp, bool copy_mount,
+ unsigned int copy_flags)
{
struct dentry *dentry = path->dentry;
int err;
+ /* Assert that inode_lock() locked the correct inode. */
+ VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path));
+
inode_lock(dentry->d_inode);
namespace_lock();
if (unlikely(cant_mount(dentry)))
err = -ENOENT;
- else if (path_overmounted(path))
+ else if (!copy_mount && path_overmounted(path))
err = -EBUSY;
else
err = get_mountpoint(dentry, mp);
@@ -3857,9 +3864,15 @@ static void lock_mount_exact(const struct path *path,
namespace_unlock();
inode_unlock(dentry->d_inode);
mp->parent = ERR_PTR(err);
- } else {
- mp->parent = real_mount(path->mnt);
+ return;
}
+
+ if (copy_mount)
+ mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags);
+ else
+ mp->parent = real_mount(path->mnt);
+ if (unlikely(IS_ERR(mp->parent)))
+ __unlock_mount(mp);
}
int finish_automount(struct vfsmount *__m, const struct path *path)
@@ -5678,6 +5691,8 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
s->mnt = mnt_file->f_path.mnt;
ns = real_mount(s->mnt)->mnt_ns;
+ if (IS_ERR(ns))
+ return PTR_ERR(ns);
if (!ns)
/*
* We can't set mount point and mnt_ns_id since we don't have a
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 318253344b5c..e3825ee246be 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -608,9 +608,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct user_namespace *user_ns;
user_ns = task_cred_xxx(task, user_ns);
- if (!ns_ref_get(user_ns))
- break;
- ns_common = to_ns_common(user_ns);
+ if (ns_ref_get(user_ns))
+ ns_common = to_ns_common(user_ns);
}
#endif
break;
@@ -620,9 +619,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct pid_namespace *pid_ns;
pid_ns = task_active_pid_ns(task);
- if (!ns_ref_get(pid_ns))
- break;
- ns_common = to_ns_common(pid_ns);
+ if (ns_ref_get(pid_ns))
+ ns_common = to_ns_common(pid_ns);
}
#endif
break;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4eec684baca9..4c863d17dfb4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2128,6 +2128,9 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
ino_t ino = 1;
child = try_lookup_noperm(&qname, dir);
+ if (IS_ERR(child))
+ goto end_instantiate;
+
if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 28a9cb13fbfa..079c18bcdbde 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -495,19 +495,6 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
fsnotify_dentry(dentry, mask);
}
-static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
- int error)
-{
- struct fs_error_report report = {
- .error = error,
- .inode = inode,
- .sb = sb,
- };
-
- return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
- NULL, NULL, NULL, 0);
-}
-
static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt)
{
fsnotify_mnt(FS_MNT_ATTACH, ns, mnt);
diff --git a/kernel/fork.c b/kernel/fork.c
index e832da9d15a4..65113a304518 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3085,7 +3085,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
- if (fs->users == 1)
+ if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);