From 3d1ea1c0aeaf7baaf0c0a3d073a49671dfd3771a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 11 Oct 2024 16:21:14 -0400
Subject: bcachefs: kill retry_estale() in bch2_ioctl_subvolume_create()

this was likely originally cribbed, and has been dead code, and Al is
working on removing it from the tree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 405cf08bda34..15725b4ce393 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -406,7 +406,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
 		sync_inodes_sb(c->vfs_sb);
 		up_read(&c->vfs_sb->s_umount);
 	}
-retry:
+
 	if (arg.src_ptr) {
 		error = user_path_at(arg.dirfd,
 				(const char __user *)(unsigned long)arg.src_ptr,
@@ -486,11 +486,6 @@ err3:
 err2:
 	if (arg.src_ptr)
 		path_put(&src_path);
-
-	if (retry_estale(error, lookup_flags)) {
-		lookup_flags |= LOOKUP_REVAL;
-		goto retry;
-	}
 err1:
 	return error;
 }
-- 
cgit v1.2.3


From b6709dcd87ac85d56e7cd574a7b21f3a8727d942 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 2 Nov 2024 10:25:04 +0100
Subject: fs: binfmt: Fix a typo

A 't' is missing in "binfm_misc".
Add it.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Link: https://lore.kernel.org/r/34b8c52b67934b293a67558a9a486aea7ba08951.1730539498.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/binfmt_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 31660d8cc2c6..df6a229b5e62 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -998,7 +998,7 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 		/*
 		 * If it turns out that most user namespaces actually want to
 		 * register their own binary type handler and therefore all
-		 * create their own separate binfm_misc mounts we should
+		 * create their own separate binfmt_misc mounts we should
 		 * consider turning this into a kmem cache.
 		 */
 		misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
-- 
cgit v1.2.3


From 4188fc31a95e6c7a03c5859831d48ce646b26e46 Mon Sep 17 00:00:00 2001
From: "nir@lichtman.org" <nir@lichtman.org>
Date: Sat, 2 Nov 2024 14:01:22 +0200
Subject: exec: move warning of null argv to be next to the relevant code

Problem: The warning is currently printed where it is detected that the
arg count is zero but the action is only taken place later in the flow
even though the warning is written as if the action is taken place in
the time of print

This could be problematic since there could be a failure between the
print and the code that takes action which would deem this warning
misleading

Solution: Move the warning print after the action of adding an empty
string as the first argument is successful

Signed-off-by: Nir Lichtman <nir@lichtman.org>
Link: https://lore.kernel.org/r/ZyYUgiPc8A8i_3FH@nirs-laptop.
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/exec.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 6c53920795c2..4057b8c3e233 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1907,9 +1907,6 @@ static int do_execveat_common(int fd, struct filename *filename,
 	}
 
 	retval = count(argv, MAX_ARG_STRINGS);
-	if (retval == 0)
-		pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
-			     current->comm, bprm->filename);
 	if (retval < 0)
 		goto out_free;
 	bprm->argc = retval;
@@ -1947,6 +1944,9 @@ static int do_execveat_common(int fd, struct filename *filename,
 		if (retval < 0)
 			goto out_free;
 		bprm->argc = 1;
+
+		pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
+			     current->comm, bprm->filename);
 	}
 
 	retval = bprm_execve(bprm);
-- 
cgit v1.2.3


From fa1bdca98d74472dcdb79cb948b54f63b5886c04 Mon Sep 17 00:00:00 2001
From: Nir Lichtman <nir@lichtman.org>
Date: Sat, 16 Nov 2024 23:13:23 +0000
Subject: exec: remove legacy custom binfmt modules autoloading

Problem: The search binary handler logic contains legacy code
to handle automatically loading kernel modules of unsupported
binary formats.
This logic is a leftover from a.out-to-ELF transition.
After removal of a.out support, this code has no use anymore.

Solution: Clean up this code from the search binary handler,
also remove the line initialising retval to -ENOENT and instead
just return -ENOEXEC if the flow has reached the end of the func.

Note: Anyone who might find future uses for this legacy code
would be better off using binfmt_misc to trigger whatever
module loading they might need - would be more flexible that way.

Suggested-by: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Nir Lichtman <nir@lichtman.org>
Link: https://lore.kernel.org/r/20241116231323.GA225987@lichtman.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/exec.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 4057b8c3e233..e0435b31a811 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1723,13 +1723,11 @@ int remove_arg_zero(struct linux_binprm *bprm)
 }
 EXPORT_SYMBOL(remove_arg_zero);
 
-#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
 /*
  * cycle the list of binary formats handler, until one recognizes the image
  */
 static int search_binary_handler(struct linux_binprm *bprm)
 {
-	bool need_retry = IS_ENABLED(CONFIG_MODULES);
 	struct linux_binfmt *fmt;
 	int retval;
 
@@ -1741,8 +1739,6 @@ static int search_binary_handler(struct linux_binprm *bprm)
 	if (retval)
 		return retval;
 
-	retval = -ENOENT;
- retry:
 	read_lock(&binfmt_lock);
 	list_for_each_entry(fmt, &formats, lh) {
 		if (!try_module_get(fmt->module))
@@ -1760,17 +1756,7 @@ static int search_binary_handler(struct linux_binprm *bprm)
 	}
 	read_unlock(&binfmt_lock);
 
-	if (need_retry) {
-		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
-		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
-			return retval;
-		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
-			return retval;
-		need_retry = false;
-		goto retry;
-	}
-
-	return retval;
+	return -ENOEXEC;
 }
 
 /* binfmt handlers will call back into begin_new_exec() on success. */
-- 
cgit v1.2.3


From cc354fa7f0160a96896a3b8d7f298a036e3b8602 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 1 Dec 2024 17:08:39 -0800
Subject: bcachefs: Explicitly select CRYPTO from BCACHEFS_FS

Explicitly select CRYPTO from BCACHEFS_FS, so that this dependency of
CRYPTO_SHA256, CRYPTO_CHACHA20, and CRYPTO_POLY1305 (which are also
selected) is satisfied.  Currently this dependency is satisfied
indirectly via LIBCRC32C, but this is fragile and is planned to change
(https://lore.kernel.org/r/20241021002935.325878-13-ebiggers@kernel.org).

Acked-by: Kent Overstreet <kent.overstreet@linux.dev>
Link: https://lore.kernel.org/r/20241202010844.144356-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/bcachefs/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index ab6c95b895b3..971ea505e7b0 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -15,6 +15,7 @@ config BCACHEFS_FS
 	select ZLIB_INFLATE
 	select ZSTD_COMPRESS
 	select ZSTD_DECOMPRESS
+	select CRYPTO
 	select CRYPTO_SHA256
 	select CRYPTO_CHACHA20
 	select CRYPTO_POLY1305
-- 
cgit v1.2.3


From f2b4fa19647e18a2e2aade7e3e4620567e7e594a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 1 Dec 2024 17:08:41 -0800
Subject: ext4: switch to using the crc32c library

Now that the crc32c() library function directly takes advantage of
architecture-specific optimizations, it is unnecessary to go through the
crypto API.  Just use crc32c().  This is much simpler, and it improves
performance due to eliminating the crypto API overhead.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20241202010844.144356-17-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/ext4/Kconfig |  3 +--
 fs/ext4/ext4.h  | 25 +++----------------------
 fs/ext4/super.c | 15 ---------------
 3 files changed, 4 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index e20d59221fc0..c9ca41d91a6c 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -31,8 +31,7 @@ config EXT4_FS
 	select BUFFER_HEAD
 	select JBD2
 	select CRC16
-	select CRYPTO
-	select CRYPTO_CRC32C
+	select CRC32
 	select FS_IOMAP
 	select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
 	help
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 74f2071189b2..4e7de7eaa374 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,7 +33,7 @@
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
-#include <crypto/hash.h>
+#include <linux/crc32c.h>
 #include <linux/falloc.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/fiemap.h>
@@ -1662,9 +1662,6 @@ struct ext4_sb_info {
 	/* record the last minlen when FITRIM is called. */
 	unsigned long s_last_trim_minblks;
 
-	/* Reference to checksum algorithm driver via cryptoapi */
-	struct crypto_shash *s_chksum_driver;
-
 	/* Precomputed FS UUID checksum for seeding other checksums */
 	__u32 s_csum_seed;
 
@@ -2463,19 +2460,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
 			      const void *address, unsigned int length)
 {
-	struct {
-		struct shash_desc shash;
-		char ctx[4];
-	} desc;
-
-	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
-
-	desc.shash.tfm = sbi->s_chksum_driver;
-	*(u32 *)desc.ctx = crc;
-
-	BUG_ON(crypto_shash_update(&desc.shash, address, length));
-
-	return *(u32 *)desc.ctx;
+	return crc32c(crc, address, length);
 }
 
 #ifdef __KERNEL__
@@ -3276,11 +3261,7 @@ extern int ext4_register_li_request(struct super_block *sb,
 
 static inline int ext4_has_metadata_csum(struct super_block *sb)
 {
-	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
-		     !EXT4_SB(sb)->s_chksum_driver);
-
-	return ext4_has_feature_metadata_csum(sb) &&
-	       (EXT4_SB(sb)->s_chksum_driver != NULL);
+	return ext4_has_feature_metadata_csum(sb);
 }
 
 static inline int ext4_has_group_desc_csum(struct super_block *sb)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 785809f33ff4..fdf4817a7dbc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1380,8 +1380,6 @@ static void ext4_put_super(struct super_block *sb)
 	 */
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
-	if (sbi->s_chksum_driver)
-		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->s_blockgroup_lock);
 	fs_put_dax(sbi->s_daxdev, NULL);
 	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
@@ -4634,15 +4632,6 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo
 	ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
 				ext4_orphan_file_block_trigger);
 
-	/* Load the checksum driver */
-	sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
-	if (IS_ERR(sbi->s_chksum_driver)) {
-		int ret = PTR_ERR(sbi->s_chksum_driver);
-		ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
-		sbi->s_chksum_driver = NULL;
-		return ret;
-	}
-
 	/* Check superblock checksum */
 	if (!ext4_superblock_csum_verify(sb, es)) {
 		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
@@ -5687,9 +5676,6 @@ failed_mount3:
 	del_timer_sync(&sbi->s_err_report);
 	ext4_group_desc_free(sbi);
 failed_mount:
-	if (sbi->s_chksum_driver)
-		crypto_free_shash(sbi->s_chksum_driver);
-
 #if IS_ENABLED(CONFIG_UNICODE)
 	utf8_unload(sb->s_encoding);
 #endif
@@ -7494,6 +7480,5 @@ static void __exit ext4_exit_fs(void)
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-MODULE_SOFTDEP("pre: crc32c");
 module_init(ext4_init_fs)
 module_exit(ext4_exit_fs)
-- 
cgit v1.2.3


From dd348f054b24a3f57cbcdc2c8e7ebc22c62eb72f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 1 Dec 2024 17:08:42 -0800
Subject: jbd2: switch to using the crc32c library

Now that the crc32c() library function directly takes advantage of
architecture-specific optimizations, it is unnecessary to go through the
crypto API.  Just use crc32c().  This is much simpler, and it improves
performance due to eliminating the crypto API overhead.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20241202010844.144356-18-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/jbd2/Kconfig      |  2 --
 fs/jbd2/journal.c    | 30 +++---------------------------
 include/linux/jbd2.h | 33 +++------------------------------
 3 files changed, 6 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 4ad2c67f93f1..9c19e1512101 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -2,8 +2,6 @@
 config JBD2
 	tristate
 	select CRC32
-	select CRYPTO
-	select CRYPTO_CRC32C
 	help
 	  This is a generic journaling layer for block devices that support
 	  both 32-bit and 64-bit block numbers.  It is currently used by
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 7e49d912b091..d8084b31b361 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1369,20 +1369,12 @@ static int journal_check_superblock(journal_t *journal)
 		return err;
 	}
 
-	/* Load the checksum driver */
 	if (jbd2_journal_has_csum_v2or3_feature(journal)) {
 		if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
 			printk(KERN_ERR "JBD2: Unknown checksum type\n");
 			return err;
 		}
 
-		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
-		if (IS_ERR(journal->j_chksum_driver)) {
-			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
-			err = PTR_ERR(journal->j_chksum_driver);
-			journal->j_chksum_driver = NULL;
-			return err;
-		}
 		/* Check superblock checksum */
 		if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
 			printk(KERN_ERR "JBD2: journal checksum error\n");
@@ -1608,8 +1600,6 @@ static journal_t *journal_init_common(struct block_device *bdev,
 
 err_cleanup:
 	percpu_counter_destroy(&journal->j_checkpoint_jh_count);
-	if (journal->j_chksum_driver)
-		crypto_free_shash(journal->j_chksum_driver);
 	kfree(journal->j_wbuf);
 	jbd2_journal_destroy_revoke(journal);
 	journal_fail_superblock(journal);
@@ -2191,8 +2181,6 @@ int jbd2_journal_destroy(journal_t *journal)
 	iput(journal->j_inode);
 	if (journal->j_revoke)
 		jbd2_journal_destroy_revoke(journal);
-	if (journal->j_chksum_driver)
-		crypto_free_shash(journal->j_chksum_driver);
 	kfree(journal->j_fc_wbuf);
 	kfree(journal->j_wbuf);
 	kfree(journal);
@@ -2337,27 +2325,15 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
 		}
 	}
 
-	/* Load the checksum driver if necessary */
-	if ((journal->j_chksum_driver == NULL) &&
-	    INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
-		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
-		if (IS_ERR(journal->j_chksum_driver)) {
-			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
-			journal->j_chksum_driver = NULL;
-			return 0;
-		}
-		/* Precompute checksum seed for all metadata */
-		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
-						   sizeof(sb->s_uuid));
-	}
-
 	lock_buffer(journal->j_sb_buffer);
 
-	/* If enabling v3 checksums, update superblock */
+	/* If enabling v3 checksums, update superblock and precompute seed */
 	if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
 		sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
 		sb->s_feature_compat &=
 			~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
+		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+						   sizeof(sb->s_uuid));
 	}
 
 	/* If enabling v1 checksums, downgrade superblock */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 50f7ea8714bf..561025b4f3d9 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -28,7 +28,7 @@
 #include <linux/slab.h>
 #include <linux/bit_spinlock.h>
 #include <linux/blkdev.h>
-#include <crypto/hash.h>
+#include <linux/crc32c.h>
 #endif
 
 #define journal_oom_retry 1
@@ -1241,13 +1241,6 @@ struct journal_s
 	 */
 	void *j_private;
 
-	/**
-	 * @j_chksum_driver:
-	 *
-	 * Reference to checksum algorithm driver via cryptoapi.
-	 */
-	struct crypto_shash *j_chksum_driver;
-
 	/**
 	 * @j_csum_seed:
 	 *
@@ -1750,10 +1743,7 @@ static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j)
 
 static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
 {
-	WARN_ON_ONCE(jbd2_journal_has_csum_v2or3_feature(journal) &&
-		     journal->j_chksum_driver == NULL);
-
-	return journal->j_chksum_driver != NULL;
+	return jbd2_journal_has_csum_v2or3_feature(journal);
 }
 
 static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
@@ -1790,27 +1780,10 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal)
 #define BJ_Reserved	4	/* Buffer is reserved for access by journal */
 #define BJ_Types	5
 
-/* JBD uses a CRC32 checksum */
-#define JBD_MAX_CHECKSUM_SIZE 4
-
 static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
 			      const void *address, unsigned int length)
 {
-	DEFINE_RAW_FLEX(struct shash_desc, desc, __ctx,
-		DIV_ROUND_UP(JBD_MAX_CHECKSUM_SIZE,
-			     sizeof(*((struct shash_desc *)0)->__ctx)));
-	int err;
-
-	BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) >
-		JBD_MAX_CHECKSUM_SIZE);
-
-	desc->tfm = journal->j_chksum_driver;
-	*(u32 *)desc->__ctx = crc;
-
-	err = crypto_shash_update(desc, address, length);
-	BUG_ON(err);
-
-	return *(u32 *)desc->__ctx;
+	return crc32c(crc, address, length);
 }
 
 /* Return most recent uncommitted transaction */
-- 
cgit v1.2.3


From 3ca4bec40ee211cd5d907c8e98b66c92f6a1b1e1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 1 Dec 2024 17:08:43 -0800
Subject: f2fs: switch to using the crc32 library

Now that the crc32() library function takes advantage of
architecture-specific optimizations, it is unnecessary to go through the
crypto API.  Just use crc32().  This is much simpler, and it improves
performance due to eliminating the crypto API overhead.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20241202010844.144356-19-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/f2fs/Kconfig |  3 +--
 fs/f2fs/f2fs.h  | 20 +-------------------
 fs/f2fs/super.c | 15 ---------------
 3 files changed, 2 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 68a1e23e1557..5916a02fb46d 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -4,8 +4,7 @@ config F2FS_FS
 	depends on BLOCK
 	select BUFFER_HEAD
 	select NLS
-	select CRYPTO
-	select CRYPTO_CRC32
+	select CRC32
 	select F2FS_FS_XATTR if FS_ENCRYPTION
 	select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
 	select FS_IOMAP
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6f2cbf4c5740..fd7e9cb6f893 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,7 +24,6 @@
 #include <linux/quotaops.h>
 #include <linux/part_stat.h>
 #include <linux/rw_hint.h>
-#include <crypto/hash.h>
 
 #include <linux/fscrypt.h>
 #include <linux/fsverity.h>
@@ -1768,9 +1767,6 @@ struct f2fs_sb_info {
 	u64 sectors_written_start;
 	u64 kbytes_written;
 
-	/* Reference to checksum algorithm driver via cryptoapi */
-	struct crypto_shash *s_chksum_driver;
-
 	/* Precomputed FS UUID checksum for seeding other checksums */
 	__u32 s_chksum_seed;
 
@@ -1948,21 +1944,7 @@ static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi,
 static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc,
 			      const void *address, unsigned int length)
 {
-	struct {
-		struct shash_desc shash;
-		char ctx[4];
-	} desc;
-	int err;
-
-	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx));
-
-	desc.shash.tfm = sbi->s_chksum_driver;
-	*(u32 *)desc.ctx = crc;
-
-	err = crypto_shash_update(&desc.shash, address, length);
-	BUG_ON(err);
-
-	return *(u32 *)desc.ctx;
+	return crc32(crc, address, length);
 }
 
 static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fc7d463dee15..19b67828ae32 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1694,8 +1694,6 @@ static void f2fs_put_super(struct super_block *sb)
 
 	kvfree(sbi->ckpt);
 
-	if (sbi->s_chksum_driver)
-		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->raw_super);
 
 	f2fs_destroy_page_array_cache(sbi);
@@ -4466,15 +4464,6 @@ try_onemore:
 	}
 	mutex_init(&sbi->flush_lock);
 
-	/* Load the checksum driver */
-	sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
-	if (IS_ERR(sbi->s_chksum_driver)) {
-		f2fs_err(sbi, "Cannot load crc32 driver.");
-		err = PTR_ERR(sbi->s_chksum_driver);
-		sbi->s_chksum_driver = NULL;
-		goto free_sbi;
-	}
-
 	/* set a block size */
 	if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
 		f2fs_err(sbi, "unable to set blocksize");
@@ -4919,8 +4908,6 @@ free_options:
 free_sb_buf:
 	kfree(raw_super);
 free_sbi:
-	if (sbi->s_chksum_driver)
-		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 
@@ -5127,5 +5114,3 @@ module_exit(exit_f2fs_fs)
 MODULE_AUTHOR("Samsung Electronics's Praesto Team");
 MODULE_DESCRIPTION("Flash Friendly File System");
 MODULE_LICENSE("GPL");
-MODULE_SOFTDEP("pre: crc32");
-
-- 
cgit v1.2.3


From c9136fad4c08288be26aaff3e63d634545b32a85 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 8 Nov 2024 17:28:39 -0800
Subject: proc/kcore: mark proc entry as permanent

drgn reads from /proc/kcore to debug the running kernel. For many drgn
scripts, /proc/kcore is actually a bottleneck.

use_pde() and unuse_pde() in prog_reg_read() show up hot in profiles.
Since the entry for /proc/kcore can never be removed, this is useless
overhead that can be trivially avoided by marking the entry as
permanent.

In my benchmark, this reduces the time per read by about 20 nanoseconds,
from 235 nanoseconds per read to 215.

Link: https://github.com/osandov/drgn/issues/106
Signed-off-by: Omar Sandoval <osandov@fb.com>
Link: https://lore.kernel.org/r/60873e6afcfda3f08d0456f19e4733612afcf134.1731115587.git.osandov@fb.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/kcore.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e376f48c4b8b..fba609a19ec4 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -663,6 +663,7 @@ static int release_kcore(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops kcore_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_read_iter	= read_kcore_iter,
 	.proc_open	= open_kcore,
 	.proc_release	= release_kcore,
-- 
cgit v1.2.3


From 680e029fd62f7d9d8373788635f52c3de358d18d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 8 Nov 2024 17:28:40 -0800
Subject: proc/kcore: don't walk list on every read

We maintain a list of memory ranges for /proc/kcore, which usually has
10-20 entries. Currently, every single read from /proc/kcore walks the
entire list in order to count the number of entries and compute some
offsets. These values only change when the list of memory ranges
changes, which is very rare (only when memory is hot(un)plugged). We can
cache the values when the list is populated to avoid these redundant
walks.

In my benchmark, this reduces the time per read by another 20
nanoseconds on top of the previous change, from 215 nanoseconds per read
to 195.

Link: https://github.com/osandov/drgn/issues/106
Signed-off-by: Omar Sandoval <osandov@fb.com>
Link: https://lore.kernel.org/r/8d945558b9c9efe74103a34b7780f1cd90d9ce7f.1731115587.git.osandov@fb.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/kcore.c | 70 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index fba609a19ec4..3070cf3e2f5c 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -65,6 +65,10 @@ static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt)
 #endif
 
 static LIST_HEAD(kclist_head);
+static int kcore_nphdr;
+static size_t kcore_phdrs_len;
+static size_t kcore_notes_len;
+static size_t kcore_data_offset;
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
@@ -101,33 +105,32 @@ void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
 	list_add_tail(&new->list, &kclist_head);
 }
 
-static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
-			     size_t *data_offset)
+static void update_kcore_size(void)
 {
 	size_t try, size;
 	struct kcore_list *m;
 
-	*nphdr = 1; /* PT_NOTE */
+	kcore_nphdr = 1; /* PT_NOTE */
 	size = 0;
 
 	list_for_each_entry(m, &kclist_head, list) {
 		try = kc_vaddr_to_offset((size_t)m->addr + m->size);
 		if (try > size)
 			size = try;
-		*nphdr = *nphdr + 1;
+		kcore_nphdr++;
 	}
 
-	*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-	*notes_len = (4 * sizeof(struct elf_note) +
-		      3 * ALIGN(sizeof(CORE_STR), 4) +
-		      VMCOREINFO_NOTE_NAME_BYTES +
-		      ALIGN(sizeof(struct elf_prstatus), 4) +
-		      ALIGN(sizeof(struct elf_prpsinfo), 4) +
-		      ALIGN(arch_task_struct_size, 4) +
-		      ALIGN(vmcoreinfo_size, 4));
-	*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
-				  *notes_len);
-	return *data_offset + size;
+	kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr);
+	kcore_notes_len = (4 * sizeof(struct elf_note) +
+			   3 * ALIGN(sizeof(CORE_STR), 4) +
+			   VMCOREINFO_NOTE_NAME_BYTES +
+			   ALIGN(sizeof(struct elf_prstatus), 4) +
+			   ALIGN(sizeof(struct elf_prpsinfo), 4) +
+			   ALIGN(arch_task_struct_size, 4) +
+			   ALIGN(vmcoreinfo_size, 4));
+	kcore_data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + kcore_phdrs_len +
+				       kcore_notes_len);
+	proc_root_kcore->size = kcore_data_offset + size;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -270,8 +273,6 @@ static int kcore_update_ram(void)
 {
 	LIST_HEAD(list);
 	LIST_HEAD(garbage);
-	int nphdr;
-	size_t phdrs_len, notes_len, data_offset;
 	struct kcore_list *tmp, *pos;
 	int ret = 0;
 
@@ -293,8 +294,7 @@ static int kcore_update_ram(void)
 	}
 	list_splice_tail(&list, &kclist_head);
 
-	proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, &notes_len,
-					       &data_offset);
+	update_kcore_size();
 
 out:
 	up_write(&kclist_lock);
@@ -326,12 +326,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 	struct file *file = iocb->ki_filp;
 	char *buf = file->private_data;
 	loff_t *fpos = &iocb->ki_pos;
-	size_t phdrs_offset, notes_offset, data_offset;
+	size_t phdrs_offset, notes_offset;
 	size_t page_offline_frozen = 1;
-	size_t phdrs_len, notes_len;
 	struct kcore_list *m;
 	size_t tsz;
-	int nphdr;
 	unsigned long start;
 	size_t buflen = iov_iter_count(iter);
 	size_t orig_buflen = buflen;
@@ -344,9 +342,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 	 */
 	page_offline_freeze();
 
-	get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
 	phdrs_offset = sizeof(struct elfhdr);
-	notes_offset = phdrs_offset + phdrs_len;
+	notes_offset = phdrs_offset + kcore_phdrs_len;
 
 	/* ELF file header. */
 	if (buflen && *fpos < sizeof(struct elfhdr)) {
@@ -368,7 +365,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 			.e_flags = ELF_CORE_EFLAGS,
 			.e_ehsize = sizeof(struct elfhdr),
 			.e_phentsize = sizeof(struct elf_phdr),
-			.e_phnum = nphdr,
+			.e_phnum = kcore_nphdr,
 		};
 
 		tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
@@ -382,10 +379,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 	}
 
 	/* ELF program headers. */
-	if (buflen && *fpos < phdrs_offset + phdrs_len) {
+	if (buflen && *fpos < phdrs_offset + kcore_phdrs_len) {
 		struct elf_phdr *phdrs, *phdr;
 
-		phdrs = kzalloc(phdrs_len, GFP_KERNEL);
+		phdrs = kzalloc(kcore_phdrs_len, GFP_KERNEL);
 		if (!phdrs) {
 			ret = -ENOMEM;
 			goto out;
@@ -393,13 +390,14 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 		phdrs[0].p_type = PT_NOTE;
 		phdrs[0].p_offset = notes_offset;
-		phdrs[0].p_filesz = notes_len;
+		phdrs[0].p_filesz = kcore_notes_len;
 
 		phdr = &phdrs[1];
 		list_for_each_entry(m, &kclist_head, list) {
 			phdr->p_type = PT_LOAD;
 			phdr->p_flags = PF_R | PF_W | PF_X;
-			phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
+			phdr->p_offset = kc_vaddr_to_offset(m->addr)
+					 + kcore_data_offset;
 			phdr->p_vaddr = (size_t)m->addr;
 			if (m->type == KCORE_RAM)
 				phdr->p_paddr = __pa(m->addr);
@@ -412,7 +410,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 			phdr++;
 		}
 
-		tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
+		tsz = min_t(size_t, buflen,
+			    phdrs_offset + kcore_phdrs_len - *fpos);
 		if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
 				 iter) != tsz) {
 			kfree(phdrs);
@@ -426,7 +425,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 	}
 
 	/* ELF note segment. */
-	if (buflen && *fpos < notes_offset + notes_len) {
+	if (buflen && *fpos < notes_offset + kcore_notes_len) {
 		struct elf_prstatus prstatus = {};
 		struct elf_prpsinfo prpsinfo = {
 			.pr_sname = 'R',
@@ -438,7 +437,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 		strscpy(prpsinfo.pr_psargs, saved_command_line,
 			sizeof(prpsinfo.pr_psargs));
 
-		notes = kzalloc(notes_len, GFP_KERNEL);
+		notes = kzalloc(kcore_notes_len, GFP_KERNEL);
 		if (!notes) {
 			ret = -ENOMEM;
 			goto out;
@@ -459,9 +458,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 		 */
 		append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
 				  vmcoreinfo_data,
-				  min(vmcoreinfo_size, notes_len - i));
+				  min(vmcoreinfo_size, kcore_notes_len - i));
 
-		tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
+		tsz = min_t(size_t, buflen,
+			    notes_offset + kcore_notes_len - *fpos);
 		if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
 			kfree(notes);
 			ret = -EFAULT;
@@ -477,7 +477,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 	 * Check to see if our file offset matches with any of
 	 * the addresses in the elf_phdr on our list.
 	 */
-	start = kc_offset_to_vaddr(*fpos - data_offset);
+	start = kc_offset_to_vaddr(*fpos - kcore_data_offset);
 	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
 		tsz = buflen;
 
-- 
cgit v1.2.3


From 605291e2210130957e8a17a466f3f21c4fe0adef Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 8 Nov 2024 17:28:41 -0800
Subject: proc/kcore: use percpu_rw_semaphore for kclist_lock

The list of memory ranges for /proc/kcore is protected by a
rw_semaphore. We lock it for reading on every read from /proc/kcore.
This is very heavy, especially since it is rarely locked for writing.
Since we want to strongly favor read lock performance, convert it to a
percpu_rw_semaphore. I also experimented with percpu_ref and SRCU, but
this change was the simplest and the fastest.

In my benchmark, this reduces the time per read by yet another 20
nanoseconds on top of the previous two changes, from 195 nanoseconds per
read to 175.

Link: https://github.com/osandov/drgn/issues/106
Signed-off-by: Omar Sandoval <osandov@fb.com>
Link: https://lore.kernel.org/r/83a3b235b4bcc3b8aef7c533e0657f4d7d5d35ae.1731115587.git.osandov@fb.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/kcore.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 3070cf3e2f5c..1cb33771bf9f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -69,7 +69,7 @@ static int kcore_nphdr;
 static size_t kcore_phdrs_len;
 static size_t kcore_notes_len;
 static size_t kcore_data_offset;
-static DECLARE_RWSEM(kclist_lock);
+DEFINE_STATIC_PERCPU_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
 /*
@@ -276,7 +276,7 @@ static int kcore_update_ram(void)
 	struct kcore_list *tmp, *pos;
 	int ret = 0;
 
-	down_write(&kclist_lock);
+	percpu_down_write(&kclist_lock);
 	if (!xchg(&kcore_need_update, 0))
 		goto out;
 
@@ -297,7 +297,7 @@ static int kcore_update_ram(void)
 	update_kcore_size();
 
 out:
-	up_write(&kclist_lock);
+	percpu_up_write(&kclist_lock);
 	list_for_each_entry_safe(pos, tmp, &garbage, list) {
 		list_del(&pos->list);
 		kfree(pos);
@@ -335,7 +335,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 	size_t orig_buflen = buflen;
 	int ret = 0;
 
-	down_read(&kclist_lock);
+	percpu_down_read(&kclist_lock);
 	/*
 	 * Don't race against drivers that set PageOffline() and expect no
 	 * further page access.
@@ -626,7 +626,7 @@ skip:
 
 out:
 	page_offline_thaw();
-	up_read(&kclist_lock);
+	percpu_up_read(&kclist_lock);
 	if (ret)
 		return ret;
 	return orig_buflen - buflen;
-- 
cgit v1.2.3


From a48bdf80ce6938f8c1de6a56fed7c4f6f46904e9 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 16 Nov 2024 07:41:28 +0100
Subject: fs: delay sysctl_nr_open check in expand_files()

Suppose a thread sharing the table started a resize, while
sysctl_nr_open got lowered to a value which prohibits it. This is still
going to go through with and without the patch, which is fine.

Further suppose another thread shows up to do a matching expansion while
resize_in_progress == true. It is going to error out since it performs
the sysctl_nr_open check *before* finding out if there is an expansion
in progress. But the aformentioned thread is going to succeded, so the
error is spurious (and it would not happen if the thread showed up a
little bit later).

Checking the sysctl *after* we know there are no pending updates sorts
it out.

While here annotate the thing as unlikely.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20241116064128.280870-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index fb1011cf6b4a..019fb9acf91b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -278,10 +278,6 @@ repeat:
 	if (nr < fdt->max_fds)
 		return 0;
 
-	/* Can we expand? */
-	if (nr >= sysctl_nr_open)
-		return -EMFILE;
-
 	if (unlikely(files->resize_in_progress)) {
 		spin_unlock(&files->file_lock);
 		wait_event(files->resize_wait, !files->resize_in_progress);
@@ -289,6 +285,10 @@ repeat:
 		goto repeat;
 	}
 
+	/* Can we expand? */
+	if (unlikely(nr >= sysctl_nr_open))
+		return -EMFILE;
+
 	/* All good, so we try */
 	files->resize_in_progress = true;
 	error = expand_fdtable(files, nr);
-- 
cgit v1.2.3


From 0a670e151a71434765de69590944e18c08ee08cf Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:09:57 +0100
Subject: tree-wide: s/override_creds()/override_creds_light(get_new_cred())/g

Convert all callers from override_creds() to
override_creds_light(get_new_cred()) in preparation of making
override_creds() not take a separate reference at all.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-1-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/base/firmware_loader/main.c   | 2 +-
 drivers/crypto/ccp/sev-dev.c          | 2 +-
 drivers/target/target_core_configfs.c | 2 +-
 fs/aio.c                              | 2 +-
 fs/binfmt_misc.c                      | 2 +-
 fs/cachefiles/internal.h              | 2 +-
 fs/coredump.c                         | 2 +-
 fs/nfs/localio.c                      | 4 ++--
 fs/nfs/nfs4idmap.c                    | 2 +-
 fs/nfsd/auth.c                        | 2 +-
 fs/nfsd/nfs4recover.c                 | 2 +-
 fs/nfsd/nfsfh.c                       | 2 +-
 fs/open.c                             | 2 +-
 fs/overlayfs/copy_up.c                | 2 +-
 fs/smb/client/cifs_spnego.c           | 2 +-
 fs/smb/client/cifsacl.c               | 4 ++--
 fs/smb/server/smb_common.c            | 2 +-
 include/linux/cred.h                  | 5 +++--
 io_uring/io_uring.c                   | 2 +-
 io_uring/sqpoll.c                     | 2 +-
 kernel/acct.c                         | 2 +-
 kernel/cgroup/cgroup.c                | 2 +-
 kernel/trace/trace_events_user.c      | 2 +-
 net/dns_resolver/dns_query.c          | 2 +-
 24 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index cb0912ea3e62..75eb6fd75927 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -911,7 +911,7 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 		ret = -ENOMEM;
 		goto out;
 	}
-	old_cred = override_creds(kern_cred);
+	old_cred = override_creds_light(get_new_cred(kern_cred));
 
 	ret = fw_get_filesystem_firmware(device, fw->priv, "", NULL);
 
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index af018afd9cd7..2ad6e41af085 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -244,7 +244,7 @@ static struct file *open_file_as_root(const char *filename, int flags, umode_t m
 	if (!cred)
 		return ERR_PTR(-ENOMEM);
 	cred->fsuid = GLOBAL_ROOT_UID;
-	old_cred = override_creds(cred);
+	old_cred = override_creds_light(get_new_cred(cred));
 
 	fp = file_open_root(&root, filename, flags, mode);
 	path_put(&root);
diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index c40217f44b1b..be98d16b2c57 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -3756,7 +3756,7 @@ static int __init target_core_init_configfs(void)
 		ret = -ENOMEM;
 		goto out;
 	}
-	old_cred = override_creds(kern_cred);
+	old_cred = override_creds_light(get_new_cred(kern_cred));
 	target_init_dbroot();
 	revert_creds(old_cred);
 	put_cred(kern_cred);
diff --git a/fs/aio.c b/fs/aio.c
index 50671640b588..a52fe2e999e7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1639,7 +1639,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 static void aio_fsync_work(struct work_struct *work)
 {
 	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
-	const struct cred *old_cred = override_creds(iocb->fsync.creds);
+	const struct cred *old_cred = override_creds_light(get_new_cred(iocb->fsync.creds));
 
 	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
 	revert_creds(old_cred);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 6a3a16f91051..21eb501ae03d 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -829,7 +829,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		 * didn't matter much as only a privileged process could open
 		 * the register file.
 		 */
-		old_cred = override_creds(file->f_cred);
+		old_cred = override_creds_light(get_new_cred(file->f_cred));
 		f = open_exec(e->interpreter);
 		revert_creds(old_cred);
 		if (IS_ERR(f)) {
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 7b99bd98de75..b156cc2e0e63 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -393,7 +393,7 @@ extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
 static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
 					   const struct cred **_saved_cred)
 {
-	*_saved_cred = override_creds(cache->cache_cred);
+	*_saved_cred = override_creds_light(get_new_cred(cache->cache_cred));
 }
 
 static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
diff --git a/fs/coredump.c b/fs/coredump.c
index d48edb37bc35..b6aae41b80d2 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -576,7 +576,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 	if (retval < 0)
 		goto fail_creds;
 
-	old_cred = override_creds(cred);
+	old_cred = override_creds_light(get_new_cred(cred));
 
 	ispipe = format_corename(&cn, &cprm, &argv, &argc);
 
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 4b8618cf114c..98393c2efb75 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -374,7 +374,7 @@ static void nfs_local_call_read(struct work_struct *work)
 	struct iov_iter iter;
 	ssize_t status;
 
-	save_cred = override_creds(filp->f_cred);
+	save_cred = override_creds_light(get_new_cred(filp->f_cred));
 
 	nfs_local_iter_init(&iter, iocb, READ);
 
@@ -545,7 +545,7 @@ static void nfs_local_call_write(struct work_struct *work)
 	ssize_t status;
 
 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
-	save_cred = override_creds(filp->f_cred);
+	save_cred = override_creds_light(get_new_cred(filp->f_cred));
 
 	nfs_local_iter_init(&iter, iocb, WRITE);
 
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 25a7c771cfd8..b9442f70271d 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -311,7 +311,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 	const struct user_key_payload *payload;
 	ssize_t ret;
 
-	saved_cred = override_creds(id_resolver_cache);
+	saved_cred = override_creds_light(get_new_cred(id_resolver_cache));
 	rkey = nfs_idmap_request_key(name, namelen, type, idmap);
 	revert_creds(saved_cred);
 
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 93e33d1ee891..614a5ec4824b 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -79,7 +79,7 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
-	put_cred(override_creds(new));
+	put_cred(override_creds_light(get_new_cred(new)));
 	put_cred(new);
 	return 0;
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4a765555bf84..886d03953d7a 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -81,7 +81,7 @@ nfs4_save_creds(const struct cred **original_creds)
 
 	new->fsuid = GLOBAL_ROOT_UID;
 	new->fsgid = GLOBAL_ROOT_GID;
-	*original_creds = override_creds(new);
+	*original_creds = override_creds_light(get_new_cred(new));
 	put_cred(new);
 	return 0;
 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 6a831cb242df..4819364190d3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -221,7 +221,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
 		new->cap_effective =
 			cap_raise_nfsd_set(new->cap_effective,
 					   new->cap_permitted);
-		put_cred(override_creds(new));
+		put_cred(override_creds_light(get_new_cred(new)));
 		put_cred(new);
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, cred, exp);
diff --git a/fs/open.c b/fs/open.c
index e6911101fe71..2459cd061f47 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -448,7 +448,7 @@ static const struct cred *access_override_creds(void)
 	 */
 	override_cred->non_rcu = 1;
 
-	old_cred = override_creds(override_cred);
+	old_cred = override_creds_light(get_new_cred(override_cred));
 
 	/* override_cred() gets its own ref */
 	put_cred(override_cred);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 3601ddfeddc2..527b041213c8 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -741,7 +741,7 @@ static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
 		return err;
 
 	if (cc->new)
-		cc->old = override_creds(cc->new);
+		cc->old = override_creds_light(get_new_cred(cc->new));
 
 	return 0;
 }
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 28f568b5fc27..721d8b1254b6 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -173,7 +173,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
 	}
 
 	cifs_dbg(FYI, "key description = %s\n", description);
-	saved_cred = override_creds(spnego_cred);
+	saved_cred = override_creds_light(get_new_cred(spnego_cred));
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 	revert_creds(saved_cred);
 
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index ba79aa2107cc..b1ea4ea3de4b 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -292,7 +292,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid)
 		return -EINVAL;
 
 	rc = 0;
-	saved_cred = override_creds(root_cred);
+	saved_cred = override_creds_light(get_new_cred(root_cred));
 	sidkey = request_key(&cifs_idmap_key_type, desc, "");
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
@@ -398,7 +398,7 @@ try_upcall_to_get_id:
 	if (!sidstr)
 		return -ENOMEM;
 
-	saved_cred = override_creds(root_cred);
+	saved_cred = override_creds_light(get_new_cred(root_cred));
 	sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
 	if (IS_ERR(sidkey)) {
 		cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 4e6f169fcf83..5662f2651b8e 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -780,7 +780,7 @@ int __ksmbd_override_fsids(struct ksmbd_work *work,
 		cred->cap_effective = cap_drop_fs_set(cred->cap_effective);
 
 	WARN_ON(work->saved_cred);
-	work->saved_cred = override_creds(cred);
+	work->saved_cred = override_creds_light(get_new_cred(cred));
 	if (!work->saved_cred) {
 		abort_creds(cred);
 		return -EINVAL;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index e4a3155fe409..b0bc1fea9ca0 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -211,9 +211,10 @@ static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
  * Get a reference on the specified set of new credentials.  The caller must
  * release the reference.
  */
-static inline struct cred *get_new_cred(struct cred *cred)
+static inline struct cred *get_new_cred(const struct cred *cred)
 {
-	return get_new_cred_many(cred, 1);
+	struct cred *nonconst_cred = (struct cred *) cred;
+	return get_new_cred_many(nonconst_cred, 1);
 }
 
 /**
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 06ff41484e29..0b25ddea943e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1728,7 +1728,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 		return -EBADF;
 
 	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
-		creds = override_creds(req->creds);
+		creds = override_creds_light(get_new_cred(req->creds));
 
 	if (!def->audit_skip)
 		audit_uring_entry(req->opcode);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 6df5e649c413..58a76d581895 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -174,7 +174,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 		const struct cred *creds = NULL;
 
 		if (ctx->sq_creds != current_cred())
-			creds = override_creds(ctx->sq_creds);
+			creds = override_creds_light(get_new_cred(ctx->sq_creds));
 
 		mutex_lock(&ctx->uring_lock);
 		if (!wq_list_empty(&ctx->iopoll_list))
diff --git a/kernel/acct.c b/kernel/acct.c
index 179848ad33e9..8f18eb02dd41 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -501,7 +501,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
 	flim = rlimit(RLIMIT_FSIZE);
 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	/* Perform file operations on behalf of whoever enabled accounting */
-	orig_cred = override_creds(file->f_cred);
+	orig_cred = override_creds_light(get_new_cred(file->f_cred));
 
 	/*
 	 * First check to see if there is enough free_space to continue
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d9061bd55436..97329b4fe502 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5216,7 +5216,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	 * permissions using the credentials from file open to protect against
 	 * inherited fd attacks.
 	 */
-	saved_cred = override_creds(of->file->f_cred);
+	saved_cred = override_creds_light(get_new_cred(of->file->f_cred));
 	ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
 					of->file->f_path.dentry->d_sb,
 					threadgroup, ctx->ns);
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 17bcad8f79de..4dd7c45d227e 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1469,7 +1469,7 @@ static int user_event_set_call_visible(struct user_event *user, bool visible)
 	 */
 	cred->fsuid = GLOBAL_ROOT_UID;
 
-	old_cred = override_creds(cred);
+	old_cred = override_creds_light(get_new_cred(cred));
 
 	if (visible)
 		ret = trace_add_event_call(&user->call);
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 82b084cc1cc6..a54f5f841cea 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -124,7 +124,7 @@ int dns_query(struct net *net,
 	/* make the upcall, using special credentials to prevent the use of
 	 * add_key() to preinstall malicious redirections
 	 */
-	saved_cred = override_creds(dns_resolver_cache);
+	saved_cred = override_creds_light(get_new_cred(dns_resolver_cache));
 	rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
 	revert_creds(saved_cred);
 	kfree(desc);
-- 
cgit v1.2.3


From f905e00904cc5899c89897b93bebfcf6656f608e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:09:59 +0100
Subject: tree-wide: s/revert_creds()/put_cred(revert_creds_light())/g

Convert all calls to revert_creds() over to explicitly dropping
reference counts in preparation for converting revert_creds() to
revert_creds_light() semantics.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-3-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/base/firmware_loader/main.c   | 2 +-
 drivers/crypto/ccp/sev-dev.c          | 2 +-
 drivers/target/target_core_configfs.c | 2 +-
 fs/aio.c                              | 2 +-
 fs/binfmt_misc.c                      | 2 +-
 fs/cachefiles/internal.h              | 2 +-
 fs/coredump.c                         | 2 +-
 fs/nfs/localio.c                      | 4 ++--
 fs/nfs/nfs4idmap.c                    | 2 +-
 fs/nfsd/auth.c                        | 2 +-
 fs/nfsd/filecache.c                   | 2 +-
 fs/nfsd/nfs4recover.c                 | 2 +-
 fs/open.c                             | 2 +-
 fs/overlayfs/copy_up.c                | 2 +-
 fs/smb/client/cifs_spnego.c           | 2 +-
 fs/smb/client/cifsacl.c               | 4 ++--
 fs/smb/server/smb_common.c            | 2 +-
 io_uring/io_uring.c                   | 2 +-
 io_uring/sqpoll.c                     | 2 +-
 kernel/acct.c                         | 2 +-
 kernel/cgroup/cgroup.c                | 2 +-
 kernel/trace/trace_events_user.c      | 2 +-
 net/dns_resolver/dns_query.c          | 2 +-
 23 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index 75eb6fd75927..b5677e173f91 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -943,7 +943,7 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 	} else
 		ret = assign_fw(fw, device);
 
-	revert_creds(old_cred);
+	put_cred(revert_creds_light(old_cred));
 	put_cred(kern_cred);
 
 out:
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 2ad6e41af085..9111a51d53e0 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -249,7 +249,7 @@ static struct file *open_file_as_root(const char *filename, int flags, umode_t m
 	fp = file_open_root(&root, filename, flags, mode);
 	path_put(&root);
 
-	revert_creds(old_cred);
+	put_cred(revert_creds_light(old_cred));
 
 	return fp;
 }
diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index be98d16b2c57..564bc71d2d09 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -3758,7 +3758,7 @@ static int __init target_core_init_configfs(void)
 	}
 	old_cred = override_creds_light(get_new_cred(kern_cred));
 	target_init_dbroot();
-	revert_creds(old_cred);
+	put_cred(revert_creds_light(old_cred));
 	put_cred(kern_cred);
 
 	return 0;
diff --git a/fs/aio.c b/fs/aio.c
index a52fe2e999e7..6b987c48b671 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1642,7 +1642,7 @@ static void aio_fsync_work(struct work_struct *work)
 	const struct cred *old_cred = override_creds_light(get_new_cred(iocb->fsync.creds));
 
 	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
-	revert_creds(old_cred);
+	put_cred(revert_creds_light(old_cred));
 	put_cred(iocb->fsync.creds);
 	iocb_put(iocb);
 }
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 21eb501ae03d..586862ca8738 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -831,7 +831,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		 */
 		old_cred = override_creds_light(get_new_cred(file->f_cred));
 		f = open_exec(e->interpreter);
-		revert_creds(old_cred);
+		put_cred(revert_creds_light(old_cred));
 		if (IS_ERR(f)) {
 			pr_notice("register: failed to install interpreter file %s\n",
 				 e->interpreter);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index b156cc2e0e63..809305dd5317 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -399,7 +399,7 @@ static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
 static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
 					 const struct cred *saved_cred)
 {
-	revert_creds(saved_cred);
+	put_cred(revert_creds_light(saved_cred));
 }
 
 /*
diff --git a/fs/coredump.c b/fs/coredump.c
index b6aae41b80d2..ff119aaa5c31 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -781,7 +781,7 @@ fail_unlock:
 	kfree(argv);
 	kfree(cn.corename);
 	coredump_finish(core_dumped);
-	revert_creds(old_cred);
+	put_cred(revert_creds_light(old_cred));
 fail_creds:
 	put_cred(cred);
 fail:
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 98393c2efb75..deca5559a56b 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -384,7 +384,7 @@ static void nfs_local_call_read(struct work_struct *work)
 	nfs_local_read_done(iocb, status);
 	nfs_local_pgio_release(iocb);
 
-	revert_creds(save_cred);
+	put_cred(revert_creds_light(save_cred));
 }
 
 static int
@@ -558,7 +558,7 @@ static void nfs_local_call_write(struct work_struct *work)
 	nfs_local_vfs_getattr(iocb);
 	nfs_local_pgio_release(iocb);
 
-	revert_creds(save_cred);
+	put_cred(revert_creds_light(save_cred));
 	current->flags = old_flags;
 }
 
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index b9442f70271d..629979b20e98 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -313,7 +313,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 
 	saved_cred = override_creds_light(get_new_cred(id_resolver_cache));
 	rkey = nfs_idmap_request_key(name, namelen, type, idmap);
-	revert_creds(saved_cred);
+	put_cred(revert_creds_light(saved_cred));
 
 	if (IS_ERR(rkey)) {
 		ret = PTR_ERR(rkey);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 614a5ec4824b..dda14811d092 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -27,7 +27,7 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp)
 	int flags = nfsexp_flags(cred, exp);
 
 	/* discard any old override before preparing the new set */
-	revert_creds(get_cred(current_real_cred()));
+	put_cred(revert_creds_light(get_cred(current_real_cred())));
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index a1cdba42c4fa..c05cd2ae8139 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1248,7 +1248,7 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
 
 	beres = nfsd_file_do_acquire(NULL, net, cred, client,
 				     fhp, may_flags, NULL, pnf, true);
-	revert_creds(save_cred);
+	put_cred(revert_creds_light(save_cred));
 	return beres;
 }
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 886d03953d7a..bf166a23d8b4 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -89,7 +89,7 @@ nfs4_save_creds(const struct cred **original_creds)
 static void
 nfs4_reset_creds(const struct cred *original)
 {
-	revert_creds(original);
+	put_cred(revert_creds_light(original));
 }
 
 static void
diff --git a/fs/open.c b/fs/open.c
index 2459cd061f47..23c414c10883 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -523,7 +523,7 @@ out_path_release:
 	}
 out:
 	if (old_cred)
-		revert_creds(old_cred);
+		put_cred(revert_creds_light(old_cred));
 
 	return res;
 }
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 527b041213c8..0f19bdbc78a4 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -749,7 +749,7 @@ static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
 static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
 {
 	if (cc->new) {
-		revert_creds(cc->old);
+		put_cred(revert_creds_light(cc->old));
 		put_cred(cc->new);
 	}
 }
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 721d8b1254b6..f2353bccc9f5 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -175,7 +175,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
 	cifs_dbg(FYI, "key description = %s\n", description);
 	saved_cred = override_creds_light(get_new_cred(spnego_cred));
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
-	revert_creds(saved_cred);
+	put_cred(revert_creds_light(saved_cred));
 
 #ifdef CONFIG_CIFS_DEBUG2
 	if (cifsFYI && !IS_ERR(spnego_key)) {
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index b1ea4ea3de4b..81d8d9802a56 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -327,7 +327,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid)
 out_key_put:
 	key_put(sidkey);
 out_revert_creds:
-	revert_creds(saved_cred);
+	put_cred(revert_creds_light(saved_cred));
 	return rc;
 
 invalidate_key:
@@ -438,7 +438,7 @@ try_upcall_to_get_id:
 out_key_put:
 	key_put(sidkey);
 out_revert_creds:
-	revert_creds(saved_cred);
+	put_cred(revert_creds_light(saved_cred));
 	kfree(sidstr);
 
 	/*
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 5662f2651b8e..1bd9b9c9db70 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -800,7 +800,7 @@ void ksmbd_revert_fsids(struct ksmbd_work *work)
 	WARN_ON(!work->saved_cred);
 
 	cred = current_cred();
-	revert_creds(work->saved_cred);
+	put_cred(revert_creds_light(work->saved_cred));
 	put_cred(cred);
 	work->saved_cred = NULL;
 }
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0b25ddea943e..66b5bd9d26ab 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1739,7 +1739,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 		audit_uring_exit(!ret, ret);
 
 	if (creds)
-		revert_creds(creds);
+		put_cred(revert_creds_light(creds));
 
 	if (ret == IOU_OK) {
 		if (issue_flags & IO_URING_F_COMPLETE_DEFER)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 58a76d581895..42ca6e07e0f7 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -192,7 +192,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
 			wake_up(&ctx->sqo_sq_wait);
 		if (creds)
-			revert_creds(creds);
+			put_cred(revert_creds_light(creds));
 	}
 
 	return ret;
diff --git a/kernel/acct.c b/kernel/acct.c
index 8f18eb02dd41..4e28aa9e1ef2 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -541,7 +541,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
 	}
 out:
 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
-	revert_creds(orig_cred);
+	put_cred(revert_creds_light(orig_cred));
 }
 
 /**
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 97329b4fe502..68b816955c9c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5220,7 +5220,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
 					of->file->f_path.dentry->d_sb,
 					threadgroup, ctx->ns);
-	revert_creds(saved_cred);
+	put_cred(revert_creds_light(saved_cred));
 	if (ret)
 		goto out_finish;
 
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 4dd7c45d227e..2fdadb2e8547 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1476,7 +1476,7 @@ static int user_event_set_call_visible(struct user_event *user, bool visible)
 	else
 		ret = trace_remove_event_call(&user->call);
 
-	revert_creds(old_cred);
+	put_cred(revert_creds_light(old_cred));
 	put_cred(cred);
 
 	return ret;
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index a54f5f841cea..297059b7e2a3 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -126,7 +126,7 @@ int dns_query(struct net *net,
 	 */
 	saved_cred = override_creds_light(get_new_cred(dns_resolver_cache));
 	rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
-	revert_creds(saved_cred);
+	put_cred(revert_creds_light(saved_cred));
 	kfree(desc);
 	if (IS_ERR(rkey)) {
 		ret = PTR_ERR(rkey);
-- 
cgit v1.2.3


From 6771e004b40962402d0e973fc7d2e0e61364fdfb Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:01 +0100
Subject: tree-wide: s/override_creds_light()/override_creds()/g

Rename all calls to override_creds_light() back to overrid_creds().

Link: https://lore.kernel.org/r/20241125-work-cred-v2-5-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/base/firmware_loader/main.c   |  2 +-
 drivers/crypto/ccp/sev-dev.c          |  2 +-
 drivers/target/target_core_configfs.c |  2 +-
 fs/aio.c                              |  2 +-
 fs/backing-file.c                     | 10 +++++-----
 fs/binfmt_misc.c                      |  2 +-
 fs/cachefiles/internal.h              |  2 +-
 fs/coredump.c                         |  2 +-
 fs/nfs/localio.c                      |  4 ++--
 fs/nfs/nfs4idmap.c                    |  2 +-
 fs/nfsd/auth.c                        |  2 +-
 fs/nfsd/nfs4recover.c                 |  2 +-
 fs/nfsd/nfsfh.c                       |  2 +-
 fs/open.c                             |  2 +-
 fs/overlayfs/copy_up.c                |  2 +-
 fs/overlayfs/dir.c                    |  2 +-
 fs/overlayfs/util.c                   |  2 +-
 fs/smb/client/cifs_spnego.c           |  2 +-
 fs/smb/client/cifsacl.c               |  4 ++--
 fs/smb/server/smb_common.c            |  2 +-
 include/linux/cred.h                  |  2 +-
 io_uring/io_uring.c                   |  2 +-
 io_uring/sqpoll.c                     |  2 +-
 kernel/acct.c                         |  2 +-
 kernel/cgroup/cgroup.c                |  2 +-
 kernel/trace/trace_events_user.c      |  2 +-
 net/dns_resolver/dns_query.c          |  2 +-
 27 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index b5677e173f91..294c75025dcb 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -911,7 +911,7 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 		ret = -ENOMEM;
 		goto out;
 	}
-	old_cred = override_creds_light(get_new_cred(kern_cred));
+	old_cred = override_creds(get_new_cred(kern_cred));
 
 	ret = fw_get_filesystem_firmware(device, fw->priv, "", NULL);
 
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 9111a51d53e0..ffae20fd52bc 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -244,7 +244,7 @@ static struct file *open_file_as_root(const char *filename, int flags, umode_t m
 	if (!cred)
 		return ERR_PTR(-ENOMEM);
 	cred->fsuid = GLOBAL_ROOT_UID;
-	old_cred = override_creds_light(get_new_cred(cred));
+	old_cred = override_creds(get_new_cred(cred));
 
 	fp = file_open_root(&root, filename, flags, mode);
 	path_put(&root);
diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index 564bc71d2d09..7788e1fe2633 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -3756,7 +3756,7 @@ static int __init target_core_init_configfs(void)
 		ret = -ENOMEM;
 		goto out;
 	}
-	old_cred = override_creds_light(get_new_cred(kern_cred));
+	old_cred = override_creds(get_new_cred(kern_cred));
 	target_init_dbroot();
 	put_cred(revert_creds_light(old_cred));
 	put_cred(kern_cred);
diff --git a/fs/aio.c b/fs/aio.c
index 6b987c48b671..7e0ec687f480 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1639,7 +1639,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 static void aio_fsync_work(struct work_struct *work)
 {
 	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
-	const struct cred *old_cred = override_creds_light(get_new_cred(iocb->fsync.creds));
+	const struct cred *old_cred = override_creds(get_new_cred(iocb->fsync.creds));
 
 	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
 	put_cred(revert_creds_light(old_cred));
diff --git a/fs/backing-file.c b/fs/backing-file.c
index cbdad8b68474..37c5a66e5dad 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -176,7 +176,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
 	    !(file->f_mode & FMODE_CAN_ODIRECT))
 		return -EINVAL;
 
-	old_cred = override_creds_light(ctx->cred);
+	old_cred = override_creds(ctx->cred);
 	if (is_sync_kiocb(iocb)) {
 		rwf_t rwf = iocb_to_rw_flags(flags);
 
@@ -233,7 +233,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 	 */
 	flags &= ~IOCB_DIO_CALLER_COMP;
 
-	old_cred = override_creds_light(ctx->cred);
+	old_cred = override_creds(ctx->cred);
 	if (is_sync_kiocb(iocb)) {
 		rwf_t rwf = iocb_to_rw_flags(flags);
 
@@ -281,7 +281,7 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
 	if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
 		return -EIO;
 
-	old_cred = override_creds_light(ctx->cred);
+	old_cred = override_creds(ctx->cred);
 	ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
 	revert_creds_light(old_cred);
 
@@ -310,7 +310,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
 	if (ret)
 		return ret;
 
-	old_cred = override_creds_light(ctx->cred);
+	old_cred = override_creds(ctx->cred);
 	file_start_write(out);
 	ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
 	file_end_write(out);
@@ -338,7 +338,7 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
 
 	vma_set_file(vma, file);
 
-	old_cred = override_creds_light(ctx->cred);
+	old_cred = override_creds(ctx->cred);
 	ret = call_mmap(vma->vm_file, vma);
 	revert_creds_light(old_cred);
 
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 586862ca8738..5756ec49f79e 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -829,7 +829,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		 * didn't matter much as only a privileged process could open
 		 * the register file.
 		 */
-		old_cred = override_creds_light(get_new_cred(file->f_cred));
+		old_cred = override_creds(get_new_cred(file->f_cred));
 		f = open_exec(e->interpreter);
 		put_cred(revert_creds_light(old_cred));
 		if (IS_ERR(f)) {
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 809305dd5317..05b1d4cfb55a 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -393,7 +393,7 @@ extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
 static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
 					   const struct cred **_saved_cred)
 {
-	*_saved_cred = override_creds_light(get_new_cred(cache->cache_cred));
+	*_saved_cred = override_creds(get_new_cred(cache->cache_cred));
 }
 
 static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
diff --git a/fs/coredump.c b/fs/coredump.c
index ff119aaa5c31..4eae37892da5 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -576,7 +576,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 	if (retval < 0)
 		goto fail_creds;
 
-	old_cred = override_creds_light(get_new_cred(cred));
+	old_cred = override_creds(get_new_cred(cred));
 
 	ispipe = format_corename(&cn, &cprm, &argv, &argc);
 
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index deca5559a56b..682d951ed69a 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -374,7 +374,7 @@ static void nfs_local_call_read(struct work_struct *work)
 	struct iov_iter iter;
 	ssize_t status;
 
-	save_cred = override_creds_light(get_new_cred(filp->f_cred));
+	save_cred = override_creds(get_new_cred(filp->f_cred));
 
 	nfs_local_iter_init(&iter, iocb, READ);
 
@@ -545,7 +545,7 @@ static void nfs_local_call_write(struct work_struct *work)
 	ssize_t status;
 
 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
-	save_cred = override_creds_light(get_new_cred(filp->f_cred));
+	save_cred = override_creds(get_new_cred(filp->f_cred));
 
 	nfs_local_iter_init(&iter, iocb, WRITE);
 
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 629979b20e98..3cae4057f8ba 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -311,7 +311,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 	const struct user_key_payload *payload;
 	ssize_t ret;
 
-	saved_cred = override_creds_light(get_new_cred(id_resolver_cache));
+	saved_cred = override_creds(get_new_cred(id_resolver_cache));
 	rkey = nfs_idmap_request_key(name, namelen, type, idmap);
 	put_cred(revert_creds_light(saved_cred));
 
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index dda14811d092..dafea9183b4e 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -79,7 +79,7 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
-	put_cred(override_creds_light(get_new_cred(new)));
+	put_cred(override_creds(get_new_cred(new)));
 	put_cred(new);
 	return 0;
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index bf166a23d8b4..0ab22b4e940f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -81,7 +81,7 @@ nfs4_save_creds(const struct cred **original_creds)
 
 	new->fsuid = GLOBAL_ROOT_UID;
 	new->fsgid = GLOBAL_ROOT_GID;
-	*original_creds = override_creds_light(get_new_cred(new));
+	*original_creds = override_creds(get_new_cred(new));
 	put_cred(new);
 	return 0;
 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 4819364190d3..1cf52f3d5412 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -221,7 +221,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
 		new->cap_effective =
 			cap_raise_nfsd_set(new->cap_effective,
 					   new->cap_permitted);
-		put_cred(override_creds_light(get_new_cred(new)));
+		put_cred(override_creds(get_new_cred(new)));
 		put_cred(new);
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, cred, exp);
diff --git a/fs/open.c b/fs/open.c
index 23c414c10883..bd0a34653f0e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -448,7 +448,7 @@ static const struct cred *access_override_creds(void)
 	 */
 	override_cred->non_rcu = 1;
 
-	old_cred = override_creds_light(get_new_cred(override_cred));
+	old_cred = override_creds(get_new_cred(override_cred));
 
 	/* override_cred() gets its own ref */
 	put_cred(override_cred);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 0f19bdbc78a4..7805667b2e05 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -741,7 +741,7 @@ static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
 		return err;
 
 	if (cc->new)
-		cc->old = override_creds_light(get_new_cred(cc->new));
+		cc->old = override_creds(get_new_cred(cc->new));
 
 	return 0;
 }
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 08e683917d12..151271f0586c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -580,7 +580,7 @@ static const struct cred *ovl_setup_cred_for_create(struct dentry *dentry,
 	 * We must be called with creator creds already, otherwise we risk
 	 * leaking creds.
 	 */
-	old_cred = override_creds_light(override_cred);
+	old_cred = override_creds(override_cred);
 	WARN_ON_ONCE(old_cred != ovl_creds(dentry->d_sb));
 
 	return override_cred;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 9aa7493b1e10..2513a79a10b0 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -65,7 +65,7 @@ const struct cred *ovl_override_creds(struct super_block *sb)
 {
 	struct ovl_fs *ofs = OVL_FS(sb);
 
-	return override_creds_light(ofs->creator_cred);
+	return override_creds(ofs->creator_cred);
 }
 
 void ovl_revert_creds(const struct cred *old_cred)
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index f2353bccc9f5..f22dc0be357f 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -173,7 +173,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
 	}
 
 	cifs_dbg(FYI, "key description = %s\n", description);
-	saved_cred = override_creds_light(get_new_cred(spnego_cred));
+	saved_cred = override_creds(get_new_cred(spnego_cred));
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 	put_cred(revert_creds_light(saved_cred));
 
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index 81d8d9802a56..d65e094b97cb 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -292,7 +292,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid)
 		return -EINVAL;
 
 	rc = 0;
-	saved_cred = override_creds_light(get_new_cred(root_cred));
+	saved_cred = override_creds(get_new_cred(root_cred));
 	sidkey = request_key(&cifs_idmap_key_type, desc, "");
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
@@ -398,7 +398,7 @@ try_upcall_to_get_id:
 	if (!sidstr)
 		return -ENOMEM;
 
-	saved_cred = override_creds_light(get_new_cred(root_cred));
+	saved_cred = override_creds(get_new_cred(root_cred));
 	sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
 	if (IS_ERR(sidkey)) {
 		cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 1bd9b9c9db70..4a4cb9e4d45a 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -780,7 +780,7 @@ int __ksmbd_override_fsids(struct ksmbd_work *work,
 		cred->cap_effective = cap_drop_fs_set(cred->cap_effective);
 
 	WARN_ON(work->saved_cred);
-	work->saved_cred = override_creds_light(get_new_cred(cred));
+	work->saved_cred = override_creds(get_new_cred(cred));
 	if (!work->saved_cred) {
 		abort_creds(cred);
 		return -EINVAL;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 80dcc18ef6e4..a073e6163c4e 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -170,7 +170,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 					  cred->cap_inheritable));
 }
 
-static inline const struct cred *override_creds_light(const struct cred *override_cred)
+static inline const struct cred *override_creds(const struct cred *override_cred)
 {
 	const struct cred *old = current->cred;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 66b5bd9d26ab..5b1cc024deea 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1728,7 +1728,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 		return -EBADF;
 
 	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
-		creds = override_creds_light(get_new_cred(req->creds));
+		creds = override_creds(get_new_cred(req->creds));
 
 	if (!def->audit_skip)
 		audit_uring_entry(req->opcode);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 42ca6e07e0f7..0fd424442118 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -174,7 +174,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 		const struct cred *creds = NULL;
 
 		if (ctx->sq_creds != current_cred())
-			creds = override_creds_light(get_new_cred(ctx->sq_creds));
+			creds = override_creds(get_new_cred(ctx->sq_creds));
 
 		mutex_lock(&ctx->uring_lock);
 		if (!wq_list_empty(&ctx->iopoll_list))
diff --git a/kernel/acct.c b/kernel/acct.c
index 4e28aa9e1ef2..a51a3b483fd9 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -501,7 +501,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
 	flim = rlimit(RLIMIT_FSIZE);
 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	/* Perform file operations on behalf of whoever enabled accounting */
-	orig_cred = override_creds_light(get_new_cred(file->f_cred));
+	orig_cred = override_creds(get_new_cred(file->f_cred));
 
 	/*
 	 * First check to see if there is enough free_space to continue
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 68b816955c9c..2d618b577e52 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5216,7 +5216,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	 * permissions using the credentials from file open to protect against
 	 * inherited fd attacks.
 	 */
-	saved_cred = override_creds_light(get_new_cred(of->file->f_cred));
+	saved_cred = override_creds(get_new_cred(of->file->f_cred));
 	ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
 					of->file->f_path.dentry->d_sb,
 					threadgroup, ctx->ns);
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 2fdadb2e8547..857124d81f12 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1469,7 +1469,7 @@ static int user_event_set_call_visible(struct user_event *user, bool visible)
 	 */
 	cred->fsuid = GLOBAL_ROOT_UID;
 
-	old_cred = override_creds_light(get_new_cred(cred));
+	old_cred = override_creds(get_new_cred(cred));
 
 	if (visible)
 		ret = trace_add_event_call(&user->call);
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 297059b7e2a3..f8749d688d66 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -124,7 +124,7 @@ int dns_query(struct net *net,
 	/* make the upcall, using special credentials to prevent the use of
 	 * add_key() to preinstall malicious redirections
 	 */
-	saved_cred = override_creds_light(get_new_cred(dns_resolver_cache));
+	saved_cred = override_creds(get_new_cred(dns_resolver_cache));
 	rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
 	put_cred(revert_creds_light(saved_cred));
 	kfree(desc);
-- 
cgit v1.2.3


From 51c0bcf0973a3836adfc46f30f876f412478e376 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:02 +0100
Subject: tree-wide: s/revert_creds_light()/revert_creds()/g

Rename all calls to revert_creds_light() back to revert_creds().

Link: https://lore.kernel.org/r/20241125-work-cred-v2-6-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/base/firmware_loader/main.c   |  2 +-
 drivers/crypto/ccp/sev-dev.c          |  2 +-
 drivers/target/target_core_configfs.c |  2 +-
 fs/aio.c                              |  2 +-
 fs/backing-file.c                     | 10 +++++-----
 fs/binfmt_misc.c                      |  2 +-
 fs/cachefiles/internal.h              |  2 +-
 fs/coredump.c                         |  2 +-
 fs/nfs/localio.c                      |  4 ++--
 fs/nfs/nfs4idmap.c                    |  2 +-
 fs/nfsd/auth.c                        |  2 +-
 fs/nfsd/filecache.c                   |  2 +-
 fs/nfsd/nfs4recover.c                 |  2 +-
 fs/open.c                             |  2 +-
 fs/overlayfs/copy_up.c                |  2 +-
 fs/overlayfs/dir.c                    |  2 +-
 fs/overlayfs/util.c                   |  2 +-
 fs/smb/client/cifs_spnego.c           |  2 +-
 fs/smb/client/cifsacl.c               |  4 ++--
 fs/smb/server/smb_common.c            |  2 +-
 include/linux/cred.h                  |  2 +-
 io_uring/io_uring.c                   |  2 +-
 io_uring/sqpoll.c                     |  2 +-
 kernel/acct.c                         |  2 +-
 kernel/cgroup/cgroup.c                |  2 +-
 kernel/trace/trace_events_user.c      |  2 +-
 net/dns_resolver/dns_query.c          |  2 +-
 27 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index 294c75025dcb..a97fa36ee4bd 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -943,7 +943,7 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 	} else
 		ret = assign_fw(fw, device);
 
-	put_cred(revert_creds_light(old_cred));
+	put_cred(revert_creds(old_cred));
 	put_cred(kern_cred);
 
 out:
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index ffae20fd52bc..187c34b02442 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -249,7 +249,7 @@ static struct file *open_file_as_root(const char *filename, int flags, umode_t m
 	fp = file_open_root(&root, filename, flags, mode);
 	path_put(&root);
 
-	put_cred(revert_creds_light(old_cred));
+	put_cred(revert_creds(old_cred));
 
 	return fp;
 }
diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index 7788e1fe2633..ec7a55987193 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -3758,7 +3758,7 @@ static int __init target_core_init_configfs(void)
 	}
 	old_cred = override_creds(get_new_cred(kern_cred));
 	target_init_dbroot();
-	put_cred(revert_creds_light(old_cred));
+	put_cred(revert_creds(old_cred));
 	put_cred(kern_cred);
 
 	return 0;
diff --git a/fs/aio.c b/fs/aio.c
index 7e0ec687f480..5e57dcaed7f1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1642,7 +1642,7 @@ static void aio_fsync_work(struct work_struct *work)
 	const struct cred *old_cred = override_creds(get_new_cred(iocb->fsync.creds));
 
 	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
-	put_cred(revert_creds_light(old_cred));
+	put_cred(revert_creds(old_cred));
 	put_cred(iocb->fsync.creds);
 	iocb_put(iocb);
 }
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 37c5a66e5dad..763fbe9b72b2 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -197,7 +197,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
 			backing_aio_cleanup(aio, ret);
 	}
 out:
-	revert_creds_light(old_cred);
+	revert_creds(old_cred);
 
 	if (ctx->accessed)
 		ctx->accessed(iocb->ki_filp);
@@ -264,7 +264,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 			backing_aio_cleanup(aio, ret);
 	}
 out:
-	revert_creds_light(old_cred);
+	revert_creds(old_cred);
 
 	return ret;
 }
@@ -283,7 +283,7 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
 
 	old_cred = override_creds(ctx->cred);
 	ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
-	revert_creds_light(old_cred);
+	revert_creds(old_cred);
 
 	if (ctx->accessed)
 		ctx->accessed(iocb->ki_filp);
@@ -314,7 +314,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
 	file_start_write(out);
 	ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
 	file_end_write(out);
-	revert_creds_light(old_cred);
+	revert_creds(old_cred);
 
 	if (ctx->end_write)
 		ctx->end_write(iocb, ret);
@@ -340,7 +340,7 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
 
 	old_cred = override_creds(ctx->cred);
 	ret = call_mmap(vma->vm_file, vma);
-	revert_creds_light(old_cred);
+	revert_creds(old_cred);
 
 	if (ctx->accessed)
 		ctx->accessed(user_file);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 5756ec49f79e..3270c2158552 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -831,7 +831,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		 */
 		old_cred = override_creds(get_new_cred(file->f_cred));
 		f = open_exec(e->interpreter);
-		put_cred(revert_creds_light(old_cred));
+		put_cred(revert_creds(old_cred));
 		if (IS_ERR(f)) {
 			pr_notice("register: failed to install interpreter file %s\n",
 				 e->interpreter);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 05b1d4cfb55a..1cfeb3b38319 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -399,7 +399,7 @@ static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
 static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
 					 const struct cred *saved_cred)
 {
-	put_cred(revert_creds_light(saved_cred));
+	put_cred(revert_creds(saved_cred));
 }
 
 /*
diff --git a/fs/coredump.c b/fs/coredump.c
index 4eae37892da5..0d3a65cac546 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -781,7 +781,7 @@ fail_unlock:
 	kfree(argv);
 	kfree(cn.corename);
 	coredump_finish(core_dumped);
-	put_cred(revert_creds_light(old_cred));
+	put_cred(revert_creds(old_cred));
 fail_creds:
 	put_cred(cred);
 fail:
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 682d951ed69a..720a4a99bd8a 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -384,7 +384,7 @@ static void nfs_local_call_read(struct work_struct *work)
 	nfs_local_read_done(iocb, status);
 	nfs_local_pgio_release(iocb);
 
-	put_cred(revert_creds_light(save_cred));
+	put_cred(revert_creds(save_cred));
 }
 
 static int
@@ -558,7 +558,7 @@ static void nfs_local_call_write(struct work_struct *work)
 	nfs_local_vfs_getattr(iocb);
 	nfs_local_pgio_release(iocb);
 
-	put_cred(revert_creds_light(save_cred));
+	put_cred(revert_creds(save_cred));
 	current->flags = old_flags;
 }
 
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 3cae4057f8ba..25b6a8920a65 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -313,7 +313,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 
 	saved_cred = override_creds(get_new_cred(id_resolver_cache));
 	rkey = nfs_idmap_request_key(name, namelen, type, idmap);
-	put_cred(revert_creds_light(saved_cred));
+	put_cred(revert_creds(saved_cred));
 
 	if (IS_ERR(rkey)) {
 		ret = PTR_ERR(rkey);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index dafea9183b4e..c399a5f030af 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -27,7 +27,7 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp)
 	int flags = nfsexp_flags(cred, exp);
 
 	/* discard any old override before preparing the new set */
-	put_cred(revert_creds_light(get_cred(current_real_cred())));
+	put_cred(revert_creds(get_cred(current_real_cred())));
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index c05cd2ae8139..dc5c9d8e8202 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1248,7 +1248,7 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
 
 	beres = nfsd_file_do_acquire(NULL, net, cred, client,
 				     fhp, may_flags, NULL, pnf, true);
-	put_cred(revert_creds_light(save_cred));
+	put_cred(revert_creds(save_cred));
 	return beres;
 }
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0ab22b4e940f..f3837167b6a1 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -89,7 +89,7 @@ nfs4_save_creds(const struct cred **original_creds)
 static void
 nfs4_reset_creds(const struct cred *original)
 {
-	put_cred(revert_creds_light(original));
+	put_cred(revert_creds(original));
 }
 
 static void
diff --git a/fs/open.c b/fs/open.c
index bd0a34653f0e..0a5cd8e74fb9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -523,7 +523,7 @@ out_path_release:
 	}
 out:
 	if (old_cred)
-		put_cred(revert_creds_light(old_cred));
+		put_cred(revert_creds(old_cred));
 
 	return res;
 }
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 7805667b2e05..439bd9a5ceec 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -749,7 +749,7 @@ static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
 static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
 {
 	if (cc->new) {
-		put_cred(revert_creds_light(cc->old));
+		put_cred(revert_creds(cc->old));
 		put_cred(cc->new);
 	}
 }
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 151271f0586c..c9993ff66fc2 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -575,7 +575,7 @@ static const struct cred *ovl_setup_cred_for_create(struct dentry *dentry,
 	}
 
 	/*
-	 * Caller is going to match this with revert_creds_light() and drop
+	 * Caller is going to match this with revert_creds() and drop
 	 * referenec on the returned creds.
 	 * We must be called with creator creds already, otherwise we risk
 	 * leaking creds.
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 2513a79a10b0..0819c739cc2f 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -70,7 +70,7 @@ const struct cred *ovl_override_creds(struct super_block *sb)
 
 void ovl_revert_creds(const struct cred *old_cred)
 {
-	revert_creds_light(old_cred);
+	revert_creds(old_cred);
 }
 
 /*
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index f22dc0be357f..6284d924fdb1 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -175,7 +175,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
 	cifs_dbg(FYI, "key description = %s\n", description);
 	saved_cred = override_creds(get_new_cred(spnego_cred));
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
-	put_cred(revert_creds_light(saved_cred));
+	put_cred(revert_creds(saved_cred));
 
 #ifdef CONFIG_CIFS_DEBUG2
 	if (cifsFYI && !IS_ERR(spnego_key)) {
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index d65e094b97cb..5718906369a9 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -327,7 +327,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid)
 out_key_put:
 	key_put(sidkey);
 out_revert_creds:
-	put_cred(revert_creds_light(saved_cred));
+	put_cred(revert_creds(saved_cred));
 	return rc;
 
 invalidate_key:
@@ -438,7 +438,7 @@ try_upcall_to_get_id:
 out_key_put:
 	key_put(sidkey);
 out_revert_creds:
-	put_cred(revert_creds_light(saved_cred));
+	put_cred(revert_creds(saved_cred));
 	kfree(sidstr);
 
 	/*
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 4a4cb9e4d45a..ec4106aa1945 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -800,7 +800,7 @@ void ksmbd_revert_fsids(struct ksmbd_work *work)
 	WARN_ON(!work->saved_cred);
 
 	cred = current_cred();
-	put_cred(revert_creds_light(work->saved_cred));
+	put_cred(revert_creds(work->saved_cred));
 	put_cred(cred);
 	work->saved_cred = NULL;
 }
diff --git a/include/linux/cred.h b/include/linux/cred.h
index a073e6163c4e..a7df1c759ef0 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -178,7 +178,7 @@ static inline const struct cred *override_creds(const struct cred *override_cred
 	return old;
 }
 
-static inline const struct cred *revert_creds_light(const struct cred *revert_cred)
+static inline const struct cred *revert_creds(const struct cred *revert_cred)
 {
 	const struct cred *override_cred = current->cred;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5b1cc024deea..3e408c8442d4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1739,7 +1739,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 		audit_uring_exit(!ret, ret);
 
 	if (creds)
-		put_cred(revert_creds_light(creds));
+		put_cred(revert_creds(creds));
 
 	if (ret == IOU_OK) {
 		if (issue_flags & IO_URING_F_COMPLETE_DEFER)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 0fd424442118..1ca963474336 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -192,7 +192,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
 			wake_up(&ctx->sqo_sq_wait);
 		if (creds)
-			put_cred(revert_creds_light(creds));
+			put_cred(revert_creds(creds));
 	}
 
 	return ret;
diff --git a/kernel/acct.c b/kernel/acct.c
index a51a3b483fd9..ea8c94887b58 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -541,7 +541,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
 	}
 out:
 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
-	put_cred(revert_creds_light(orig_cred));
+	put_cred(revert_creds(orig_cred));
 }
 
 /**
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2d618b577e52..1a94e8b154be 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5220,7 +5220,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
 					of->file->f_path.dentry->d_sb,
 					threadgroup, ctx->ns);
-	put_cred(revert_creds_light(saved_cred));
+	put_cred(revert_creds(saved_cred));
 	if (ret)
 		goto out_finish;
 
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 857124d81f12..c54ae15f425c 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1476,7 +1476,7 @@ static int user_event_set_call_visible(struct user_event *user, bool visible)
 	else
 		ret = trace_remove_event_call(&user->call);
 
-	put_cred(revert_creds_light(old_cred));
+	put_cred(revert_creds(old_cred));
 	put_cred(cred);
 
 	return ret;
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index f8749d688d66..0b0789fe2194 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -126,7 +126,7 @@ int dns_query(struct net *net,
 	 */
 	saved_cred = override_creds(get_new_cred(dns_resolver_cache));
 	rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
-	put_cred(revert_creds_light(saved_cred));
+	put_cred(revert_creds(saved_cred));
 	kfree(desc);
 	if (IS_ERR(rkey)) {
 		ret = PTR_ERR(rkey);
-- 
cgit v1.2.3


From b37eab47cf5479f974c029e23d84f9049d4df009 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:06 +0100
Subject: aio: avoid pointless cred reference count bump

iocb->fsync.creds already holds a reference count that is stable while
the operation is performed.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-10-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 5e57dcaed7f1..50671640b588 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1639,10 +1639,10 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 static void aio_fsync_work(struct work_struct *work)
 {
 	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
-	const struct cred *old_cred = override_creds(get_new_cred(iocb->fsync.creds));
+	const struct cred *old_cred = override_creds(iocb->fsync.creds);
 
 	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
-	put_cred(revert_creds(old_cred));
+	revert_creds(old_cred);
 	put_cred(iocb->fsync.creds);
 	iocb_put(iocb);
 }
-- 
cgit v1.2.3


From caf6bf48f902e8e353f124325dd635d143331476 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:07 +0100
Subject: binfmt_misc: avoid pointless cred reference count bump

file->f_cred already holds a reference count that is stable during the
operation.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-11-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/binfmt_misc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 3270c2158552..6a3a16f91051 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -829,9 +829,9 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		 * didn't matter much as only a privileged process could open
 		 * the register file.
 		 */
-		old_cred = override_creds(get_new_cred(file->f_cred));
+		old_cred = override_creds(file->f_cred);
 		f = open_exec(e->interpreter);
-		put_cred(revert_creds(old_cred));
+		revert_creds(old_cred);
 		if (IS_ERR(f)) {
 			pr_notice("register: failed to install interpreter file %s\n",
 				 e->interpreter);
-- 
cgit v1.2.3


From bd05aeb1eedc60dee05f82efe97339330239f37e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:08 +0100
Subject: coredump: avoid pointless cred reference count bump

The creds are allocated via prepare_creds() which has already taken a
reference.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-12-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/coredump.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 0d3a65cac546..d48edb37bc35 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -576,7 +576,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 	if (retval < 0)
 		goto fail_creds;
 
-	old_cred = override_creds(get_new_cred(cred));
+	old_cred = override_creds(cred);
 
 	ispipe = format_corename(&cn, &cprm, &argv, &argc);
 
@@ -781,7 +781,7 @@ fail_unlock:
 	kfree(argv);
 	kfree(cn.corename);
 	coredump_finish(core_dumped);
-	put_cred(revert_creds(old_cred));
+	revert_creds(old_cred);
 fail_creds:
 	put_cred(cred);
 fail:
-- 
cgit v1.2.3


From 6c7a0a6afd0e45ba4879d6fb2bafa9807cde6e2c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:09 +0100
Subject: nfs/localio: avoid pointless cred reference count bumps

filp->f_cred already holds a reference count that is stable during the
operation.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-13-68b9d38bb5b2@kernel.org
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nfs/localio.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 720a4a99bd8a..4b8618cf114c 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -374,7 +374,7 @@ static void nfs_local_call_read(struct work_struct *work)
 	struct iov_iter iter;
 	ssize_t status;
 
-	save_cred = override_creds(get_new_cred(filp->f_cred));
+	save_cred = override_creds(filp->f_cred);
 
 	nfs_local_iter_init(&iter, iocb, READ);
 
@@ -384,7 +384,7 @@ static void nfs_local_call_read(struct work_struct *work)
 	nfs_local_read_done(iocb, status);
 	nfs_local_pgio_release(iocb);
 
-	put_cred(revert_creds(save_cred));
+	revert_creds(save_cred);
 }
 
 static int
@@ -545,7 +545,7 @@ static void nfs_local_call_write(struct work_struct *work)
 	ssize_t status;
 
 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
-	save_cred = override_creds(get_new_cred(filp->f_cred));
+	save_cred = override_creds(filp->f_cred);
 
 	nfs_local_iter_init(&iter, iocb, WRITE);
 
@@ -558,7 +558,7 @@ static void nfs_local_call_write(struct work_struct *work)
 	nfs_local_vfs_getattr(iocb);
 	nfs_local_pgio_release(iocb);
 
-	put_cred(revert_creds(save_cred));
+	revert_creds(save_cred);
 	current->flags = old_flags;
 }
 
-- 
cgit v1.2.3


From 3e23a1cd849d96ff3615d731acdafa8332ca2206 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:10 +0100
Subject: nfs/nfs4idmap: avoid pointless reference count bump

The override creds are allocated with a long-term refernce when the
id_resolver is initialized via prepare_kernel_creds() that is put when
the id_resolver is destroyed.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-14-68b9d38bb5b2@kernel.org
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nfs/nfs4idmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 25b6a8920a65..25a7c771cfd8 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -311,9 +311,9 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 	const struct user_key_payload *payload;
 	ssize_t ret;
 
-	saved_cred = override_creds(get_new_cred(id_resolver_cache));
+	saved_cred = override_creds(id_resolver_cache);
 	rkey = nfs_idmap_request_key(name, namelen, type, idmap);
-	put_cred(revert_creds(saved_cred));
+	revert_creds(saved_cred);
 
 	if (IS_ERR(rkey)) {
 		ret = PTR_ERR(rkey);
-- 
cgit v1.2.3


From dfce6a462a9490063852c5d91d0f41104c0aad6d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:11 +0100
Subject: nfs/nfs4recover: avoid pointless cred reference count bump

The code already got rid of the extra reference count from the old
version of override_creds().

Link: https://lore.kernel.org/r/20241125-work-cred-v2-15-68b9d38bb5b2@kernel.org
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nfsd/nfs4recover.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index f3837167b6a1..7f2ceeb118a4 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -81,8 +81,7 @@ nfs4_save_creds(const struct cred **original_creds)
 
 	new->fsuid = GLOBAL_ROOT_UID;
 	new->fsgid = GLOBAL_ROOT_GID;
-	*original_creds = override_creds(get_new_cred(new));
-	put_cred(new);
+	*original_creds = override_creds(new);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 81be9a8a1090cdddea69aa326cd41b658932e84e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:12 +0100
Subject: nfsfh: avoid pointless cred reference count bump

The code already got rid of the extra reference count from the old
version of override_creds().

Link: https://lore.kernel.org/r/20241125-work-cred-v2-16-68b9d38bb5b2@kernel.org
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nfsd/nfsfh.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 1cf52f3d5412..98d6459724a7 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -221,8 +221,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
 		new->cap_effective =
 			cap_raise_nfsd_set(new->cap_effective,
 					   new->cap_permitted);
-		put_cred(override_creds(get_new_cred(new)));
-		put_cred(new);
+		put_cred(override_creds(new));
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, cred, exp);
 		if (error)
-- 
cgit v1.2.3


From 7708f3a7d25f5f1e80a5ef6abc3abde00128f88c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:13 +0100
Subject: open: avoid pointless cred reference count bump

The code already got rid of the extra reference count from the old
version of override_creds().

Link: https://lore.kernel.org/r/20241125-work-cred-v2-17-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/open.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 0a5cd8e74fb9..ffcfef67ac86 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -402,7 +402,6 @@ static bool access_need_override_creds(int flags)
 
 static const struct cred *access_override_creds(void)
 {
-	const struct cred *old_cred;
 	struct cred *override_cred;
 
 	override_cred = prepare_creds();
@@ -447,13 +446,7 @@ static const struct cred *access_override_creds(void)
 	 * freeing.
 	 */
 	override_cred->non_rcu = 1;
-
-	old_cred = override_creds(get_new_cred(override_cred));
-
-	/* override_cred() gets its own ref */
-	put_cred(override_cred);
-
-	return old_cred;
+	return override_creds(override_cred);
 }
 
 static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
-- 
cgit v1.2.3


From facc239a8c4980e17c832a7d5ee3809a2e2a45bd Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:14 +0100
Subject: ovl: avoid pointless cred reference count bump

security_inode_copy_up() allocates a set of new credentials and has
taken a reference count.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-18-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/overlayfs/copy_up.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 439bd9a5ceec..3601ddfeddc2 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -741,7 +741,7 @@ static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
 		return err;
 
 	if (cc->new)
-		cc->old = override_creds(get_new_cred(cc->new));
+		cc->old = override_creds(cc->new);
 
 	return 0;
 }
@@ -749,7 +749,7 @@ static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
 static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
 {
 	if (cc->new) {
-		put_cred(revert_creds(cc->old));
+		revert_creds(cc->old);
 		put_cred(cc->new);
 	}
 }
-- 
cgit v1.2.3


From 6077c4620daad2c9823c5ed55b4b4226a2883794 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:15 +0100
Subject: cifs: avoid pointless cred reference count bump

During module init spnego_cred will be allocated with its own reference
which is only destroyed during module exit.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-19-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/smb/client/cifs_spnego.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 6284d924fdb1..28f568b5fc27 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -173,9 +173,9 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
 	}
 
 	cifs_dbg(FYI, "key description = %s\n", description);
-	saved_cred = override_creds(get_new_cred(spnego_cred));
+	saved_cred = override_creds(spnego_cred);
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
-	put_cred(revert_creds(saved_cred));
+	revert_creds(saved_cred);
 
 #ifdef CONFIG_CIFS_DEBUG2
 	if (cifsFYI && !IS_ERR(spnego_key)) {
-- 
cgit v1.2.3


From 2b315eda9e45f21b6c0da298016677b1090df67d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:16 +0100
Subject: cifs: avoid pointless cred reference count bump

During module init root_cred will be allocated with its own reference
which is only destroyed during module exit.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-20-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/smb/client/cifsacl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index 5718906369a9..ba79aa2107cc 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -292,7 +292,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid)
 		return -EINVAL;
 
 	rc = 0;
-	saved_cred = override_creds(get_new_cred(root_cred));
+	saved_cred = override_creds(root_cred);
 	sidkey = request_key(&cifs_idmap_key_type, desc, "");
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
@@ -327,7 +327,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid)
 out_key_put:
 	key_put(sidkey);
 out_revert_creds:
-	put_cred(revert_creds(saved_cred));
+	revert_creds(saved_cred);
 	return rc;
 
 invalidate_key:
@@ -398,7 +398,7 @@ try_upcall_to_get_id:
 	if (!sidstr)
 		return -ENOMEM;
 
-	saved_cred = override_creds(get_new_cred(root_cred));
+	saved_cred = override_creds(root_cred);
 	sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
 	if (IS_ERR(sidkey)) {
 		cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
@@ -438,7 +438,7 @@ try_upcall_to_get_id:
 out_key_put:
 	key_put(sidkey);
 out_revert_creds:
-	put_cred(revert_creds(saved_cred));
+	revert_creds(saved_cred);
 	kfree(sidstr);
 
 	/*
-- 
cgit v1.2.3


From 62e5396c50ae3942ff94c0a8cec6f476c1c3da2f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:17 +0100
Subject: smb: avoid pointless cred reference count bump

The creds are allocated via prepare_kernel_cred() which has already
taken a reference.

This also removes a pointless check that gives the impression that
override_creds() can ever be called on a task with current->cred NULL.
That's not possible afaict. Remove the check to not imply that there can
be a dangling pointer in current->cred.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-21-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/smb/server/smb_common.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index ec4106aa1945..d009651fd5a8 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -780,11 +780,7 @@ int __ksmbd_override_fsids(struct ksmbd_work *work,
 		cred->cap_effective = cap_drop_fs_set(cred->cap_effective);
 
 	WARN_ON(work->saved_cred);
-	work->saved_cred = override_creds(get_new_cred(cred));
-	if (!work->saved_cred) {
-		abort_creds(cred);
-		return -EINVAL;
-	}
+	work->saved_cred = override_creds(cred);
 	return 0;
 }
 
@@ -796,13 +792,11 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
 void ksmbd_revert_fsids(struct ksmbd_work *work)
 {
 	const struct cred *cred;
-
 	WARN_ON(!work->saved_cred);
 
-	cred = current_cred();
-	put_cred(revert_creds(work->saved_cred));
-	put_cred(cred);
+	cred = revert_creds(work->saved_cred);
 	work->saved_cred = NULL;
+	put_cred(cred);
 }
 
 __le32 smb_map_generic_desired_access(__le32 daccess)
-- 
cgit v1.2.3


From 76a40086683933431660b348dbfefa80b9eaaa03 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:23 +0100
Subject: cachefiles: avoid pointless cred reference count bump

The cache holds a long-term reference to the credentials that's taken
when the cache is created and put when the cache becomes unused.

Link: https://lore.kernel.org/r/20241125-work-cred-v2-27-68b9d38bb5b2@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/internal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 1cfeb3b38319..7b99bd98de75 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -393,13 +393,13 @@ extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
 static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
 					   const struct cred **_saved_cred)
 {
-	*_saved_cred = override_creds(get_new_cred(cache->cache_cred));
+	*_saved_cred = override_creds(cache->cache_cred);
 }
 
 static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
 					 const struct cred *saved_cred)
 {
-	put_cred(revert_creds(saved_cred));
+	revert_creds(saved_cred);
 }
 
 /*
-- 
cgit v1.2.3


From 4fa6af563d4dcef43ae34a9bce9d1a9440f20867 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 25 Nov 2024 15:10:24 +0100
Subject: nfsd: avoid pointless cred reference count bump

The code already got rid of the extra reference count from the old
version of override_creds().

Link: https://lore.kernel.org/r/20241125-work-cred-v2-28-68b9d38bb5b2@kernel.org
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nfsd/auth.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index c399a5f030af..4dc327e02456 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -79,8 +79,7 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
-	put_cred(override_creds(get_new_cred(new)));
-	put_cred(new);
+	put_cred(override_creds(new));
 	return 0;
 
 oom:
-- 
cgit v1.2.3


From b905bafdea21a75d75a96855edd9e0b6051eee30 Mon Sep 17 00:00:00 2001
From: Leo Stone <leocstone@gmail.com>
Date: Sat, 30 Nov 2024 21:14:19 -0800
Subject: hfs: Sanity check the root record

In the syzbot reproducer, the hfs_cat_rec for the root dir has type
HFS_CDR_FIL after being read with hfs_bnode_read() in hfs_super_fill().
This indicates it should be used as an hfs_cat_file, which is 102 bytes.
Only the first 70 bytes of that struct are initialized, however,
because the entrylength passed into hfs_bnode_read() is still the length of
a directory record. This causes uninitialized values to be used later on,
when the hfs_cat_rec union is treated as the larger hfs_cat_file struct.

Add a check to make sure the retrieved record has the correct type
for the root directory (HFS_CDR_DIR), and make sure we load the correct
number of bytes for a directory record.

Reported-by: syzbot+2db3c7526ba68f4ea776@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=2db3c7526ba68f4ea776
Tested-by: syzbot+2db3c7526ba68f4ea776@syzkaller.appspotmail.com
Tested-by: Leo Stone <leocstone@gmail.com>
Signed-off-by: Leo Stone <leocstone@gmail.com>
Link: https://lore.kernel.org/r/20241201051420.77858-1-leocstone@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/hfs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 3bee9b5dba5e..fe09c2093a93 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -349,11 +349,13 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
 		goto bail_no_root;
 	res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
 	if (!res) {
-		if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
+		if (fd.entrylength != sizeof(rec.dir)) {
 			res =  -EIO;
 			goto bail_hfs_find;
 		}
 		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
+		if (rec.type != HFS_CDR_DIR)
+			res = -EIO;
 	}
 	if (res)
 		goto bail_hfs_find;
-- 
cgit v1.2.3


From 989e0cdc0f18a594b25cabc60426d29659aeaf58 Mon Sep 17 00:00:00 2001
From: Brahmajit Das <brahmajit.xyz@gmail.com>
Date: Sat, 5 Oct 2024 01:21:32 +0530
Subject: fs/qnx6: Fix building with GCC 15
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qnx6_checkroot() had been using weirdly spelled initializer - it needed
to initialize 3-element arrays of char and it used NUL-padded
3-character string literals (i.e. 4-element initializers, with
completely pointless zeroes at the end).

That had been spotted by gcc-15[*]; prior to that gcc quietly dropped
the 4th element of initializers.

However, none of that had been needed in the first place - all this
array is used for is checking that the first directory entry in root
directory is "." and the second - "..".  The check had been expressed as
a loop, using that match_root[] array.  Since there is no chance that we
ever want to extend that list of entries, the entire thing is much too
fancy for its own good; what we need is just a couple of explicit
memcmp() and that's it.

[*]: fs/qnx6/inode.c: In function ‘qnx6_checkroot’:
fs/qnx6/inode.c:182:41: error: initializer-string for array of ‘char’ is too long [-Werror=unterminated-string-initialization]
  182 |         static char match_root[2][3] = {".\0\0", "..\0"};
      |                                         ^~~~~~~
fs/qnx6/inode.c:182:50: error: initializer-string for array of ‘char’ is too long [-Werror=unterminated-string-initialization]
  182 |         static char match_root[2][3] = {".\0\0", "..\0"};
      |                                                  ^~~~~~

Signed-off-by: Brahmajit Das <brahmajit.xyz@gmail.com>
Link: https://lore.kernel.org/r/20241004195132.1393968-1-brahmajit.xyz@gmail.com
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/qnx6/inode.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 85925ec0051a..3310d1ad4d0e 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -179,8 +179,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf)
  */
 static const char *qnx6_checkroot(struct super_block *s)
 {
-	static char match_root[2][3] = {".\0\0", "..\0"};
-	int i, error = 0;
+	int error = 0;
 	struct qnx6_dir_entry *dir_entry;
 	struct inode *root = d_inode(s->s_root);
 	struct address_space *mapping = root->i_mapping;
@@ -189,11 +188,9 @@ static const char *qnx6_checkroot(struct super_block *s)
 	if (IS_ERR(folio))
 		return "error reading root directory";
 	dir_entry = kmap_local_folio(folio, 0);
-	for (i = 0; i < 2; i++) {
-		/* maximum 3 bytes - due to match_root limitation */
-		if (strncmp(dir_entry[i].de_fname, match_root[i], 3))
-			error = 1;
-	}
+	if (memcmp(dir_entry[0].de_fname, ".", 2) ||
+	    memcmp(dir_entry[1].de_fname, "..", 3))
+		error = 1;
 	folio_release_kmap(folio, dir_entry);
 	if (error)
 		return "error reading root directory.";
-- 
cgit v1.2.3


From ac1e21bd8c883aeac2f1835fc93b39c1e6838b35 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Tue, 3 Dec 2024 09:44:06 +0800
Subject: jbd2: increase IO priority for writing revoke records

Commit '6a3afb6ac6df ("jbd2: increase the journal IO's priority")'
increases the priority of journal I/O by marking I/O with the
JBD2_JOURNAL_REQ_FLAGS. However, that commit missed the revoke buffers,
so also addresses that kind of I/Os.

Fixes: 6a3afb6ac6df ("jbd2: increase the journal IO's priority")
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20241203014407.805916-2-yi.zhang@huaweicloud.com
Reviewed-by: Kemeng Shi <shikemeng@huaweicloud.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/jbd2/revoke.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 4556e4689024..ce63d5fde9c3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(descriptor);
 	BUFFER_TRACE(descriptor, "write");
 	set_buffer_dirty(descriptor);
-	write_dirty_buffer(descriptor, REQ_SYNC);
+	write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS);
 }
 #endif
 
-- 
cgit v1.2.3


From a0851ea9cd555c333795b85ddd908898b937c4e1 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Tue, 3 Dec 2024 09:44:07 +0800
Subject: jbd2: flush filesystem device before updating tail sequence

When committing transaction in jbd2_journal_commit_transaction(), the
disk caches for the filesystem device should be flushed before updating
the journal tail sequence. However, this step is missed if the journal
is not located on the filesystem device. As a result, the filesystem may
become inconsistent following a power failure or system crash. Fix it by
ensuring that the filesystem device is flushed appropriately.

Fixes: 3339578f0578 ("jbd2: cleanup journal tail after transaction commit")
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20241203014407.805916-3-yi.zhang@huaweicloud.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/jbd2/commit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 9153ff3a08e7..e8e80761ac73 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -772,9 +772,9 @@ start_journal_io:
 	/*
 	 * If the journal is not located on the file system device,
 	 * then we must flush the file system device before we issue
-	 * the commit record
+	 * the commit record and update the journal tail sequence.
 	 */
-	if (commit_transaction->t_need_data_flush &&
+	if ((commit_transaction->t_need_data_flush || update_tail) &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
 		blkdev_issue_flush(journal->j_fs_dev);
-- 
cgit v1.2.3


From 6fba89813ccf333d2bc4d5caea04cd5f3c39eb50 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 23 Oct 2024 14:21:54 -0700
Subject: lsm: ensure the correct LSM context releaser

Add a new lsm_context data structure to hold all the information about a
"security context", including the string, its size and which LSM allocated
the string. The allocation information is necessary because LSMs have
different policies regarding the lifecycle of these strings. SELinux
allocates and destroys them on each use, whereas Smack provides a pointer
to an entry in a list that never goes away.

Update security_release_secctx() to use the lsm_context instead of a
(char *, len) pair. Change its callers to do likewise.  The LSMs
supporting this hook have had comments added to remind the developer
that there is more work to be done.

The BPF security module provides all LSM hooks. While there has yet to
be a known instance of a BPF configuration that uses security contexts,
the possibility is real. In the existing implementation there is
potential for multiple frees in that case.

Cc: linux-integrity@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: audit@vger.kernel.org
Cc: netfilter-devel@vger.kernel.org
To: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: linux-nfs@vger.kernel.org
Cc: Todd Kjos <tkjos@google.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 drivers/android/binder.c                | 24 ++++++++++----------
 fs/ceph/xattr.c                         |  6 ++++-
 fs/nfs/nfs4proc.c                       |  8 +++++--
 fs/nfsd/nfs4xdr.c                       |  8 +++++--
 include/linux/lsm_hook_defs.h           |  2 +-
 include/linux/security.h                | 35 +++++++++++++++++++++++++++--
 include/net/scm.h                       | 11 ++++-----
 kernel/audit.c                          | 30 ++++++++++++-------------
 kernel/auditsc.c                        | 23 ++++++++++---------
 net/ipv4/ip_sockglue.c                  | 10 ++++-----
 net/netfilter/nf_conntrack_netlink.c    | 10 ++++-----
 net/netfilter/nf_conntrack_standalone.c |  9 ++++----
 net/netfilter/nfnetlink_queue.c         | 13 +++++++----
 net/netlabel/netlabel_unlabeled.c       | 40 +++++++++++++++------------------
 net/netlabel/netlabel_user.c            | 11 +++++----
 security/apparmor/include/secid.h       |  2 +-
 security/apparmor/secid.c               | 11 +++++++--
 security/security.c                     |  8 +++----
 security/selinux/hooks.c                | 11 +++++++--
 19 files changed, 165 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index ef353ca13c35..e8245df63289 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3017,8 +3017,7 @@ static void binder_transaction(struct binder_proc *proc,
 	struct binder_context *context = proc->context;
 	int t_debug_id = atomic_inc_return(&binder_last_id);
 	ktime_t t_start_time = ktime_get();
-	char *secctx = NULL;
-	u32 secctx_sz = 0;
+	struct lsm_context lsmctx;
 	struct list_head sgc_head;
 	struct list_head pf_head;
 	const void __user *user_buffer = (const void __user *)
@@ -3297,7 +3296,8 @@ static void binder_transaction(struct binder_proc *proc,
 		size_t added_size;
 
 		security_cred_getsecid(proc->cred, &secid);
-		ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
+		ret = security_secid_to_secctx(secid, &lsmctx.context,
+					       &lsmctx.len);
 		if (ret) {
 			binder_txn_error("%d:%d failed to get security context\n",
 				thread->pid, proc->pid);
@@ -3306,7 +3306,7 @@ static void binder_transaction(struct binder_proc *proc,
 			return_error_line = __LINE__;
 			goto err_get_secctx_failed;
 		}
-		added_size = ALIGN(secctx_sz, sizeof(u64));
+		added_size = ALIGN(lsmctx.len, sizeof(u64));
 		extra_buffers_size += added_size;
 		if (extra_buffers_size < added_size) {
 			binder_txn_error("%d:%d integer overflow of extra_buffers_size\n",
@@ -3340,23 +3340,23 @@ static void binder_transaction(struct binder_proc *proc,
 		t->buffer = NULL;
 		goto err_binder_alloc_buf_failed;
 	}
-	if (secctx) {
+	if (lsmctx.context) {
 		int err;
 		size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) +
 				    ALIGN(tr->offsets_size, sizeof(void *)) +
 				    ALIGN(extra_buffers_size, sizeof(void *)) -
-				    ALIGN(secctx_sz, sizeof(u64));
+				    ALIGN(lsmctx.len, sizeof(u64));
 
 		t->security_ctx = t->buffer->user_data + buf_offset;
 		err = binder_alloc_copy_to_buffer(&target_proc->alloc,
 						  t->buffer, buf_offset,
-						  secctx, secctx_sz);
+						  lsmctx.context, lsmctx.len);
 		if (err) {
 			t->security_ctx = 0;
 			WARN_ON(1);
 		}
-		security_release_secctx(secctx, secctx_sz);
-		secctx = NULL;
+		security_release_secctx(&lsmctx);
+		lsmctx.context = NULL;
 	}
 	t->buffer->debug_id = t->debug_id;
 	t->buffer->transaction = t;
@@ -3400,7 +3400,7 @@ static void binder_transaction(struct binder_proc *proc,
 	off_end_offset = off_start_offset + tr->offsets_size;
 	sg_buf_offset = ALIGN(off_end_offset, sizeof(void *));
 	sg_buf_end_offset = sg_buf_offset + extra_buffers_size -
-		ALIGN(secctx_sz, sizeof(u64));
+		ALIGN(lsmctx.len, sizeof(u64));
 	off_min = 0;
 	for (buffer_offset = off_start_offset; buffer_offset < off_end_offset;
 	     buffer_offset += sizeof(binder_size_t)) {
@@ -3779,8 +3779,8 @@ err_copy_data_failed:
 	binder_alloc_free_buf(&target_proc->alloc, t->buffer);
 err_binder_alloc_buf_failed:
 err_bad_extra_size:
-	if (secctx)
-		security_release_secctx(secctx, secctx_sz);
+	if (lsmctx.context)
+		security_release_secctx(&lsmctx);
 err_get_secctx_failed:
 	kfree(tcomplete);
 	binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 1a9f12204666..2a14335a4bb7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1446,12 +1446,16 @@ out:
 
 void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
 {
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+	struct lsm_context scaff; /* scaffolding */
+#endif
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	posix_acl_release(as_ctx->acl);
 	posix_acl_release(as_ctx->default_acl);
 #endif
 #ifdef CONFIG_CEPH_FS_SECURITY_LABEL
-	security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen);
+	lsmcontext_init(&scaff, as_ctx->sec_ctx, as_ctx->sec_ctxlen, 0);
+	security_release_secctx(&scaff);
 #endif
 #ifdef CONFIG_FS_ENCRYPTION
 	kfree(as_ctx->fscrypt_auth);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 405f17e6e0b4..2daeb6f663d9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -138,8 +138,12 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
 static inline void
 nfs4_label_release_security(struct nfs4_label *label)
 {
-	if (label)
-		security_release_secctx(label->label, label->len);
+	struct lsm_context scaff; /* scaffolding */
+
+	if (label) {
+		lsmcontext_init(&scaff, label->label, label->len, 0);
+		security_release_secctx(&scaff);
+	}
 }
 static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
 {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 53fac037611c..95ec6a4b4da3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3644,8 +3644,12 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 
 out:
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	if (args.context)
-		security_release_secctx(args.context, args.contextlen);
+	if (args.context) {
+		struct lsm_context scaff; /* scaffolding */
+
+		lsmcontext_init(&scaff, args.context, args.contextlen, 0);
+		security_release_secctx(&scaff);
+	}
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
 	kfree(args.acl);
 	if (tempfh) {
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index eb2937599cb0..c13df23132eb 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -300,7 +300,7 @@ LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata,
 LSM_HOOK(int, -EOPNOTSUPP, lsmprop_to_secctx, struct lsm_prop *prop,
 	 char **secdata, u32 *seclen)
 LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid)
-LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen)
+LSM_HOOK(void, LSM_RET_VOID, release_secctx, struct lsm_context *cp)
 LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
 LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen)
 LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen)
diff --git a/include/linux/security.h b/include/linux/security.h
index cbdba435b798..68e56935716b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -225,6 +225,37 @@ extern unsigned long dac_mmap_min_addr;
 #define dac_mmap_min_addr	0UL
 #endif
 
+/*
+ * A "security context" is the text representation of
+ * the information used by LSMs.
+ * This structure contains the string, its length, and which LSM
+ * it is useful for.
+ */
+struct lsm_context {
+	char	*context;	/* Provided by the module */
+	u32	len;
+	int	id;		/* Identifies the module */
+};
+
+/**
+ * lsmcontext_init - initialize an lsmcontext structure.
+ * @cp: Pointer to the context to initialize
+ * @context: Initial context, or NULL
+ * @size: Size of context, or 0
+ * @id: Which LSM provided the context
+ *
+ * Fill in the lsmcontext from the provided information.
+ * This is a scaffolding function that will be removed when
+ * lsm_context integration is complete.
+ */
+static inline void lsmcontext_init(struct lsm_context *cp, char *context,
+				   u32 size, int id)
+{
+	cp->id = id;
+	cp->context = context;
+	cp->len = size;
+}
+
 /*
  * Values used in the task_security_ops calls
  */
@@ -556,7 +587,7 @@ int security_ismaclabel(const char *name);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
 int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata, u32 *seclen);
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
-void security_release_secctx(char *secdata, u32 seclen);
+void security_release_secctx(struct lsm_context *cp);
 void security_inode_invalidate_secctx(struct inode *inode);
 int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen);
 int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
@@ -1545,7 +1576,7 @@ static inline int security_secctx_to_secid(const char *secdata,
 	return -EOPNOTSUPP;
 }
 
-static inline void security_release_secctx(char *secdata, u32 seclen)
+static inline void security_release_secctx(struct lsm_context *cp)
 {
 }
 
diff --git a/include/net/scm.h b/include/net/scm.h
index 0d35c7c77a74..f75449e1d876 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -105,16 +105,17 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
 #ifdef CONFIG_SECURITY_NETWORK
 static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
 {
-	char *secdata;
-	u32 seclen;
+	struct lsm_context ctx;
 	int err;
 
 	if (test_bit(SOCK_PASSSEC, &sock->flags)) {
-		err = security_secid_to_secctx(scm->secid, &secdata, &seclen);
+		err = security_secid_to_secctx(scm->secid, &ctx.context,
+					       &ctx.len);
 
 		if (!err) {
-			put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata);
-			security_release_secctx(secdata, seclen);
+			put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len,
+				 ctx.context);
+			security_release_secctx(&ctx);
 		}
 	}
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 6a95a6077953..1d48d0654a46 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1221,8 +1221,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
 	struct audit_sig_info   *sig_data;
-	char			*ctx = NULL;
-	u32			len;
+	struct lsm_context	lsmctx;
 
 	err = audit_netlink_ok(skb, msg_type);
 	if (err)
@@ -1472,27 +1471,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		break;
 	}
 	case AUDIT_SIGNAL_INFO:
-		len = 0;
 		if (lsmprop_is_set(&audit_sig_lsm)) {
-			err = security_lsmprop_to_secctx(&audit_sig_lsm, &ctx,
-							 &len);
+			err = security_lsmprop_to_secctx(&audit_sig_lsm,
+							 &lsmctx.context,
+							 &lsmctx.len);
 			if (err)
 				return err;
 		}
-		sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
+		sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len),
+				   GFP_KERNEL);
 		if (!sig_data) {
 			if (lsmprop_is_set(&audit_sig_lsm))
-				security_release_secctx(ctx, len);
+				security_release_secctx(&lsmctx);
 			return -ENOMEM;
 		}
 		sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
 		sig_data->pid = audit_sig_pid;
 		if (lsmprop_is_set(&audit_sig_lsm)) {
-			memcpy(sig_data->ctx, ctx, len);
-			security_release_secctx(ctx, len);
+			memcpy(sig_data->ctx, lsmctx.context, lsmctx.len);
+			security_release_secctx(&lsmctx);
 		}
 		audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
-				 sig_data, struct_size(sig_data, ctx, len));
+				 sig_data, struct_size(sig_data, ctx,
+						       lsmctx.len));
 		kfree(sig_data);
 		break;
 	case AUDIT_TTY_GET: {
@@ -2180,23 +2181,22 @@ void audit_log_key(struct audit_buffer *ab, char *key)
 int audit_log_task_context(struct audit_buffer *ab)
 {
 	struct lsm_prop prop;
-	char *ctx = NULL;
-	unsigned len;
+	struct lsm_context ctx;
 	int error;
 
 	security_current_getlsmprop_subj(&prop);
 	if (!lsmprop_is_set(&prop))
 		return 0;
 
-	error = security_lsmprop_to_secctx(&prop, &ctx, &len);
+	error = security_lsmprop_to_secctx(&prop, &ctx.context, &ctx.len);
 	if (error) {
 		if (error != -EINVAL)
 			goto error_path;
 		return 0;
 	}
 
-	audit_log_format(ab, " subj=%s", ctx);
-	security_release_secctx(ctx, len);
+	audit_log_format(ab, " subj=%s", ctx.context);
+	security_release_secctx(&ctx);
 	return 0;
 
 error_path:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 279ba5c420a4..de8fac6c5bd3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1098,8 +1098,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 				 char *comm)
 {
 	struct audit_buffer *ab;
-	char *ctx = NULL;
-	u32 len;
+	struct lsm_context ctx;
 	int rc = 0;
 
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
@@ -1110,12 +1109,12 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 			 from_kuid(&init_user_ns, auid),
 			 from_kuid(&init_user_ns, uid), sessionid);
 	if (lsmprop_is_set(prop)) {
-		if (security_lsmprop_to_secctx(prop, &ctx, &len)) {
+		if (security_lsmprop_to_secctx(prop, &ctx.context, &ctx.len)) {
 			audit_log_format(ab, " obj=(none)");
 			rc = 1;
 		} else {
-			audit_log_format(ab, " obj=%s", ctx);
-			security_release_secctx(ctx, len);
+			audit_log_format(ab, " obj=%s", ctx.context);
+			security_release_secctx(&ctx);
 		}
 	}
 	audit_log_format(ab, " ocomm=");
@@ -1371,6 +1370,7 @@ static void audit_log_time(struct audit_context *context, struct audit_buffer **
 
 static void show_special(struct audit_context *context, int *call_panic)
 {
+	struct lsm_context lsmcxt;
 	struct audit_buffer *ab;
 	int i;
 
@@ -1401,7 +1401,8 @@ static void show_special(struct audit_context *context, int *call_panic)
 				*call_panic = 1;
 			} else {
 				audit_log_format(ab, " obj=%s", ctx);
-				security_release_secctx(ctx, len);
+				lsmcontext_init(&lsmcxt, ctx, len, 0);
+				security_release_secctx(&lsmcxt);
 			}
 		}
 		if (context->ipc.has_perm) {
@@ -1560,15 +1561,15 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
 				 MAJOR(n->rdev),
 				 MINOR(n->rdev));
 	if (lsmprop_is_set(&n->oprop)) {
-		char *ctx = NULL;
-		u32 len;
+		struct lsm_context ctx;
 
-		if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) {
+		if (security_lsmprop_to_secctx(&n->oprop, &ctx.context,
+					       &ctx.len)) {
 			if (call_panic)
 				*call_panic = 2;
 		} else {
-			audit_log_format(ab, " obj=%s", ctx);
-			security_release_secctx(ctx, len);
+			audit_log_format(ab, " obj=%s", ctx.context);
+			security_release_secctx(&ctx);
 		}
 	}
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cf377377b52d..a8180dcc2a32 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -128,20 +128,20 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
 
 static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
 {
-	char *secdata;
-	u32 seclen, secid;
+	struct lsm_context ctx;
+	u32 secid;
 	int err;
 
 	err = security_socket_getpeersec_dgram(NULL, skb, &secid);
 	if (err)
 		return;
 
-	err = security_secid_to_secctx(secid, &secdata, &seclen);
+	err = security_secid_to_secctx(secid, &ctx.context, &ctx.len);
 	if (err)
 		return;
 
-	put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
-	security_release_secctx(secdata, seclen);
+	put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context);
+	security_release_secctx(&ctx);
 }
 
 static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 36168f8b6efa..22f46be33f1e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -357,10 +357,10 @@ nla_put_failure:
 static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
 {
 	struct nlattr *nest_secctx;
-	int len, ret;
-	char *secctx;
+	struct lsm_context ctx;
+	int ret;
 
-	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	ret = security_secid_to_secctx(ct->secmark, &ctx.context, &ctx.len);
 	if (ret)
 		return 0;
 
@@ -369,13 +369,13 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
 	if (!nest_secctx)
 		goto nla_put_failure;
 
-	if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
+	if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context))
 		goto nla_put_failure;
 	nla_nest_end(skb, nest_secctx);
 
 	ret = 0;
 nla_put_failure:
-	security_release_secctx(secctx, len);
+	security_release_secctx(&ctx);
 	return ret;
 }
 #else
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 7d4f0fa8b609..5f7fd23b7afe 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -172,17 +172,16 @@ static void ct_seq_stop(struct seq_file *s, void *v)
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
 static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
 {
+	struct lsm_context ctx;
 	int ret;
-	u32 len;
-	char *secctx;
 
-	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	ret = security_secid_to_secctx(ct->secmark, &ctx.context, &ctx.len);
 	if (ret)
 		return;
 
-	seq_printf(s, "secctx=%s ", secctx);
+	seq_printf(s, "secctx=%s ", ctx.context);
 
-	security_release_secctx(secctx, len);
+	security_release_secctx(&ctx);
 }
 #else
 static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index d2773ce9b585..37757cd77cf1 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -567,6 +567,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	enum ip_conntrack_info ctinfo = 0;
 	const struct nfnl_ct_hook *nfnl_ct;
 	bool csum_verify;
+	struct lsm_context scaff; /* scaffolding */
 	char *secdata = NULL;
 	u32 seclen = 0;
 	ktime_t tstamp;
@@ -810,8 +811,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	}
 
 	nlh->nlmsg_len = skb->len;
-	if (seclen)
-		security_release_secctx(secdata, seclen);
+	if (seclen) {
+		lsmcontext_init(&scaff, secdata, seclen, 0);
+		security_release_secctx(&scaff);
+	}
 	return skb;
 
 nla_put_failure:
@@ -819,8 +822,10 @@ nla_put_failure:
 	kfree_skb(skb);
 	net_err_ratelimited("nf_queue: error creating packet message\n");
 nlmsg_failure:
-	if (seclen)
-		security_release_secctx(secdata, seclen);
+	if (seclen) {
+		lsmcontext_init(&scaff, secdata, seclen, 0);
+		security_release_secctx(&scaff);
+	}
 	return NULL;
 }
 
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 1bc2d0890a9f..921fa8eeb451 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -374,8 +374,7 @@ int netlbl_unlhsh_add(struct net *net,
 	struct net_device *dev;
 	struct netlbl_unlhsh_iface *iface;
 	struct audit_buffer *audit_buf = NULL;
-	char *secctx = NULL;
-	u32 secctx_len;
+	struct lsm_context ctx;
 
 	if (addr_len != sizeof(struct in_addr) &&
 	    addr_len != sizeof(struct in6_addr))
@@ -439,10 +438,10 @@ unlhsh_add_return:
 	rcu_read_unlock();
 	if (audit_buf != NULL) {
 		if (security_secid_to_secctx(secid,
-					     &secctx,
-					     &secctx_len) == 0) {
-			audit_log_format(audit_buf, " sec_obj=%s", secctx);
-			security_release_secctx(secctx, secctx_len);
+					     &ctx.context,
+					     &ctx.len) == 0) {
+			audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
+			security_release_secctx(&ctx);
 		}
 		audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
 		audit_log_end(audit_buf);
@@ -473,8 +472,7 @@ static int netlbl_unlhsh_remove_addr4(struct net *net,
 	struct netlbl_unlhsh_addr4 *entry;
 	struct audit_buffer *audit_buf;
 	struct net_device *dev;
-	char *secctx;
-	u32 secctx_len;
+	struct lsm_context ctx;
 
 	spin_lock(&netlbl_unlhsh_lock);
 	list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
@@ -495,9 +493,9 @@ static int netlbl_unlhsh_remove_addr4(struct net *net,
 		dev_put(dev);
 		if (entry != NULL &&
 		    security_secid_to_secctx(entry->secid,
-					     &secctx, &secctx_len) == 0) {
-			audit_log_format(audit_buf, " sec_obj=%s", secctx);
-			security_release_secctx(secctx, secctx_len);
+					     &ctx.context, &ctx.len) == 0) {
+			audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
+			security_release_secctx(&ctx);
 		}
 		audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
 		audit_log_end(audit_buf);
@@ -534,8 +532,7 @@ static int netlbl_unlhsh_remove_addr6(struct net *net,
 	struct netlbl_unlhsh_addr6 *entry;
 	struct audit_buffer *audit_buf;
 	struct net_device *dev;
-	char *secctx;
-	u32 secctx_len;
+	struct lsm_context ctx;
 
 	spin_lock(&netlbl_unlhsh_lock);
 	list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list);
@@ -555,9 +552,9 @@ static int netlbl_unlhsh_remove_addr6(struct net *net,
 		dev_put(dev);
 		if (entry != NULL &&
 		    security_secid_to_secctx(entry->secid,
-					     &secctx, &secctx_len) == 0) {
-			audit_log_format(audit_buf, " sec_obj=%s", secctx);
-			security_release_secctx(secctx, secctx_len);
+					     &ctx.context, &ctx.len) == 0) {
+			audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
+			security_release_secctx(&ctx);
 		}
 		audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
 		audit_log_end(audit_buf);
@@ -1069,10 +1066,9 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd,
 	int ret_val = -ENOMEM;
 	struct netlbl_unlhsh_walk_arg *cb_arg = arg;
 	struct net_device *dev;
+	struct lsm_context ctx;
 	void *data;
 	u32 secid;
-	char *secctx;
-	u32 secctx_len;
 
 	data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
 			   cb_arg->seq, &netlbl_unlabel_gnl_family,
@@ -1127,14 +1123,14 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd,
 		secid = addr6->secid;
 	}
 
-	ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
+	ret_val = security_secid_to_secctx(secid, &ctx.context, &ctx.len);
 	if (ret_val != 0)
 		goto list_cb_failure;
 	ret_val = nla_put(cb_arg->skb,
 			  NLBL_UNLABEL_A_SECCTX,
-			  secctx_len,
-			  secctx);
-	security_release_secctx(secctx, secctx_len);
+			  ctx.len,
+			  ctx.context);
+	security_release_secctx(&ctx);
 	if (ret_val != 0)
 		goto list_cb_failure;
 
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 81635a13987b..f5e7a9919df1 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -84,8 +84,7 @@ struct audit_buffer *netlbl_audit_start_common(int type,
 					       struct netlbl_audit *audit_info)
 {
 	struct audit_buffer *audit_buf;
-	char *secctx;
-	u32 secctx_len;
+	struct lsm_context ctx;
 
 	if (audit_enabled == AUDIT_OFF)
 		return NULL;
@@ -99,10 +98,10 @@ struct audit_buffer *netlbl_audit_start_common(int type,
 			 audit_info->sessionid);
 
 	if (lsmprop_is_set(&audit_info->prop) &&
-	    security_lsmprop_to_secctx(&audit_info->prop, &secctx,
-				       &secctx_len) == 0) {
-		audit_log_format(audit_buf, " subj=%s", secctx);
-		security_release_secctx(secctx, secctx_len);
+	    security_lsmprop_to_secctx(&audit_info->prop, &ctx.context,
+				       &ctx.len) == 0) {
+		audit_log_format(audit_buf, " subj=%s", ctx.context);
+		security_release_secctx(&ctx);
 	}
 
 	return audit_buf;
diff --git a/security/apparmor/include/secid.h b/security/apparmor/include/secid.h
index f6a515640950..1bcde2f45f24 100644
--- a/security/apparmor/include/secid.h
+++ b/security/apparmor/include/secid.h
@@ -29,7 +29,7 @@ int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
 int apparmor_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
 			       u32 *seclen);
 int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
-void apparmor_release_secctx(char *secdata, u32 seclen);
+void apparmor_release_secctx(struct lsm_context *cp);
 
 
 int aa_alloc_secid(struct aa_label *label, gfp_t gfp);
diff --git a/security/apparmor/secid.c b/security/apparmor/secid.c
index 47dc08fc583e..43e4dab7f6e6 100644
--- a/security/apparmor/secid.c
+++ b/security/apparmor/secid.c
@@ -106,9 +106,16 @@ int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 	return 0;
 }
 
-void apparmor_release_secctx(char *secdata, u32 seclen)
+void apparmor_release_secctx(struct lsm_context *cp)
 {
-	kfree(secdata);
+	/*
+	 * stacking scaffolding:
+	 * When it is possible for more than one LSM to provide a
+	 * release hook, do this check:
+	 * if (cp->id == LSM_ID_APPARMOR || cp->id == LSM_ID_UNDEF)
+	 */
+
+	kfree(cp->context);
 }
 
 /**
diff --git a/security/security.c b/security/security.c
index 09664e09fec9..b7f8ec93f6f0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4360,14 +4360,14 @@ EXPORT_SYMBOL(security_secctx_to_secid);
 
 /**
  * security_release_secctx() - Free a secctx buffer
- * @secdata: secctx
- * @seclen: length of secctx
+ * @cp: the security context
  *
  * Release the security context.
  */
-void security_release_secctx(char *secdata, u32 seclen)
+void security_release_secctx(struct lsm_context *cp)
 {
-	call_void_hook(release_secctx, secdata, seclen);
+	call_void_hook(release_secctx, cp);
+	memset(cp, 0, sizeof(*cp));
 }
 EXPORT_SYMBOL(security_release_secctx);
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f5a08f94e094..aabc724f4d44 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6657,9 +6657,16 @@ static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 				       secid, GFP_KERNEL);
 }
 
-static void selinux_release_secctx(char *secdata, u32 seclen)
+static void selinux_release_secctx(struct lsm_context *cp)
 {
-	kfree(secdata);
+	/*
+	 * stacking scaffolding:
+	 * When it is possible for more than one LSM to provide a
+	 * release hook, do this check:
+	 * if (cp->id == LSM_ID_SELINUX || cp->id == LSM_ID_UNDEF)
+	 */
+
+	kfree(cp->context);
 }
 
 static void selinux_inode_invalidate_secctx(struct inode *inode)
-- 
cgit v1.2.3


From 76ecf306ae5da84ef8f48c7a2608736e6866440c Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 23 Oct 2024 14:21:56 -0700
Subject: lsm: use lsm_context in security_inode_getsecctx

Change the security_inode_getsecctx() interface to fill a lsm_context
structure instead of data and length pointers.  This provides
the information about which LSM created the context so that
security_release_secctx() can use the correct hook.

Cc: linux-nfs@vger.kernel.org
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/nfsd/nfs4xdr.c             | 26 ++++++++++----------------
 include/linux/lsm_hook_defs.h |  4 ++--
 include/linux/security.h      |  5 +++--
 security/security.c           | 12 ++++++------
 security/selinux/hooks.c      | 10 ++++++----
 security/smack/smack_lsm.c    |  7 ++++---
 6 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 95ec6a4b4da3..8dd2e2ada474 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2818,11 +2818,11 @@ static __be32 nfsd4_encode_nfsace4(struct xdr_stream *xdr, struct svc_rqst *rqst
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 static inline __be32
 nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
-			    void *context, int len)
+			    const struct lsm_context *context)
 {
 	__be32 *p;
 
-	p = xdr_reserve_space(xdr, len + 4 + 4 + 4);
+	p = xdr_reserve_space(xdr, context->len + 4 + 4 + 4);
 	if (!p)
 		return nfserr_resource;
 
@@ -2832,13 +2832,13 @@ nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
 	 */
 	*p++ = cpu_to_be32(0); /* lfs */
 	*p++ = cpu_to_be32(0); /* pi */
-	p = xdr_encode_opaque(p, context, len);
+	p = xdr_encode_opaque(p, context->context, context->len);
 	return 0;
 }
 #else
 static inline __be32
 nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
-			    void *context, int len)
+			    struct lsm_context *context)
 { return 0; }
 #endif
 
@@ -2920,8 +2920,7 @@ struct nfsd4_fattr_args {
 	struct kstatfs		statfs;
 	struct nfs4_acl		*acl;
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	void			*context;
-	int			contextlen;
+	struct lsm_context	context;
 #endif
 	u32			rdattr_err;
 	bool			contextsupport;
@@ -3376,8 +3375,7 @@ static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr,
 static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
 					    const struct nfsd4_fattr_args *args)
 {
-	return nfsd4_encode_security_label(xdr, args->rqstp,
-					   args->context, args->contextlen);
+	return nfsd4_encode_security_label(xdr, args->rqstp, &args->context);
 }
 #endif
 
@@ -3527,7 +3525,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	args.ignore_crossmnt = (ignore_crossmnt != 0);
 	args.acl = NULL;
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	args.context = NULL;
+	args.context.context = NULL;
 #endif
 
 	/*
@@ -3607,7 +3605,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	     attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
 		if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
 			err = security_inode_getsecctx(d_inode(dentry),
-						&args.context, &args.contextlen);
+						&args.context);
 		else
 			err = -EOPNOTSUPP;
 		args.contextsupport = (err == 0);
@@ -3644,12 +3642,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 
 out:
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	if (args.context) {
-		struct lsm_context scaff; /* scaffolding */
-
-		lsmcontext_init(&scaff, args.context, args.contextlen, 0);
-		security_release_secctx(&scaff);
-	}
+	if (args.context.context)
+		security_release_secctx(&args.context);
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
 	kfree(args.acl);
 	if (tempfh) {
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 01e5a8e09bba..69e1076448c6 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -303,8 +303,8 @@ LSM_HOOK(void, LSM_RET_VOID, release_secctx, struct lsm_context *cp)
 LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
 LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen)
 LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen)
-LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, void **ctx,
-	 u32 *ctxlen)
+LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode,
+	 struct lsm_context *cp)
 
 #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
 LSM_HOOK(int, 0, post_notification, const struct cred *w_cred,
diff --git a/include/linux/security.h b/include/linux/security.h
index 58518bbc00a6..29f8100bc7c8 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -591,7 +591,7 @@ void security_release_secctx(struct lsm_context *cp);
 void security_inode_invalidate_secctx(struct inode *inode);
 int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen);
 int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
-int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
+int security_inode_getsecctx(struct inode *inode, struct lsm_context *cp);
 int security_locked_down(enum lockdown_reason what);
 int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len,
 		      void *val, size_t val_len, u64 id, u64 flags);
@@ -1591,7 +1591,8 @@ static inline int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32
 {
 	return -EOPNOTSUPP;
 }
-static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
+static inline int security_inode_getsecctx(struct inode *inode,
+					   struct lsm_context *cp)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/security/security.c b/security/security.c
index 1edf8fb2046c..1aecf1696c50 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4426,17 +4426,17 @@ EXPORT_SYMBOL(security_inode_setsecctx);
 /**
  * security_inode_getsecctx() - Get the security label of an inode
  * @inode: inode
- * @ctx: secctx
- * @ctxlen: length of secctx
+ * @cp: security context
  *
- * On success, returns 0 and fills out @ctx and @ctxlen with the security
- * context for the given @inode.
+ * On success, returns 0 and fills out @cp with the security context
+ * for the given @inode.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
+int security_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
 {
-	return call_int_hook(inode_getsecctx, inode, ctx, ctxlen);
+	memset(cp, 0, sizeof(*cp));
+	return call_int_hook(inode_getsecctx, inode, cp);
 }
 EXPORT_SYMBOL(security_inode_getsecctx);
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index ddc24db7c0b2..9254570de103 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6711,14 +6711,16 @@ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 				     ctx, ctxlen, 0, NULL);
 }
 
-static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
+static int selinux_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
 {
-	int len = 0;
+	int len;
 	len = selinux_inode_getsecurity(&nop_mnt_idmap, inode,
-					XATTR_SELINUX_SUFFIX, ctx, true);
+					XATTR_SELINUX_SUFFIX,
+					(void **)&cp->context, true);
 	if (len < 0)
 		return len;
-	*ctxlen = len;
+	cp->len = len;
+	cp->id = LSM_ID_SELINUX;
 	return 0;
 }
 #ifdef CONFIG_KEYS
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 4084f99c493c..55a556f17ade 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4898,12 +4898,13 @@ static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 				     ctx, ctxlen, 0, NULL);
 }
 
-static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
+static int smack_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
 {
 	struct smack_known *skp = smk_of_inode(inode);
 
-	*ctx = skp->smk_known;
-	*ctxlen = strlen(skp->smk_known);
+	cp->context = skp->smk_known;
+	cp->len = strlen(skp->smk_known);
+	cp->id = LSM_ID_SMACK;
 	return 0;
 }
 
-- 
cgit v1.2.3


From b530104f50e86db6f187d39fed5821b3cca755ee Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 23 Oct 2024 14:21:57 -0700
Subject: lsm: lsm_context in security_dentry_init_security

Replace the (secctx,seclen) pointer pair with a single lsm_context
pointer to allow return of the LSM identifier along with the context
and context length. This allows security_release_secctx() to know how
to release the context. Callers have been modified to use or save the
returned data from the new structure.

Cc: ceph-devel@vger.kernel.org
Cc: linux-nfs@vger.kernel.org
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/ceph/super.h               |  3 +--
 fs/ceph/xattr.c               | 16 ++++++----------
 fs/fuse/dir.c                 | 35 ++++++++++++++++++-----------------
 fs/nfs/nfs4proc.c             | 20 ++++++++++++--------
 include/linux/lsm_hook_defs.h |  2 +-
 include/linux/security.h      | 26 +++-----------------------
 security/security.c           |  9 ++++-----
 security/selinux/hooks.c      |  8 ++++----
 8 files changed, 49 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index af14ec382246..7fa1e7be50e4 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1132,8 +1132,7 @@ struct ceph_acl_sec_ctx {
 	void *acl;
 #endif
 #ifdef CONFIG_CEPH_FS_SECURITY_LABEL
-	void *sec_ctx;
-	u32 sec_ctxlen;
+	struct lsm_context lsmctx;
 #endif
 #ifdef CONFIG_FS_ENCRYPTION
 	struct ceph_fscrypt_auth *fscrypt_auth;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2a14335a4bb7..537165db4519 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1383,8 +1383,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
 	int err;
 
 	err = security_dentry_init_security(dentry, mode, &dentry->d_name,
-					    &name, &as_ctx->sec_ctx,
-					    &as_ctx->sec_ctxlen);
+					    &name, &as_ctx->lsmctx);
 	if (err < 0) {
 		WARN_ON_ONCE(err != -EOPNOTSUPP);
 		err = 0; /* do nothing */
@@ -1409,7 +1408,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
 	 */
 	name_len = strlen(name);
 	err = ceph_pagelist_reserve(pagelist,
-				    4 * 2 + name_len + as_ctx->sec_ctxlen);
+				    4 * 2 + name_len + as_ctx->lsmctx.len);
 	if (err)
 		goto out;
 
@@ -1432,8 +1431,9 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
 	ceph_pagelist_encode_32(pagelist, name_len);
 	ceph_pagelist_append(pagelist, name, name_len);
 
-	ceph_pagelist_encode_32(pagelist, as_ctx->sec_ctxlen);
-	ceph_pagelist_append(pagelist, as_ctx->sec_ctx, as_ctx->sec_ctxlen);
+	ceph_pagelist_encode_32(pagelist, as_ctx->lsmctx.len);
+	ceph_pagelist_append(pagelist, as_ctx->lsmctx.context,
+			     as_ctx->lsmctx.len);
 
 	err = 0;
 out:
@@ -1446,16 +1446,12 @@ out:
 
 void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
 {
-#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
-	struct lsm_context scaff; /* scaffolding */
-#endif
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	posix_acl_release(as_ctx->acl);
 	posix_acl_release(as_ctx->default_acl);
 #endif
 #ifdef CONFIG_CEPH_FS_SECURITY_LABEL
-	lsmcontext_init(&scaff, as_ctx->sec_ctx, as_ctx->sec_ctxlen, 0);
-	security_release_secctx(&scaff);
+	security_release_secctx(&as_ctx->lsmctx);
 #endif
 #ifdef CONFIG_FS_ENCRYPTION
 	kfree(as_ctx->fscrypt_auth);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 494ac372ace0..0b2f8567ca30 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -467,29 +467,29 @@ static int get_security_context(struct dentry *entry, umode_t mode,
 {
 	struct fuse_secctx *fctx;
 	struct fuse_secctx_header *header;
-	void *ctx = NULL, *ptr;
-	u32 ctxlen, total_len = sizeof(*header);
+	struct lsm_context lsmctx = { };
+	void *ptr;
+	u32 total_len = sizeof(*header);
 	int err, nr_ctx = 0;
-	const char *name;
+	const char *name = NULL;
 	size_t namelen;
 
 	err = security_dentry_init_security(entry, mode, &entry->d_name,
-					    &name, &ctx, &ctxlen);
-	if (err) {
-		if (err != -EOPNOTSUPP)
-			goto out_err;
-		/* No LSM is supporting this security hook. Ignore error */
-		ctxlen = 0;
-		ctx = NULL;
-	}
+					    &name, &lsmctx);
+
+	/* If no LSM is supporting this security hook ignore error */
+	if (err && err != -EOPNOTSUPP)
+		goto out_err;
 
-	if (ctxlen) {
+	if (lsmctx.len) {
 		nr_ctx = 1;
 		namelen = strlen(name) + 1;
 		err = -EIO;
-		if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX))
+		if (WARN_ON(namelen > XATTR_NAME_MAX + 1 ||
+		    lsmctx.len > S32_MAX))
 			goto out_err;
-		total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen);
+		total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen +
+					    lsmctx.len);
 	}
 
 	err = -ENOMEM;
@@ -502,19 +502,20 @@ static int get_security_context(struct dentry *entry, umode_t mode,
 	ptr += sizeof(*header);
 	if (nr_ctx) {
 		fctx = ptr;
-		fctx->size = ctxlen;
+		fctx->size = lsmctx.len;
 		ptr += sizeof(*fctx);
 
 		strcpy(ptr, name);
 		ptr += namelen;
 
-		memcpy(ptr, ctx, ctxlen);
+		memcpy(ptr, lsmctx.context, lsmctx.len);
 	}
 	ext->size = total_len;
 	ext->value = header;
 	err = 0;
 out_err:
-	kfree(ctx);
+	if (nr_ctx)
+		security_release_secctx(&lsmctx);
 	return err;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2daeb6f663d9..d615d520f8cf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -114,6 +114,7 @@ static inline struct nfs4_label *
 nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
 	struct iattr *sattr, struct nfs4_label *label)
 {
+	struct lsm_context shim;
 	int err;
 
 	if (label == NULL)
@@ -128,21 +129,24 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
 	label->label = NULL;
 
 	err = security_dentry_init_security(dentry, sattr->ia_mode,
-				&dentry->d_name, NULL,
-				(void **)&label->label, &label->len);
-	if (err == 0)
-		return label;
+				&dentry->d_name, NULL, &shim);
+	if (err)
+		return NULL;
 
-	return NULL;
+	label->label = shim.context;
+	label->len = shim.len;
+	return label;
 }
 static inline void
 nfs4_label_release_security(struct nfs4_label *label)
 {
-	struct lsm_context scaff; /* scaffolding */
+	struct lsm_context shim;
 
 	if (label) {
-		lsmcontext_init(&scaff, label->label, label->len, 0);
-		security_release_secctx(&scaff);
+		shim.context = label->label;
+		shim.len = label->len;
+		shim.id = LSM_ID_UNDEF;
+		security_release_secctx(&shim);
 	}
 }
 static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 69e1076448c6..e2f1ce37c41e 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -83,7 +83,7 @@ LSM_HOOK(int, 0, move_mount, const struct path *from_path,
 	 const struct path *to_path)
 LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry,
 	 int mode, const struct qstr *name, const char **xattr_name,
-	 void **ctx, u32 *ctxlen)
+	 struct lsm_context *cp)
 LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode,
 	 struct qstr *name, const struct cred *old, struct cred *new)
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 29f8100bc7c8..980b6c207cad 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -237,25 +237,6 @@ struct lsm_context {
 	int	id;		/* Identifies the module */
 };
 
-/**
- * lsmcontext_init - initialize an lsmcontext structure.
- * @cp: Pointer to the context to initialize
- * @context: Initial context, or NULL
- * @size: Size of context, or 0
- * @id: Which LSM provided the context
- *
- * Fill in the lsmcontext from the provided information.
- * This is a scaffolding function that will be removed when
- * lsm_context integration is complete.
- */
-static inline void lsmcontext_init(struct lsm_context *cp, char *context,
-				   u32 size, int id)
-{
-	cp->id = id;
-	cp->context = context;
-	cp->len = size;
-}
-
 /*
  * Values used in the task_security_ops calls
  */
@@ -409,8 +390,8 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 int security_move_mount(const struct path *from_path, const struct path *to_path);
 int security_dentry_init_security(struct dentry *dentry, int mode,
 				  const struct qstr *name,
-				  const char **xattr_name, void **ctx,
-				  u32 *ctxlen);
+				  const char **xattr_name,
+				  struct lsm_context *lsmcxt);
 int security_dentry_create_files_as(struct dentry *dentry, int mode,
 					struct qstr *name,
 					const struct cred *old,
@@ -883,8 +864,7 @@ static inline int security_dentry_init_security(struct dentry *dentry,
 						 int mode,
 						 const struct qstr *name,
 						 const char **xattr_name,
-						 void **ctx,
-						 u32 *ctxlen)
+						 struct lsm_context *lsmcxt)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/security/security.c b/security/security.c
index 1aecf1696c50..7523d14f31fb 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1735,8 +1735,7 @@ void security_inode_free(struct inode *inode)
  * @mode: mode used to determine resource type
  * @name: name of the last path component
  * @xattr_name: name of the security/LSM xattr
- * @ctx: pointer to the resulting LSM context
- * @ctxlen: length of @ctx
+ * @lsmctx: pointer to the resulting LSM context
  *
  * Compute a context for a dentry as the inode is not yet available since NFSv4
  * has no label backed by an EA anyway.  It is important to note that
@@ -1746,11 +1745,11 @@ void security_inode_free(struct inode *inode)
  */
 int security_dentry_init_security(struct dentry *dentry, int mode,
 				  const struct qstr *name,
-				  const char **xattr_name, void **ctx,
-				  u32 *ctxlen)
+				  const char **xattr_name,
+				  struct lsm_context *lsmctx)
 {
 	return call_int_hook(dentry_init_security, dentry, mode, name,
-			     xattr_name, ctx, ctxlen);
+			     xattr_name, lsmctx);
 }
 EXPORT_SYMBOL(security_dentry_init_security);
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9254570de103..4f8d37ae769a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2869,8 +2869,8 @@ static void selinux_inode_free_security(struct inode *inode)
 
 static int selinux_dentry_init_security(struct dentry *dentry, int mode,
 					const struct qstr *name,
-					const char **xattr_name, void **ctx,
-					u32 *ctxlen)
+					const char **xattr_name,
+					struct lsm_context *cp)
 {
 	u32 newsid;
 	int rc;
@@ -2885,8 +2885,8 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode,
 	if (xattr_name)
 		*xattr_name = XATTR_NAME_SELINUX;
 
-	return security_sid_to_context(newsid, (char **)ctx,
-				       ctxlen);
+	cp->id = LSM_ID_SELINUX;
+	return security_sid_to_context(newsid, &cp->context, &cp->len);
 }
 
 static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
-- 
cgit v1.2.3


From 5c44aa21f0863270efbf03a4d5bd6b75fec4134c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 25 Nov 2024 18:01:14 +0000
Subject: isofs: Partially convert zisofs_read_folio to use a folio

Remove several hidden calls to compound_head() and references
to page->index.  More needs to be done to use folios throughout
the zisofs code.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20241125180117.2914311-1-willy@infradead.org
---
 fs/isofs/compress.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 34d5baa5d88a..5f3b6da0e022 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -301,7 +301,6 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
  */
 static int zisofs_read_folio(struct file *file, struct folio *folio)
 {
-	struct page *page = &folio->page;
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	int err;
@@ -311,16 +310,15 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
 		PAGE_SHIFT <= zisofs_block_shift ?
 		(1 << (zisofs_block_shift - PAGE_SHIFT)) : 0;
 	struct page **pages;
-	pgoff_t index = page->index, end_index;
+	pgoff_t index = folio->index, end_index;
 
 	end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	/*
-	 * If this page is wholly outside i_size we just return zero;
+	 * If this folio is wholly outside i_size we just return zero;
 	 * do_generic_file_read() will handle this for us
 	 */
 	if (index >= end_index) {
-		SetPageUptodate(page);
-		unlock_page(page);
+		folio_end_read(folio, true);
 		return 0;
 	}
 
@@ -338,10 +336,10 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
 	pages = kcalloc(max_t(unsigned int, zisofs_pages_per_cblock, 1),
 					sizeof(*pages), GFP_KERNEL);
 	if (!pages) {
-		unlock_page(page);
+		folio_unlock(folio);
 		return -ENOMEM;
 	}
-	pages[full_page] = page;
+	pages[full_page] = &folio->page;
 
 	for (i = 0; i < pcount; i++, index++) {
 		if (i != full_page)
-- 
cgit v1.2.3


From 2ab0837cb91b7de507daa145d17b3b6b2efb3abf Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 8 Dec 2024 13:34:13 -0500
Subject: efivarfs: Fix error on non-existent file

When looking up a non-existent file, efivarfs returns -EINVAL if the
file does not conform to the NAME-GUID format and -ENOENT if it does.
This is caused by efivars_d_hash() returning -EINVAL if the name is not
formatted correctly.  This error is returned before simple_lookup()
returns a negative dentry, and is the error value that the user sees.

Fix by removing this check.  If the file does not exist, simple_lookup()
will return a negative dentry leading to -ENOENT and efivarfs_create()
already has a validity check before it creates an entry (and will
correctly return -EINVAL)

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: <stable@vger.kernel.org>
[ardb: make efivarfs_valid_name() static]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/inode.c    | 2 +-
 fs/efivarfs/internal.h | 1 -
 fs/efivarfs/super.c    | 3 ---
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 586446e02ef7..ec23da8405ff 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -51,7 +51,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
  *
  *	VariableName-12345678-1234-1234-1234-1234567891bc
  */
-bool efivarfs_valid_name(const char *str, int len)
+static bool efivarfs_valid_name(const char *str, int len)
 {
 	const char *s = str + len - EFI_VARIABLE_GUID_LEN;
 
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index d71d2e08422f..74f0602a9e01 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -60,7 +60,6 @@ bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
 
 extern const struct file_operations efivarfs_file_operations;
 extern const struct inode_operations efivarfs_dir_inode_operations;
-extern bool efivarfs_valid_name(const char *str, int len);
 extern struct inode *efivarfs_get_inode(struct super_block *sb,
 			const struct inode *dir, int mode, dev_t dev,
 			bool is_removable);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index a929f1b613be..beba15673be8 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -144,9 +144,6 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
 	const unsigned char *s = qstr->name;
 	unsigned int len = qstr->len;
 
-	if (!efivarfs_valid_name(s, len))
-		return -EINVAL;
-
 	while (len-- > EFI_VARIABLE_GUID_LEN)
 		hash = partial_name_hash(*s++, hash);
 
-- 
cgit v1.2.3


From ead64b20f16e38443aded90dc8491d25f4f5bd39 Mon Sep 17 00:00:00 2001
From: Christian Göttsche <cgzones@googlemail.com>
Date: Mon, 25 Nov 2024 11:40:00 +0100
Subject: gfs2: reorder capability check last
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

capable() calls refer to enabled LSMs whether to permit or deny the
request.  This is relevant in connection with SELinux, where a
capability check results in a policy decision and by default a denial
message on insufficient permission is issued.
It can lead to three undesired cases:
  1. A denial message is generated, even in case the operation was an
     unprivileged one and thus the syscall succeeded, creating noise.
  2. To avoid the noise from 1. the policy writer adds a rule to ignore
     those denial messages, hiding future syscalls, where the task
     performs an actual privileged operation, leading to hidden limited
     functionality of that task.
  3. To avoid the noise from 1. the policy writer adds a rule to permit
     the task the requested capability, while it does not need it,
     violating the principle of least privilege.

Signed-off-by: Christian Göttsche <cgzones@googlemail.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/quota.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index f462d9cb3087..988f38dc5b2c 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -44,8 +44,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
 	int ret;
 
 	ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
-	if (capable(CAP_SYS_RESOURCE) ||
-	    sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ||
+	    capable(CAP_SYS_RESOURCE))
 		return 0;
 	ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
 	if (ret)
-- 
cgit v1.2.3


From ebe559609d7829b52c6642b581860760984faf9d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Nov 2024 10:30:14 -0500
Subject: fs: get rid of __FMODE_NONOTIFY kludge

All it takes to get rid of the __FMODE_NONOTIFY kludge is switching
fanotify from anon_inode_getfd() to anon_inode_getfile_fmode() and adding
a dentry_open_nonotify() helper to be used by fanotify on the other path.
That's it - no more weird shit in OPEN_FMODE(), etc.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/linux-fsdevel/20241113043003.GH3387508@ZenIV/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/d1231137e7b661a382459e79a764259509a4115d.1731684329.git.josef@toxicpanda.com
---
 fs/fcntl.c                         |  4 ++--
 fs/notify/fanotify/fanotify_user.c | 25 ++++++++++++++++---------
 fs/open.c                          | 23 +++++++++++++++++++----
 include/linux/fs.h                 |  6 +++---
 include/uapi/asm-generic/fcntl.h   |  1 -
 5 files changed, 40 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 49884fa3c81d..5598e4d57422 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1158,10 +1158,10 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+	BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
 		HWEIGHT32(
 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
-			__FMODE_EXEC | __FMODE_NONOTIFY));
+			__FMODE_EXEC));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
 					 sizeof(struct fasync_struct), 0,
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 2d85c71717d6..919ff59cb802 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -100,8 +100,7 @@ static void __init fanotify_sysctls_init(void)
  *
  * Internal and external open flags are stored together in field f_flags of
  * struct file. Only external open flags shall be allowed in event_f_flags.
- * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
- * excluded.
+ * Internal flags like FMODE_EXEC shall be excluded.
  */
 #define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
 		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
@@ -258,12 +257,11 @@ static int create_fd(struct fsnotify_group *group, const struct path *path,
 		return client_fd;
 
 	/*
-	 * we need a new file handle for the userspace program so it can read even if it was
-	 * originally opened O_WRONLY.
+	 * We provide an fd for the userspace program, so it could access the
+	 * file without generating fanotify events itself.
 	 */
-	new_file = dentry_open(path,
-			       group->fanotify_data.f_flags | __FMODE_NONOTIFY,
-			       current_cred());
+	new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags,
+					current_cred());
 	if (IS_ERR(new_file)) {
 		put_unused_fd(client_fd);
 		client_fd = PTR_ERR(new_file);
@@ -1409,6 +1407,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
 	unsigned int internal_flags = 0;
+	struct file *file;
 
 	pr_debug("%s: flags=%x event_f_flags=%x\n",
 		 __func__, flags, event_f_flags);
@@ -1477,7 +1476,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	    (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
 		return -EINVAL;
 
-	f_flags = O_RDWR | __FMODE_NONOTIFY;
+	f_flags = O_RDWR;
 	if (flags & FAN_CLOEXEC)
 		f_flags |= O_CLOEXEC;
 	if (flags & FAN_NONBLOCK)
@@ -1555,10 +1554,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 			goto out_destroy_group;
 	}
 
-	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
+	fd = get_unused_fd_flags(f_flags);
 	if (fd < 0)
 		goto out_destroy_group;
 
+	file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
+					f_flags, FMODE_NONOTIFY);
+	if (IS_ERR(file)) {
+		fd = PTR_ERR(file);
+		put_unused_fd(fd);
+		goto out_destroy_group;
+	}
+	fd_install(fd, file);
 	return fd;
 
 out_destroy_group:
diff --git a/fs/open.c b/fs/open.c
index e6911101fe71..c3490286092e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1105,6 +1105,23 @@ struct file *dentry_open(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(dentry_open);
 
+struct file *dentry_open_nonotify(const struct path *path, int flags,
+				  const struct cred *cred)
+{
+	struct file *f = alloc_empty_file(flags, cred);
+	if (!IS_ERR(f)) {
+		int error;
+
+		f->f_mode |= FMODE_NONOTIFY;
+		error = vfs_open(path, f);
+		if (error) {
+			fput(f);
+			f = ERR_PTR(error);
+		}
+	}
+	return f;
+}
+
 /**
  * dentry_create - Create and open a file
  * @path: path to create
@@ -1202,7 +1219,7 @@ inline struct open_how build_open_how(int flags, umode_t mode)
 inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 {
 	u64 flags = how->flags;
-	u64 strip = __FMODE_NONOTIFY | O_CLOEXEC;
+	u64 strip = O_CLOEXEC;
 	int lookup_flags = 0;
 	int acc_mode = ACC_MODE(flags);
 
@@ -1210,9 +1227,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 			 "struct open_flags doesn't yet handle flags > 32 bits");
 
 	/*
-	 * Strip flags that either shouldn't be set by userspace like
-	 * FMODE_NONOTIFY or that aren't relevant in determining struct
-	 * open_flags like O_CLOEXEC.
+	 * Strip flags that aren't relevant in determining struct open_flags.
 	 */
 	flags &= ~strip;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e29433c5ecc..93c2b720271e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2751,6 +2751,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
 }
 struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *creds);
+struct file *dentry_open_nonotify(const struct path *path, int flags,
+				  const struct cred *cred);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
 struct path *backing_file_user_path(struct file *f);
@@ -3707,11 +3709,9 @@ struct ctl_table;
 int __init list_bdev_fs_names(char *buf, size_t size);
 
 #define __FMODE_EXEC		((__force int) FMODE_EXEC)
-#define __FMODE_NONOTIFY	((__force int) FMODE_NONOTIFY)
 
 #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
-#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
-					    (flag & __FMODE_NONOTIFY)))
+#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))
 
 static inline bool is_sxid(umode_t mode)
 {
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 80f37a0d40d7..613475285643 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -6,7 +6,6 @@
 
 /*
  * FMODE_EXEC is 0x20
- * FMODE_NONOTIFY is 0x4000000
  * These cannot be used by userspace O_* until internal and external open
  * flags are split.
  * -Eric Paris
-- 
cgit v1.2.3


From 6d44a780635b9b3fb6e20f366ee7750a9adfa884 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Mon, 9 Dec 2024 11:25:04 -0600
Subject: smb3: fix compiler warning in reparse code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

utf8s_to_utf16s() specifies pwcs as a wchar_t pointer (whether big endian
or little endian is passed in as an additional parm), so to remove a
distracting compile warning it needs to be cast as (wchar_t *) in
parse_reparse_wsl_symlink() as done by other callers.

Fixes: 06a7adf318a3 ("cifs: Add support for parsing WSL-style symlinks")
Reviewed-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/reparse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 50b1aba20008..d88b41133e00 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -676,7 +676,7 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf
 		return -ENOMEM;
 	symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len,
 					    UTF16_LITTLE_ENDIAN,
-					    symname_utf16, symname_utf8_len * 2);
+					    (wchar_t *) symname_utf16, symname_utf8_len * 2);
 	if (symname_utf16_len < 0) {
 		kfree(symname_utf16);
 		return symname_utf16_len;
-- 
cgit v1.2.3


From a94204f4d48e28a711b7ed10399f749286c433e3 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:15 -0500
Subject: fsnotify: opt-in for permission events at file open time

Legacy inotify/fanotify listeners can add watches for events on inode,
parent or mount and expect to get events (e.g. FS_MODIFY) on files that
were already open at the time of setting up the watches.

fanotify permission events are typically used by Anti-malware sofware,
that is watching the entire mount and it is not common to have more that
one Anti-malware engine installed on a system.

To reduce the overhead of the fsnotify_file_perm() hooks on every file
access, relax the semantics of the legacy FAN_ACCESS_PERM event to generate
events only if there were *any* permission event listeners on the
filesystem at the time that the file was opened.

The new semantic is implemented by extending the FMODE_NONOTIFY bit into
two FMODE_NONOTIFY_* bits, that are used to store a mode for which of the
events types to report.

This is going to apply to the new fanotify pre-content events in order
to reduce the cost of the new pre-content event vfs hooks.

[Thanks to Bert Karwatzki <spasswolf@web.de> for reporting a bug in this
code with CONFIG_FANOTIFY_ACCESS_PERMISSIONS disabled]

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/linux-fsdevel/CAHk-=wj8L=mtcRTi=NECHMGfZQgXOp_uix1YVh04fEmrKaMnXA@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/5ea5f8e283d1edb55aa79c35187bfe344056af14.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fsnotify.c     | 38 ++++++++++++++++++++++++++++++++++++++
 fs/open.c                |  8 +++++++-
 include/linux/fs.h       | 43 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/fsnotify.h | 39 +++++++++++++++++++++++----------------
 4 files changed, 106 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index f976949d2634..569ec356e4ce 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -623,6 +623,44 @@ out:
 }
 EXPORT_SYMBOL_GPL(fsnotify);
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+/*
+ * At open time we check fsnotify_sb_has_priority_watchers() and set the
+ * FMODE_NONOTIFY_ mode bits accordignly.
+ * Later, fsnotify permission hooks do not check if there are permission event
+ * watches, but that there were permission event watches at open time.
+ */
+void file_set_fsnotify_mode(struct file *file)
+{
+	struct super_block *sb = file->f_path.dentry->d_sb;
+
+	/* Is it a file opened by fanotify? */
+	if (FMODE_FSNOTIFY_NONE(file->f_mode))
+		return;
+
+	/*
+	 * Permission events is a super set of pre-content events, so if there
+	 * are no permission event watchers, there are also no pre-content event
+	 * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit.
+	 */
+	if (likely(!fsnotify_sb_has_priority_watchers(sb,
+						FSNOTIFY_PRIO_CONTENT))) {
+		file->f_mode |= FMODE_NONOTIFY_PERM;
+		return;
+	}
+
+	/*
+	 * If there are permission event watchers but no pre-content event
+	 * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+	 */
+	if (likely(!fsnotify_sb_has_priority_watchers(sb,
+						FSNOTIFY_PRIO_PRE_CONTENT))) {
+		file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
+		return;
+	}
+}
+#endif
+
 static __init int fsnotify_init(void)
 {
 	int ret;
diff --git a/fs/open.c b/fs/open.c
index c3490286092e..1a9483872e1f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -901,7 +901,7 @@ static int do_dentry_open(struct file *f,
 	f->f_sb_err = file_sample_sb_err(f);
 
 	if (unlikely(f->f_flags & O_PATH)) {
-		f->f_mode = FMODE_PATH | FMODE_OPENED;
+		f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY;
 		f->f_op = &empty_fops;
 		return 0;
 	}
@@ -929,6 +929,12 @@ static int do_dentry_open(struct file *f,
 	if (error)
 		goto cleanup_all;
 
+	/*
+	 * Set FMODE_NONOTIFY_* bits according to existing permission watches.
+	 * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't
+	 * change anything.
+	 */
+	file_set_fsnotify_mode(f);
 	error = fsnotify_open_perm(f);
 	if (error)
 		goto cleanup_all;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 93c2b720271e..5f7ac5b548a4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -173,13 +173,20 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 
 #define	FMODE_NOREUSE		((__force fmode_t)(1 << 23))
 
-/* FMODE_* bit 24 */
-
 /* File is embedded in backing_file object */
-#define FMODE_BACKING		((__force fmode_t)(1 << 25))
+#define FMODE_BACKING		((__force fmode_t)(1 << 24))
+
+/*
+ * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be
+ * generated (see below)
+ */
+#define FMODE_NONOTIFY		((__force fmode_t)(1 << 25))
 
-/* File was opened by fanotify and shouldn't generate fanotify events */
-#define FMODE_NONOTIFY		((__force fmode_t)(1 << 26))
+/*
+ * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be
+ * generated (see below)
+ */
+#define FMODE_NONOTIFY_PERM	((__force fmode_t)(1 << 26))
 
 /* File is capable of returning -EAGAIN if I/O will block */
 #define FMODE_NOWAIT		((__force fmode_t)(1 << 27))
@@ -190,6 +197,32 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)(1 << 29))
 
+/*
+ * The two FMODE_NONOTIFY* define which fsnotify events should not be generated
+ * for a file. These are the possible values of (f->f_mode &
+ * FMODE_FSNOTIFY_MASK) and their meaning:
+ *
+ * FMODE_NONOTIFY - suppress all (incl. non-permission) events.
+ * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events.
+ * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events.
+ */
+#define FMODE_FSNOTIFY_MASK \
+	(FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)
+
+#define FMODE_FSNOTIFY_NONE(mode) \
+	((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY)
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+#define FMODE_FSNOTIFY_PERM(mode) \
+	((mode & FMODE_FSNOTIFY_MASK) == 0 || \
+	 (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM))
+#define FMODE_FSNOTIFY_HSM(mode) \
+	((mode & FMODE_FSNOTIFY_MASK) == 0)
+#else
+#define FMODE_FSNOTIFY_PERM(mode)	0
+#define FMODE_FSNOTIFY_HSM(mode)	0
+#endif
+
+
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 278620e063ab..8d1849137a96 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -108,38 +108,35 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
 	fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
 }
 
-static inline int fsnotify_file(struct file *file, __u32 mask)
+static inline int fsnotify_path(const struct path *path, __u32 mask)
 {
-	const struct path *path;
+	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
+}
 
+static inline int fsnotify_file(struct file *file, __u32 mask)
+{
 	/*
 	 * FMODE_NONOTIFY are fds generated by fanotify itself which should not
 	 * generate new events. We also don't want to generate events for
 	 * FMODE_PATH fds (involves open & close events) as they are just
 	 * handle creation / destruction events and not "real" file events.
 	 */
-	if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH))
+	if (FMODE_FSNOTIFY_NONE(file->f_mode))
 		return 0;
 
-	path = &file->f_path;
-	/* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */
-	if (mask & ALL_FSNOTIFY_PERM_EVENTS &&
-	    !fsnotify_sb_has_priority_watchers(path->dentry->d_sb,
-					       FSNOTIFY_PRIO_CONTENT))
-		return 0;
-
-	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
+	return fsnotify_path(&file->f_path, mask);
 }
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+
+void file_set_fsnotify_mode(struct file *file);
+
 /*
  * fsnotify_file_area_perm - permission hook before access to file range
  */
 static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 					  const loff_t *ppos, size_t count)
 {
-	__u32 fsnotify_mask = FS_ACCESS_PERM;
-
 	/*
 	 * filesystem may be modified in the context of permission events
 	 * (e.g. by HSM filling a file on access), so sb freeze protection
@@ -150,7 +147,10 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 	if (!(perm_mask & MAY_READ))
 		return 0;
 
-	return fsnotify_file(file, fsnotify_mask);
+	if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
+		return 0;
+
+	return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
 }
 
 /*
@@ -168,16 +168,23 @@ static inline int fsnotify_open_perm(struct file *file)
 {
 	int ret;
 
+	if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
+		return 0;
+
 	if (file->f_flags & __FMODE_EXEC) {
-		ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
+		ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
 		if (ret)
 			return ret;
 	}
 
-	return fsnotify_file(file, FS_OPEN_PERM);
+	return fsnotify_path(&file->f_path, FS_OPEN_PERM);
 }
 
 #else
+static inline void file_set_fsnotify_mode(struct file *file)
+{
+}
+
 static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 					  const loff_t *ppos, size_t count)
 {
-- 
cgit v1.2.3


From 318652e07fa5b1743d08eeccd69a1f47f2c15710 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:16 -0500
Subject: fsnotify: check if file is actually being watched for pre-content
 events on open

So far, we set FMODE_NONOTIFY_ flags at open time if we know that there
are no permission event watchers at all on the filesystem, but lack of
FMODE_NONOTIFY_ flags does not mean that the file is actually watched.

For pre-content events, it is possible to optimize things so that we
don't bother trying to send pre-content events if file was not watched
(through sb, mnt, parent or inode itself) on open. Set FMODE_NONOTIFY_
flags according to that.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/2ddcc9f8d1fde48d085318a6b5a889289d8871d8.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fsnotify.c             | 29 ++++++++++++++++++++++++++---
 include/linux/fsnotify_backend.h |  3 +++
 2 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 569ec356e4ce..9e483fface1e 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -193,7 +193,7 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
 	return mask & marks_mask;
 }
 
-/* Are there any inode/mount/sb objects that are interested in this event? */
+/* Are there any inode/mount/sb objects that watch for these events? */
 static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
 					   __u32 mask)
 {
@@ -632,7 +632,9 @@ EXPORT_SYMBOL_GPL(fsnotify);
  */
 void file_set_fsnotify_mode(struct file *file)
 {
-	struct super_block *sb = file->f_path.dentry->d_sb;
+	struct dentry *dentry = file->f_path.dentry, *parent;
+	struct super_block *sb = dentry->d_sb;
+	__u32 mnt_mask, p_mask;
 
 	/* Is it a file opened by fanotify? */
 	if (FMODE_FSNOTIFY_NONE(file->f_mode))
@@ -653,11 +655,32 @@ void file_set_fsnotify_mode(struct file *file)
 	 * If there are permission event watchers but no pre-content event
 	 * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
 	 */
-	if (likely(!fsnotify_sb_has_priority_watchers(sb,
+	if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
+	    likely(!fsnotify_sb_has_priority_watchers(sb,
 						FSNOTIFY_PRIO_PRE_CONTENT))) {
 		file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
 		return;
 	}
+
+	/*
+	 * OK, there are some pre-content watchers. Check if anybody is
+	 * watching for pre-content events on *this* file.
+	 */
+	mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
+	if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
+				     FSNOTIFY_PRE_CONTENT_EVENTS)))
+		return;
+
+	/* Is parent watching for pre-content events on this file? */
+	if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
+		parent = dget_parent(dentry);
+		p_mask = fsnotify_inode_watches_children(d_inode(parent));
+		dput(parent);
+		if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)
+			return;
+	}
+	/* Nobody watching for pre-content events from this file */
+	file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
 }
 #endif
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 3ecf7768e577..9c105244815d 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -77,6 +77,9 @@
  */
 #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)
 
+/* Pre-content events can be used to fill file content */
+#define FSNOTIFY_PRE_CONTENT_EVENTS 0
+
 #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
 				  FS_OPEN_EXEC_PERM)
 
-- 
cgit v1.2.3


From b82c6f5930f65c510f5b6b4b0d7d1913a6dda3db Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 15 Nov 2024 10:30:17 -0500
Subject: fanotify: don't skip extra event info if no info_mode is set

Previously we would only include optional information if you requested
it via an FAN_ flag at fanotify_init time (FAN_REPORT_FID for example).
However this isn't necessary as the event length is encoded in the
metadata, and if the user doesn't want to consume the information they
don't have to.  With the PRE_ACCESS events we will always generate range
information, so drop this check in order to allow this extra
information to be exported without needing to have another flag.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/afcbc4e4139dee076ef1757918b037d3b48c3edb.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fanotify/fanotify_user.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 919ff59cb802..8fca5ec442e4 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -158,9 +158,6 @@ static size_t fanotify_event_len(unsigned int info_mode,
 	int fh_len;
 	int dot_len = 0;
 
-	if (!info_mode)
-		return event_len;
-
 	if (fanotify_is_error_event(event->mask))
 		event_len += FANOTIFY_ERROR_INFO_LEN;
 
@@ -754,12 +751,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	buf += FAN_EVENT_METADATA_LEN;
 	count -= FAN_EVENT_METADATA_LEN;
 
-	if (info_mode) {
-		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
-						buf, count);
-		if (ret < 0)
-			goto out_close_fd;
-	}
+	ret = copy_info_records_to_user(event, info, info_mode, pidfd,
+					buf, count);
+	if (ret < 0)
+		goto out_close_fd;
 
 	if (f)
 		fd_install(fd, f);
-- 
cgit v1.2.3


From 4edcb9f7b7179ef87ca16440da50ff01f05f268c Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:18 -0500
Subject: fanotify: rename a misnamed constant

FANOTIFY_PIDFD_INFO_HDR_LEN is not the length of the header.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/8776ab90fe538225aeb561c560296bafd16b97c4.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fanotify/fanotify_user.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8fca5ec442e4..456cc3e92c88 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -117,7 +117,7 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
 #define FANOTIFY_EVENT_ALIGN 4
 #define FANOTIFY_FID_INFO_HDR_LEN \
 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
-#define FANOTIFY_PIDFD_INFO_HDR_LEN \
+#define FANOTIFY_PIDFD_INFO_LEN \
 	sizeof(struct fanotify_event_info_pidfd)
 #define FANOTIFY_ERROR_INFO_LEN \
 	(sizeof(struct fanotify_event_info_error))
@@ -172,14 +172,14 @@ static size_t fanotify_event_len(unsigned int info_mode,
 		dot_len = 1;
 	}
 
-	if (info_mode & FAN_REPORT_PIDFD)
-		event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
-
 	if (fanotify_event_has_object_fh(event)) {
 		fh_len = fanotify_event_object_fh_len(event);
 		event_len += fanotify_fid_info_len(fh_len, dot_len);
 	}
 
+	if (info_mode & FAN_REPORT_PIDFD)
+		event_len += FANOTIFY_PIDFD_INFO_LEN;
+
 	return event_len;
 }
 
@@ -501,7 +501,7 @@ static int copy_pidfd_info_to_user(int pidfd,
 				   size_t count)
 {
 	struct fanotify_event_info_pidfd info = { };
-	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+	size_t info_len = FANOTIFY_PIDFD_INFO_LEN;
 
 	if (WARN_ON_ONCE(info_len > count))
 		return -EFAULT;
-- 
cgit v1.2.3


From f156524e5d72c81792eee81f828784dc8a37a7f2 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:20 -0500
Subject: fsnotify: introduce pre-content permission events

The new FS_PRE_ACCESS permission event is similar to FS_ACCESS_PERM,
but it meant for a different use case of filling file content before
access to a file range, so it has slightly different semantics.

Generate FS_PRE_ACCESS/FS_ACCESS_PERM as two seperate events, so content
scanners could inspect the content filled by pre-content event handler.

Unlike FS_ACCESS_PERM, FS_PRE_ACCESS is also called before a file is
modified by syscalls as write() and fallocate().

FS_ACCESS_PERM is reported also on blockdev and pipes, but the new
pre-content events are only reported for regular files and dirs.

The pre-content events are meant to be used by hierarchical storage
managers that want to fill the content of files on first access.

There are some specific requirements from filesystems that could
be used with pre-content events, so add a flag for fs to opt-in
for pre-content events explicitly before they can be used.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/b934c5e3af205abc4e0e4709f6486815937ddfdf.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fsnotify.c             |  2 +-
 include/linux/fs.h               |  1 +
 include/linux/fsnotify.h         | 19 ++++++++++++++++++-
 include/linux/fsnotify_backend.h | 11 ++++++++---
 security/selinux/hooks.c         |  3 ++-
 5 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9e483fface1e..32b461c5d04b 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -688,7 +688,7 @@ static __init int fsnotify_init(void)
 {
 	int ret;
 
-	BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
+	BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
 
 	ret = init_srcu_struct(&fsnotify_mark_srcu);
 	if (ret)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5f7ac5b548a4..3f4d59464965 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1265,6 +1265,7 @@ extern int send_sigurg(struct file *file);
 #define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
 #define SB_I_NOUMASK	0x00001000	/* VFS does not apply umask */
 #define SB_I_NOIDMAP	0x00002000	/* No idmapped mounts on this superblock */
+#define SB_I_ALLOW_HSM	0x00004000	/* Allow HSM events on this superblock */
 
 /* Possible states of 'frozen' field */
 enum {
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 8d1849137a96..d91aa064f0e4 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -144,12 +144,29 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 	 */
 	lockdep_assert_once(file_write_not_started(file));
 
-	if (!(perm_mask & MAY_READ))
+	if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS)))
 		return 0;
 
 	if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
 		return 0;
 
+	/*
+	 * read()/write() and other types of access generate pre-content events.
+	 */
+	if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+		int ret = fsnotify_path(&file->f_path, FS_PRE_ACCESS);
+
+		if (ret)
+			return ret;
+	}
+
+	if (!(perm_mask & MAY_READ))
+		return 0;
+
+	/*
+	 * read() also generates the legacy FS_ACCESS_PERM event, so content
+	 * scanners can inspect the content filled by pre-content event.
+	 */
 	return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c38762b62bf1..9bda354b5538 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -57,6 +57,8 @@
 #define FS_OPEN_EXEC_PERM	0x00040000	/* open/exec event in a permission hook */
 /* #define FS_DIR_MODIFY	0x00080000 */	/* Deprecated (reserved) */
 
+#define FS_PRE_ACCESS		0x00100000	/* Pre-content access hook */
+
 /*
  * Set on inode mark that cares about things that happen to its children.
  * Always set for dnotify and inotify.
@@ -78,11 +80,14 @@
  */
 #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)
 
+/* Content events can be used to inspect file content */
+#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \
+				      FS_ACCESS_PERM)
 /* Pre-content events can be used to fill file content */
-#define FSNOTIFY_PRE_CONTENT_EVENTS 0
+#define FSNOTIFY_PRE_CONTENT_EVENTS  (FS_PRE_ACCESS)
 
-#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
-				  FS_OPEN_EXEC_PERM)
+#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \
+				  FSNOTIFY_PRE_CONTENT_EVENTS)
 
 /*
  * This is a list of all events that may get sent to a parent that is watching
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f5a08f94e094..97a2c04c2b37 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3404,7 +3404,8 @@ static int selinux_path_notify(const struct path *path, u64 mask,
 		perm |= FILE__WATCH_WITH_PERM;
 
 	/* watches on read-like events need the file:watch_reads permission */
-	if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE))
+	if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS |
+		    FS_CLOSE_NOWRITE))
 		perm |= FILE__WATCH_READS;
 
 	return path_has_perm(current_cred(), path, perm);
-- 
cgit v1.2.3


From 9740d17162deca7138fad7dcf3ef52324832c32b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:21 -0500
Subject: fsnotify: pass optional file access range in pre-content event

We would like to add file range information to pre-content events.

Pass a struct file_range with offset and length to event handler
along with pre-content permission event.

The offset and length are aligned to page size, but we may need to
align them to minimum folio size for filesystems with large block size.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/88eddee301231d814aede27fb4d5b41ae37c9702.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fanotify/fanotify.c    | 11 +++++++++--
 fs/notify/fanotify/fanotify.h    |  2 ++
 fs/notify/fsnotify.c             | 18 ++++++++++++++++++
 include/linux/fsnotify.h         |  4 ++--
 include/linux/fsnotify_backend.h | 40 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 71 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 24c7c5df4998..2e6ba94ec405 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -548,9 +548,13 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
 	return &pevent->fae;
 }
 
-static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
+static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
+							int data_type,
 							gfp_t gfp)
 {
+	const struct path *path = fsnotify_data_path(data, data_type);
+	const struct file_range *range =
+			    fsnotify_data_file_range(data, data_type);
 	struct fanotify_perm_event *pevent;
 
 	pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
@@ -564,6 +568,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
 	pevent->hdr.len = 0;
 	pevent->state = FAN_EVENT_INIT;
 	pevent->path = *path;
+	/* NULL ppos means no range info */
+	pevent->ppos = range ? &range->pos : NULL;
+	pevent->count = range ? range->count : 0;
 	path_get(path);
 
 	return &pevent->fae;
@@ -801,7 +808,7 @@ static struct fanotify_event *fanotify_alloc_event(
 	old_memcg = set_active_memcg(group->memcg);
 
 	if (fanotify_is_perm_event(mask)) {
-		event = fanotify_alloc_perm_event(path, gfp);
+		event = fanotify_alloc_perm_event(data, data_type, gfp);
 	} else if (fanotify_is_error_event(mask)) {
 		event = fanotify_alloc_error_event(group, fsid, data,
 						   data_type, &hash);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index e5ab33cae6a7..93598b7d5952 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -425,6 +425,8 @@ FANOTIFY_PE(struct fanotify_event *event)
 struct fanotify_perm_event {
 	struct fanotify_event fae;
 	struct path path;
+	const loff_t *ppos;		/* optional file range info */
+	size_t count;
 	u32 response;			/* userspace answer to the event */
 	unsigned short state;		/* state of the event */
 	int fd;		/* fd we passed to userspace for this event */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 32b461c5d04b..8ee495a58d0a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -203,6 +203,24 @@ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
 	return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
 }
 
+/* Report pre-content event with optional range info */
+int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
+			 size_t count)
+{
+	struct file_range range;
+
+	/* Report page aligned range only when pos is known */
+	if (!ppos)
+		return fsnotify_path(path, FS_PRE_ACCESS);
+
+	range.path = path;
+	range.pos = PAGE_ALIGN_DOWN(*ppos);
+	range.count = PAGE_ALIGN(*ppos + count) - range.pos;
+
+	return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range,
+			       FSNOTIFY_EVENT_FILE_RANGE);
+}
+
 /*
  * Notify this dentry's parent about a child's events with child name info
  * if parent is watching or if inode/sb/mount are interested in events with
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index d91aa064f0e4..87044acf8e79 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -154,7 +154,7 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 	 * read()/write() and other types of access generate pre-content events.
 	 */
 	if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
-		int ret = fsnotify_path(&file->f_path, FS_PRE_ACCESS);
+		int ret = fsnotify_pre_content(&file->f_path, ppos, count);
 
 		if (ret)
 			return ret;
@@ -171,7 +171,7 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 }
 
 /*
- * fsnotify_file_perm - permission hook before file access
+ * fsnotify_file_perm - permission hook before file access (unknown range)
  */
 static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9bda354b5538..0d24a21a8e60 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -294,6 +294,7 @@ static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
 /* When calling fsnotify tell it if the data is a path or inode */
 enum fsnotify_data_type {
 	FSNOTIFY_EVENT_NONE,
+	FSNOTIFY_EVENT_FILE_RANGE,
 	FSNOTIFY_EVENT_PATH,
 	FSNOTIFY_EVENT_INODE,
 	FSNOTIFY_EVENT_DENTRY,
@@ -306,6 +307,17 @@ struct fs_error_report {
 	struct super_block *sb;
 };
 
+struct file_range {
+	const struct path *path;
+	loff_t pos;
+	size_t count;
+};
+
+static inline const struct path *file_range_path(const struct file_range *range)
+{
+	return range->path;
+}
+
 static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
 {
 	switch (data_type) {
@@ -315,6 +327,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
 		return d_inode(data);
 	case FSNOTIFY_EVENT_PATH:
 		return d_inode(((const struct path *)data)->dentry);
+	case FSNOTIFY_EVENT_FILE_RANGE:
+		return d_inode(file_range_path(data)->dentry);
 	case FSNOTIFY_EVENT_ERROR:
 		return ((struct fs_error_report *)data)->inode;
 	default:
@@ -330,6 +344,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ
 		return (struct dentry *)data;
 	case FSNOTIFY_EVENT_PATH:
 		return ((const struct path *)data)->dentry;
+	case FSNOTIFY_EVENT_FILE_RANGE:
+		return file_range_path(data)->dentry;
 	default:
 		return NULL;
 	}
@@ -341,6 +357,8 @@ static inline const struct path *fsnotify_data_path(const void *data,
 	switch (data_type) {
 	case FSNOTIFY_EVENT_PATH:
 		return data;
+	case FSNOTIFY_EVENT_FILE_RANGE:
+		return file_range_path(data);
 	default:
 		return NULL;
 	}
@@ -356,6 +374,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data,
 		return ((struct dentry *)data)->d_sb;
 	case FSNOTIFY_EVENT_PATH:
 		return ((const struct path *)data)->dentry->d_sb;
+	case FSNOTIFY_EVENT_FILE_RANGE:
+		return file_range_path(data)->dentry->d_sb;
 	case FSNOTIFY_EVENT_ERROR:
 		return ((struct fs_error_report *) data)->sb;
 	default:
@@ -375,6 +395,18 @@ static inline struct fs_error_report *fsnotify_data_error_report(
 	}
 }
 
+static inline const struct file_range *fsnotify_data_file_range(
+							const void *data,
+							int data_type)
+{
+	switch (data_type) {
+	case FSNOTIFY_EVENT_FILE_RANGE:
+		return (struct file_range *)data;
+	default:
+		return NULL;
+	}
+}
+
 /*
  * Index to merged marks iterator array that correlates to a type of watch.
  * The type of watched object can be deduced from the iterator type, but not
@@ -863,9 +895,17 @@ static inline void fsnotify_init_event(struct fsnotify_event *event)
 {
 	INIT_LIST_HEAD(&event->list);
 }
+int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
+			 size_t count);
 
 #else
 
+static inline int fsnotify_pre_content(const struct path *path,
+				       const loff_t *ppos, size_t count)
+{
+	return 0;
+}
+
 static inline int fsnotify(__u32 mask, const void *data, int data_type,
 			   struct inode *dir, const struct qstr *name,
 			   struct inode *inode, u32 cookie)
-- 
cgit v1.2.3


From 4acf3bc76e521b47acebcefc6312c97992f4ca29 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:22 -0500
Subject: fsnotify: generate pre-content permission event on truncate

Generate FS_PRE_ACCESS event before truncate, without sb_writers held.

Move the security hooks also before sb_start_write() to conform with
other security hooks (e.g. in write, fallocate).

The event will have a range info of the page surrounding the new size
to provide an opportunity to fill the conetnt at the end of file before
truncating to non-page aligned size.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/23af8201db6ac2efdea94f09ab067d81ba5de7a7.1731684329.git.josef@toxicpanda.com
---
 fs/open.c                | 31 +++++++++++++++++++++----------
 include/linux/fsnotify.h | 20 ++++++++++++++++++++
 2 files changed, 41 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 1a9483872e1f..d11d373dca80 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -81,14 +81,18 @@ long vfs_truncate(const struct path *path, loff_t length)
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 
-	error = mnt_want_write(path->mnt);
-	if (error)
-		goto out;
-
 	idmap = mnt_idmap(path->mnt);
 	error = inode_permission(idmap, inode, MAY_WRITE);
 	if (error)
-		goto mnt_drop_write_and_out;
+		return error;
+
+	error = fsnotify_truncate_perm(path, length);
+	if (error)
+		return error;
+
+	error = mnt_want_write(path->mnt);
+	if (error)
+		return error;
 
 	error = -EPERM;
 	if (IS_APPEND(inode))
@@ -114,7 +118,7 @@ put_write_and_out:
 	put_write_access(inode);
 mnt_drop_write_and_out:
 	mnt_drop_write(path->mnt);
-out:
+
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_truncate);
@@ -175,11 +179,18 @@ long do_ftruncate(struct file *file, loff_t length, int small)
 	/* Check IS_APPEND on real upper inode */
 	if (IS_APPEND(file_inode(file)))
 		return -EPERM;
-	sb_start_write(inode->i_sb);
+
 	error = security_file_truncate(file);
-	if (!error)
-		error = do_truncate(file_mnt_idmap(file), dentry, length,
-				    ATTR_MTIME | ATTR_CTIME, file);
+	if (error)
+		return error;
+
+	error = fsnotify_truncate_perm(&file->f_path, length);
+	if (error)
+		return error;
+
+	sb_start_write(inode->i_sb);
+	error = do_truncate(file_mnt_idmap(file), dentry, length,
+			    ATTR_MTIME | ATTR_CTIME, file);
 	sb_end_write(inode->i_sb);
 
 	return error;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 87044acf8e79..1a9ef8f6784d 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -170,6 +170,21 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 	return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
 }
 
+/*
+ * fsnotify_truncate_perm - permission hook before file truncate
+ */
+static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
+{
+	struct inode *inode = d_inode(path->dentry);
+
+	if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) ||
+	    !fsnotify_sb_has_priority_watchers(inode->i_sb,
+					       FSNOTIFY_PRIO_PRE_CONTENT))
+		return 0;
+
+	return fsnotify_pre_content(path, &length, 0);
+}
+
 /*
  * fsnotify_file_perm - permission hook before file access (unknown range)
  */
@@ -208,6 +223,11 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 	return 0;
 }
 
+static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
+{
+	return 0;
+}
+
 static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 {
 	return 0;
-- 
cgit v1.2.3


From 4f8afa33817a6420398d1c177c6e220a05081f51 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:23 -0500
Subject: fanotify: introduce FAN_PRE_ACCESS permission event

Similar to FAN_ACCESS_PERM permission event, but it is only allowed with
class FAN_CLASS_PRE_CONTENT and only allowed on regular files and dirs.

Unlike FAN_ACCESS_PERM, it is safe to write to the file being accessed
in the context of the event handler.

This pre-content event is meant to be used by hierarchical storage
managers that want to fill the content of files on first read access.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/b80986f8d5b860acea2c9a73c0acd93587be5fe4.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fanotify/fanotify.c      |  3 ++-
 fs/notify/fanotify/fanotify_user.c | 35 ++++++++++++++++++++++++++++++-----
 include/linux/fanotify.h           | 14 ++++++++++----
 include/uapi/linux/fanotify.h      |  2 ++
 4 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 2e6ba94ec405..da6c3c1c7edf 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -916,8 +916,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
 	BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
 	BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
 	BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
+	BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);
 
-	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21);
+	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
 
 	mask = fanotify_group_event_mask(group, iter_info, &match_mask,
 					 mask, data, data_type, dir);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 456cc3e92c88..08e4d8659ef5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1287,7 +1287,7 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
 }
 
 static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
-					      unsigned int fan_flags)
+					     __u32 mask, unsigned int fan_flags)
 {
 	/*
 	 * Non evictable mark cannot be downgraded to evictable mark.
@@ -1314,6 +1314,11 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
 		return -EEXIST;
 
+	/* For now pre-content events are not generated for directories */
+	mask |= fsn_mark->mask;
+	if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
+		return -EEXIST;
+
 	return 0;
 }
 
@@ -1340,7 +1345,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 	/*
 	 * Check if requested mark flags conflict with an existing mark flags.
 	 */
-	ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
+	ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags);
 	if (ret)
 		goto out;
 
@@ -1640,11 +1645,23 @@ static int fanotify_events_supported(struct fsnotify_group *group,
 				     unsigned int flags)
 {
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+	bool is_dir = d_is_dir(path->dentry);
 	/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
 	bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
 				 (mask & FAN_RENAME) ||
 				 (flags & FAN_MARK_IGNORE);
 
+	/*
+	 * Filesystems need to opt-into pre-content evnets (a.k.a HSM)
+	 * and they are only supported on regular files and directories.
+	 */
+	if (mask & FANOTIFY_PRE_CONTENT_EVENTS) {
+		if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM))
+			return -EOPNOTSUPP;
+		if (!is_dir && !d_is_reg(path->dentry))
+			return -EINVAL;
+	}
+
 	/*
 	 * Some filesystems such as 'proc' acquire unusual locks when opening
 	 * files. For them fanotify permission events have high chances of
@@ -1677,7 +1694,7 @@ static int fanotify_events_supported(struct fsnotify_group *group,
 	 * but because we always allowed it, error only when using new APIs.
 	 */
 	if (strict_dir_events && mark_type == FAN_MARK_INODE &&
-	    !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
+	    !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
 		return -ENOTDIR;
 
 	return 0;
@@ -1778,10 +1795,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 		return -EPERM;
 
 	/*
-	 * Permission events require minimum priority FAN_CLASS_CONTENT.
+	 * Permission events are not allowed for FAN_CLASS_NOTIF.
+	 * Pre-content permission events are not allowed for FAN_CLASS_CONTENT.
 	 */
 	if (mask & FANOTIFY_PERM_EVENTS &&
-	    group->priority < FSNOTIFY_PRIO_CONTENT)
+	    group->priority == FSNOTIFY_PRIO_NORMAL)
+		return -EINVAL;
+	else if (mask & FANOTIFY_PRE_CONTENT_EVENTS &&
+		 group->priority == FSNOTIFY_PRIO_CONTENT)
 		return -EINVAL;
 
 	if (mask & FAN_FS_ERROR &&
@@ -1816,6 +1837,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
 		return -EINVAL;
 
+	/* Pre-content events are not currently generated for directories. */
+	if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
+		return -EINVAL;
+
 	if (mark_cmd == FAN_MARK_FLUSH) {
 		if (mark_type == FAN_MARK_MOUNT)
 			fsnotify_clear_vfsmount_marks_by_group(group);
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 89ff45bd6f01..c747af064d2c 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -89,6 +89,16 @@
 #define FANOTIFY_DIRENT_EVENTS	(FAN_MOVE | FAN_CREATE | FAN_DELETE | \
 				 FAN_RENAME)
 
+/* Content events can be used to inspect file content */
+#define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \
+				      FAN_ACCESS_PERM)
+/* Pre-content events can be used to fill file content */
+#define FANOTIFY_PRE_CONTENT_EVENTS  (FAN_PRE_ACCESS)
+
+/* Events that require a permission response from user */
+#define FANOTIFY_PERM_EVENTS	(FANOTIFY_CONTENT_PERM_EVENTS | \
+				 FANOTIFY_PRE_CONTENT_EVENTS)
+
 /* Events that can be reported with event->fd */
 #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS)
 
@@ -104,10 +114,6 @@
 				 FANOTIFY_INODE_EVENTS | \
 				 FANOTIFY_ERROR_EVENTS)
 
-/* Events that require a permission response from user */
-#define FANOTIFY_PERM_EVENTS	(FAN_OPEN_PERM | FAN_ACCESS_PERM | \
-				 FAN_OPEN_EXEC_PERM)
-
 /* Extra flags that may be reported with event or control handling of events */
 #define FANOTIFY_EVENT_FLAGS	(FAN_EVENT_ON_CHILD | FAN_ONDIR)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 79072b6894f2..7596168c80eb 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -27,6 +27,8 @@
 #define FAN_OPEN_EXEC_PERM	0x00040000	/* File open/exec in perm check */
 /* #define FAN_DIR_MODIFY	0x00080000 */	/* Deprecated (reserved) */
 
+#define FAN_PRE_ACCESS		0x00100000	/* Pre-content access hook */
+
 #define FAN_EVENT_ON_CHILD	0x08000000	/* Interested in child events */
 
 #define FAN_RENAME		0x10000000	/* File was renamed */
-- 
cgit v1.2.3


From 870499bc1d4dc04cba1f63dd5e7bc02b983e2458 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:24 -0500
Subject: fanotify: report file range info with pre-content events

With group class FAN_CLASS_PRE_CONTENT, report offset and length info
along with FAN_PRE_ACCESS pre-content events.

This information is meant to be used by hierarchical storage managers
that want to fill partial content of files on first access to range.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/b90a9e6c809dd3cad5684da90f23ea93ec6ce8c8.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fanotify/fanotify.h      |  8 ++++++++
 fs/notify/fanotify/fanotify_user.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/fanotify.h      |  8 ++++++++
 3 files changed, 54 insertions(+)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 93598b7d5952..7f06355afa1f 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -448,6 +448,14 @@ static inline bool fanotify_is_perm_event(u32 mask)
 		mask & FANOTIFY_PERM_EVENTS;
 }
 
+static inline bool fanotify_event_has_access_range(struct fanotify_event *event)
+{
+	if (!(event->mask & FANOTIFY_PRE_CONTENT_EVENTS))
+		return false;
+
+	return FANOTIFY_PERM(event)->ppos;
+}
+
 static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
 {
 	return container_of(fse, struct fanotify_event, fse);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 08e4d8659ef5..6ef3cc7de5e4 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -121,6 +121,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
 	sizeof(struct fanotify_event_info_pidfd)
 #define FANOTIFY_ERROR_INFO_LEN \
 	(sizeof(struct fanotify_event_info_error))
+#define FANOTIFY_RANGE_INFO_LEN \
+	(sizeof(struct fanotify_event_info_range))
 
 static int fanotify_fid_info_len(int fh_len, int name_len)
 {
@@ -180,6 +182,9 @@ static size_t fanotify_event_len(unsigned int info_mode,
 	if (info_mode & FAN_REPORT_PIDFD)
 		event_len += FANOTIFY_PIDFD_INFO_LEN;
 
+	if (fanotify_event_has_access_range(event))
+		event_len += FANOTIFY_RANGE_INFO_LEN;
+
 	return event_len;
 }
 
@@ -516,6 +521,30 @@ static int copy_pidfd_info_to_user(int pidfd,
 	return info_len;
 }
 
+static size_t copy_range_info_to_user(struct fanotify_event *event,
+				      char __user *buf, int count)
+{
+	struct fanotify_perm_event *pevent = FANOTIFY_PERM(event);
+	struct fanotify_event_info_range info = { };
+	size_t info_len = FANOTIFY_RANGE_INFO_LEN;
+
+	if (WARN_ON_ONCE(info_len > count))
+		return -EFAULT;
+
+	if (WARN_ON_ONCE(!pevent->ppos))
+		return -EINVAL;
+
+	info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE;
+	info.hdr.len = info_len;
+	info.offset = *(pevent->ppos);
+	info.count = pevent->count;
+
+	if (copy_to_user(buf, &info, info_len))
+		return -EFAULT;
+
+	return info_len;
+}
+
 static int copy_info_records_to_user(struct fanotify_event *event,
 				     struct fanotify_info *info,
 				     unsigned int info_mode, int pidfd,
@@ -637,6 +666,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
 		total_bytes += ret;
 	}
 
+	if (fanotify_event_has_access_range(event)) {
+		ret = copy_range_info_to_user(event, buf, count);
+		if (ret < 0)
+			return ret;
+		buf += ret;
+		count -= ret;
+		total_bytes += ret;
+	}
+
 	return total_bytes;
 }
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 7596168c80eb..0636a9c85dd0 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -146,6 +146,7 @@ struct fanotify_event_metadata {
 #define FAN_EVENT_INFO_TYPE_DFID	3
 #define FAN_EVENT_INFO_TYPE_PIDFD	4
 #define FAN_EVENT_INFO_TYPE_ERROR	5
+#define FAN_EVENT_INFO_TYPE_RANGE	6
 
 /* Special info types for FAN_RENAME */
 #define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME	10
@@ -192,6 +193,13 @@ struct fanotify_event_info_error {
 	__u32 error_count;
 };
 
+struct fanotify_event_info_range {
+	struct fanotify_event_info_header hdr;
+	__u32 pad;
+	__u64 offset;
+	__u64 count;
+};
+
 /*
  * User space may need to record additional information about its decision.
  * The extra information type records what kind of information is included.
-- 
cgit v1.2.3


From b4b2ff4f61ded819bfa22e50fdec7693f51cbbee Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 15 Nov 2024 10:30:25 -0500
Subject: fanotify: allow to set errno in FAN_DENY permission response

With FAN_DENY response, user trying to perform the filesystem operation
gets an error with errno set to EPERM.

It is useful for hierarchical storage management (HSM) service to be able
to deny access for reasons more diverse than EPERM, for example EAGAIN,
if HSM could retry the operation later.

Allow fanotify groups with priority FAN_CLASSS_PRE_CONTENT to responsd
to permission events with the response value FAN_DENY_ERRNO(errno),
instead of FAN_DENY to return a custom error.

Limit custom error values to errors expected on read(2)/write(2) and
open(2) of regular files. This list could be extended in the future.
Userspace can test for legitimate values of FAN_DENY_ERRNO(errno) by
writing a response to an fanotify group fd with a value of FAN_NOFD in
the fd field of the response.

The change in fanotify_response is backward compatible, because errno is
written in the high 8 bits of the 32bit response field and old kernels
reject respose value with high bits set.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/1e5fb6af84b69ca96b5c849fa5f10bdf4d1dc414.1731684329.git.josef@toxicpanda.com
---
 fs/notify/fanotify/fanotify.c      | 17 +++++++++++++----
 fs/notify/fanotify/fanotify.h      |  5 +++++
 fs/notify/fanotify/fanotify_user.c | 29 +++++++++++++++++++++++++++--
 include/linux/fanotify.h           |  4 +++-
 include/uapi/linux/fanotify.h      |  7 +++++++
 5 files changed, 55 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index da6c3c1c7edf..95646f7c46ca 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -223,7 +223,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
 				 struct fanotify_perm_event *event,
 				 struct fsnotify_iter_info *iter_info)
 {
-	int ret;
+	int ret, errno;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
@@ -262,14 +262,23 @@ static int fanotify_get_response(struct fsnotify_group *group,
 		ret = 0;
 		break;
 	case FAN_DENY:
+		/* Check custom errno from pre-content events */
+		errno = fanotify_get_response_errno(event->response);
+		if (errno) {
+			ret = -errno;
+			break;
+		}
+		fallthrough;
 	default:
 		ret = -EPERM;
 	}
 
 	/* Check if the response should be audited */
-	if (event->response & FAN_AUDIT)
-		audit_fanotify(event->response & ~FAN_AUDIT,
-			       &event->audit_rule);
+	if (event->response & FAN_AUDIT) {
+		u32 response = event->response &
+			(FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS);
+		audit_fanotify(response & ~FAN_AUDIT, &event->audit_rule);
+	}
 
 	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
 		 group, event, ret);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 7f06355afa1f..c12cbc270539 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -528,3 +528,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
 
 	return mflags;
 }
+
+static inline u32 fanotify_get_response_errno(int res)
+{
+	return (res >> FAN_ERRNO_SHIFT) & FAN_ERRNO_MASK;
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6ef3cc7de5e4..19435cd2c41f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -327,11 +327,12 @@ static int process_access_response(struct fsnotify_group *group,
 	struct fanotify_perm_event *event;
 	int fd = response_struct->fd;
 	u32 response = response_struct->response;
+	int errno = fanotify_get_response_errno(response);
 	int ret = info_len;
 	struct fanotify_response_info_audit_rule friar;
 
-	pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__,
-		 group, fd, response, info, info_len);
+	pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n",
+		 __func__, group, fd, response, errno, info, info_len);
 	/*
 	 * make sure the response is valid, if invalid we do nothing and either
 	 * userspace can send a valid response or we will clean it up after the
@@ -342,7 +343,31 @@ static int process_access_response(struct fsnotify_group *group,
 
 	switch (response & FANOTIFY_RESPONSE_ACCESS) {
 	case FAN_ALLOW:
+		if (errno)
+			return -EINVAL;
+		break;
 	case FAN_DENY:
+		/* Custom errno is supported only for pre-content groups */
+		if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT)
+			return -EINVAL;
+
+		/*
+		 * Limit errno to values expected on open(2)/read(2)/write(2)
+		 * of regular files.
+		 */
+		switch (errno) {
+		case 0:
+		case EIO:
+		case EPERM:
+		case EBUSY:
+		case ETXTBSY:
+		case EAGAIN:
+		case ENOSPC:
+		case EDQUOT:
+			break;
+		default:
+			return -EINVAL;
+		}
 		break;
 	default:
 		return -EINVAL;
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index c747af064d2c..78f660ebc318 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -132,7 +132,9 @@
 /* These masks check for invalid bits in permission responses. */
 #define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY)
 #define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO)
-#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS)
+#define FANOTIFY_RESPONSE_VALID_MASK \
+	(FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \
+	 (FAN_ERRNO_MASK << FAN_ERRNO_SHIFT))
 
 /* Do not use these old uapi constants internally */
 #undef FAN_ALL_CLASS_BITS
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 0636a9c85dd0..bd8167979707 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -235,6 +235,13 @@ struct fanotify_response_info_audit_rule {
 /* Legit userspace responses to a _PERM event */
 #define FAN_ALLOW	0x01
 #define FAN_DENY	0x02
+/* errno other than EPERM can specified in upper byte of deny response */
+#define FAN_ERRNO_BITS	8
+#define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS)
+#define FAN_ERRNO_MASK	((1 << FAN_ERRNO_BITS) - 1)
+#define FAN_DENY_ERRNO(err) \
+	(FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT))
+
 #define FAN_AUDIT	0x10	/* Bitmask to create audit record for result */
 #define FAN_INFO	0x20	/* Bitmask to indicate additional information */
 
-- 
cgit v1.2.3


From bb57c81e97e0082abfb0406ed6f67c615c3d206c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Dec 2024 11:06:32 +0000
Subject: cifs: Fix rmdir failure due to ongoing I/O on deleted file

The cifs_io_request struct (a wrapper around netfs_io_request) holds open
the file on the server, even beyond the local Linux file being closed.
This can cause problems with Windows-based filesystems as the file's name
still exists after deletion until the file is closed, preventing the parent
directory from being removed and causing spurious test failures in xfstests
due to inability to remove a directory.  The symptom looks something like
this in the test output:

   rm: cannot remove '/mnt/scratch/test/p0/d3': Directory not empty
   rm: cannot remove '/mnt/scratch/test/p1/dc/dae': Directory not empty

Fix this by waiting in unlink and rename for any outstanding I/O requests
to be completed on the target file before removing that file.

Note that this doesn't prevent Linux from trying to start new requests
after deletion if it still has the file open locally - something that's
perfectly acceptable on a UNIX system.

Note also that whilst I've marked this as fixing the commit to make cifs
use netfslib, I don't know that it won't occur before that.

Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib")
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 668098d3b108..f146e06c97eb 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1947,6 +1947,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 		goto unlink_out;
 	}
 
+	netfs_wait_for_outstanding_io(inode);
 	cifs_close_deferred_file_under_dentry(tcon, full_path);
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 	if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -2464,8 +2465,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
 	}
 
 	cifs_close_deferred_file_under_dentry(tcon, from_name);
-	if (d_inode(target_dentry) != NULL)
+	if (d_inode(target_dentry) != NULL) {
+		netfs_wait_for_outstanding_io(d_inode(target_dentry));
 		cifs_close_deferred_file_under_dentry(tcon, to_name);
+	}
 
 	rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
 			    to_name);
-- 
cgit v1.2.3


From 8676c4dfae5bea2582f9a999fd7463e3c79a3412 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Mon, 9 Dec 2024 12:07:09 +0100
Subject: cifs: Use str_yes_no() helper in cifs_ses_add_channel()

Remove hard-coded strings by using the str_yes_no() helper function.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/sess.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 0bb77f9ec686..3306fb655136 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -488,11 +488,11 @@ cifs_ses_add_channel(struct cifs_ses *ses,
 
 	if (iface->sockaddr.ss_family == AF_INET)
 		cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI4)\n",
-			 ses, iface->speed, iface->rdma_capable ? "yes" : "no",
+			 ses, iface->speed, str_yes_no(iface->rdma_capable),
 			 &ipv4->sin_addr);
 	else
 		cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI6)\n",
-			 ses, iface->speed, iface->rdma_capable ? "yes" : "no",
+			 ses, iface->speed, str_yes_no(iface->rdma_capable),
 			 &ipv6->sin6_addr);
 
 	/*
-- 
cgit v1.2.3


From 633609c48a358134d3f8ef8241dff24841577f58 Mon Sep 17 00:00:00 2001
From: Enzo Matsumiya <ematsumiya@suse.de>
Date: Tue, 10 Dec 2024 10:21:48 -0300
Subject: smb: client: destroy cfid_put_wq on module exit

Fix potential problem in rmmod

Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index c9f9b6e97964..9d96b833015c 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -2043,6 +2043,7 @@ exit_cifs(void)
 	destroy_workqueue(decrypt_wq);
 	destroy_workqueue(fileinfo_put_wq);
 	destroy_workqueue(serverclose_wq);
+	destroy_workqueue(cfid_put_wq);
 	destroy_workqueue(cifsiod_wq);
 	cifs_proc_clean();
 }
-- 
cgit v1.2.3


From b44679c63e4d3ac820998b6bd59fba89a72ad3e7 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Mon, 9 Dec 2024 19:42:39 +0800
Subject: iomap: pass byte granular end position to iomap_add_to_ioend

This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.

Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:

"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.

We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."

To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.

Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.

LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/

Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 955f19e27e47..bcc7831d03af 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1774,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
  */
 static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, loff_t pos, unsigned len)
+		struct inode *inode, loff_t pos, loff_t end_pos,
+		unsigned len)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
@@ -1800,8 +1801,8 @@ new_ioend:
 
 static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, u64 pos, unsigned dirty_len,
-		unsigned *count)
+		struct inode *inode, u64 pos, u64 end_pos,
+		unsigned dirty_len, unsigned *count)
 {
 	int error;
 
@@ -1826,7 +1827,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 			break;
 		default:
 			error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
-					map_len);
+					end_pos, map_len);
 			if (!error)
 				(*count)++;
 			break;
@@ -1897,11 +1898,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
 		 *    remaining memory is zeroed when mapped, and writes to that
 		 *    region are not written out to the file.
 		 *
-		 * Also adjust the writeback range to skip all blocks entirely
-		 * beyond i_size.
+		 * Also adjust the end_pos to the end of file and skip writeback
+		 * for all blocks entirely beyond i_size.
 		 */
 		folio_zero_segment(folio, poff, folio_size(folio));
-		*end_pos = round_up(isize, i_blocksize(inode));
+		*end_pos = isize;
 	}
 
 	return true;
@@ -1914,6 +1915,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	struct inode *inode = folio->mapping->host;
 	u64 pos = folio_pos(folio);
 	u64 end_pos = pos + folio_size(folio);
+	u64 end_aligned = 0;
 	unsigned count = 0;
 	int error = 0;
 	u32 rlen;
@@ -1955,9 +1957,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	/*
 	 * Walk through the folio to find dirty areas to write back.
 	 */
-	while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+	end_aligned = round_up(end_pos, i_blocksize(inode));
+	while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
 		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
-				pos, rlen, &count);
+				pos, end_pos, rlen, &count);
 		if (error)
 			break;
 		pos += rlen;
-- 
cgit v1.2.3


From 51d20d1dacbec589d459e11fc88fbca419f84a99 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Mon, 9 Dec 2024 19:42:40 +0800
Subject: iomap: fix zero padding data issue in concurrent append writes

During concurrent append writes to XFS filesystem, zero padding data
may appear in the file after power failure. This happens due to imprecise
disk size updates when handling write completion.

Consider this scenario with concurrent append writes same file:

  Thread 1:                  Thread 2:
  ------------               -----------
  write [A, A+B]
  update inode size to A+B
  submit I/O [A, A+BS]
                             write [A+B, A+B+C]
                             update inode size to A+B+C
  <I/O completes, updates disk size to min(A+B+C, A+BS)>
  <power failure>

After reboot:
  1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C]

  |<         Block Size (BS)      >|
  |DDDDDDDDDDDDDDDD0000000000000000|
  ^               ^        ^
  A              A+B     A+B+C
                         (EOF)

  2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS]

  |<         Block Size (BS)      >|<           Block Size (BS)    >|
  |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000|
  ^               ^                ^               ^
  A              A+B              A+BS           A+B+C
                                  (EOF)

  D = Valid Data
  0 = Zero Padding

The issue stems from disk size being set to min(io_offset + io_size,
inode->i_size) at I/O completion. Since io_offset+io_size is block
size granularity, it may exceed the actual valid file data size. In
the case of concurrent append writes, inode->i_size may be larger
than the actual range of valid file data written to disk, leading to
inaccurate disk size updates.

This patch modifies the meaning of io_size to represent the size of
valid data within EOF in an ioend. If the ioend spans beyond i_size,
io_size will be trimmed to provide the file with more accurate size
information. This is particularly useful for on-disk size updates
at completion time.

After this change, ioends that span i_size will not grow or merge with
other ioends in concurrent scenarios. However, these cases that need
growth/merging rarely occur and it seems no noticeable performance impact.
Although rounding up io_size could enable ioend growth/merging in these
scenarios, we decided to keep the code simple after discussion [1].

Another benefit is that it makes the xfs_ioend_is_append() check more
accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio()
in certain scenarios, such as repeated writes at the file tail without
extending the file size.

Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com

Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h  |  2 +-
 2 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index bcc7831d03af..54dc27d92781 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1794,7 +1794,52 @@ new_ioend:
 
 	if (ifs)
 		atomic_add(len, &ifs->write_bytes_pending);
+
+	/*
+	 * Clamp io_offset and io_size to the incore EOF so that ondisk
+	 * file size updates in the ioend completion are byte-accurate.
+	 * This avoids recovering files with zeroed tail regions when
+	 * writeback races with appending writes:
+	 *
+	 *    Thread 1:                  Thread 2:
+	 *    ------------               -----------
+	 *    write [A, A+B]
+	 *    update inode size to A+B
+	 *    submit I/O [A, A+BS]
+	 *                               write [A+B, A+B+C]
+	 *                               update inode size to A+B+C
+	 *    <I/O completes, updates disk size to min(A+B+C, A+BS)>
+	 *    <power failure>
+	 *
+	 *  After reboot:
+	 *    1) with A+B+C < A+BS, the file has zero padding in range
+	 *       [A+B, A+B+C]
+	 *
+	 *    |<     Block Size (BS)   >|
+	 *    |DDDDDDDDDDDD0000000000000|
+	 *    ^           ^        ^
+	 *    A          A+B     A+B+C
+	 *                       (EOF)
+	 *
+	 *    2) with A+B+C > A+BS, the file has zero padding in range
+	 *       [A+B, A+BS]
+	 *
+	 *    |<     Block Size (BS)   >|<     Block Size (BS)    >|
+	 *    |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+	 *    ^           ^             ^           ^
+	 *    A          A+B           A+BS       A+B+C
+	 *                             (EOF)
+	 *
+	 *    D = Valid Data
+	 *    0 = Zero Padding
+	 *
+	 * Note that this defeats the ability to chain the ioends of
+	 * appending writes.
+	 */
 	wpc->ioend->io_size += len;
+	if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
+		wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+
 	wbc_account_cgroup_owner(wbc, folio, len);
 	return 0;
 }
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5675af6b740c..75bf54e76f3b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -335,7 +335,7 @@ struct iomap_ioend {
 	u16			io_type;
 	u16			io_flags;	/* IOMAP_F_* */
 	struct inode		*io_inode;	/* file being written to */
-	size_t			io_size;	/* size of the extent */
+	size_t			io_size;	/* size of data within eof */
 	loff_t			io_offset;	/* offset in the file */
 	sector_t		io_sector;	/* start sector of ioend */
 	struct bio		io_bio;		/* MUST BE LAST! */
-- 
cgit v1.2.3


From 7f4796a46571ced5d3d5b0942e1bfea1eedaaecd Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 15 Nov 2024 10:30:30 -0500
Subject: xfs: add pre-content fsnotify hook for DAX faults

xfs has it's own handling for DAX faults, so we need to add the
pre-content fsnotify hook for this case. Other faults go through
filemap_fault so they're handled properly there.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/9eccdf59a65b72f0a1a5e2f2b9bff8eda2d4f2d9.1731684329.git.josef@toxicpanda.com
---
 fs/xfs/xfs_file.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4a0b7de4f7ae..c488ae26b23d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1443,6 +1443,9 @@ xfs_dax_read_fault(
 
 	trace_xfs_read_fault(ip, order);
 
+	ret = filemap_fsnotify_fault(vmf);
+	if (unlikely(ret))
+		return ret;
 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
 	ret = xfs_dax_fault_locked(vmf, order, false);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
@@ -1471,6 +1474,16 @@ xfs_write_fault(
 	vm_fault_t		ret;
 
 	trace_xfs_write_fault(ip, order);
+	/*
+	 * Usually we get here from ->page_mkwrite callback but in case of DAX
+	 * we will get here also for ordinary write fault. Handle HSM
+	 * notifications for that case.
+	 */
+	if (IS_DAX(inode)) {
+		ret = filemap_fsnotify_fault(vmf);
+		if (unlikely(ret))
+			return ret;
+	}
 
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vmf->vma->vm_file);
-- 
cgit v1.2.3


From b722e40be2bda7a688f74e1a794121e84f717fdc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 15 Nov 2024 10:30:31 -0500
Subject: btrfs: disable defrag on pre-content watched files

We queue up inodes to be defrag'ed asynchronously, which means we do not
have their original file for readahead.  This means that the code to
skip readahead on pre-content watched files will not run, and we could
potentially read in empty pages.

Handle this corner case by disabling defrag on files that are currently
being watched for pre-content events.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/4cc5bcea13db7904174353d08e85157356282a59.1731684329.git.josef@toxicpanda.com
---
 fs/btrfs/ioctl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c9302d193187..51b439537049 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2635,6 +2635,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 			goto out;
 		}
 
+		/*
+		 * Don't allow defrag on pre-content watched files, as it could
+		 * populate the page cache with 0's via readahead.
+		 */
+		if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+			ret = -EINVAL;
+			goto out;
+		}
+
 		if (argp) {
 			if (copy_from_user(&range, argp, sizeof(range))) {
 				ret = -EFAULT;
-- 
cgit v1.2.3


From bb480760ffc7018e21ee6f60241c2b99ff26ee0e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 Nov 2024 11:18:23 +0100
Subject: ext4: add pre-content fsnotify hook for DAX faults

ext4 has its own handling for DAX faults. Add the pre-content fsnotify
hook for this case.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/file.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3bd96c3d4cd0..a5205149adba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -756,6 +756,9 @@ retry:
 			return VM_FAULT_SIGBUS;
 		}
 	} else {
+		result = filemap_fsnotify_fault(vmf);
+		if (unlikely(result))
+			return result;
 		filemap_invalidate_lock_shared(mapping);
 	}
 	result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
-- 
cgit v1.2.3


From 5121711eb8dbcbed70b1db429a4665f413844164 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 15 Nov 2024 10:30:32 -0500
Subject: fs: enable pre-content events on supported file systems

Now that all the code has been added for pre-content events, and the
various file systems that need the page fault hooks for fsnotify have
been updated, add SB_I_ALLOW_HSM to the supported file systems.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/46960dcb2725fa0317895ed66a8409ba1c306a82.1731684329.git.josef@toxicpanda.com
---
 fs/btrfs/super.c   | 2 +-
 fs/ext4/super.c    | 3 +++
 fs/xfs/xfs_super.c | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 97a85d180b61..fe6ecc3f1cab 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -961,7 +961,7 @@ static int btrfs_fill_super(struct super_block *sb,
 #endif
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
-	sb->s_iflags |= SB_I_CGROUPWB;
+	sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
 
 	err = super_setup_bdi(sb);
 	if (err) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 785809f33ff4..029fc5fc4ffe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5312,6 +5312,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	/* i_version is always enabled now */
 	sb->s_flags |= SB_I_VERSION;
 
+	/* HSM events are allowed by default. */
+	sb->s_iflags |= SB_I_ALLOW_HSM;
+
 	err = ext4_check_feature_compatibility(sb, es, silent);
 	if (err)
 		goto failed_mount;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 394fdf3bb535..8524b9d42873 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1730,7 +1730,7 @@ xfs_fs_fill_super(
 		sb->s_time_max = XFS_LEGACY_TIME_MAX;
 	}
 	trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
-	sb->s_iflags |= SB_I_CGROUPWB;
+	sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
 
 	set_posix_acl_flag(sb);
 
-- 
cgit v1.2.3


From 0357ef03c94ef835bd44a0658b8edb672a9dbf51 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 28 Nov 2024 15:25:32 +0100
Subject: fs: don't block write during exec on pre-content watched files

Commit 2a010c412853 ("fs: don't block i_writecount during exec") removed
the legacy behavior of getting ETXTBSY on attempt to open and executable
file for write while it is being executed.

This commit was reverted because an application that depends on this
legacy behavior was broken by the change.

We need to allow HSM writing into executable files while executed to
fill their content on-the-fly.

To that end, disable the ETXTBSY legacy behavior for files that are
watched by pre-content events.

This change is not expected to cause regressions with existing systems
which do not have any pre-content event listeners.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20241128142532.465176-1-amir73il@gmail.com
---
 fs/binfmt_elf.c       |  4 ++--
 fs/binfmt_elf_fdpic.c |  4 ++--
 fs/exec.c             |  8 ++++----
 include/linux/fs.h    | 22 ++++++++++++++++++++++
 kernel/fork.c         | 12 ++++++------
 5 files changed, 36 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 106f0e8af177..8054f44d39cf 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1257,7 +1257,7 @@ out_free_interp:
 		}
 		reloc_func_desc = interp_load_addr;
 
-		allow_write_access(interpreter);
+		exe_file_allow_write_access(interpreter);
 		fput(interpreter);
 
 		kfree(interp_elf_ex);
@@ -1354,7 +1354,7 @@ out_free_dentry:
 	kfree(interp_elf_ex);
 	kfree(interp_elf_phdata);
 out_free_file:
-	allow_write_access(interpreter);
+	exe_file_allow_write_access(interpreter);
 	if (interpreter)
 		fput(interpreter);
 out_free_ph:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f1a7c4875c4a..c13ee8180b17 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -394,7 +394,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 			goto error;
 		}
 
-		allow_write_access(interpreter);
+		exe_file_allow_write_access(interpreter);
 		fput(interpreter);
 		interpreter = NULL;
 	}
@@ -467,7 +467,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 
 error:
 	if (interpreter) {
-		allow_write_access(interpreter);
+		exe_file_allow_write_access(interpreter);
 		fput(interpreter);
 	}
 	kfree(interpreter_name);
diff --git a/fs/exec.c b/fs/exec.c
index 98cb7ba9983c..c41cfd35c74c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -912,7 +912,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 	    path_noexec(&file->f_path))
 		return ERR_PTR(-EACCES);
 
-	err = deny_write_access(file);
+	err = exe_file_deny_write_access(file);
 	if (err)
 		return ERR_PTR(err);
 
@@ -927,7 +927,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
  * Returns ERR_PTR on failure or allocated struct file on success.
  *
  * As this is a wrapper for the internal do_open_execat(), callers
- * must call allow_write_access() before fput() on release. Also see
+ * must call exe_file_allow_write_access() before fput() on release. Also see
  * do_close_execat().
  */
 struct file *open_exec(const char *name)
@@ -1471,7 +1471,7 @@ static void do_close_execat(struct file *file)
 {
 	if (!file)
 		return;
-	allow_write_access(file);
+	exe_file_allow_write_access(file);
 	fput(file);
 }
 
@@ -1797,7 +1797,7 @@ static int exec_binprm(struct linux_binprm *bprm)
 		bprm->file = bprm->interpreter;
 		bprm->interpreter = NULL;
 
-		allow_write_access(exec);
+		exe_file_allow_write_access(exec);
 		if (unlikely(bprm->have_execfd)) {
 			if (bprm->executable) {
 				fput(exec);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f4d59464965..a1230c40fef1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3095,6 +3095,28 @@ static inline void allow_write_access(struct file *file)
 	if (file)
 		atomic_inc(&file_inode(file)->i_writecount);
 }
+
+/*
+ * Do not prevent write to executable file when watched by pre-content events.
+ *
+ * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at
+ * the time of file open and remains constant for entire lifetime of the file,
+ * so if pre-content watches are added post execution or removed before the end
+ * of the execution, it will not cause i_writecount reference leak.
+ */
+static inline int exe_file_deny_write_access(struct file *exe_file)
+{
+	if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
+		return 0;
+	return deny_write_access(exe_file);
+}
+static inline void exe_file_allow_write_access(struct file *exe_file)
+{
+	if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
+		return;
+	allow_write_access(exe_file);
+}
+
 static inline bool inode_is_open_for_write(const struct inode *inode)
 {
 	return atomic_read(&inode->i_writecount) > 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1450b461d196..015c397f47ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -625,8 +625,8 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
 	 * We depend on the oldmm having properly denied write access to the
 	 * exe_file already.
 	 */
-	if (exe_file && deny_write_access(exe_file))
-		pr_warn_once("deny_write_access() failed in %s\n", __func__);
+	if (exe_file && exe_file_deny_write_access(exe_file))
+		pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
 }
 
 #ifdef CONFIG_MMU
@@ -1424,13 +1424,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 		 * We expect the caller (i.e., sys_execve) to already denied
 		 * write access, so this is unlikely to fail.
 		 */
-		if (unlikely(deny_write_access(new_exe_file)))
+		if (unlikely(exe_file_deny_write_access(new_exe_file)))
 			return -EACCES;
 		get_file(new_exe_file);
 	}
 	rcu_assign_pointer(mm->exe_file, new_exe_file);
 	if (old_exe_file) {
-		allow_write_access(old_exe_file);
+		exe_file_allow_write_access(old_exe_file);
 		fput(old_exe_file);
 	}
 	return 0;
@@ -1471,7 +1471,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 			return ret;
 	}
 
-	ret = deny_write_access(new_exe_file);
+	ret = exe_file_deny_write_access(new_exe_file);
 	if (ret)
 		return -EACCES;
 	get_file(new_exe_file);
@@ -1483,7 +1483,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 	mmap_write_unlock(mm);
 
 	if (old_exe_file) {
-		allow_write_access(old_exe_file);
+		exe_file_allow_write_access(old_exe_file);
 		fput(old_exe_file);
 	}
 	return 0;
-- 
cgit v1.2.3


From 7a4f541873734f41f9645ec147cfae72ef3ffd00 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 11 Dec 2024 12:55:56 -0800
Subject: fuse: fix direct io folio offset and length calculation

For the direct io case, the pages from userspace may be part of a huge
folio, even if all folios in the page cache for fuse are small.

Fix the logic for calculating the offset and length of the folio for
the direct io case, which currently incorrectly assumes that all folios
encountered are one page size.

Fixes: 3b97c3652d91 ("fuse: convert direct io to use folios")
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Bernd Schubert <bschubert@ddn.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/file.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 88d0946b5bc9..15b08d6a5739 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1557,18 +1557,22 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
 
 		nbytes += ret;
 
-		ret += start;
-		/* Currently, all folios in FUSE are one page */
-		nfolios = DIV_ROUND_UP(ret, PAGE_SIZE);
-
-		ap->descs[ap->num_folios].offset = start;
-		fuse_folio_descs_length_init(ap->descs, ap->num_folios, nfolios);
-		for (i = 0; i < nfolios; i++)
-			ap->folios[i + ap->num_folios] = page_folio(pages[i]);
-
-		ap->num_folios += nfolios;
-		ap->descs[ap->num_folios - 1].length -=
-			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
+		nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE);
+
+		for (i = 0; i < nfolios; i++) {
+			struct folio *folio = page_folio(pages[i]);
+			unsigned int offset = start +
+				(folio_page_idx(folio, pages[i]) << PAGE_SHIFT);
+			unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start);
+
+			ap->descs[ap->num_folios].offset = offset;
+			ap->descs[ap->num_folios].length = len;
+			ap->folios[ap->num_folios] = folio;
+			start = 0;
+			ret -= len;
+			ap->num_folios++;
+		}
+
 		nr_pages += nfolios;
 	}
 	kfree(pages);
-- 
cgit v1.2.3


From b10a1e5643e505c367c7e16aa6d8a9a0dc07354b Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 3 Dec 2024 15:28:21 +0800
Subject: erofs: fix rare pcluster memory leak after unmounting

There may still exist some pcluster with valid reference counts
during unmounting.  Instead of introducing another synchronization
primitive, just try again as unmounting is relatively rare.  This
approach is similar to z_erofs_cache_invalidate_folio().

It was also reported by syzbot as a UAF due to commit f5ad9f9a603f
("erofs: free pclusters if no cached folio is attached"):

BUG: KASAN: slab-use-after-free in do_raw_spin_trylock+0x72/0x1f0 kernel/locking/spinlock_debug.c:123
..
 queued_spin_trylock include/asm-generic/qspinlock.h:92 [inline]
 do_raw_spin_trylock+0x72/0x1f0 kernel/locking/spinlock_debug.c:123
 __raw_spin_trylock include/linux/spinlock_api_smp.h:89 [inline]
 _raw_spin_trylock+0x20/0x80 kernel/locking/spinlock.c:138
 spin_trylock include/linux/spinlock.h:361 [inline]
 z_erofs_put_pcluster fs/erofs/zdata.c:959 [inline]
 z_erofs_decompress_pcluster fs/erofs/zdata.c:1403 [inline]
 z_erofs_decompress_queue+0x3798/0x3ef0 fs/erofs/zdata.c:1425
 z_erofs_decompressqueue_work+0x99/0xe0 fs/erofs/zdata.c:1437
 process_one_work kernel/workqueue.c:3229 [inline]
 process_scheduled_works+0xa68/0x1840 kernel/workqueue.c:3310
 worker_thread+0x870/0xd30 kernel/workqueue.c:3391
 kthread+0x2f2/0x390 kernel/kthread.c:389
 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
 </TASK>

However, it seems a long outstanding memory leak.  Fix it now.

Fixes: f5ad9f9a603f ("erofs: free pclusters if no cached folio is attached")
Reported-by: syzbot+7ff87b095e7ca0c5ac39@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/674c1235.050a0220.ad585.0032.GAE@google.com
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241203072821.1885740-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/zutil.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 75704f58ecfa..0dd65cefce33 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -230,9 +230,10 @@ void erofs_shrinker_unregister(struct super_block *sb)
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 
 	mutex_lock(&sbi->umount_mutex);
-	/* clean up all remaining pclusters in memory */
-	z_erofs_shrink_scan(sbi, ~0UL);
-
+	while (!xa_empty(&sbi->managed_pslots)) {
+		z_erofs_shrink_scan(sbi, ~0UL);
+		cond_resched();
+	}
 	spin_lock(&erofs_sb_list_lock);
 	list_del(&sbi->list);
 	spin_unlock(&erofs_sb_list_lock);
-- 
cgit v1.2.3


From 1a2180f6859c73c674809f9f82e36c94084682ba Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 27 Nov 2024 16:52:36 +0800
Subject: erofs: fix PSI memstall accounting

Max Kellermann recently reported psi_group_cpu.tasks[NR_MEMSTALL] is
incorrect in the 6.11.9 kernel.

The root cause appears to be that, since the problematic commit, bio
can be NULL, causing psi_memstall_leave() to be skipped in
z_erofs_submit_queue().

Reported-by: Max Kellermann <max.kellermann@ionos.com>
Closes: https://lore.kernel.org/r/CAKPOu+8tvSowiJADW2RuKyofL_CSkm_SuyZA7ME5vMLWmL6pqw@mail.gmail.com
Fixes: 9e2f9d34dd12 ("erofs: handle overlapped pclusters out of crafted images properly")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241127085236.3538334-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/zdata.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 01f147505487..19ef4ff2a134 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1792,9 +1792,9 @@ drain_io:
 			erofs_fscache_submit_bio(bio);
 		else
 			submit_bio(bio);
-		if (memstall)
-			psi_memstall_leave(&pflags);
 	}
+	if (memstall)
+		psi_memstall_leave(&pflags);
 
 	/*
 	 * although background is preferred, no one is pending for submission.
-- 
cgit v1.2.3


From e2de3c1bf6a0c99b089bd706a62da8f988918858 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Thu, 12 Dec 2024 21:35:01 +0800
Subject: erofs: add erofs_sb_free() helper

Unify the common parts of erofs_fc_free() and erofs_kill_sb() as
erofs_sb_free().

Thus, fput() in erofs_fc_get_tree() is no longer needed, too.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241212133504.2047178-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/super.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index c235a8e4315e..de8e3ecc6381 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -703,16 +703,19 @@ static int erofs_fc_get_tree(struct fs_context *fc)
 			GET_TREE_BDEV_QUIET_LOOKUP : 0);
 #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
 	if (ret == -ENOTBLK) {
+		struct file *file;
+
 		if (!fc->source)
 			return invalf(fc, "No source specified");
-		sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
-		if (IS_ERR(sbi->fdev))
-			return PTR_ERR(sbi->fdev);
+
+		file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
+		sbi->fdev = file;
 
 		if (S_ISREG(file_inode(sbi->fdev)->i_mode) &&
 		    sbi->fdev->f_mapping->a_ops->read_folio)
 			return get_tree_nodev(fc, erofs_fc_fill_super);
-		fput(sbi->fdev);
 	}
 #endif
 	return ret;
@@ -763,19 +766,24 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs)
 	kfree(devs);
 }
 
-static void erofs_fc_free(struct fs_context *fc)
+static void erofs_sb_free(struct erofs_sb_info *sbi)
 {
-	struct erofs_sb_info *sbi = fc->s_fs_info;
-
-	if (!sbi)
-		return;
-
 	erofs_free_dev_context(sbi->devs);
 	kfree(sbi->fsid);
 	kfree(sbi->domain_id);
+	if (sbi->fdev)
+		fput(sbi->fdev);
 	kfree(sbi);
 }
 
+static void erofs_fc_free(struct fs_context *fc)
+{
+	struct erofs_sb_info *sbi = fc->s_fs_info;
+
+	if (sbi) /* free here if an error occurs before transferring to sb */
+		erofs_sb_free(sbi);
+}
+
 static const struct fs_context_operations erofs_context_ops = {
 	.parse_param	= erofs_fc_parse_param,
 	.get_tree       = erofs_fc_get_tree,
@@ -813,15 +821,9 @@ static void erofs_kill_sb(struct super_block *sb)
 		kill_anon_super(sb);
 	else
 		kill_block_super(sb);
-
-	erofs_free_dev_context(sbi->devs);
 	fs_put_dax(sbi->dax_dev, NULL);
 	erofs_fscache_unregister_fs(sb);
-	kfree(sbi->fsid);
-	kfree(sbi->domain_id);
-	if (sbi->fdev)
-		fput(sbi->fdev);
-	kfree(sbi);
+	erofs_sb_free(sbi);
 	sb->s_fs_info = NULL;
 }
 
-- 
cgit v1.2.3


From a440a28ddbdcb861150987b4d6e828631656b92f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:24 -0800
Subject: xfs: fix off-by-one error in fsmap's end_daddr usage

In commit ca6448aed4f10a, we created an "end_daddr" variable to fix
fsmap reporting when the end of the range requested falls in the middle
of an unknown (aka free on the rmapbt) region.  Unfortunately, I didn't
notice that the the code sets end_daddr to the last sector of the device
but then uses that quantity to compute the length of the synthesized
mapping.

Zizhi Wo later observed that when end_daddr isn't set, we still don't
report the last fsblock on a device because in that case (aka when
info->last is true), the info->high mapping that we pass to
xfs_getfsmap_group_helper has a startblock that points to the last
fsblock.  This is also wrong because the code uses startblock to
compute the length of the synthesized mapping.

Fix the second problem by setting end_daddr unconditionally, and fix the
first problem by setting start_daddr to one past the end of the range to
query.

Cc: <stable@vger.kernel.org> # v6.11
Fixes: ca6448aed4f10a ("xfs: Fix missing interval for missing_owner in xfs fsmap")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reported-by: Zizhi Wo <wozizhi@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_fsmap.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 82f2e0dd2249..3290dd8524a6 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -163,7 +163,8 @@ struct xfs_getfsmap_info {
 	xfs_daddr_t		next_daddr;	/* next daddr we expect */
 	/* daddr of low fsmap key when we're using the rtbitmap */
 	xfs_daddr_t		low_daddr;
-	xfs_daddr_t		end_daddr;	/* daddr of high fsmap key */
+	/* daddr of high fsmap key, or the last daddr on the device */
+	xfs_daddr_t		end_daddr;
 	u64			missing_owner;	/* owner of holes */
 	u32			dev;		/* device id */
 	/*
@@ -387,8 +388,8 @@ xfs_getfsmap_group_helper(
 	 * we calculated from userspace's high key to synthesize the record.
 	 * Note that if the btree query found a mapping, there won't be a gap.
 	 */
-	if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL)
-		frec->start_daddr = info->end_daddr;
+	if (info->last)
+		frec->start_daddr = info->end_daddr + 1;
 	else
 		frec->start_daddr = xfs_gbno_to_daddr(xg, startblock);
 
@@ -736,11 +737,10 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
 	 * we calculated from userspace's high key to synthesize the record.
 	 * Note that if the btree query found a mapping, there won't be a gap.
 	 */
-	if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) {
-		frec.start_daddr = info->end_daddr;
-	} else {
+	if (info->last)
+		frec.start_daddr = info->end_daddr + 1;
+	else
 		frec.start_daddr = xfs_rtb_to_daddr(mp, start_rtb);
-	}
 
 	frec.len_daddr = XFS_FSB_TO_BB(mp, rtbcount);
 	return xfs_getfsmap_helper(tp, info, &frec);
@@ -933,7 +933,10 @@ xfs_getfsmap(
 	struct xfs_trans		*tp = NULL;
 	struct xfs_fsmap		dkeys[2];	/* per-dev keys */
 	struct xfs_getfsmap_dev		handlers[XFS_GETFSMAP_DEVS];
-	struct xfs_getfsmap_info	info = { NULL };
+	struct xfs_getfsmap_info	info = {
+		.fsmap_recs		= fsmap_recs,
+		.head			= head,
+	};
 	bool				use_rmap;
 	int				i;
 	int				error = 0;
@@ -998,9 +1001,6 @@ xfs_getfsmap(
 
 	info.next_daddr = head->fmh_keys[0].fmr_physical +
 			  head->fmh_keys[0].fmr_length;
-	info.end_daddr = XFS_BUF_DADDR_NULL;
-	info.fsmap_recs = fsmap_recs;
-	info.head = head;
 
 	/* For each device we support... */
 	for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
@@ -1013,17 +1013,23 @@ xfs_getfsmap(
 			break;
 
 		/*
-		 * If this device number matches the high key, we have
-		 * to pass the high key to the handler to limit the
-		 * query results.  If the device number exceeds the
-		 * low key, zero out the low key so that we get
-		 * everything from the beginning.
+		 * If this device number matches the high key, we have to pass
+		 * the high key to the handler to limit the query results, and
+		 * set the end_daddr so that we can synthesize records at the
+		 * end of the query range or device.
 		 */
 		if (handlers[i].dev == head->fmh_keys[1].fmr_device) {
 			dkeys[1] = head->fmh_keys[1];
 			info.end_daddr = min(handlers[i].nr_sectors - 1,
 					     dkeys[1].fmr_physical);
+		} else {
+			info.end_daddr = handlers[i].nr_sectors - 1;
 		}
+
+		/*
+		 * If the device number exceeds the low key, zero out the low
+		 * key so that we get everything from the beginning.
+		 */
 		if (handlers[i].dev > head->fmh_keys[0].fmr_device)
 			memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
 
-- 
cgit v1.2.3


From 9b7280010366dbe32791acd498a37dc522f568db Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:24 -0800
Subject: xfs: metapath scrubber should use the already loaded inodes

Don't waste time in xchk_setup_metapath_dqinode doing a second lookup of
the quota inodes, just grab them from the quotainfo structure.  The
whole point of this scrubber is to make sure that the dirents exist, so
it's completely silly to do lookups.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: 128a055291ebbc ("xfs: scrub quota file metapaths")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/metapath.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index b78db6513465..80467d6bc763 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -196,36 +196,45 @@ xchk_setup_metapath_dqinode(
 	struct xfs_scrub	*sc,
 	xfs_dqtype_t		type)
 {
+	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
 	struct xfs_trans	*tp = NULL;
 	struct xfs_inode	*dp = NULL;
 	struct xfs_inode	*ip = NULL;
-	const char		*path;
 	int			error;
 
+	if (!qi)
+		return -ENOENT;
+
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		ip = qi->qi_uquotaip;
+		break;
+	case XFS_DQTYPE_GROUP:
+		ip = qi->qi_gquotaip;
+		break;
+	case XFS_DQTYPE_PROJ:
+		ip = qi->qi_pquotaip;
+		break;
+	default:
+		ASSERT(0);
+		return -EINVAL;
+	}
+	if (!ip)
+		return -ENOENT;
+
 	error = xfs_trans_alloc_empty(sc->mp, &tp);
 	if (error)
 		return error;
 
 	error = xfs_dqinode_load_parent(tp, &dp);
-	if (error)
-		goto out_cancel;
-
-	error = xfs_dqinode_load(tp, dp, type, &ip);
-	if (error)
-		goto out_dp;
-
 	xfs_trans_cancel(tp);
-	tp = NULL;
+	if (error)
+		return error;
 
-	path = kasprintf(GFP_KERNEL, "%s", xfs_dqinode_path(type));
-	error = xchk_setup_metapath_scan(sc, dp, path, ip);
+	error = xchk_setup_metapath_scan(sc, dp,
+			kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
 
-	xfs_irele(ip);
-out_dp:
 	xfs_irele(dp);
-out_cancel:
-	if (tp)
-		xfs_trans_cancel(tp);
 	return error;
 }
 #else
-- 
cgit v1.2.3


From e1d8602b6cfb757952827d11c7d26f2a1714fe82 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:25 -0800
Subject: xfs: keep quota directory inode loaded

In the same vein as the previous patch, there's no point in the metapath
scrub setup function doing a lookup on the quota metadir just so it can
validate that lookups work correctly.  Instead, retain the quota
directory inode in memory for the lifetime of the mount so that we can
check this meaningfully.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: 128a055291ebbc ("xfs: scrub quota file metapaths")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/metapath.c | 37 ++++++-------------------------------
 fs/xfs/xfs_qm.c         | 47 +++++++++++++++++++++++++----------------------
 fs/xfs/xfs_qm.h         |  1 +
 3 files changed, 32 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index 80467d6bc763..c678cba1ffc3 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -171,23 +171,13 @@ static int
 xchk_setup_metapath_quotadir(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_trans	*tp;
-	struct xfs_inode	*dp = NULL;
-	int			error;
-
-	error = xfs_trans_alloc_empty(sc->mp, &tp);
-	if (error)
-		return error;
+	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
 
-	error = xfs_dqinode_load_parent(tp, &dp);
-	xfs_trans_cancel(tp);
-	if (error)
-		return error;
+	if (!qi || !qi->qi_dirip)
+		return -ENOENT;
 
-	error = xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
-			kasprintf(GFP_KERNEL, "quota"), dp);
-	xfs_irele(dp);
-	return error;
+	return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
+			kstrdup("quota", GFP_KERNEL), qi->qi_dirip);
 }
 
 /* Scan a quota inode under the /quota directory. */
@@ -197,10 +187,7 @@ xchk_setup_metapath_dqinode(
 	xfs_dqtype_t		type)
 {
 	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
-	struct xfs_trans	*tp = NULL;
-	struct xfs_inode	*dp = NULL;
 	struct xfs_inode	*ip = NULL;
-	int			error;
 
 	if (!qi)
 		return -ENOENT;
@@ -222,20 +209,8 @@ xchk_setup_metapath_dqinode(
 	if (!ip)
 		return -ENOENT;
 
-	error = xfs_trans_alloc_empty(sc->mp, &tp);
-	if (error)
-		return error;
-
-	error = xfs_dqinode_load_parent(tp, &dp);
-	xfs_trans_cancel(tp);
-	if (error)
-		return error;
-
-	error = xchk_setup_metapath_scan(sc, dp,
+	return xchk_setup_metapath_scan(sc, qi->qi_dirip,
 			kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
-
-	xfs_irele(dp);
-	return error;
 }
 #else
 # define xchk_setup_metapath_quotadir(...)	(-ENOENT)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 61ee110b47d7..3c0189831f14 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -241,6 +241,10 @@ xfs_qm_destroy_quotainos(
 		xfs_irele(qi->qi_pquotaip);
 		qi->qi_pquotaip = NULL;
 	}
+	if (qi->qi_dirip) {
+		xfs_irele(qi->qi_dirip);
+		qi->qi_dirip = NULL;
+	}
 }
 
 /*
@@ -646,8 +650,7 @@ xfs_qm_init_timelimits(
 static int
 xfs_qm_load_metadir_qinos(
 	struct xfs_mount	*mp,
-	struct xfs_quotainfo	*qi,
-	struct xfs_inode	**dpp)
+	struct xfs_quotainfo	*qi)
 {
 	struct xfs_trans	*tp;
 	int			error;
@@ -656,7 +659,7 @@ xfs_qm_load_metadir_qinos(
 	if (error)
 		return error;
 
-	error = xfs_dqinode_load_parent(tp, dpp);
+	error = xfs_dqinode_load_parent(tp, &qi->qi_dirip);
 	if (error == -ENOENT) {
 		/* no quota dir directory, but we'll create one later */
 		error = 0;
@@ -666,21 +669,21 @@ xfs_qm_load_metadir_qinos(
 		goto out_trans;
 
 	if (XFS_IS_UQUOTA_ON(mp)) {
-		error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_USER,
+		error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_USER,
 				&qi->qi_uquotaip);
 		if (error && error != -ENOENT)
 			goto out_trans;
 	}
 
 	if (XFS_IS_GQUOTA_ON(mp)) {
-		error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_GROUP,
+		error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_GROUP,
 				&qi->qi_gquotaip);
 		if (error && error != -ENOENT)
 			goto out_trans;
 	}
 
 	if (XFS_IS_PQUOTA_ON(mp)) {
-		error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_PROJ,
+		error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_PROJ,
 				&qi->qi_pquotaip);
 		if (error && error != -ENOENT)
 			goto out_trans;
@@ -696,34 +699,33 @@ out_trans:
 STATIC int
 xfs_qm_create_metadir_qinos(
 	struct xfs_mount	*mp,
-	struct xfs_quotainfo	*qi,
-	struct xfs_inode	**dpp)
+	struct xfs_quotainfo	*qi)
 {
 	int			error;
 
-	if (!*dpp) {
-		error = xfs_dqinode_mkdir_parent(mp, dpp);
+	if (!qi->qi_dirip) {
+		error = xfs_dqinode_mkdir_parent(mp, &qi->qi_dirip);
 		if (error && error != -EEXIST)
 			return error;
 	}
 
 	if (XFS_IS_UQUOTA_ON(mp) && !qi->qi_uquotaip) {
-		error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_USER,
-				&qi->qi_uquotaip);
+		error = xfs_dqinode_metadir_create(qi->qi_dirip,
+				XFS_DQTYPE_USER, &qi->qi_uquotaip);
 		if (error)
 			return error;
 	}
 
 	if (XFS_IS_GQUOTA_ON(mp) && !qi->qi_gquotaip) {
-		error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_GROUP,
-				&qi->qi_gquotaip);
+		error = xfs_dqinode_metadir_create(qi->qi_dirip,
+				XFS_DQTYPE_GROUP, &qi->qi_gquotaip);
 		if (error)
 			return error;
 	}
 
 	if (XFS_IS_PQUOTA_ON(mp) && !qi->qi_pquotaip) {
-		error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_PROJ,
-				&qi->qi_pquotaip);
+		error = xfs_dqinode_metadir_create(qi->qi_dirip,
+				XFS_DQTYPE_PROJ, &qi->qi_pquotaip);
 		if (error)
 			return error;
 	}
@@ -768,7 +770,6 @@ xfs_qm_init_metadir_qinos(
 	struct xfs_mount	*mp)
 {
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	struct xfs_inode	*dp = NULL;
 	int			error;
 
 	if (!xfs_has_quota(mp)) {
@@ -777,20 +778,22 @@ xfs_qm_init_metadir_qinos(
 			return error;
 	}
 
-	error = xfs_qm_load_metadir_qinos(mp, qi, &dp);
+	error = xfs_qm_load_metadir_qinos(mp, qi);
 	if (error)
 		goto out_err;
 
-	error = xfs_qm_create_metadir_qinos(mp, qi, &dp);
+	error = xfs_qm_create_metadir_qinos(mp, qi);
 	if (error)
 		goto out_err;
 
-	xfs_irele(dp);
+	/* The only user of the quota dir inode is online fsck */
+#if !IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+	xfs_irele(qi->qi_dirip);
+	qi->qi_dirip = NULL;
+#endif
 	return 0;
 out_err:
 	xfs_qm_destroy_quotainos(mp->m_quotainfo);
-	if (dp)
-		xfs_irele(dp);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index e919c7f62f57..35b64bc3a7a8 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -55,6 +55,7 @@ struct xfs_quotainfo {
 	struct xfs_inode	*qi_uquotaip;	/* user quota inode */
 	struct xfs_inode	*qi_gquotaip;	/* group quota inode */
 	struct xfs_inode	*qi_pquotaip;	/* project quota inode */
+	struct xfs_inode	*qi_dirip;	/* quota metadir */
 	struct list_lru		qi_lru;
 	int			qi_dquots;
 	struct mutex		qi_quotaofflock;/* to serialize quotaoff */
-- 
cgit v1.2.3


From bd27c7bcdca25ce8067ebb94ded6ac1bd7b47317 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:26 -0800
Subject: xfs: return a 64-bit block count from xfs_btree_count_blocks

With the nrext64 feature enabled, it's possible for a data fork to have
2^48 extent mappings.  Even with a 64k fsblock size, that maps out to
a bmbt containing more than 2^32 blocks.  Therefore, this predicate must
return a u64 count to avoid an integer wraparound that will cause scrub
to do the wrong thing.

It's unlikely that any such filesystem currently exists, because the
incore bmbt would consume more than 64GB of kernel memory on its own,
and so far nobody except me has driven a filesystem that far, judging
from the lack of complaints.

Cc: <stable@vger.kernel.org> # v5.19
Fixes: df9ad5cc7a5240 ("xfs: Introduce macros to represent new maximum extent counts for data/attr forks")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.c        | 4 ++--
 fs/xfs/libxfs/xfs_btree.h        | 2 +-
 fs/xfs/libxfs/xfs_ialloc_btree.c | 4 +++-
 fs/xfs/scrub/agheader.c          | 6 +++---
 fs/xfs/scrub/agheader_repair.c   | 6 +++---
 fs/xfs/scrub/fscounters.c        | 2 +-
 fs/xfs/scrub/ialloc.c            | 4 ++--
 fs/xfs/scrub/refcount.c          | 2 +-
 fs/xfs/xfs_bmap_util.c           | 2 +-
 9 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2b5fc5fd1643..c748866ef923 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -5144,7 +5144,7 @@ xfs_btree_count_blocks_helper(
 	int			level,
 	void			*data)
 {
-	xfs_extlen_t		*blocks = data;
+	xfs_filblks_t		*blocks = data;
 	(*blocks)++;
 
 	return 0;
@@ -5154,7 +5154,7 @@ xfs_btree_count_blocks_helper(
 int
 xfs_btree_count_blocks(
 	struct xfs_btree_cur	*cur,
-	xfs_extlen_t		*blocks)
+	xfs_filblks_t		*blocks)
 {
 	*blocks = 0;
 	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3b739459ebb0..c5bff273cae2 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -484,7 +484,7 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
 int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
 		xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data);
 
-int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks);
 
 union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n,
 		struct xfs_btree_block *block);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b34896dd1a3..6f270d8f4270 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -744,6 +744,7 @@ xfs_finobt_count_blocks(
 {
 	struct xfs_buf		*agbp = NULL;
 	struct xfs_btree_cur	*cur;
+	xfs_filblks_t		blocks;
 	int			error;
 
 	error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
@@ -751,9 +752,10 @@ xfs_finobt_count_blocks(
 		return error;
 
 	cur = xfs_finobt_init_cursor(pag, tp, agbp);
-	error = xfs_btree_count_blocks(cur, tree_blocks);
+	error = xfs_btree_count_blocks(cur, &blocks);
 	xfs_btree_del_cursor(cur, error);
 	xfs_trans_brelse(tp, agbp);
+	*tree_blocks = blocks;
 
 	return error;
 }
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 61f80a6410c7..1d41b85478da 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -458,7 +458,7 @@ xchk_agf_xref_btreeblks(
 {
 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
 	struct xfs_mount	*mp = sc->mp;
-	xfs_agblock_t		blocks;
+	xfs_filblks_t		blocks;
 	xfs_agblock_t		btreeblks;
 	int			error;
 
@@ -507,7 +507,7 @@ xchk_agf_xref_refcblks(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
-	xfs_agblock_t		blocks;
+	xfs_filblks_t		blocks;
 	int			error;
 
 	if (!sc->sa.refc_cur)
@@ -840,7 +840,7 @@ xchk_agi_xref_fiblocks(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
-	xfs_agblock_t		blocks;
+	xfs_filblks_t		blocks;
 	int			error = 0;
 
 	if (!xfs_has_inobtcounts(sc->mp))
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 0fad0baaba2f..b45d2b32051a 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -256,7 +256,7 @@ xrep_agf_calc_from_btrees(
 	struct xfs_agf		*agf = agf_bp->b_addr;
 	struct xfs_mount	*mp = sc->mp;
 	xfs_agblock_t		btreeblks;
-	xfs_agblock_t		blocks;
+	xfs_filblks_t		blocks;
 	int			error;
 
 	/* Update the AGF counters from the bnobt. */
@@ -946,7 +946,7 @@ xrep_agi_calc_from_btrees(
 	if (error)
 		goto err;
 	if (xfs_has_inobtcounts(mp)) {
-		xfs_agblock_t	blocks;
+		xfs_filblks_t	blocks;
 
 		error = xfs_btree_count_blocks(cur, &blocks);
 		if (error)
@@ -959,7 +959,7 @@ xrep_agi_calc_from_btrees(
 	agi->agi_freecount = cpu_to_be32(freecount);
 
 	if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
-		xfs_agblock_t	blocks;
+		xfs_filblks_t	blocks;
 
 		cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
 		error = xfs_btree_count_blocks(cur, &blocks);
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 4a50f8e00040..ca23cf4db6c5 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -261,7 +261,7 @@ xchk_fscount_btreeblks(
 	struct xchk_fscounters	*fsc,
 	xfs_agnumber_t		agno)
 {
-	xfs_extlen_t		blocks;
+	xfs_filblks_t		blocks;
 	int			error;
 
 	error = xchk_ag_init_existing(sc, agno, &sc->sa);
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index abad54c3621d..4dc7c83dc08a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -650,8 +650,8 @@ xchk_iallocbt_xref_rmap_btreeblks(
 	struct xfs_scrub	*sc)
 {
 	xfs_filblks_t		blocks;
-	xfs_extlen_t		inobt_blocks = 0;
-	xfs_extlen_t		finobt_blocks = 0;
+	xfs_filblks_t		inobt_blocks = 0;
+	xfs_filblks_t		finobt_blocks = 0;
 	int			error;
 
 	if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 2b6be75e9424..1c5e45cc6419 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -491,7 +491,7 @@ xchk_refcount_xref_rmap(
 	struct xfs_scrub	*sc,
 	xfs_filblks_t		cow_blocks)
 {
-	xfs_extlen_t		refcbt_blocks = 0;
+	xfs_filblks_t		refcbt_blocks = 0;
 	xfs_filblks_t		blocks;
 	int			error;
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a59bbe767a7d..0836fea2d6d8 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -103,7 +103,7 @@ xfs_bmap_count_blocks(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur	*cur;
-	xfs_extlen_t		btblocks = 0;
+	xfs_filblks_t		btblocks = 0;
 	int			error;
 
 	*nextents = 0;
-- 
cgit v1.2.3


From 7ce31f20a0771d71779c3b0ec9cdf474cc3c8e9a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:27 -0800
Subject: xfs: don't drop errno values when we fail to ficlone the entire range

Way back when we first implemented FICLONE for XFS, life was simple --
either the the entire remapping completed, or something happened and we
had to return an errno explaining what happened.  Neither of those
ioctls support returning partial results, so it's all or nothing.

Then things got complicated when copy_file_range came along, because it
actually can return the number of bytes copied, so commit 3f68c1f562f1e4
tried to make it so that we could return a partial result if the
REMAP_FILE_CAN_SHORTEN flag is set.  This is also how FIDEDUPERANGE can
indicate that the kernel performed a partial deduplication.

Unfortunately, the logic is wrong if an error stops the remapping and
CAN_SHORTEN is not set.  Because those callers cannot return partial
results, it is an error for ->remap_file_range to return a positive
quantity that is less than the @len passed in.  Implementations really
should be returning a negative errno in this case, because that's what
btrfs (which introduced FICLONE{,RANGE}) did.

Therefore, ->remap_range implementations cannot silently drop an errno
that they might have when the number of bytes remapped is less than the
number of bytes requested and CAN_SHORTEN is not set.

Found by running generic/562 on a 64k fsblock filesystem and wondering
why it reported corrupt files.

Cc: <stable@vger.kernel.org> # v4.20
Fixes: 3fc9f5e409319e ("xfs: remove xfs_reflink_remap_range")
Really-Fixes: 3f68c1f562f1e4 ("xfs: support returning partial reflink results")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4a0b7de4f7ae..9a435b1ff264 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1242,6 +1242,14 @@ out_unlock:
 	xfs_iunlock2_remapping(src, dest);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+	/*
+	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
+	 * handle partial results -- either the whole remap succeeds, or we
+	 * must say why it did not.  In this case, any error should be returned
+	 * to the caller.
+	 */
+	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+		return ret;
 	return remapped > 0 ? remapped : ret;
 }
 
-- 
cgit v1.2.3


From aa7bfb537edf62085d7718845f6644b0e4efb9df Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:27 -0800
Subject: xfs: separate healthy clearing mask during repair

In commit d9041681dd2f53 we introduced some XFS_SICK_*ZAPPED flags so
that the inode record repair code could clean up a damaged inode record
enough to iget the inode but still be able to remember that the higher
level repair code needs to be called.  As part of that, we introduced a
xchk_mark_healthy_if_clean helper that is supposed to cause the ZAPPED
state to be removed if that higher level metadata actually checks out.
This was done by setting additional bits in sick_mask hoping that
xchk_update_health will clear all those bits after a healthy scrub.

Unfortunately, that's not quite what sick_mask means -- bits in that
mask are indeed cleared if the metadata is healthy, but they're set if
the metadata is NOT healthy.  fsck is only intended to set the ZAPPED
bits explicitly.

If something else sets the CORRUPT/XCORRUPT state after the
xchk_mark_healthy_if_clean call, we end up marking the metadata zapped.
This can happen if the following sequence happens:

1. Scrub runs, discovers that the metadata is fine but could be
   optimized and calls xchk_mark_healthy_if_clean on a ZAPPED flag.
   That causes the ZAPPED flag to be set in sick_mask because the
   metadata is not CORRUPT or XCORRUPT.

2. Repair runs to optimize the metadata.

3. Some other metadata used for cross-referencing in (1) becomes
   corrupt.

4. Post-repair scrub runs, but this time it sets CORRUPT or XCORRUPT due
   to the events in (3).

5. Now the xchk_health_update sets the ZAPPED flag on the metadata we
   just repaired.  This is not the correct state.

Fix this by moving the "if healthy" mask to a separate field, and only
ever using it to clear the sick state.

Cc: <stable@vger.kernel.org> # v6.8
Fixes: d9041681dd2f53 ("xfs: set inode sick state flags when we zap either ondisk fork")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/health.c | 57 +++++++++++++++++++++++++++++----------------------
 fs/xfs/scrub/scrub.h  |  6 ++++++
 2 files changed, 39 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index ce86bdad37fa..ccc6ca5934ca 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -71,7 +71,8 @@
 /* Map our scrub type to a sick mask and a set of health update functions. */
 
 enum xchk_health_group {
-	XHG_FS = 1,
+	XHG_NONE = 1,
+	XHG_FS,
 	XHG_AG,
 	XHG_INO,
 	XHG_RTGROUP,
@@ -83,6 +84,7 @@ struct xchk_health_map {
 };
 
 static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
+	[XFS_SCRUB_TYPE_PROBE]		= { XHG_NONE,  0 },
 	[XFS_SCRUB_TYPE_SB]		= { XHG_AG,  XFS_SICK_AG_SB },
 	[XFS_SCRUB_TYPE_AGF]		= { XHG_AG,  XFS_SICK_AG_AGF },
 	[XFS_SCRUB_TYPE_AGFL]		= { XHG_AG,  XFS_SICK_AG_AGFL },
@@ -133,7 +135,7 @@ xchk_mark_healthy_if_clean(
 {
 	if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
 				  XFS_SCRUB_OFLAG_XCORRUPT)))
-		sc->sick_mask |= mask;
+		sc->healthy_mask |= mask;
 }
 
 /*
@@ -189,6 +191,7 @@ xchk_update_health(
 {
 	struct xfs_perag	*pag;
 	struct xfs_rtgroup	*rtg;
+	unsigned int		mask = sc->sick_mask;
 	bool			bad;
 
 	/*
@@ -203,50 +206,56 @@ xchk_update_health(
 		return;
 	}
 
-	if (!sc->sick_mask)
-		return;
-
 	bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
 				   XFS_SCRUB_OFLAG_XCORRUPT));
+	if (!bad)
+		mask |= sc->healthy_mask;
 	switch (type_to_health_flag[sc->sm->sm_type].group) {
+	case XHG_NONE:
+		break;
 	case XHG_AG:
+		if (!mask)
+			return;
 		pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
 		if (bad)
-			xfs_group_mark_corrupt(pag_group(pag), sc->sick_mask);
+			xfs_group_mark_corrupt(pag_group(pag), mask);
 		else
-			xfs_group_mark_healthy(pag_group(pag), sc->sick_mask);
+			xfs_group_mark_healthy(pag_group(pag), mask);
 		xfs_perag_put(pag);
 		break;
 	case XHG_INO:
 		if (!sc->ip)
 			return;
-		if (bad) {
-			unsigned int	mask = sc->sick_mask;
-
-			/*
-			 * If we're coming in for repairs then we don't want
-			 * sickness flags to propagate to the incore health
-			 * status if the inode gets inactivated before we can
-			 * fix it.
-			 */
-			if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
-				mask |= XFS_SICK_INO_FORGET;
+		/*
+		 * If we're coming in for repairs then we don't want sickness
+		 * flags to propagate to the incore health status if the inode
+		 * gets inactivated before we can fix it.
+		 */
+		if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+			mask |= XFS_SICK_INO_FORGET;
+		if (!mask)
+			return;
+		if (bad)
 			xfs_inode_mark_corrupt(sc->ip, mask);
-		} else
-			xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
+		else
+			xfs_inode_mark_healthy(sc->ip, mask);
 		break;
 	case XHG_FS:
+		if (!mask)
+			return;
 		if (bad)
-			xfs_fs_mark_corrupt(sc->mp, sc->sick_mask);
+			xfs_fs_mark_corrupt(sc->mp, mask);
 		else
-			xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
+			xfs_fs_mark_healthy(sc->mp, mask);
 		break;
 	case XHG_RTGROUP:
+		if (!mask)
+			return;
 		rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
 		if (bad)
-			xfs_group_mark_corrupt(rtg_group(rtg), sc->sick_mask);
+			xfs_group_mark_corrupt(rtg_group(rtg), mask);
 		else
-			xfs_group_mark_healthy(rtg_group(rtg), sc->sick_mask);
+			xfs_group_mark_healthy(rtg_group(rtg), mask);
 		xfs_rtgroup_put(rtg);
 		break;
 	default:
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index a7fda3e2b013..5dbbe93cb49b 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -184,6 +184,12 @@ struct xfs_scrub {
 	 */
 	unsigned int			sick_mask;
 
+	/*
+	 * Clear these XFS_SICK_* flags but only if the scan is ok.  Useful for
+	 * removing ZAPPED flags after a repair.
+	 */
+	unsigned int			healthy_mask;
+
 	/* next time we want to cond_resched() */
 	struct xchk_relax		relax;
 
-- 
cgit v1.2.3


From 6f4669708a69fd21f0299c2d5c4780a6ce358ab5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:28 -0800
Subject: xfs: set XFS_SICK_INO_SYMLINK_ZAPPED explicitly when zapping a
 symlink

If we need to reset a symlink target to the "durr it's busted" string,
then we clear the zapped flag as well.  However, this should be using
the provided helper so that we don't set the zapped state on an
otherwise ok symlink.

Cc: <stable@vger.kernel.org> # v6.10
Fixes: 2651923d8d8db0 ("xfs: online repair of symbolic links")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/symlink_repair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
index d015a86ef460..953ce7be78dc 100644
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -36,6 +36,7 @@
 #include "scrub/tempfile.h"
 #include "scrub/tempexch.h"
 #include "scrub/reap.h"
+#include "scrub/health.h"
 
 /*
  * Symbolic Link Repair
@@ -233,7 +234,7 @@ xrep_symlink_salvage(
 	 * target zapped flag.
 	 */
 	if (buflen == 0) {
-		sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
+		xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED);
 		sprintf(target_buf, DUMMY_TARGET);
 	}
 
-- 
cgit v1.2.3


From dc5a0527398d42e4d3e47abe8a43960fca0314ed Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:29 -0800
Subject: xfs: mark metadir repair tempfiles with IRECOVERY

Once in a long while, xfs/566 and xfs/801 report directory corruption in
one of the metadata subdirectories while it's forcibly rebuilding all
filesystem metadata.  I observed the following sequence of events:

1. Initiate a repair of the parent pointers for the /quota/user file.
   This is the secret file containing user quota data.

2. The pptr repair thread creates a temporary file and begins staging
   parent pointers in the ondisk metadata in preparation for an
   exchange-range to commit the new pptr data.

3. At the same time, initiate a repair of the /quota directory itself.

4. The dir repair thread finds the temporary file from (2), scans it for
   parent pointers, and stages a dirent in its own temporary dir in
   preparation to commit the fixed directory.

5. The parent pointer repair completes and frees the temporary file.

6. The dir repair commits the new directory and scans it again.  It
   finds the dirent that points to the old temporary file in (2) and
   marks the directory corrupt.

Oops!  Repair code must never scan the temporary files that other repair
functions create to stage new metadata.  They're not supposed to do
that, but the predicate function xrep_is_tempfile is incorrect because
it assumes that any XFS_DIFLAG2_METADATA file cannot ever be a temporary
file, but xrep_tempfile_adjust_directory_tree creates exactly that.

Fix this by setting the IRECOVERY flag on temporary metadata directory
inodes and using that to correct the predicate.  Repair code is supposed
to erase all the data in temporary files before releasing them, so it's
ok if a thread scans the temporary file after we drop IRECOVERY.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: bb6cdd5529ff67 ("xfs: hide metadata inodes from everyone because they are special")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/tempfile.c | 10 ++++++++--
 fs/xfs/xfs_inode.h      |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 4b7f7860e37e..dc3802c7f678 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -223,6 +223,7 @@ xrep_tempfile_adjust_directory_tree(
 	if (error)
 		goto out_ilock;
 
+	xfs_iflags_set(sc->tempip, XFS_IRECOVERY);
 	xfs_qm_dqdetach(sc->tempip);
 out_ilock:
 	xrep_tempfile_iunlock(sc);
@@ -246,6 +247,8 @@ xrep_tempfile_remove_metadir(
 
 	ASSERT(sc->tp == NULL);
 
+	xfs_iflags_clear(sc->tempip, XFS_IRECOVERY);
+
 	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
 	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
 
@@ -945,10 +948,13 @@ xrep_is_tempfile(
 
 	/*
 	 * Files in the metadata directory tree also have S_PRIVATE set and
-	 * IOP_XATTR unset, so we must distinguish them separately.
+	 * IOP_XATTR unset, so we must distinguish them separately.  We (ab)use
+	 * the IRECOVERY flag to mark temporary metadir inodes knowing that the
+	 * end of log recovery clears IRECOVERY, so the only ones that can
+	 * exist during online repair are the ones we create.
 	 */
 	if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA))
-		return false;
+		return __xfs_iflags_test(ip, XFS_IRECOVERY);
 
 	if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
 		return true;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b0de3d924d4c..1648dc5a8068 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -231,7 +231,7 @@ xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags)
 }
 
 static inline int
-__xfs_iflags_test(xfs_inode_t *ip, unsigned long flags)
+__xfs_iflags_test(const struct xfs_inode *ip, unsigned long flags)
 {
 	return (ip->i_flags & flags);
 }
-- 
cgit v1.2.3


From af9f02457f461b23307fe826a37be61ba6e32c92 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:30 -0800
Subject: xfs: fix null bno_hint handling in xfs_rtallocate_rtg

xfs_bmap_rtalloc initializes the bno_hint variable to NULLRTBLOCK (aka
NULLFSBLOCK).  If the allocation request is for a file range that's
adjacent to an existing mapping, it will then change bno_hint to the
blkno hint in the bmalloca structure.

In other words, bno_hint is either a rt block number, or it's all 1s.
Unfortunately, commit ec12f97f1b8a8f didn't take the NULLRTBLOCK state
into account, which means that it tries to translate that into a
realtime extent number.  We then end up with an obnoxiously high rtx
number and pointlessly feed that to the near allocator.  This often
fails and falls back to the by-size allocator.  Seeing as we had no
locality hint anyway, this is a waste of time.

Fix the code to detect a lack of bno_hint correctly.  This was detected
by running xfs/009 with metadir enabled and a 28k rt extent size.

Cc: <stable@vger.kernel.org> # v6.12
Fixes: ec12f97f1b8a8f ("xfs: make the rtalloc start hint a xfs_rtblock_t")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_rtalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 0cb534d71119..fcfa6e0eb3ad 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1827,7 +1827,7 @@ xfs_rtallocate_rtg(
 	 * For an allocation to an empty file at offset 0, pick an extent that
 	 * will space things out in the rt area.
 	 */
-	if (bno_hint)
+	if (bno_hint != NULLFSBLOCK)
 		start = xfs_rtb_to_rtx(args.mp, bno_hint);
 	else if (!xfs_has_rtgroups(args.mp) && initial_user_data)
 		start = xfs_rtpick_extent(args.rtg, tp, maxlen);
-- 
cgit v1.2.3


From 23bee6f390a12d0c4c51fefc083704bc5dac377e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:30 -0800
Subject: xfs: fix error bailout in xfs_rtginode_create

smatch reported that we screwed up the error cleanup in this function.
Fix it.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: ae897e0bed0f54 ("xfs: support creating per-RTG files in growfs")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index e74bb059f24f..4f3bfc884aff 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -496,7 +496,7 @@ xfs_rtginode_create(
 
 	error = xfs_metadir_create(&upd, S_IFREG);
 	if (error)
-		return error;
+		goto out_cancel;
 
 	xfs_rtginode_lockdep_setup(upd.ip, rtg_rgno(rtg), type);
 
-- 
cgit v1.2.3


From 6d7b4bc1c3e00b1a25b7a05141a64337b4629337 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:31 -0800
Subject: xfs: update btree keys correctly when _insrec splits an inode root
 block

In commit 2c813ad66a72, I partially fixed a bug wherein xfs_btree_insrec
would erroneously try to update the parent's key for a block that had
been split if we decided to insert the new record into the new block.
The solution was to detect this situation and update the in-core key
value that we pass up to the caller so that the caller will (eventually)
add the new block to the parent level of the tree with the correct key.

However, I missed a subtlety about the way inode-rooted btrees work.  If
the full block was a maximally sized inode root block, we'll solve that
fullness by moving the root block's records to a new block, resizing the
root block, and updating the root to point to the new block.  We don't
pass a pointer to the new block to the caller because that work has
already been done.  The new record will /always/ land in the new block,
so in this case we need to use xfs_btree_update_keys to update the keys.

This bug can theoretically manifest itself in the very rare case that we
split a bmbt root block and the new record lands in the very first slot
of the new block, though I've never managed to trigger it in practice.
However, it is very easy to reproduce by running generic/522 with the
realtime rmapbt patchset if rtinherit=1.

Cc: <stable@vger.kernel.org> # v4.8
Fixes: 2c813ad66a7218 ("xfs: support btrees with overlapping intervals for keys")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c748866ef923..68ee1c299c25 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3557,14 +3557,31 @@ xfs_btree_insrec(
 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
 
 	/*
-	 * If we just inserted into a new tree block, we have to
-	 * recalculate nkey here because nkey is out of date.
+	 * Update btree keys to reflect the newly added record or keyptr.
+	 * There are three cases here to be aware of.  Normally, all we have to
+	 * do is walk towards the root, updating keys as necessary.
 	 *
-	 * Otherwise we're just updating an existing block (having shoved
-	 * some records into the new tree block), so use the regular key
-	 * update mechanism.
+	 * If the caller had us target a full block for the insertion, we dealt
+	 * with that by calling the _make_block_unfull function.  If the
+	 * "make unfull" function splits the block, it'll hand us back the key
+	 * and pointer of the new block.  We haven't yet added the new block to
+	 * the next level up, so if we decide to add the new record to the new
+	 * block (bp->b_bn != old_bn), we have to update the caller's pointer
+	 * so that the caller adds the new block with the correct key.
+	 *
+	 * However, there is a third possibility-- if the selected block is the
+	 * root block of an inode-rooted btree and cannot be expanded further,
+	 * the "make unfull" function moves the root block contents to a new
+	 * block and updates the root block to point to the new block.  In this
+	 * case, no block pointer is passed back because the block has already
+	 * been added to the btree.  In this case, we need to use the regular
+	 * key update function, just like the first case.  This is critical for
+	 * overlapping btrees, because the high key must be updated to reflect
+	 * the entire tree, not just the subtree accessible through the first
+	 * child of the root (which is now two levels down from the root).
 	 */
-	if (bp && xfs_buf_daddr(bp) != old_bn) {
+	if (!xfs_btree_ptr_is_null(cur, &nptr) &&
+	    bp && xfs_buf_daddr(bp) != old_bn) {
 		xfs_btree_get_keys(cur, block, lkey);
 	} else if (xfs_btree_needs_key_update(cur, optr)) {
 		error = xfs_btree_update_keys(cur, level);
-- 
cgit v1.2.3


From ffc3ea4f3c1cc83a86b7497b0c4b0aee7de5480d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:32 -0800
Subject: xfs: fix scrub tracepoints when inode-rooted btrees are involved

Fix a minor mistakes in the scrub tracepoints that can manifest when
inode-rooted btrees are enabled.  The existing code worked fine for bmap
btrees, but we should tighten the code up to be less sloppy.

Cc: <stable@vger.kernel.org> # v5.7
Fixes: 92219c292af8dd ("xfs: convert btree cursor inode-private member names")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 9b38f5ad1eaf..d2ae7e93acb0 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -605,7 +605,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
 	TP_fast_assign(
 		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
 		__entry->dev = sc->mp->m_super->s_dev;
-		__entry->ino = sc->ip->i_ino;
+		__entry->ino = cur->bc_ino.ip->i_ino;
 		__entry->whichfork = cur->bc_ino.whichfork;
 		__entry->type = sc->sm->sm_type;
 		__assign_str(name);
-- 
cgit v1.2.3


From 53b001a21c9dff73b64e8c909c41991f01d5d00f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:33 -0800
Subject: xfs: unlock inodes when erroring out of xfs_trans_alloc_dir

Debugging a filesystem patch with generic/475 caused the system to hang
after observing the following sequences in dmesg:

 XFS (dm-0): metadata I/O error in "xfs_imap_to_bp+0x61/0xe0 [xfs]" at daddr 0x491520 len 32 error 5
 XFS (dm-0): metadata I/O error in "xfs_btree_read_buf_block+0xba/0x160 [xfs]" at daddr 0x3445608 len 8 error 5
 XFS (dm-0): metadata I/O error in "xfs_imap_to_bp+0x61/0xe0 [xfs]" at daddr 0x138e1c0 len 32 error 5
 XFS (dm-0): log I/O error -5
 XFS (dm-0): Metadata I/O Error (0x1) detected at xfs_trans_read_buf_map+0x1ea/0x4b0 [xfs] (fs/xfs/xfs_trans_buf.c:311).  Shutting down filesystem.
 XFS (dm-0): Please unmount the filesystem and rectify the problem(s)
 XFS (dm-0): Internal error dqp->q_ino.reserved < dqp->q_ino.count at line 869 of file fs/xfs/xfs_trans_dquot.c.  Caller xfs_trans_dqresv+0x236/0x440 [xfs]
 XFS (dm-0): Corruption detected. Unmount and run xfs_repair
 XFS (dm-0): Unmounting Filesystem be6bcbcc-9921-4deb-8d16-7cc94e335fa7

The system is stuck in unmount trying to lock a couple of inodes so that
they can be purged.  The dquot corruption notice above is a clue to what
happened -- a link() call tried to set up a transaction to link a child
into a directory.  Quota reservation for the transaction failed after IO
errors shut down the filesystem, but then we forgot to unlock the inodes
on our way out.  Fix that.

Cc: <stable@vger.kernel.org> # v6.10
Fixes: bd5562111d5839 ("xfs: Hold inode locks in xfs_trans_alloc_dir")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_trans.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 30fbed27cf05..05b18e30368e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1435,5 +1435,8 @@ done:
 
 out_cancel:
 	xfs_trans_cancel(tp);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	if (dp != ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
-- 
cgit v1.2.3


From 44d9b07e52db25035680713c3428016cadcd2ea1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:33 -0800
Subject: xfs: only run precommits once per transaction object

Committing a transaction tx0 with a defer ops chain of (A, B, C)
creates a chain of transactions that looks like this:

tx0 -> txA -> txB -> txC

Prior to commit cb042117488dbf, __xfs_trans_commit would run precommits
on tx0, then call xfs_defer_finish_noroll to convert A-C to tx[A-C].
Unfortunately, after the finish_noroll loop we forgot to run precommits
on txC.  That was fixed by adding the second precommit call.

Unfortunately, none of us remembered that xfs_defer_finish_noroll
calls __xfs_trans_commit a second time to commit tx0 before finishing
work A in txA and committing that.  In other words, we run precommits
twice on tx0:

xfs_trans_commit(tx0)
    __xfs_trans_commit(tx0, false)
        xfs_trans_run_precommits(tx0)
        xfs_defer_finish_noroll(tx0)
            xfs_trans_roll(tx0)
                txA = xfs_trans_dup(tx0)
                __xfs_trans_commit(tx0, true)
                xfs_trans_run_precommits(tx0)

This currently isn't an issue because the inode item precommit is
idempotent; the iunlink item precommit deletes itself so it can't be
called again; and the buffer/dquot item precommits only check the incore
objects for corruption.  However, it doesn't make sense to run
precommits twice.

Fix this situation by only running precommits after finish_noroll.

Cc: <stable@vger.kernel.org> # v6.4
Fixes: cb042117488dbf ("xfs: defered work could create precommits")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_trans.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 05b18e30368e..4a517250efc9 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -860,13 +860,6 @@ __xfs_trans_commit(
 
 	trace_xfs_trans_commit(tp, _RET_IP_);
 
-	error = xfs_trans_run_precommits(tp);
-	if (error) {
-		if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
-			xfs_defer_cancel(tp);
-		goto out_unreserve;
-	}
-
 	/*
 	 * Finish deferred items on final commit. Only permanent transactions
 	 * should ever have deferred ops.
@@ -877,13 +870,12 @@ __xfs_trans_commit(
 		error = xfs_defer_finish_noroll(&tp);
 		if (error)
 			goto out_unreserve;
-
-		/* Run precommits from final tx in defer chain. */
-		error = xfs_trans_run_precommits(tp);
-		if (error)
-			goto out_unreserve;
 	}
 
+	error = xfs_trans_run_precommits(tp);
+	if (error)
+		goto out_unreserve;
+
 	/*
 	 * If there is nothing to be logged by the transaction,
 	 * then unlock all of the items associated with the
-- 
cgit v1.2.3


From a004afdc62946d3261f724c6472997085c4f0735 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:34 -0800
Subject: xfs: avoid nested calls to __xfs_trans_commit

Currently, __xfs_trans_commit calls xfs_defer_finish_noroll, which calls
__xfs_trans_commit again on the same transaction.  In other words,
there's a nested function call (albeit with slightly different
arguments) that has caused minor amounts of confusion in the past.
There's no reason to keep this around, since there's only one place
where we actually want the xfs_defer_finish_noroll, and that is in the
top level xfs_trans_commit call.

This also reduces stack usage a little bit.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_trans.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4a517250efc9..26bb2343082a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -860,18 +860,6 @@ __xfs_trans_commit(
 
 	trace_xfs_trans_commit(tp, _RET_IP_);
 
-	/*
-	 * Finish deferred items on final commit. Only permanent transactions
-	 * should ever have deferred ops.
-	 */
-	WARN_ON_ONCE(!list_empty(&tp->t_dfops) &&
-		     !(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
-	if (!regrant && (tp->t_flags & XFS_TRANS_PERM_LOG_RES)) {
-		error = xfs_defer_finish_noroll(&tp);
-		if (error)
-			goto out_unreserve;
-	}
-
 	error = xfs_trans_run_precommits(tp);
 	if (error)
 		goto out_unreserve;
@@ -950,6 +938,20 @@ int
 xfs_trans_commit(
 	struct xfs_trans	*tp)
 {
+	/*
+	 * Finish deferred items on final commit. Only permanent transactions
+	 * should ever have deferred ops.
+	 */
+	WARN_ON_ONCE(!list_empty(&tp->t_dfops) &&
+		     !(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+	if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) {
+		int error = xfs_defer_finish_noroll(&tp);
+		if (error) {
+			xfs_trans_cancel(tp);
+			return error;
+		}
+	}
+
 	return __xfs_trans_commit(tp, false);
 }
 
-- 
cgit v1.2.3


From 3762113b597fa600d4e03300eec048256c546b1c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:35 -0800
Subject: xfs: don't lose solo superblock counter update transactions

Superblock counter updates are tracked via per-transaction counters in
the xfs_trans object.  These changes are then turned into dirty log
items in xfs_trans_apply_sb_deltas just prior to commiting the log items
to the CIL.

However, updating the per-transaction counter deltas do not cause
XFS_TRANS_DIRTY to be set on the transaction.  In other words, a pure sb
counter update will be silently discarded if there are no other dirty
log items attached to the transaction.

This is currently not the case anywhere in the filesystem because sb
counter updates always dirty at least one other metadata item, but let's
not leave a logic bomb.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_trans.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 26bb2343082a..427a8ba0ab99 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -860,6 +860,13 @@ __xfs_trans_commit(
 
 	trace_xfs_trans_commit(tp, _RET_IP_);
 
+	/*
+	 * Commit per-transaction changes that are not already tracked through
+	 * log items.  This can add dirty log items to the transaction.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+		xfs_trans_apply_sb_deltas(tp);
+
 	error = xfs_trans_run_precommits(tp);
 	if (error)
 		goto out_unreserve;
@@ -890,8 +897,6 @@ __xfs_trans_commit(
 	/*
 	 * If we need to update the superblock, then do it now.
 	 */
-	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
-		xfs_trans_apply_sb_deltas(tp);
 	xfs_trans_apply_dquot_deltas(tp);
 
 	xlog_cil_commit(log, tp, &commit_seq, regrant);
-- 
cgit v1.2.3


From 07137e925fa951646325762bda6bd2503dfe64c6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:36 -0800
Subject: xfs: don't lose solo dquot update transactions

Quota counter updates are tracked via incore objects which hang off the
xfs_trans object.  These changes are then turned into dirty log items in
xfs_trans_apply_dquot_deltas just prior to commiting the log items to
the CIL.

However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be
set on the transaction.  In other words, a pure quota counter update
will be silently discarded if there are no other dirty log items
attached to the transaction.

This is currently not the case anywhere in the filesystem because quota
updates always dirty at least one other metadata item, but a subsequent
bug fix will add dquot log item precommits, so we actually need a dirty
dquot log item prior to xfs_trans_run_precommits.  Also let's not leave
a logic bomb.

Cc: <stable@vger.kernel.org> # v2.6.35
Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_quota.h       |  5 +++--
 fs/xfs/xfs_trans.c       | 10 +++-------
 fs/xfs/xfs_trans_dquot.c | 31 ++++++++++++++++++++++++++-----
 3 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fa1317cc396c..d7565462af3d 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -101,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *);
 extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
 		uint, int64_t);
 extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp,
+		bool already_locked);
 int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
 		int64_t dblocks, int64_t rblocks, bool force);
 extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
@@ -173,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
 {
 }
 #define xfs_trans_apply_dquot_deltas(tp)
-#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp, a)
 static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
 		struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
 		bool force)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 427a8ba0ab99..4cd25717c9d1 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -866,6 +866,7 @@ __xfs_trans_commit(
 	 */
 	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
 		xfs_trans_apply_sb_deltas(tp);
+	xfs_trans_apply_dquot_deltas(tp);
 
 	error = xfs_trans_run_precommits(tp);
 	if (error)
@@ -894,11 +895,6 @@ __xfs_trans_commit(
 
 	ASSERT(tp->t_ticket != NULL);
 
-	/*
-	 * If we need to update the superblock, then do it now.
-	 */
-	xfs_trans_apply_dquot_deltas(tp);
-
 	xlog_cil_commit(log, tp, &commit_seq, regrant);
 
 	xfs_trans_free(tp);
@@ -924,7 +920,7 @@ out_unreserve:
 	 * the dqinfo portion to be.  All that means is that we have some
 	 * (non-persistent) quota reservations that need to be unreserved.
 	 */
-	xfs_trans_unreserve_and_mod_dquots(tp);
+	xfs_trans_unreserve_and_mod_dquots(tp, true);
 	if (tp->t_ticket) {
 		if (regrant && !xlog_is_shutdown(log))
 			xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1018,7 +1014,7 @@ xfs_trans_cancel(
 	}
 #endif
 	xfs_trans_unreserve_and_mod_sb(tp);
-	xfs_trans_unreserve_and_mod_dquots(tp);
+	xfs_trans_unreserve_and_mod_dquots(tp, false);
 
 	if (tp->t_ticket) {
 		xfs_log_ticket_ungrant(log, tp->t_ticket);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 481ba3dc9f19..713b6d243e56 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -606,6 +606,24 @@ xfs_trans_apply_dquot_deltas(
 			ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
 			ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
 			ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+
+			/*
+			 * We've applied the count changes and given back
+			 * whatever reservation we didn't use.  Zero out the
+			 * dqtrx fields.
+			 */
+			qtrx->qt_blk_res = 0;
+			qtrx->qt_bcount_delta = 0;
+			qtrx->qt_delbcnt_delta = 0;
+
+			qtrx->qt_rtblk_res = 0;
+			qtrx->qt_rtblk_res_used = 0;
+			qtrx->qt_rtbcount_delta = 0;
+			qtrx->qt_delrtb_delta = 0;
+
+			qtrx->qt_ino_res = 0;
+			qtrx->qt_ino_res_used = 0;
+			qtrx->qt_icount_delta = 0;
 		}
 	}
 }
@@ -642,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook(
  */
 void
 xfs_trans_unreserve_and_mod_dquots(
-	struct xfs_trans	*tp)
+	struct xfs_trans	*tp,
+	bool			already_locked)
 {
 	int			i, j;
 	struct xfs_dquot	*dqp;
@@ -671,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots(
 			 * about the number of blocks used field, or deltas.
 			 * Also we don't bother to zero the fields.
 			 */
-			locked = false;
+			locked = already_locked;
 			if (qtrx->qt_blk_res) {
-				xfs_dqlock(dqp);
-				locked = true;
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = true;
+				}
 				dqp->q_blk.reserved -=
 					(xfs_qcnt_t)qtrx->qt_blk_res;
 			}
@@ -695,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots(
 				dqp->q_rtb.reserved -=
 					(xfs_qcnt_t)qtrx->qt_rtblk_res;
 			}
-			if (locked)
+			if (locked && !already_locked)
 				xfs_dqunlock(dqp);
 
 		}
-- 
cgit v1.2.3


From a40fe30868ba433ac08376e30132400bec067583 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:36 -0800
Subject: xfs: separate dquot buffer reads from xfs_dqflush

The first step towards holding the dquot buffer in the li_buf instead of
reading it in the AIL is to separate the part that reads the buffer from
the actual flush code.  There should be no functional changes.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c      | 57 ++++++++++++++++++++++++++++++++-----------------
 fs/xfs/xfs_dquot.h      |  4 +++-
 fs/xfs/xfs_dquot_item.c | 20 +++++++++++++----
 fs/xfs/xfs_qm.c         | 37 ++++++++++++++++++++++++++------
 4 files changed, 86 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index ff982d983989..6ec4087e38df 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1238,6 +1238,42 @@ xfs_qm_dqflush_check(
 	return NULL;
 }
 
+/*
+ * Get the buffer containing the on-disk dquot.
+ *
+ * Requires dquot flush lock, will clear the dirty flag, delete the quota log
+ * item from the AIL, and shut down the system if something goes wrong.
+ */
+int
+xfs_dquot_read_buf(
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_buf		*bp = NULL;
+	int			error;
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+				   mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+				   &bp, &xfs_dquot_buf_ops);
+	if (error == -EAGAIN)
+		return error;
+	if (xfs_metadata_is_sick(error))
+		xfs_dquot_mark_sick(dqp);
+	if (error)
+		goto out_abort;
+
+	*bpp = bp;
+	return 0;
+
+out_abort:
+	dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
+	xfs_trans_ail_delete(&dqp->q_logitem.qli_item, 0);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	return error;
+}
+
 /*
  * Write a modified dquot to disk.
  * The dquot must be locked and the flush lock too taken by caller.
@@ -1249,11 +1285,10 @@ xfs_qm_dqflush_check(
 int
 xfs_qm_dqflush(
 	struct xfs_dquot	*dqp,
-	struct xfs_buf		**bpp)
+	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = dqp->q_mount;
 	struct xfs_log_item	*lip = &dqp->q_logitem.qli_item;
-	struct xfs_buf		*bp;
 	struct xfs_dqblk	*dqblk;
 	xfs_failaddr_t		fa;
 	int			error;
@@ -1263,28 +1298,12 @@ xfs_qm_dqflush(
 
 	trace_xfs_dqflush(dqp);
 
-	*bpp = NULL;
-
 	xfs_qm_dqunpin_wait(dqp);
 
-	/*
-	 * Get the buffer containing the on-disk dquot
-	 */
-	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-				   mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
-				   &bp, &xfs_dquot_buf_ops);
-	if (error == -EAGAIN)
-		goto out_unlock;
-	if (xfs_metadata_is_sick(error))
-		xfs_dquot_mark_sick(dqp);
-	if (error)
-		goto out_abort;
-
 	fa = xfs_qm_dqflush_check(dqp);
 	if (fa) {
 		xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
 				dqp->q_id, fa);
-		xfs_buf_relse(bp);
 		xfs_dquot_mark_sick(dqp);
 		error = -EFSCORRUPTED;
 		goto out_abort;
@@ -1334,14 +1353,12 @@ xfs_qm_dqflush(
 	}
 
 	trace_xfs_dqflush_done(dqp);
-	*bpp = bp;
 	return 0;
 
 out_abort:
 	dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
 	xfs_trans_ail_delete(lip, 0);
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-out_unlock:
 	xfs_dqfunlock(dqp);
 	return error;
 }
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index d73d179df009..50f8404c4117 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -214,7 +214,9 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
 #define XFS_DQ_IS_DIRTY(dqp)	((dqp)->q_flags & XFS_DQFLAG_DIRTY)
 
 void		xfs_qm_dqdestroy(struct xfs_dquot *dqp);
-int		xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp);
+int		xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp,
+				struct xfs_buf **bpp);
+int		xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp);
 void		xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
 void		xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
 void		xfs_qm_adjust_dqlimits(struct xfs_dquot *d);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 7d19091215b0..56ecc5ed0193 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -155,14 +155,26 @@ xfs_qm_dquot_logitem_push(
 
 	spin_unlock(&lip->li_ailp->ail_lock);
 
-	error = xfs_qm_dqflush(dqp, &bp);
+	error = xfs_dquot_read_buf(NULL, dqp, &bp);
+	if (error) {
+		if (error == -EAGAIN)
+			rval = XFS_ITEM_LOCKED;
+		xfs_dqfunlock(dqp);
+		goto out_relock_ail;
+	}
+
+	/*
+	 * dqflush completes dqflock on error, and the delwri ioend does it on
+	 * success.
+	 */
+	error = xfs_qm_dqflush(dqp, bp);
 	if (!error) {
 		if (!xfs_buf_delwri_queue(bp, buffer_list))
 			rval = XFS_ITEM_FLUSHING;
-		xfs_buf_relse(bp);
-	} else if (error == -EAGAIN)
-		rval = XFS_ITEM_LOCKED;
+	}
+	xfs_buf_relse(bp);
 
+out_relock_ail:
 	spin_lock(&lip->li_ailp->ail_lock);
 out_unlock:
 	xfs_dqunlock(dqp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3c0189831f14..d9ac50a33c57 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -148,17 +148,28 @@ xfs_qm_dqpurge(
 		 * We don't care about getting disk errors here. We need
 		 * to purge this dquot anyway, so we go ahead regardless.
 		 */
-		error = xfs_qm_dqflush(dqp, &bp);
+		error = xfs_dquot_read_buf(NULL, dqp, &bp);
+		if (error == -EAGAIN) {
+			xfs_dqfunlock(dqp);
+			dqp->q_flags &= ~XFS_DQFLAG_FREEING;
+			goto out_unlock;
+		}
+		if (error)
+			goto out_funlock;
+
+		/*
+		 * dqflush completes dqflock on error, and the bwrite ioend
+		 * does it on success.
+		 */
+		error = xfs_qm_dqflush(dqp, bp);
 		if (!error) {
 			error = xfs_bwrite(bp);
 			xfs_buf_relse(bp);
-		} else if (error == -EAGAIN) {
-			dqp->q_flags &= ~XFS_DQFLAG_FREEING;
-			goto out_unlock;
 		}
 		xfs_dqflock(dqp);
 	}
 
+out_funlock:
 	ASSERT(atomic_read(&dqp->q_pincount) == 0);
 	ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) ||
 		!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
@@ -494,7 +505,17 @@ xfs_qm_dquot_isolate(
 		/* we have to drop the LRU lock to flush the dquot */
 		spin_unlock(&lru->lock);
 
-		error = xfs_qm_dqflush(dqp, &bp);
+		error = xfs_dquot_read_buf(NULL, dqp, &bp);
+		if (error) {
+			xfs_dqfunlock(dqp);
+			goto out_unlock_dirty;
+		}
+
+		/*
+		 * dqflush completes dqflock on error, and the delwri ioend
+		 * does it on success.
+		 */
+		error = xfs_qm_dqflush(dqp, bp);
 		if (error)
 			goto out_unlock_dirty;
 
@@ -1489,11 +1510,13 @@ xfs_qm_flush_one(
 		goto out_unlock;
 	}
 
-	error = xfs_qm_dqflush(dqp, &bp);
+	error = xfs_dquot_read_buf(NULL, dqp, &bp);
 	if (error)
 		goto out_unlock;
 
-	xfs_buf_delwri_queue(bp, buffer_list);
+	error = xfs_qm_dqflush(dqp, bp);
+	if (!error)
+		xfs_buf_delwri_queue(bp, buffer_list);
 	xfs_buf_relse(bp);
 out_unlock:
 	xfs_dqunlock(dqp);
-- 
cgit v1.2.3


From ec88b41b932d5731291dcc0d0d63ea13ab8e07d5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:37 -0800
Subject: xfs: clean up log item accesses in xfs_qm_dqflush{,_done}

Clean up these functions a little bit before we move on to the real
modifications, and make the variable naming consistent for dquot log
items.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 6ec4087e38df..1dc85de58e59 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1142,8 +1142,9 @@ static void
 xfs_qm_dqflush_done(
 	struct xfs_log_item	*lip)
 {
-	struct xfs_dq_logitem	*qip = (struct xfs_dq_logitem *)lip;
-	struct xfs_dquot	*dqp = qip->qli_dquot;
+	struct xfs_dq_logitem	*qlip =
+			container_of(lip, struct xfs_dq_logitem, qli_item);
+	struct xfs_dquot	*dqp = qlip->qli_dquot;
 	struct xfs_ail		*ailp = lip->li_ailp;
 	xfs_lsn_t		tail_lsn;
 
@@ -1156,12 +1157,12 @@ xfs_qm_dqflush_done(
 	 * holding the lock before removing the dquot from the AIL.
 	 */
 	if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
-	    ((lip->li_lsn == qip->qli_flush_lsn) ||
+	    (lip->li_lsn == qlip->qli_flush_lsn ||
 	     test_bit(XFS_LI_FAILED, &lip->li_flags))) {
 
 		spin_lock(&ailp->ail_lock);
 		xfs_clear_li_failed(lip);
-		if (lip->li_lsn == qip->qli_flush_lsn) {
+		if (lip->li_lsn == qlip->qli_flush_lsn) {
 			/* xfs_ail_update_finish() drops the AIL lock */
 			tail_lsn = xfs_ail_delete_one(ailp, lip);
 			xfs_ail_update_finish(ailp, tail_lsn);
@@ -1319,7 +1320,7 @@ xfs_qm_dqflush(
 	dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
 
 	xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
-					&dqp->q_logitem.qli_item.li_lsn);
+			&lip->li_lsn);
 
 	/*
 	 * copy the lsn into the on-disk dquot now while we have the in memory
@@ -1331,7 +1332,7 @@ xfs_qm_dqflush(
 	 * of a dquot without an up-to-date CRC getting to disk.
 	 */
 	if (xfs_has_crc(mp)) {
-		dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+		dqblk->dd_lsn = cpu_to_be64(lip->li_lsn);
 		xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
 				 XFS_DQUOT_CRC_OFF);
 	}
@@ -1341,7 +1342,7 @@ xfs_qm_dqflush(
 	 * the AIL and release the flush lock once the dquot is synced to disk.
 	 */
 	bp->b_flags |= _XBF_DQUOTS;
-	list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list);
+	list_add_tail(&lip->li_bio_list, &bp->b_li_list);
 
 	/*
 	 * If the buffer is pinned then push on the log so we won't
-- 
cgit v1.2.3


From acc8f8628c3737108f36e5637f4d5daeaf96d90e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:38 -0800
Subject: xfs: attach dquot buffer to dquot log item buffer

Ever since 6.12-rc1, I've observed a pile of warnings from the kernel
when running fstests with quotas enabled:

WARNING: CPU: 1 PID: 458580 at mm/page_alloc.c:4221 __alloc_pages_noprof+0xc9c/0xf18
CPU: 1 UID: 0 PID: 458580 Comm: xfsaild/sda3 Tainted: G        W          6.12.0-rc6-djwa #rc6 6ee3e0e531f6457e2d26aa008a3b65ff184b377c
<snip>
Call trace:
 __alloc_pages_noprof+0xc9c/0xf18
 alloc_pages_mpol_noprof+0x94/0x240
 alloc_pages_noprof+0x68/0xf8
 new_slab+0x3e0/0x568
 ___slab_alloc+0x5a0/0xb88
 __slab_alloc.constprop.0+0x7c/0xf8
 __kmalloc_noprof+0x404/0x4d0
 xfs_buf_get_map+0x594/0xde0 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfs_buf_read_map+0x64/0x2e0 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfs_trans_read_buf_map+0x1dc/0x518 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfs_qm_dqflush+0xac/0x468 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfs_qm_dquot_logitem_push+0xe4/0x148 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfsaild+0x3f4/0xde8 [xfs 384cb02810558b4c490343c164e9407332118f88]
 kthread+0x110/0x128
 ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---

This corresponds to the line:

	WARN_ON_ONCE(current->flags & PF_MEMALLOC);

within the NOFAIL checks.  What's happening here is that the XFS AIL is
trying to write a disk quota update back into the filesystem, but for
that it needs to read the ondisk buffer for the dquot.  The buffer is
not in memory anymore, probably because it was evicted.  Regardless, the
buffer cache tries to allocate a new buffer, but those allocations are
NOFAIL.  The AIL thread has marked itself PF_MEMALLOC (aka noreclaim)
since commit 43ff2122e6492b ("xfs: on-stack delayed write buffer lists")
presumably because reclaim can push on XFS to push on the AIL.

An easy way to fix this probably would have been to drop the NOFAIL flag
from the xfs_buf allocation and open code a retry loop, but then there's
still the problem that for bs>ps filesystems, the buffer itself could
require up to 64k worth of pages.

Inode items had similar behavior (multi-page cluster buffers that we
don't want to allocate in the AIL) which we solved by making transaction
precommit attach the inode cluster buffers to the dirty log item.  Let's
solve the dquot problem in the same way.

So: Make a real precommit handler to read the dquot buffer and attach it
to the log item; pass it to dqflush in the push method; and have the
iodone function detach the buffer once we've flushed everything.  Add a
state flag to the log item to track when a thread has entered the
precommit -> push mechanism to skip the detaching if it turns out that
the dquot is very busy, as we don't hold the dquot lock between log item
commit and AIL push).

Reading and attaching the dquot buffer in the precommit hook is inspired
by the work done for inode cluster buffers some time ago.

Cc: <stable@vger.kernel.org> # v6.12
Fixes: 903edea6c53f09 ("mm: warn about illegal __GFP_NOFAIL usage in a more appropriate location and manner")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c      | 130 ++++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_dquot.h      |   6 ++-
 fs/xfs/xfs_dquot_item.c |  39 +++++++++------
 fs/xfs/xfs_dquot_item.h |   7 +++
 fs/xfs/xfs_qm.c         |   9 ++--
 fs/xfs/xfs_trans_ail.c  |   2 +-
 6 files changed, 169 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1dc85de58e59..708fd3358375 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -68,6 +68,30 @@ xfs_dquot_mark_sick(
 	}
 }
 
+/*
+ * Detach the dquot buffer if it's still attached, because we can get called
+ * through dqpurge after a log shutdown.  Caller must hold the dqflock or have
+ * otherwise isolated the dquot.
+ */
+void
+xfs_dquot_detach_buf(
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_dq_logitem	*qlip = &dqp->q_logitem;
+	struct xfs_buf		*bp = NULL;
+
+	spin_lock(&qlip->qli_lock);
+	if (qlip->qli_item.li_buf) {
+		bp = qlip->qli_item.li_buf;
+		qlip->qli_item.li_buf = NULL;
+	}
+	spin_unlock(&qlip->qli_lock);
+	if (bp) {
+		list_del_init(&qlip->qli_item.li_bio_list);
+		xfs_buf_rele(bp);
+	}
+}
+
 /*
  * This is called to free all the memory associated with a dquot
  */
@@ -76,6 +100,7 @@ xfs_qm_dqdestroy(
 	struct xfs_dquot	*dqp)
 {
 	ASSERT(list_empty(&dqp->q_lru));
+	ASSERT(dqp->q_logitem.qli_item.li_buf == NULL);
 
 	kvfree(dqp->q_logitem.qli_item.li_lv_shadow);
 	mutex_destroy(&dqp->q_qlock);
@@ -1146,6 +1171,7 @@ xfs_qm_dqflush_done(
 			container_of(lip, struct xfs_dq_logitem, qli_item);
 	struct xfs_dquot	*dqp = qlip->qli_dquot;
 	struct xfs_ail		*ailp = lip->li_ailp;
+	struct xfs_buf		*bp = NULL;
 	xfs_lsn_t		tail_lsn;
 
 	/*
@@ -1171,6 +1197,20 @@ xfs_qm_dqflush_done(
 		}
 	}
 
+	/*
+	 * If this dquot hasn't been dirtied since initiating the last dqflush,
+	 * release the buffer reference.  We already unlinked this dquot item
+	 * from the buffer.
+	 */
+	spin_lock(&qlip->qli_lock);
+	if (!qlip->qli_dirty) {
+		bp = lip->li_buf;
+		lip->li_buf = NULL;
+	}
+	spin_unlock(&qlip->qli_lock);
+	if (bp)
+		xfs_buf_rele(bp);
+
 	/*
 	 * Release the dq's flush lock since we're done with it.
 	 */
@@ -1197,7 +1237,7 @@ xfs_buf_dquot_io_fail(
 
 	spin_lock(&bp->b_mount->m_ail->ail_lock);
 	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
-		xfs_set_li_failed(lip, bp);
+		set_bit(XFS_LI_FAILED, &lip->li_flags);
 	spin_unlock(&bp->b_mount->m_ail->ail_lock);
 }
 
@@ -1249,6 +1289,7 @@ int
 xfs_dquot_read_buf(
 	struct xfs_trans	*tp,
 	struct xfs_dquot	*dqp,
+	xfs_buf_flags_t		xbf_flags,
 	struct xfs_buf		**bpp)
 {
 	struct xfs_mount	*mp = dqp->q_mount;
@@ -1256,7 +1297,7 @@ xfs_dquot_read_buf(
 	int			error;
 
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
-				   mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+				   mp->m_quotainfo->qi_dqchunklen, xbf_flags,
 				   &bp, &xfs_dquot_buf_ops);
 	if (error == -EAGAIN)
 		return error;
@@ -1275,6 +1316,77 @@ out_abort:
 	return error;
 }
 
+/*
+ * Attach a dquot buffer to this dquot to avoid allocating a buffer during a
+ * dqflush, since dqflush can be called from reclaim context.
+ */
+int
+xfs_dquot_attach_buf(
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_dq_logitem	*qlip = &dqp->q_logitem;
+	struct xfs_log_item	*lip = &qlip->qli_item;
+	int			error;
+
+	spin_lock(&qlip->qli_lock);
+	if (!lip->li_buf) {
+		struct xfs_buf	*bp = NULL;
+
+		spin_unlock(&qlip->qli_lock);
+		error = xfs_dquot_read_buf(tp, dqp, 0, &bp);
+		if (error)
+			return error;
+
+		/*
+		 * Attach the dquot to the buffer so that the AIL does not have
+		 * to read the dquot buffer to push this item.
+		 */
+		xfs_buf_hold(bp);
+		spin_lock(&qlip->qli_lock);
+		lip->li_buf = bp;
+		xfs_trans_brelse(tp, bp);
+	}
+	qlip->qli_dirty = true;
+	spin_unlock(&qlip->qli_lock);
+
+	return 0;
+}
+
+/*
+ * Get a new reference the dquot buffer attached to this dquot for a dqflush
+ * operation.
+ *
+ * Returns 0 and a NULL bp if none was attached to the dquot; 0 and a locked
+ * bp; or -EAGAIN if the buffer could not be locked.
+ */
+int
+xfs_dquot_use_attached_buf(
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp = dqp->q_logitem.qli_item.li_buf;
+
+	/*
+	 * A NULL buffer can happen if the dquot dirty flag was set but the
+	 * filesystem shut down before transaction commit happened.  In that
+	 * case we're not going to flush anyway.
+	 */
+	if (!bp) {
+		ASSERT(xfs_is_shutdown(dqp->q_mount));
+
+		*bpp = NULL;
+		return 0;
+	}
+
+	if (!xfs_buf_trylock(bp))
+		return -EAGAIN;
+
+	xfs_buf_hold(bp);
+	*bpp = bp;
+	return 0;
+}
+
 /*
  * Write a modified dquot to disk.
  * The dquot must be locked and the flush lock too taken by caller.
@@ -1289,7 +1401,8 @@ xfs_qm_dqflush(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = dqp->q_mount;
-	struct xfs_log_item	*lip = &dqp->q_logitem.qli_item;
+	struct xfs_dq_logitem	*qlip = &dqp->q_logitem;
+	struct xfs_log_item	*lip = &qlip->qli_item;
 	struct xfs_dqblk	*dqblk;
 	xfs_failaddr_t		fa;
 	int			error;
@@ -1319,8 +1432,15 @@ xfs_qm_dqflush(
 	 */
 	dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
 
-	xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
-			&lip->li_lsn);
+	/*
+	 * We hold the dquot lock, so nobody can dirty it while we're
+	 * scheduling the write out.  Clear the dirty-since-flush flag.
+	 */
+	spin_lock(&qlip->qli_lock);
+	qlip->qli_dirty = false;
+	spin_unlock(&qlip->qli_lock);
+
+	xfs_trans_ail_copy_lsn(mp->m_ail, &qlip->qli_flush_lsn, &lip->li_lsn);
 
 	/*
 	 * copy the lsn into the on-disk dquot now while we have the in memory
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 50f8404c4117..c7e80fc90823 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -215,7 +215,7 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
 
 void		xfs_qm_dqdestroy(struct xfs_dquot *dqp);
 int		xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp,
-				struct xfs_buf **bpp);
+				xfs_buf_flags_t flags, struct xfs_buf **bpp);
 int		xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp);
 void		xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
 void		xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
@@ -239,6 +239,10 @@ void		xfs_dqlockn(struct xfs_dqtrx *q);
 
 void		xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
 
+int xfs_dquot_attach_buf(struct xfs_trans *tp, struct xfs_dquot *dqp);
+int xfs_dquot_use_attached_buf(struct xfs_dquot *dqp, struct xfs_buf **bpp);
+void xfs_dquot_detach_buf(struct xfs_dquot *dqp);
+
 static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
 {
 	xfs_dqlock(dqp);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 56ecc5ed0193..271b195ebb93 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -123,8 +123,9 @@ xfs_qm_dquot_logitem_push(
 		__releases(&lip->li_ailp->ail_lock)
 		__acquires(&lip->li_ailp->ail_lock)
 {
-	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
-	struct xfs_buf		*bp = lip->li_buf;
+	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
+	struct xfs_dquot	*dqp = qlip->qli_dquot;
+	struct xfs_buf		*bp;
 	uint			rval = XFS_ITEM_SUCCESS;
 	int			error;
 
@@ -155,11 +156,10 @@ xfs_qm_dquot_logitem_push(
 
 	spin_unlock(&lip->li_ailp->ail_lock);
 
-	error = xfs_dquot_read_buf(NULL, dqp, &bp);
-	if (error) {
-		if (error == -EAGAIN)
-			rval = XFS_ITEM_LOCKED;
+	error = xfs_dquot_use_attached_buf(dqp, &bp);
+	if (error == -EAGAIN) {
 		xfs_dqfunlock(dqp);
+		rval = XFS_ITEM_LOCKED;
 		goto out_relock_ail;
 	}
 
@@ -207,12 +207,10 @@ xfs_qm_dquot_logitem_committing(
 }
 
 #ifdef DEBUG_EXPENSIVE
-static int
-xfs_qm_dquot_logitem_precommit(
-	struct xfs_trans	*tp,
-	struct xfs_log_item	*lip)
+static void
+xfs_qm_dquot_logitem_precommit_check(
+	struct xfs_dquot	*dqp)
 {
-	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
 	struct xfs_mount	*mp = dqp->q_mount;
 	struct xfs_disk_dquot	ddq = { };
 	xfs_failaddr_t		fa;
@@ -228,13 +226,24 @@ xfs_qm_dquot_logitem_precommit(
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 		ASSERT(fa == NULL);
 	}
-
-	return 0;
 }
 #else
-# define xfs_qm_dquot_logitem_precommit	NULL
+# define xfs_qm_dquot_logitem_precommit_check(...)	((void)0)
 #endif
 
+static int
+xfs_qm_dquot_logitem_precommit(
+	struct xfs_trans	*tp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
+	struct xfs_dquot	*dqp = qlip->qli_dquot;
+
+	xfs_qm_dquot_logitem_precommit_check(dqp);
+
+	return xfs_dquot_attach_buf(tp, dqp);
+}
+
 static const struct xfs_item_ops xfs_dquot_item_ops = {
 	.iop_size	= xfs_qm_dquot_logitem_size,
 	.iop_precommit	= xfs_qm_dquot_logitem_precommit,
@@ -259,5 +268,7 @@ xfs_qm_dquot_logitem_init(
 
 	xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
 					&xfs_dquot_item_ops);
+	spin_lock_init(&lp->qli_lock);
 	lp->qli_dquot = dqp;
+	lp->qli_dirty = false;
 }
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 794710c24474..d66e52807d76 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -14,6 +14,13 @@ struct xfs_dq_logitem {
 	struct xfs_log_item	qli_item;	/* common portion */
 	struct xfs_dquot	*qli_dquot;	/* dquot ptr */
 	xfs_lsn_t		qli_flush_lsn;	/* lsn at last flush */
+
+	/*
+	 * We use this spinlock to coordinate access to the li_buf pointer in
+	 * the log item and the qli_dirty flag.
+	 */
+	spinlock_t		qli_lock;
+	bool			qli_dirty;	/* dirtied since last flush? */
 };
 
 void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d9ac50a33c57..7d07d4b5c339 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -148,7 +148,7 @@ xfs_qm_dqpurge(
 		 * We don't care about getting disk errors here. We need
 		 * to purge this dquot anyway, so we go ahead regardless.
 		 */
-		error = xfs_dquot_read_buf(NULL, dqp, &bp);
+		error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
 		if (error == -EAGAIN) {
 			xfs_dqfunlock(dqp);
 			dqp->q_flags &= ~XFS_DQFLAG_FREEING;
@@ -168,6 +168,7 @@ xfs_qm_dqpurge(
 		}
 		xfs_dqflock(dqp);
 	}
+	xfs_dquot_detach_buf(dqp);
 
 out_funlock:
 	ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -505,7 +506,7 @@ xfs_qm_dquot_isolate(
 		/* we have to drop the LRU lock to flush the dquot */
 		spin_unlock(&lru->lock);
 
-		error = xfs_dquot_read_buf(NULL, dqp, &bp);
+		error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
 		if (error) {
 			xfs_dqfunlock(dqp);
 			goto out_unlock_dirty;
@@ -523,6 +524,8 @@ xfs_qm_dquot_isolate(
 		xfs_buf_relse(bp);
 		goto out_unlock_dirty;
 	}
+
+	xfs_dquot_detach_buf(dqp);
 	xfs_dqfunlock(dqp);
 
 	/*
@@ -1510,7 +1513,7 @@ xfs_qm_flush_one(
 		goto out_unlock;
 	}
 
-	error = xfs_dquot_read_buf(NULL, dqp, &bp);
+	error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 8ede9d099d1f..f56d62dced97 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -360,7 +360,7 @@ xfsaild_resubmit_item(
 
 	/* protected by ail_lock */
 	list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
-		if (bp->b_flags & _XBF_INODES)
+		if (bp->b_flags & (_XBF_INODES | _XBF_DQUOTS))
 			clear_bit(XFS_LI_FAILED, &lip->li_flags);
 		else
 			xfs_clear_li_failed(lip);
-- 
cgit v1.2.3


From ca378189fdfa890a4f0622f85ee41b710bbac271 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:39 -0800
Subject: xfs: convert quotacheck to attach dquot buffers

Now that we've converted the dquot logging machinery to attach the dquot
buffer to the li_buf pointer so that the AIL dqflush doesn't have to
allocate or read buffers in a reclaim path, do the same for the
quotacheck code so that the reclaim shrinker dqflush call doesn't have
to do that either.

Cc: <stable@vger.kernel.org> # v6.12
Fixes: 903edea6c53f09 ("mm: warn about illegal __GFP_NOFAIL usage in a more appropriate location and manner")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c |  9 +++------
 fs/xfs/xfs_dquot.h |  2 --
 fs/xfs/xfs_qm.c    | 18 +++++++++++++-----
 3 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 708fd3358375..f11d475898f2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1285,11 +1285,10 @@ xfs_qm_dqflush_check(
  * Requires dquot flush lock, will clear the dirty flag, delete the quota log
  * item from the AIL, and shut down the system if something goes wrong.
  */
-int
+static int
 xfs_dquot_read_buf(
 	struct xfs_trans	*tp,
 	struct xfs_dquot	*dqp,
-	xfs_buf_flags_t		xbf_flags,
 	struct xfs_buf		**bpp)
 {
 	struct xfs_mount	*mp = dqp->q_mount;
@@ -1297,10 +1296,8 @@ xfs_dquot_read_buf(
 	int			error;
 
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
-				   mp->m_quotainfo->qi_dqchunklen, xbf_flags,
+				   mp->m_quotainfo->qi_dqchunklen, 0,
 				   &bp, &xfs_dquot_buf_ops);
-	if (error == -EAGAIN)
-		return error;
 	if (xfs_metadata_is_sick(error))
 		xfs_dquot_mark_sick(dqp);
 	if (error)
@@ -1334,7 +1331,7 @@ xfs_dquot_attach_buf(
 		struct xfs_buf	*bp = NULL;
 
 		spin_unlock(&qlip->qli_lock);
-		error = xfs_dquot_read_buf(tp, dqp, 0, &bp);
+		error = xfs_dquot_read_buf(tp, dqp, &bp);
 		if (error)
 			return error;
 
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c7e80fc90823..c617bac75361 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -214,8 +214,6 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
 #define XFS_DQ_IS_DIRTY(dqp)	((dqp)->q_flags & XFS_DQFLAG_DIRTY)
 
 void		xfs_qm_dqdestroy(struct xfs_dquot *dqp);
-int		xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp,
-				xfs_buf_flags_t flags, struct xfs_buf **bpp);
 int		xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp);
 void		xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
 void		xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 7d07d4b5c339..69b70c3e999d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -148,13 +148,13 @@ xfs_qm_dqpurge(
 		 * We don't care about getting disk errors here. We need
 		 * to purge this dquot anyway, so we go ahead regardless.
 		 */
-		error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
+		error = xfs_dquot_use_attached_buf(dqp, &bp);
 		if (error == -EAGAIN) {
 			xfs_dqfunlock(dqp);
 			dqp->q_flags &= ~XFS_DQFLAG_FREEING;
 			goto out_unlock;
 		}
-		if (error)
+		if (!bp)
 			goto out_funlock;
 
 		/*
@@ -506,8 +506,8 @@ xfs_qm_dquot_isolate(
 		/* we have to drop the LRU lock to flush the dquot */
 		spin_unlock(&lru->lock);
 
-		error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
-		if (error) {
+		error = xfs_dquot_use_attached_buf(dqp, &bp);
+		if (!bp || error == -EAGAIN) {
 			xfs_dqfunlock(dqp);
 			goto out_unlock_dirty;
 		}
@@ -1331,6 +1331,10 @@ xfs_qm_quotacheck_dqadjust(
 		return error;
 	}
 
+	error = xfs_dquot_attach_buf(NULL, dqp);
+	if (error)
+		return error;
+
 	trace_xfs_dqadjust(dqp);
 
 	/*
@@ -1513,9 +1517,13 @@ xfs_qm_flush_one(
 		goto out_unlock;
 	}
 
-	error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
+	error = xfs_dquot_use_attached_buf(dqp, &bp);
 	if (error)
 		goto out_unlock;
+	if (!bp) {
+		error = -EFSCORRUPTED;
+		goto out_unlock;
+	}
 
 	error = xfs_qm_dqflush(dqp, bp);
 	if (!error)
-- 
cgit v1.2.3


From 7f8a44f37229fc76bfcafa341a4b8862368ef44a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:39 -0800
Subject: xfs: fix sb_spino_align checks for large fsblock sizes

For a sparse inodes filesystem, mkfs.xfs computes the values of
sb_spino_align and sb_inoalignmt with the following code:

	int     cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;

	if (cfg->sb_feat.crcs_enabled)
		cluster_size *= cfg->inodesize / XFS_DINODE_MIN_SIZE;

	sbp->sb_spino_align = cluster_size >> cfg->blocklog;
	sbp->sb_inoalignmt = XFS_INODES_PER_CHUNK *
			cfg->inodesize >> cfg->blocklog;

On a V5 filesystem with 64k fsblocks and 512 byte inodes, this results
in cluster_size = 8192 * (512 / 256) = 16384.  As a result,
sb_spino_align and sb_inoalignmt are both set to zero.  Unfortunately,
this trips the new sb_spino_align check that was just added to
xfs_validate_sb_common, and the mkfs fails:

# mkfs.xfs -f -b size=64k, /dev/sda
meta-data=/dev/sda               isize=512    agcount=4, agsize=81136 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=1, rmapbt=1
         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=1
         =                       exchange=0   metadir=0
data     =                       bsize=65536  blocks=324544, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=65536  ascii-ci=0, ftype=1, parent=0
log      =internal log           bsize=65536  blocks=5006, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=65536  blocks=0, rtextents=0
         =                       rgcount=0    rgsize=0 extents
Discarding blocks...Sparse inode alignment (0) is invalid.
Metadata corruption detected at 0x560ac5a80bbe, xfs_sb block 0x0/0x200
libxfs_bwrite: write verifier failed on xfs_sb bno 0x0/0x1
mkfs.xfs: Releasing dirty buffer to free list!
found dirty buffer (bulk) on free list!
Sparse inode alignment (0) is invalid.
Metadata corruption detected at 0x560ac5a80bbe, xfs_sb block 0x0/0x200
libxfs_bwrite: write verifier failed on xfs_sb bno 0x0/0x1
mkfs.xfs: writing AG headers failed, err=22

Prior to commit 59e43f5479cce1 this all worked fine, even if "sparse"
inodes are somewhat meaningless when everything fits in a single
fsblock.  Adjust the checks to handle existing filesystems.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: 59e43f5479cce1 ("xfs: sb_spino_align is not verified")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_sb.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a809513a290c..3b5623611eba 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -494,12 +494,13 @@ xfs_validate_sb_common(
 				return -EINVAL;
 			}
 
-			if (!sbp->sb_spino_align ||
-			    sbp->sb_spino_align > sbp->sb_inoalignmt ||
-			    (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0) {
+			if (sbp->sb_spino_align &&
+			    (sbp->sb_spino_align > sbp->sb_inoalignmt ||
+			     (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) {
 				xfs_warn(mp,
-				"Sparse inode alignment (%u) is invalid.",
-					sbp->sb_spino_align);
+"Sparse inode alignment (%u) is invalid, must be integer factor of (%u).",
+					sbp->sb_spino_align,
+					sbp->sb_inoalignmt);
 				return -EINVAL;
 			}
 		} else if (sbp->sb_spino_align) {
-- 
cgit v1.2.3


From 3853b5e1d7ccb83f572df8a12619d1a58d266d6d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:40 -0800
Subject: xfs: don't move nondir/nonreg temporary repair files to the metadir
 namespace

Only directories or regular files are allowed in the metadata directory
tree.  Don't move the repair tempfile to the metadir namespace if this
is not true; this will cause the inode verifiers to trip.

xrep_tempfile_adjust_directory_tree opportunistically moves sc->tempip
from the regular directory tree to the metadata directory tree if sc->ip
is part of the metadata directory tree.  However, the scrub setup
functions grab sc->ip and create sc->tempip before we actually get
around to checking if the file mode is the right type for the scrubber.

IOWs, you can invoke the symlink scrubber with the file handle of a
subdirectory in the metadir.  xrep_setup_symlink will create a temporary
symlink file, xrep_tempfile_adjust_directory_tree will foolishly try to
set the METADATA flag on the temp symlink, which trips the inode
verifier in the inode item precommit, which shuts down the filesystem
when expensive checks are turned on.  If they're /not/ turned on, then
xchk_symlink will return ENOENT when it sees that it's been passed a
symlink, but the invalid inode could still get flushed to disk.  We
don't want that.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: 9dc31acb01a1c7 ("xfs: move repair temporary files to the metadata directory tree")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/tempfile.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index dc3802c7f678..2d7ca7e1bbca 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -184,11 +184,18 @@ out_release_dquots:
 }
 
 /*
+ * Move sc->tempip from the regular directory tree to the metadata directory
+ * tree if sc->ip is part of the metadata directory tree and tempip has an
+ * eligible file mode.
+ *
  * Temporary files have to be created before we even know which inode we're
  * going to scrub, so we assume that they will be part of the regular directory
  * tree.  If it turns out that we're actually scrubbing a file from the
  * metadata directory tree, we have to subtract the temp file from the root
- * dquots and detach the dquots.
+ * dquots and detach the dquots prior to setting the METADATA iflag.  However,
+ * the scrub setup functions grab sc->ip and create sc->tempip before we
+ * actually get around to checking if the file mode is the right type for the
+ * scrubber.
  */
 int
 xrep_tempfile_adjust_directory_tree(
@@ -204,6 +211,9 @@ xrep_tempfile_adjust_directory_tree(
 
 	if (!sc->ip || !xfs_is_metadir_inode(sc->ip))
 		return 0;
+	if (!S_ISDIR(VFS_I(sc->tempip)->i_mode) &&
+	    !S_ISREG(VFS_I(sc->tempip)->i_mode))
+		return 0;
 
 	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
 	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
-- 
cgit v1.2.3


From e57e083be9b9bc5c341e1245b988f290c09a5ed7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:41 -0800
Subject: xfs: don't crash on corrupt /quotas dirent

If the /quotas dirent points to an inode but the inode isn't loadable
(and hence mkdir returns -EEXIST), don't crash, just bail out.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: e80fbe1ad8eff7 ("xfs: use metadir for quota inodes")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_qm.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 69b70c3e999d..dc8b1010d4d3 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -731,6 +731,13 @@ xfs_qm_create_metadir_qinos(
 		error = xfs_dqinode_mkdir_parent(mp, &qi->qi_dirip);
 		if (error && error != -EEXIST)
 			return error;
+		/*
+		 * If the /quotas dirent points to an inode that isn't
+		 * loadable, qi_dirip will be NULL but mkdir_parent will return
+		 * -EEXIST.  In this case the metadir is corrupt, so bail out.
+		 */
+		if (XFS_IS_CORRUPT(mp, qi->qi_dirip == NULL))
+			return -EFSCORRUPTED;
 	}
 
 	if (XFS_IS_UQUOTA_ON(mp) && !qi->qi_uquotaip) {
-- 
cgit v1.2.3


From 06b20ef09ba16374e1e68f1e8dbe434c4ad4e6fd Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: xfs: check pre-metadir fields correctly

The checks that were added to the superblock scrubber for metadata
directories aren't quite right -- the old inode pointers are now defined
to be zeroes until someone else reuses them.  Also consolidate the new
metadir field checks to one place; they were inexplicably scattered
around.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: 28d756d4d562dc ("xfs: update sb field checks when metadir is turned on")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/agheader.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 1d41b85478da..88063d67cb5f 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -145,8 +145,11 @@ xchk_superblock(
 		xchk_block_set_preen(sc, bp);
 
 	if (xfs_has_metadir(sc->mp)) {
-		if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino))
-			xchk_block_set_preen(sc, bp);
+		if (sb->sb_rbmino != cpu_to_be64(0))
+			xchk_block_set_corrupt(sc, bp);
+
+		if (sb->sb_rsumino != cpu_to_be64(0))
+			xchk_block_set_corrupt(sc, bp);
 	} else {
 		if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
 			xchk_block_set_preen(sc, bp);
@@ -229,7 +232,13 @@ xchk_superblock(
 	 * sb_icount, sb_ifree, sb_fdblocks, sb_frexents
 	 */
 
-	if (!xfs_has_metadir(mp)) {
+	if (xfs_has_metadir(mp)) {
+		if (sb->sb_uquotino != cpu_to_be64(0))
+			xchk_block_set_corrupt(sc, bp);
+
+		if (sb->sb_gquotino != cpu_to_be64(0))
+			xchk_block_set_preen(sc, bp);
+	} else {
 		if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
 			xchk_block_set_preen(sc, bp);
 
@@ -281,15 +290,8 @@ xchk_superblock(
 		if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok)))
 			xchk_block_set_corrupt(sc, bp);
 
-		if (xfs_has_metadir(mp)) {
-			if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog)
-				xchk_block_set_corrupt(sc, bp);
-			if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad)))
-				xchk_block_set_preen(sc, bp);
-		} else {
-			if (sb->sb_features2 != sb->sb_bad_features2)
-				xchk_block_set_preen(sc, bp);
-		}
+		if (sb->sb_features2 != sb->sb_bad_features2)
+			xchk_block_set_preen(sc, bp);
 	}
 
 	/* Check sb_features2 flags that are set at mkfs time. */
@@ -351,7 +353,10 @@ xchk_superblock(
 		if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
 			xchk_block_set_corrupt(sc, bp);
 
-		if (!xfs_has_metadir(mp)) {
+		if (xfs_has_metadir(mp)) {
+			if (sb->sb_pquotino != cpu_to_be64(0))
+				xchk_block_set_corrupt(sc, bp);
+		} else {
 			if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
 				xchk_block_set_preen(sc, bp);
 		}
@@ -366,11 +371,20 @@ xchk_superblock(
 	}
 
 	if (xfs_has_metadir(mp)) {
+		if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino))
+			xchk_block_set_preen(sc, bp);
+
 		if (sb->sb_rgcount != cpu_to_be32(mp->m_sb.sb_rgcount))
 			xchk_block_set_corrupt(sc, bp);
 
 		if (sb->sb_rgextents != cpu_to_be32(mp->m_sb.sb_rgextents))
 			xchk_block_set_corrupt(sc, bp);
+
+		if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog)
+			xchk_block_set_corrupt(sc, bp);
+
+		if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad)))
+			xchk_block_set_corrupt(sc, bp);
 	}
 
 	/* Everything else must be zero. */
-- 
cgit v1.2.3


From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: xfs: fix zero byte checking in the superblock scrubber

The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.

Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.

Cc: <stable@vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/agheader.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
 	/* scrub teardown will take care of sc->sa for us */
 }
 
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb).  This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+	struct xfs_mount	*mp)
+{
+	if (xfs_has_metadir(mp))
+		return offsetofend(struct xfs_dsb, sb_pad);
+	if (xfs_has_metauuid(mp))
+		return offsetofend(struct xfs_dsb, sb_meta_uuid);
+	if (xfs_has_crc(mp))
+		return offsetofend(struct xfs_dsb, sb_lsn);
+	if (xfs_sb_version_hasmorebits(&mp->m_sb))
+		return offsetofend(struct xfs_dsb, sb_bad_features2);
+	if (xfs_has_logv2(mp))
+		return offsetofend(struct xfs_dsb, sb_logsunit);
+	if (xfs_has_sector(mp))
+		return offsetofend(struct xfs_dsb, sb_logsectsize);
+	/* only support dirv2 or more recent */
+	return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
 /*
  * Scrub the filesystem superblock.
  *
@@ -75,6 +101,7 @@ xchk_superblock(
 	struct xfs_buf		*bp;
 	struct xfs_dsb		*sb;
 	struct xfs_perag	*pag;
+	size_t			sblen;
 	xfs_agnumber_t		agno;
 	uint32_t		v2_ok;
 	__be32			features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
 	}
 
 	/* Everything else must be zero. */
-	if (memchr_inv(sb + 1, 0,
-			BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+	sblen = xchk_superblock_ondisk_size(mp);
+	if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
 		xchk_block_set_corrupt(sc, bp);
 
 	xchk_superblock_xref(sc, bp);
-- 
cgit v1.2.3


From 7f8b718c58783f3ff0810b39e2f62f50ba2549f6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:43 -0800
Subject: xfs: return from xfs_symlink_verify early on V4 filesystems

V4 symlink blocks didn't have headers, so return early if this is a V4
filesystem.

Cc: <stable@vger.kernel.org> # v5.1
Fixes: 39708c20ab5133 ("xfs: miscellaneous verifier magic value fixups")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_symlink_remote.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index f228127a88ff..fb47a76ead18 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -92,8 +92,10 @@ xfs_symlink_verify(
 	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
 
+	/* no verification of non-crc buffers */
 	if (!xfs_has_crc(mp))
-		return __this_address;
+		return NULL;
+
 	if (!xfs_verify_magic(bp, dsl->sl_magic))
 		return __this_address;
 	if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
-- 
cgit v1.2.3


From 12f2930f5f91bc0d67794c69d1961098c7c72040 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:58:11 -0800
Subject: xfs: port xfs_ioc_start_commit to multigrain timestamps

Take advantage of the multigrain timestamp APIs to ensure that nobody
can sneak in and write things to a file between starting a file update
operation and committing the results.  This should have been part of the
multigrain timestamp merge, but I forgot to fling it at jlayton when he
resubmitted the patchset due to developer bandwidth problems.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: 4e40eff0b5737c ("fs: add infrastructure for multigrain timestamps")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/xfs/xfs_exchrange.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index 9ab05ad224d1..265c42449893 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -854,7 +854,7 @@ xfs_ioc_start_commit(
 	struct xfs_commit_range __user	*argp)
 {
 	struct xfs_commit_range		args = { };
-	struct timespec64		ts;
+	struct kstat			kstat = { };
 	struct xfs_commit_range_fresh	*kern_f;
 	struct xfs_commit_range_fresh	__user *user_f;
 	struct inode			*inode2 = file_inode(file);
@@ -871,12 +871,12 @@ xfs_ioc_start_commit(
 	memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
 
 	xfs_ilock(ip2, lockflags);
-	ts = inode_get_ctime(inode2);
-	kern_f->file2_ctime		= ts.tv_sec;
-	kern_f->file2_ctime_nsec	= ts.tv_nsec;
-	ts = inode_get_mtime(inode2);
-	kern_f->file2_mtime		= ts.tv_sec;
-	kern_f->file2_mtime_nsec	= ts.tv_nsec;
+	/* Force writing of a distinct ctime if any writes happen. */
+	fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2);
+	kern_f->file2_ctime		= kstat.ctime.tv_sec;
+	kern_f->file2_ctime_nsec	= kstat.ctime.tv_nsec;
+	kern_f->file2_mtime		= kstat.mtime.tv_sec;
+	kern_f->file2_mtime_nsec	= kstat.mtime.tv_nsec;
 	kern_f->file2_ino		= ip2->i_ino;
 	kern_f->file2_gen		= inode2->i_generation;
 	kern_f->magic			= XCR_FRESH_MAGIC;
-- 
cgit v1.2.3


From aa21f333c86c8a09d39189de87abb0153d338190 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 11 Dec 2024 13:11:17 +0100
Subject: fs: fix is_mnt_ns_file()

Commit 1fa08aece425 ("nsfs: convert to path_from_stashed() helper") reused
nsfs dentry's d_fsdata, which no longer contains a pointer to
proc_ns_operations.

Fix the remaining use in is_mnt_ns_file().

Fixes: 1fa08aece425 ("nsfs: convert to path_from_stashed() helper")
Cc: stable@vger.kernel.org # v6.9
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20241211121118.85268-1-mszeredi@redhat.com
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 23e81c2a1e3f..6eec7794f707 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2055,9 +2055,15 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
 
 static bool is_mnt_ns_file(struct dentry *dentry)
 {
+	struct ns_common *ns;
+
 	/* Is this a proxy for a mount namespace? */
-	return dentry->d_op == &ns_dentry_operations &&
-	       dentry->d_fsdata == &mntns_operations;
+	if (dentry->d_op != &ns_dentry_operations)
+		return false;
+
+	ns = d_inode(dentry)->i_private;
+
+	return ns->ops == &mntns_operations;
 }
 
 struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
-- 
cgit v1.2.3


From 78f2560fc9fa5ccaaf23ac78edb732c08bad7a92 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Tue, 3 Dec 2024 00:01:10 +0100
Subject: fuse: Set *nbytesp=0 in fuse_get_user_pages on allocation failure

In fuse_get_user_pages(), set *nbytesp to 0 when struct page **pages
allocation fails. This prevents the caller (fuse_direct_io) from making
incorrect assumptions that could lead to NULL pointer dereferences
when processing the request reply.

Previously, *nbytesp was left unmodified on allocation failure, which
could cause issues if the caller assumed pages had been added to
ap->descs[] when they hadn't.

Reported-by: syzbot+87b8e6ed25dbc41759f7@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=87b8e6ed25dbc41759f7
Fixes: 3b97c3652d91 ("fuse: convert direct io to use folios")
Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Tested-by: Dmitry Antipov <dmantipov@yandex.ru>
Tested-by: David Howells <dhowells@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/file.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 15b08d6a5739..7d92a5479998 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1541,8 +1541,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
 	 */
 	struct page **pages = kzalloc(max_pages * sizeof(struct page *),
 				      GFP_KERNEL);
-	if (!pages)
-		return -ENOMEM;
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	while (nbytes < *nbytesp && nr_pages < max_pages) {
 		unsigned nfolios, i;
@@ -1588,6 +1590,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
 	else
 		ap->args.out_pages = true;
 
+out:
 	*nbytesp = nbytes;
 
 	return ret < 0 ? ret : 0;
-- 
cgit v1.2.3


From 9698d5a4836549d394e6efd858b5200878c9f255 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 14:02:23 +0100
Subject: pidfs: rework inode number allocation

Recently we received a patchset that aims to enable file handle encoding
and decoding via name_to_handle_at(2) and open_by_handle_at(2).

A crucical step in the patch series is how to go from inode number to
struct pid without leaking information into unprivileged contexts. The
issue is that in order to find a struct pid the pid number in the
initial pid namespace must be encoded into the file handle via
name_to_handle_at(2). This can be used by containers using a separate
pid namespace to learn what the pid number of a given process in the
initial pid namespace is. While this is a weak information leak it could
be used in various exploits and in general is an ugly wart in the design.

To solve this problem a new way is needed to lookup a struct pid based
on the inode number allocated for that struct pid. The other part is to
remove the custom inode number allocation on 32bit systems that is also
an ugly wart that should go away.

So, a new scheme is used that I was discusssing with Tejun some time
back. A cyclic ida is used for the lower 32 bits and a the high 32 bits
are used for the generation number. This gives a 64 bit inode number
that is unique on both 32 bit and 64 bit. The lower 32 bit number is
recycled slowly and can be used to lookup struct pids.

Link: https://lore.kernel.org/r/20241129-work-pidfs-v2-1-61043d66fbce@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c            | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/pidfs.h |  2 ++
 kernel/pid.c          | 14 ++++-----
 3 files changed, 86 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 618abb1fa1b8..abfe96be9ffe 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -23,6 +23,79 @@
 #include "internal.h"
 #include "mount.h"
 
+static DEFINE_IDR(pidfs_ino_idr);
+
+static u32 pidfs_ino_upper_32_bits = 0;
+
+#if BITS_PER_LONG == 32
+/*
+ * On 32 bit systems the lower 32 bits are the inode number and
+ * the higher 32 bits are the generation number. The starting
+ * value for the inode number and the generation number is one.
+ */
+static u32 pidfs_ino_lower_32_bits = 1;
+
+static inline unsigned long pidfs_ino(u64 ino)
+{
+	return lower_32_bits(ino);
+}
+
+/* On 32 bit the generation number are the upper 32 bits. */
+static inline u32 pidfs_gen(u64 ino)
+{
+	return upper_32_bits(ino);
+}
+
+#else
+
+static u32 pidfs_ino_lower_32_bits = 0;
+
+/* On 64 bit simply return ino. */
+static inline unsigned long pidfs_ino(u64 ino)
+{
+	return ino;
+}
+
+/* On 64 bit the generation number is 1. */
+static inline u32 pidfs_gen(u64 ino)
+{
+	return 1;
+}
+#endif
+
+/*
+ * Construct an inode number for struct pid in a way that we can use the
+ * lower 32bit to lookup struct pid independent of any pid numbers that
+ * could be leaked into userspace (e.g., via file handle encoding).
+ */
+int pidfs_add_pid(struct pid *pid)
+{
+	u32 upper;
+	int lower;
+
+        /*
+	 * Inode numbering for pidfs start at 2. This avoids collisions
+	 * with the root inode which is 1 for pseudo filesystems.
+         */
+	lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC);
+	if (lower >= 0 && lower < pidfs_ino_lower_32_bits)
+		pidfs_ino_upper_32_bits++;
+	upper = pidfs_ino_upper_32_bits;
+	pidfs_ino_lower_32_bits = lower;
+	if (lower < 0)
+		return lower;
+
+	pid->ino = ((u64)upper << 32) | lower;
+	pid->stashed = NULL;
+	return 0;
+}
+
+/* The idr number to remove is the lower 32 bits of the inode. */
+void pidfs_remove_pid(struct pid *pid)
+{
+	idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino));
+}
+
 #ifdef CONFIG_PROC_FS
 /**
  * pidfd_show_fdinfo - print information about a pidfd
@@ -346,7 +419,7 @@ static inline void pidfs_free_inum(unsigned long ino)
 #else
 static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
 {
-	*ino = pid->ino;
+	*ino = pidfs_ino(pid->ino);
 	return 0;
 }
 #define pidfs_free_inum(ino) ((void)(ino))
@@ -429,11 +502,14 @@ static const struct dentry_operations pidfs_dentry_operations = {
 
 static int pidfs_init_inode(struct inode *inode, void *data)
 {
+	const struct pid *pid = data;
+
 	inode->i_private = data;
 	inode->i_flags |= S_PRIVATE;
 	inode->i_mode |= S_IRWXU;
 	inode->i_op = &pidfs_inode_operations;
 	inode->i_fop = &pidfs_file_operations;
+	inode->i_generation = pidfs_gen(pid->ino);
 	/*
 	 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
 	 * avoids collisions with the root inode which is 1 for pseudo
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 75bdf9807802..2958652bb108 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -4,5 +4,7 @@
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
+int pidfs_add_pid(struct pid *pid);
+void pidfs_remove_pid(struct pid *pid);
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 115448e89c3e..58567d6904b2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -64,11 +64,6 @@ int pid_max = PID_MAX_DEFAULT;
 
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
-/*
- * Pseudo filesystems start inode numbering after one. We use Reserved
- * PIDs as a natural offset.
- */
-static u64 pidfs_ino = RESERVED_PIDS;
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -158,6 +153,7 @@ void free_pid(struct pid *pid)
 
 		idr_remove(&ns->idr, upid->nr);
 	}
+	pidfs_remove_pid(pid);
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
 	call_rcu(&pid->rcu, delayed_put_pid);
@@ -273,22 +269,26 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	INIT_HLIST_HEAD(&pid->inodes);
 
 	upid = pid->numbers + ns->level;
+	idr_preload(GFP_KERNEL);
 	spin_lock_irq(&pidmap_lock);
 	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
-	pid->stashed = NULL;
-	pid->ino = ++pidfs_ino;
+	retval = pidfs_add_pid(pid);
+	if (retval)
+		goto out_unlock;
 	for ( ; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
 		upid->ns->pid_allocated++;
 	}
 	spin_unlock_irq(&pidmap_lock);
+	idr_preload_end();
 
 	return pid;
 
 out_unlock:
 	spin_unlock_irq(&pidmap_lock);
+	idr_preload_end();
 	put_pid_ns(ns);
 
 out_free:
-- 
cgit v1.2.3


From 03c212bf3fa86820baa5bcad75cfabb845166ccd Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 14:02:24 +0100
Subject: pidfs: remove 32bit inode number handling

Now that we have a unified inode number handling model remove the custom
ida-based allocation for 32bit.

Link: https://lore.kernel.org/r/20241129-work-pidfs-v2-2-61043d66fbce@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c | 43 ++-----------------------------------------
 1 file changed, 2 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index abfe96be9ffe..71af820eafa5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -391,40 +391,6 @@ struct pid *pidfd_pid(const struct file *file)
 
 static struct vfsmount *pidfs_mnt __ro_after_init;
 
-#if BITS_PER_LONG == 32
-/*
- * Provide a fallback mechanism for 32-bit systems so processes remain
- * reliably comparable by inode number even on those systems.
- */
-static DEFINE_IDA(pidfd_inum_ida);
-
-static int pidfs_inum(struct pid *pid, unsigned long *ino)
-{
-	int ret;
-
-	ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
-			      UINT_MAX, GFP_ATOMIC);
-	if (ret < 0)
-		return -ENOSPC;
-
-	*ino = ret;
-	return 0;
-}
-
-static inline void pidfs_free_inum(unsigned long ino)
-{
-	if (ino > 0)
-		ida_free(&pidfd_inum_ida, ino);
-}
-#else
-static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
-{
-	*ino = pidfs_ino(pid->ino);
-	return 0;
-}
-#define pidfs_free_inum(ino) ((void)(ino))
-#endif
-
 /*
  * The vfs falls back to simple_setattr() if i_op->setattr() isn't
  * implemented. Let's reject it completely until we have a clean
@@ -476,7 +442,6 @@ static void pidfs_evict_inode(struct inode *inode)
 
 	clear_inode(inode);
 	put_pid(pid);
-	pidfs_free_inum(inode->i_ino);
 }
 
 static const struct super_operations pidfs_sops = {
@@ -509,13 +474,9 @@ static int pidfs_init_inode(struct inode *inode, void *data)
 	inode->i_mode |= S_IRWXU;
 	inode->i_op = &pidfs_inode_operations;
 	inode->i_fop = &pidfs_file_operations;
+	inode->i_ino = pidfs_ino(pid->ino);
 	inode->i_generation = pidfs_gen(pid->ino);
-	/*
-	 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
-	 * avoids collisions with the root inode which is 1 for pseudo
-	 * filesystems.
-	 */
-	return pidfs_inum(data, &inode->i_ino);
+	return 0;
 }
 
 static void pidfs_put_data(void *data)
-- 
cgit v1.2.3


From 230536ff6b06b199995687aa7fbf164970ebda85 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 14:02:25 +0100
Subject: pidfs: support FS_IOC_GETVERSION

This will allow 32 bit userspace to detect when a given inode number has
been recycled and also to construct a unique 64 bit identifier.

Link: https://lore.kernel.org/r/20241129-work-pidfs-v2-3-61043d66fbce@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 71af820eafa5..8d62d900d20d 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -271,6 +271,14 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct ns_common *ns_common = NULL;
 	struct pid_namespace *pid_ns;
 
+	if (cmd == FS_IOC_GETVERSION) {
+		if (!arg)
+			return -EINVAL;
+
+		__u32 __user *argp = (__u32 __user *)arg;
+		return put_user(file_inode(file)->i_generation, argp);
+	}
+
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task)
 		return -ESRCH;
-- 
cgit v1.2.3


From d2ab36bb115b720c9c738184d4007e1ca01c53da Mon Sep 17 00:00:00 2001
From: Erin Shepherd <erin.shepherd@e43.eu>
Date: Fri, 29 Nov 2024 14:38:00 +0100
Subject: pseudofs: add support for export_ops

Pseudo-filesystems might reasonably wish to implement the export ops
(particularly for name_to_handle_at/open_by_handle_at); plumb this
through pseudo_fs_context

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
Link: https://lore.kernel.org/r/20241113-pidfs_fh-v2-1-9a4d28155a37@e43.eu
Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-1-87d803a42495@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c                | 1 +
 include/linux/pseudo_fs.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 748ac5923154..2890a9c4a414 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_blocksize_bits = PAGE_SHIFT;
 	s->s_magic = ctx->magic;
 	s->s_op = ctx->ops ?: &simple_super_operations;
+	s->s_export_op = ctx->eops;
 	s->s_xattr = ctx->xattr;
 	s->s_time_gran = 1;
 	root = new_inode(s);
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index 730f77381d55..2503f7625d65 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -5,6 +5,7 @@
 
 struct pseudo_fs_context {
 	const struct super_operations *ops;
+	const struct export_operations *eops;
 	const struct xattr_handler * const *xattr;
 	const struct dentry_operations *dops;
 	unsigned long magic;
-- 
cgit v1.2.3


From f07c7cc4684a641032c6bd439d3b91ec336e8cb5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 14:38:01 +0100
Subject: fhandle: simplify error handling

Rely on our cleanup infrastructure.

Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-2-87d803a42495@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fhandle.c b/fs/fhandle.c
index ec9145047dfc..c00d88fb14e1 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -261,19 +261,20 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
 {
 	int handle_dwords;
 	struct vfsmount *mnt = ctx->root.mnt;
+	struct dentry *dentry;
 
 	/* change the handle size to multiple of sizeof(u32) */
 	handle_dwords = handle->handle_bytes >> 2;
-	path->dentry = exportfs_decode_fh_raw(mnt,
-					  (struct fid *)handle->f_handle,
-					  handle_dwords, handle->handle_type,
-					  ctx->fh_flags,
-					  vfs_dentry_acceptable, ctx);
-	if (IS_ERR_OR_NULL(path->dentry)) {
-		if (path->dentry == ERR_PTR(-ENOMEM))
+	dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
+					handle_dwords, handle->handle_type,
+					ctx->fh_flags, vfs_dentry_acceptable,
+					ctx);
+	if (IS_ERR_OR_NULL(dentry)) {
+		if (dentry == ERR_PTR(-ENOMEM))
 			return -ENOMEM;
 		return -ESTALE;
 	}
+	path->dentry = dentry;
 	path->mnt = mntget(mnt);
 	return 0;
 }
@@ -398,29 +399,23 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 			   int open_flag)
 {
 	long retval = 0;
-	struct path path;
+	struct path path __free(path_put) = {};
 	struct file *file;
-	int fd;
 
 	retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
 	if (retval)
 		return retval;
 
-	fd = get_unused_fd_flags(open_flag);
-	if (fd < 0) {
-		path_put(&path);
+	CLASS(get_unused_fd, fd)(O_CLOEXEC);
+	if (fd < 0)
 		return fd;
-	}
+
 	file = file_open_root(&path, "", open_flag, 0);
-	if (IS_ERR(file)) {
-		put_unused_fd(fd);
-		retval =  PTR_ERR(file);
-	} else {
-		retval = fd;
-		fd_install(fd, file);
-	}
-	path_put(&path);
-	return retval;
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	fd_install(fd, file);
+	return take_fd(fd);
 }
 
 /**
-- 
cgit v1.2.3


From 50166d57ea8c5042ecba0ee22532617d72ed085a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 14:38:02 +0100
Subject: exportfs: add open method

This allows filesystems such as pidfs to provide their custom open.

Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-3-87d803a42495@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c             | 7 ++++++-
 include/linux/exportfs.h | 5 +++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fhandle.c b/fs/fhandle.c
index c00d88fb14e1..f0b818f08aaa 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -401,6 +401,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 	long retval = 0;
 	struct path path __free(path_put) = {};
 	struct file *file;
+	const struct export_operations *eops;
 
 	retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
 	if (retval)
@@ -410,7 +411,11 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 	if (fd < 0)
 		return fd;
 
-	file = file_open_root(&path, "", open_flag, 0);
+	eops = path.mnt->mnt_sb->s_export_op;
+	if (eops->open)
+		file = eops->open(&path, open_flag);
+	else
+		file = file_open_root(&path, "", open_flag, 0);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 4cc8801e50e3..c69b79b64466 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -10,6 +10,7 @@ struct inode;
 struct iomap;
 struct super_block;
 struct vfsmount;
+struct path;
 
 /* limit the handle size to NFSv4 handle size now */
 #define MAX_HANDLE_SZ 128
@@ -225,6 +226,9 @@ struct fid {
  *    is also a directory.  In the event that it cannot be found, or storage
  *    space cannot be allocated, a %ERR_PTR should be returned.
  *
+ * open:
+ *    Allow filesystems to specify a custom open function.
+ *
  * commit_metadata:
  *    @commit_metadata should commit metadata changes to stable storage.
  *
@@ -251,6 +255,7 @@ struct export_operations {
 			  bool write, u32 *device_generation);
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
+	struct file * (*open)(struct path *path, unsigned int oflags);
 #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
 #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
 #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
-- 
cgit v1.2.3


From 6ebb05b48e9c555f23a042dcbb45280a0f26def8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 14:38:03 +0100
Subject: fhandle: pull CAP_DAC_READ_SEARCH check into may_decode_fh()

There's no point in keeping it outside of that helper. This way we have
all the permission pieces in one place.

Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-4-87d803a42495@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fhandle.c b/fs/fhandle.c
index f0b818f08aaa..e17029b1dc44 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -279,28 +279,32 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
 	return 0;
 }
 
-/*
- * Allow relaxed permissions of file handles if the caller has the
- * ability to mount the filesystem or create a bind-mount of the
- * provided @mountdirfd.
- *
- * In both cases the caller may be able to get an unobstructed way to
- * the encoded file handle. If the caller is only able to create a
- * bind-mount we need to verify that there are no locked mounts on top
- * of it that could prevent us from getting to the encoded file.
- *
- * In principle, locked mounts can prevent the caller from mounting the
- * filesystem but that only applies to procfs and sysfs neither of which
- * support decoding file handles.
- */
 static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
 				 unsigned int o_flags)
 {
 	struct path *root = &ctx->root;
 
+	if (capable(CAP_DAC_READ_SEARCH))
+		return true;
+
 	/*
-	 * Restrict to O_DIRECTORY to provide a deterministic API that avoids a
-	 * confusing api in the face of disconnected non-dir dentries.
+	 * Allow relaxed permissions of file handles if the caller has
+	 * the ability to mount the filesystem or create a bind-mount of
+	 * the provided @mountdirfd.
+	 *
+	 * In both cases the caller may be able to get an unobstructed
+	 * way to the encoded file handle. If the caller is only able to
+	 * create a bind-mount we need to verify that there are no
+	 * locked mounts on top of it that could prevent us from getting
+	 * to the encoded file.
+	 *
+	 * In principle, locked mounts can prevent the caller from
+	 * mounting the filesystem but that only applies to procfs and
+	 * sysfs neither of which support decoding file handles.
+	 *
+	 * Restrict to O_DIRECTORY to provide a deterministic API that
+	 * avoids a confusing api in the face of disconnected non-dir
+	 * dentries.
 	 *
 	 * There's only one dentry for each directory inode (VFS rule)...
 	 */
@@ -337,7 +341,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 	if (retval)
 		goto out_err;
 
-	if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
+	if (!may_decode_fh(&ctx, o_flags)) {
 		retval = -EPERM;
 		goto out_path;
 	}
-- 
cgit v1.2.3


From 83c47d9e0ce79b5d7c0b21b9f35402dbde0fa15c Mon Sep 17 00:00:00 2001
From: Marios Makassikis <mmakassikis@freebox.fr>
Date: Sat, 14 Dec 2024 12:16:45 +0900
Subject: ksmbd: count all requests in req_running counter

This changes the semantics of req_running to count all in-flight
requests on a given connection, rather than the number of elements
in the conn->request list. The latter is used only in smb2_cancel,
and the counter is not used

Signed-off-by: Marios Makassikis <mmakassikis@freebox.fr>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/connection.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index c14dd72e1b30..be9656d524a4 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -120,8 +120,8 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work)
 	if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE)
 		requests_queue = &conn->requests;
 
+	atomic_inc(&conn->req_running);
 	if (requests_queue) {
-		atomic_inc(&conn->req_running);
 		spin_lock(&conn->request_lock);
 		list_add_tail(&work->request_entry, requests_queue);
 		spin_unlock(&conn->request_lock);
@@ -132,11 +132,12 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work)
 {
 	struct ksmbd_conn *conn = work->conn;
 
+	atomic_dec(&conn->req_running);
+
 	if (list_empty(&work->request_entry) &&
 	    list_empty(&work->async_request_entry))
 		return;
 
-	atomic_dec(&conn->req_running);
 	spin_lock(&conn->request_lock);
 	list_del_init(&work->request_entry);
 	spin_unlock(&conn->request_lock);
-- 
cgit v1.2.3


From 43fb7bce8866e793275c4f9f25af6a37745f3416 Mon Sep 17 00:00:00 2001
From: Marios Makassikis <mmakassikis@freebox.fr>
Date: Sat, 14 Dec 2024 12:17:23 +0900
Subject: ksmbd: fix broken transfers when exceeding max simultaneous
 operations

Since commit 0a77d947f599 ("ksmbd: check outstanding simultaneous SMB
operations"), ksmbd enforces a maximum number of simultaneous operations
for a connection. The problem is that reaching the limit causes ksmbd to
close the socket, and the client has no indication that it should have
slowed down.

This behaviour can be reproduced by setting "smb2 max credits = 128" (or
lower), and transferring a large file (25GB).

smbclient fails as below:

  $ smbclient //192.168.1.254/testshare -U user%pass
  smb: \> put file.bin
  cli_push returned NT_STATUS_USER_SESSION_DELETED
  putting file file.bin as \file.bin smb2cli_req_compound_submit:
  Insufficient credits. 0 available, 1 needed
  NT_STATUS_INTERNAL_ERROR closing remote file \file.bin
  smb: \> smb2cli_req_compound_submit: Insufficient credits. 0 available,
  1 needed

Windows clients fail with 0x8007003b (with smaller files even).

Fix this by delaying reading from the socket until there's room to
allocate a request. This effectively applies backpressure on the client,
so the transfer completes, albeit at a slower rate.

Fixes: 0a77d947f599 ("ksmbd: check outstanding simultaneous SMB operations")
Signed-off-by: Marios Makassikis <mmakassikis@freebox.fr>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/connection.c    | 13 +++++++++++--
 fs/smb/server/connection.h    |  1 -
 fs/smb/server/server.c        |  7 +------
 fs/smb/server/server.h        |  1 +
 fs/smb/server/transport_ipc.c |  5 ++++-
 5 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index be9656d524a4..f8a40f65db6a 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -70,7 +70,6 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
 	atomic_set(&conn->req_running, 0);
 	atomic_set(&conn->r_count, 0);
 	atomic_set(&conn->refcnt, 1);
-	atomic_set(&conn->mux_smb_requests, 0);
 	conn->total_credits = 1;
 	conn->outstanding_credits = 0;
 
@@ -133,6 +132,8 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work)
 	struct ksmbd_conn *conn = work->conn;
 
 	atomic_dec(&conn->req_running);
+	if (waitqueue_active(&conn->req_running_q))
+		wake_up(&conn->req_running_q);
 
 	if (list_empty(&work->request_entry) &&
 	    list_empty(&work->async_request_entry))
@@ -309,7 +310,7 @@ int ksmbd_conn_handler_loop(void *p)
 {
 	struct ksmbd_conn *conn = (struct ksmbd_conn *)p;
 	struct ksmbd_transport *t = conn->transport;
-	unsigned int pdu_size, max_allowed_pdu_size;
+	unsigned int pdu_size, max_allowed_pdu_size, max_req;
 	char hdr_buf[4] = {0,};
 	int size;
 
@@ -319,6 +320,7 @@ int ksmbd_conn_handler_loop(void *p)
 	if (t->ops->prepare && t->ops->prepare(t))
 		goto out;
 
+	max_req = server_conf.max_inflight_req;
 	conn->last_active = jiffies;
 	set_freezable();
 	while (ksmbd_conn_alive(conn)) {
@@ -328,6 +330,13 @@ int ksmbd_conn_handler_loop(void *p)
 		kvfree(conn->request_buf);
 		conn->request_buf = NULL;
 
+recheck:
+		if (atomic_read(&conn->req_running) + 1 > max_req) {
+			wait_event_interruptible(conn->req_running_q,
+				atomic_read(&conn->req_running) < max_req);
+			goto recheck;
+		}
+
 		size = t->ops->read(t, hdr_buf, sizeof(hdr_buf), -1);
 		if (size != sizeof(hdr_buf))
 			break;
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 8ddd5a3c7baf..b379ae4fdcdf 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -107,7 +107,6 @@ struct ksmbd_conn {
 	__le16				signing_algorithm;
 	bool				binding;
 	atomic_t			refcnt;
-	atomic_t			mux_smb_requests;
 };
 
 struct ksmbd_conn_ops {
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 3ba95bd8edeb..601e7fcbcf1e 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -270,7 +270,6 @@ static void handle_ksmbd_work(struct work_struct *wk)
 
 	ksmbd_conn_try_dequeue_request(work);
 	ksmbd_free_work_struct(work);
-	atomic_dec(&conn->mux_smb_requests);
 	/*
 	 * Checking waitqueue to dropping pending requests on
 	 * disconnection. waitqueue_active is safe because it
@@ -300,11 +299,6 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn)
 	if (err)
 		return 0;
 
-	if (atomic_inc_return(&conn->mux_smb_requests) >= conn->vals->max_credits) {
-		atomic_dec_return(&conn->mux_smb_requests);
-		return -ENOSPC;
-	}
-
 	work = ksmbd_alloc_work_struct();
 	if (!work) {
 		pr_err("allocation for work failed\n");
@@ -367,6 +361,7 @@ static int server_conf_init(void)
 	server_conf.auth_mechs |= KSMBD_AUTH_KRB5 |
 				KSMBD_AUTH_MSKRB5;
 #endif
+	server_conf.max_inflight_req = SMB2_MAX_CREDITS;
 	return 0;
 }
 
diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h
index 4fc529335271..94187628ff08 100644
--- a/fs/smb/server/server.h
+++ b/fs/smb/server/server.h
@@ -42,6 +42,7 @@ struct ksmbd_server_config {
 	struct smb_sid		domain_sid;
 	unsigned int		auth_mechs;
 	unsigned int		max_connections;
+	unsigned int		max_inflight_req;
 
 	char			*conf[SERVER_CONF_WORK_GROUP + 1];
 	struct task_struct	*dh_task;
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 48cda3350e5a..befaf42b84cc 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -319,8 +319,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
 		init_smb2_max_write_size(req->smb2_max_write);
 	if (req->smb2_max_trans)
 		init_smb2_max_trans_size(req->smb2_max_trans);
-	if (req->smb2_max_credits)
+	if (req->smb2_max_credits) {
 		init_smb2_max_credits(req->smb2_max_credits);
+		server_conf.max_inflight_req =
+			req->smb2_max_credits;
+	}
 	if (req->smbd_max_io_size)
 		init_smbd_max_io_size(req->smbd_max_io_size);
 
-- 
cgit v1.2.3


From fe4ed2f09b492e3507615a053814daa8fafdecb1 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Sat, 14 Dec 2024 12:19:03 +0900
Subject: ksmbd: conn lock to serialize smb2 negotiate

If client send parallel smb2 negotiate request on same connection,
ksmbd_conn can be racy. smb2 negotiate handling that are not
performance-related can be serialized with conn lock.

Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 803b35b89513..23e21845f928 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1097,6 +1097,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
 		return rc;
 	}
 
+	ksmbd_conn_lock(conn);
 	smb2_buf_len = get_rfc1002_len(work->request_buf);
 	smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects);
 	if (smb2_neg_size > smb2_buf_len) {
@@ -1247,6 +1248,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
 	ksmbd_conn_set_need_negotiate(conn);
 
 err_out:
+	ksmbd_conn_unlock(conn);
 	if (rc)
 		rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
 
-- 
cgit v1.2.3


From 7b00af2c5414dc01e0718deef7ead81102867636 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Mon, 16 Dec 2024 20:53:08 +0800
Subject: erofs: use `struct erofs_device_info` for the primary device

Instead of just listing each one directly in `struct erofs_sb_info`
except that we still use `sb->s_bdev` for the primary block device.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241216125310.930933-2-hsiangkao@linux.alibaba.com
---
 fs/erofs/data.c     | 12 ++++--------
 fs/erofs/fscache.c  |  6 +++---
 fs/erofs/internal.h |  8 ++------
 fs/erofs/super.c    | 27 +++++++++++++--------------
 4 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 1c49f8962021..622017c65958 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -56,10 +56,10 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
 
 	buf->file = NULL;
 	if (erofs_is_fileio_mode(sbi)) {
-		buf->file = sbi->fdev;		/* some fs like FUSE needs it */
+		buf->file = sbi->dif0.file;	/* some fs like FUSE needs it */
 		buf->mapping = buf->file->f_mapping;
 	} else if (erofs_is_fscache_mode(sb))
-		buf->mapping = sbi->s_fscache->inode->i_mapping;
+		buf->mapping = sbi->dif0.fscache->inode->i_mapping;
 	else
 		buf->mapping = sb->s_bdev->bd_mapping;
 }
@@ -201,12 +201,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 	erofs_off_t startoff, length;
 	int id;
 
-	map->m_bdev = sb->s_bdev;
-	map->m_daxdev = EROFS_SB(sb)->dax_dev;
-	map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
-	map->m_fscache = EROFS_SB(sb)->s_fscache;
-	map->m_fp = EROFS_SB(sb)->fdev;
-
+	erofs_fill_from_devinfo(map, &EROFS_SB(sb)->dif0);
+	map->m_bdev = sb->s_bdev;	/* use s_bdev for the primary device */
 	if (map->m_deviceid) {
 		down_read(&devs->rwsem);
 		dif = idr_find(&devs->tree, map->m_deviceid - 1);
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index fda16eedafb5..ce7e38c82719 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -657,7 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
 	if (IS_ERR(fscache))
 		return PTR_ERR(fscache);
 
-	sbi->s_fscache = fscache;
+	sbi->dif0.fscache = fscache;
 	return 0;
 }
 
@@ -665,14 +665,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 
-	erofs_fscache_unregister_cookie(sbi->s_fscache);
+	erofs_fscache_unregister_cookie(sbi->dif0.fscache);
 
 	if (sbi->domain)
 		erofs_fscache_domain_put(sbi->domain);
 	else
 		fscache_relinquish_volume(sbi->volume, NULL, false);
 
-	sbi->s_fscache = NULL;
+	sbi->dif0.fscache = NULL;
 	sbi->volume = NULL;
 	sbi->domain = NULL;
 }
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1c847c30a918..3e8d71d516f4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -107,6 +107,7 @@ struct erofs_xattr_prefix_item {
 };
 
 struct erofs_sb_info {
+	struct erofs_device_info dif0;
 	struct erofs_mount_opts opt;	/* options */
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* list for all registered superblocks, mainly for shrinker */
@@ -124,13 +125,9 @@ struct erofs_sb_info {
 
 	struct erofs_sb_lz4_info lz4;
 #endif	/* CONFIG_EROFS_FS_ZIP */
-	struct file *fdev;
 	struct inode *packed_inode;
 	struct erofs_dev_context *devs;
-	struct dax_device *dax_dev;
-	u64 dax_part_off;
 	u64 total_blocks;
-	u32 primarydevice_blocks;
 
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -166,7 +163,6 @@ struct erofs_sb_info {
 
 	/* fscache support */
 	struct fscache_volume *volume;
-	struct erofs_fscache *s_fscache;
 	struct erofs_domain *domain;
 	char *fsid;
 	char *domain_id;
@@ -187,7 +183,7 @@ struct erofs_sb_info {
 
 static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi)
 {
-	return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev;
+	return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file;
 }
 
 static inline bool erofs_is_fscache_mode(struct super_block *sb)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index de8e3ecc6381..9044907354e1 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -203,7 +203,7 @@ static int erofs_scan_devices(struct super_block *sb,
 	struct erofs_device_info *dif;
 	int id, err = 0;
 
-	sbi->total_blocks = sbi->primarydevice_blocks;
+	sbi->total_blocks = sbi->dif0.blocks;
 	if (!erofs_sb_has_device_table(sbi))
 		ondisk_extradevs = 0;
 	else
@@ -307,7 +307,7 @@ static int erofs_read_superblock(struct super_block *sb)
 			  sbi->sb_size);
 		goto out;
 	}
-	sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
+	sbi->dif0.blocks = le32_to_cpu(dsb->blocks);
 	sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
 #ifdef CONFIG_EROFS_FS_XATTR
 	sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -602,9 +602,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 			return -EINVAL;
 		}
 
-		sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
-						  &sbi->dax_part_off,
-						  NULL, NULL);
+		sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+				&sbi->dif0.dax_part_off, NULL, NULL);
 	}
 
 	err = erofs_read_superblock(sb);
@@ -627,7 +626,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 	}
 
 	if (test_opt(&sbi->opt, DAX_ALWAYS)) {
-		if (!sbi->dax_dev) {
+		if (!sbi->dif0.dax_dev) {
 			errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
 			clear_opt(&sbi->opt, DAX_ALWAYS);
 		} else if (sbi->blkszbits != PAGE_SHIFT) {
@@ -707,14 +706,13 @@ static int erofs_fc_get_tree(struct fs_context *fc)
 
 		if (!fc->source)
 			return invalf(fc, "No source specified");
-
 		file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
 		if (IS_ERR(file))
 			return PTR_ERR(file);
-		sbi->fdev = file;
+		sbi->dif0.file = file;
 
-		if (S_ISREG(file_inode(sbi->fdev)->i_mode) &&
-		    sbi->fdev->f_mapping->a_ops->read_folio)
+		if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) &&
+		    sbi->dif0.file->f_mapping->a_ops->read_folio)
 			return get_tree_nodev(fc, erofs_fc_fill_super);
 	}
 #endif
@@ -771,8 +769,8 @@ static void erofs_sb_free(struct erofs_sb_info *sbi)
 	erofs_free_dev_context(sbi->devs);
 	kfree(sbi->fsid);
 	kfree(sbi->domain_id);
-	if (sbi->fdev)
-		fput(sbi->fdev);
+	if (sbi->dif0.file)
+		fput(sbi->dif0.file);
 	kfree(sbi);
 }
 
@@ -817,11 +815,12 @@ static void erofs_kill_sb(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 
-	if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev)
+	if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) ||
+	    sbi->dif0.file)
 		kill_anon_super(sb);
 	else
 		kill_block_super(sb);
-	fs_put_dax(sbi->dax_dev, NULL);
+	fs_put_dax(sbi->dif0.dax_dev, NULL);
 	erofs_fscache_unregister_fs(sb);
 	erofs_sb_free(sbi);
 	sb->s_fs_info = NULL;
-- 
cgit v1.2.3


From f8d920a402aec3482931cb5f1539ed438740fc49 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Fri, 13 Dec 2024 07:54:01 +0800
Subject: erofs: reference `struct erofs_device_info` for erofs_map_dev

Record `m_sb` and `m_dif` to replace `m_fscache`, `m_daxdev`, `m_fp`
and `m_dax_part_off` in order to simplify the codebase.

Note that `m_bdev` is still left since it can be assigned from
`sb->s_bdev` directly.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241212235401.2857246-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/data.c     | 26 ++++++++++----------------
 fs/erofs/fileio.c   |  2 +-
 fs/erofs/fscache.c  |  4 ++--
 fs/erofs/internal.h |  6 ++----
 4 files changed, 15 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 622017c65958..0cd6b5c4df98 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -179,19 +179,13 @@ out:
 }
 
 static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
-				    struct erofs_device_info *dif)
+		struct super_block *sb, struct erofs_device_info *dif)
 {
+	map->m_sb = sb;
+	map->m_dif = dif;
 	map->m_bdev = NULL;
-	map->m_fp = NULL;
-	if (dif->file) {
-		if (S_ISBLK(file_inode(dif->file)->i_mode))
-			map->m_bdev = file_bdev(dif->file);
-		else
-			map->m_fp = dif->file;
-	}
-	map->m_daxdev = dif->dax_dev;
-	map->m_dax_part_off = dif->dax_part_off;
-	map->m_fscache = dif->fscache;
+	if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode))
+		map->m_bdev = file_bdev(dif->file);
 }
 
 int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
@@ -201,7 +195,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 	erofs_off_t startoff, length;
 	int id;
 
-	erofs_fill_from_devinfo(map, &EROFS_SB(sb)->dif0);
+	erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
 	map->m_bdev = sb->s_bdev;	/* use s_bdev for the primary device */
 	if (map->m_deviceid) {
 		down_read(&devs->rwsem);
@@ -215,7 +209,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			up_read(&devs->rwsem);
 			return 0;
 		}
-		erofs_fill_from_devinfo(map, dif);
+		erofs_fill_from_devinfo(map, sb, dif);
 		up_read(&devs->rwsem);
 	} else if (devs->extra_devices && !devs->flatdev) {
 		down_read(&devs->rwsem);
@@ -228,7 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			if (map->m_pa >= startoff &&
 			    map->m_pa < startoff + length) {
 				map->m_pa -= startoff;
-				erofs_fill_from_devinfo(map, dif);
+				erofs_fill_from_devinfo(map, sb, dif);
 				break;
 			}
 		}
@@ -298,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 
 	iomap->offset = map.m_la;
 	if (flags & IOMAP_DAX)
-		iomap->dax_dev = mdev.m_daxdev;
+		iomap->dax_dev = mdev.m_dif->dax_dev;
 	else
 		iomap->bdev = mdev.m_bdev;
 	iomap->length = map.m_llen;
@@ -327,7 +321,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		iomap->type = IOMAP_MAPPED;
 		iomap->addr = mdev.m_pa;
 		if (flags & IOMAP_DAX)
-			iomap->addr += mdev.m_dax_part_off;
+			iomap->addr += mdev.m_dif->dax_part_off;
 	}
 	return 0;
 }
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index 3af96b1e2c2a..a61b8faec651 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -67,7 +67,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev)
 					     GFP_KERNEL | __GFP_NOFAIL);
 
 	bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ);
-	rq->iocb.ki_filp = mdev->m_fp;
+	rq->iocb.ki_filp = mdev->m_dif->file;
 	return rq;
 }
 
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index ce7e38c82719..ce3d8737df85 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -198,7 +198,7 @@ struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev)
 
 	io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL);
 	bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ);
-	io->io.private = mdev->m_fscache->cookie;
+	io->io.private = mdev->m_dif->fscache->cookie;
 	io->io.end_io = erofs_fscache_bio_endio;
 	refcount_set(&io->io.ref, 1);
 	return &io->bio;
@@ -316,7 +316,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
 	if (!io)
 		return -ENOMEM;
 	iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
-	ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie,
+	ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie,
 			mdev.m_pa + (pos - map.m_la), io);
 	erofs_fscache_req_io_put(io);
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 3e8d71d516f4..7cc8e1be04e8 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -353,11 +353,9 @@ enum {
 };
 
 struct erofs_map_dev {
-	struct erofs_fscache *m_fscache;
+	struct super_block *m_sb;
+	struct erofs_device_info *m_dif;
 	struct block_device *m_bdev;
-	struct dax_device *m_daxdev;
-	struct file *m_fp;
-	u64 m_dax_part_off;
 
 	erofs_off_t m_pa;
 	unsigned int m_deviceid;
-- 
cgit v1.2.3


From 6422cde1b0d5a31b206b263417c1c2b3c80fe82c Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Thu, 12 Dec 2024 21:43:36 +0800
Subject: erofs: use buffered I/O for file-backed mounts by default

For many use cases (e.g. container images are just fetched from remote),
performance will be impacted if underlay page cache is up-to-date but
direct i/o flushes dirty pages first.

Instead, let's use buffered I/O by default to keep in sync with loop
devices and add a (re)mount option to explicitly give a try to use
direct I/O if supported by the underlying files.

The container startup time is improved as below:
[workload] docker.io/library/workpress:latest
                                     unpack        1st run  non-1st runs
EROFS snapshotter buffered I/O file  4.586404265s  0.308s   0.198s
EROFS snapshotter direct I/O file    4.581742849s  2.238s   0.222s
EROFS snapshotter loop               4.596023152s  0.346s   0.201s
Overlayfs snapshotter                5.382851037s  0.206s   0.214s

Fixes: fb176750266a ("erofs: add file-backed mount support")
Cc: Derek McGowan <derek@mcg.dev>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241212134336.2059899-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/fileio.c   |  7 +++++--
 fs/erofs/internal.h |  1 +
 fs/erofs/super.c    | 23 +++++++++++++++--------
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index a61b8faec651..33f8539dda4a 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -9,6 +9,7 @@ struct erofs_fileio_rq {
 	struct bio_vec bvecs[BIO_MAX_VECS];
 	struct bio bio;
 	struct kiocb iocb;
+	struct super_block *sb;
 };
 
 struct erofs_fileio {
@@ -52,8 +53,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
 	rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT;
 	rq->iocb.ki_ioprio = get_current_ioprio();
 	rq->iocb.ki_complete = erofs_fileio_ki_complete;
-	rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ?
-				IOCB_DIRECT : 0;
+	if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) &&
+	    rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT)
+		rq->iocb.ki_flags = IOCB_DIRECT;
 	iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
 		      rq->bio.bi_iter.bi_size);
 	ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
@@ -68,6 +70,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev)
 
 	bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ);
 	rq->iocb.ki_filp = mdev->m_dif->file;
+	rq->sb = mdev->m_sb;
 	return rq;
 }
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 7cc8e1be04e8..686d835eb533 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -176,6 +176,7 @@ struct erofs_sb_info {
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
 #define EROFS_MOUNT_DAX_ALWAYS		0x00000040
 #define EROFS_MOUNT_DAX_NEVER		0x00000080
+#define EROFS_MOUNT_DIRECT_IO		0x00000100
 
 #define clear_opt(opt, option)	((opt)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(opt, option)	((opt)->mount_opt |= EROFS_MOUNT_##option)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 9044907354e1..f5956474bfde 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -364,14 +364,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi)
 }
 
 enum {
-	Opt_user_xattr,
-	Opt_acl,
-	Opt_cache_strategy,
-	Opt_dax,
-	Opt_dax_enum,
-	Opt_device,
-	Opt_fsid,
-	Opt_domain_id,
+	Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
+	Opt_device, Opt_fsid, Opt_domain_id, Opt_directio,
 	Opt_err
 };
 
@@ -398,6 +392,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
 	fsparam_string("device",	Opt_device),
 	fsparam_string("fsid",		Opt_fsid),
 	fsparam_string("domain_id",	Opt_domain_id),
+	fsparam_flag_no("directio",	Opt_directio),
 	{}
 };
 
@@ -511,6 +506,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 		errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
 		break;
 #endif
+	case Opt_directio:
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+		if (result.boolean)
+			set_opt(&sbi->opt, DIRECT_IO);
+		else
+			clear_opt(&sbi->opt, DIRECT_IO);
+#else
+		errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
+#endif
+		break;
 	default:
 		return -ENOPARAM;
 	}
@@ -948,6 +953,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",dax=always");
 	if (test_opt(opt, DAX_NEVER))
 		seq_puts(seq, ",dax=never");
+	if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO))
+		seq_puts(seq, ",directio");
 #ifdef CONFIG_EROFS_FS_ONDEMAND
 	if (sbi->fsid)
 		seq_printf(seq, ",fsid=%s", sbi->fsid);
-- 
cgit v1.2.3


From a909c179535383dc72a8f8e155ed3d35f298af86 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:16 +0000
Subject: f2fs: Use a folio in f2fs_all_cluster_page_ready()

Remove references to page->index and use folio_test_uptodate()
instead of PageUptodate().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 7f26440e8595..9f8d0f4c8564 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -846,7 +846,7 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
 bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages,
 				int index, int nr_pages, bool uptodate)
 {
-	unsigned long pgidx = pages[index]->index;
+	unsigned long pgidx = page_folio(pages[index])->index;
 	int i = uptodate ? 0 : 1;
 
 	/*
@@ -860,9 +860,11 @@ bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages,
 		return false;
 
 	for (; i < cc->cluster_size; i++) {
-		if (pages[index + i]->index != pgidx + i)
+		struct folio *folio = page_folio(pages[index + i]);
+
+		if (folio->index != pgidx + i)
 			return false;
-		if (uptodate && !PageUptodate(pages[index + i]))
+		if (uptodate && !folio_test_uptodate(folio))
 			return false;
 	}
 
-- 
cgit v1.2.3


From ff6c82a934f7b5df8702579d921209c5ca336102 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:17 +0000
Subject: f2fs: Use a folio in f2fs_compress_write_end()

This removes an access of page->index.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 9f8d0f4c8564..c91a2e4fe60c 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1197,7 +1197,8 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
 		.cluster_size = F2FS_I(inode)->i_cluster_size,
 		.rpages = fsdata,
 	};
-	bool first_index = (index == cc.rpages[0]->index);
+	struct folio *folio = page_folio(cc.rpages[0]);
+	bool first_index = (index == folio->index);
 
 	if (copied)
 		set_cluster_dirty(&cc);
-- 
cgit v1.2.3


From 1cda5bc0b2fe93cdcb5f05a02f814a282d32742c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:18 +0000
Subject: f2fs: Use a folio in f2fs_truncate_partial_cluster()

Convert the incoming page to a folio and use it throughout.
Removes an access to page->index.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index c91a2e4fe60c..494baa1e8bd3 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1242,13 +1242,14 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock)
 		int i;
 
 		for (i = cluster_size - 1; i >= 0; i--) {
-			loff_t start = rpages[i]->index << PAGE_SHIFT;
+			struct folio *folio = page_folio(rpages[i]);
+			loff_t start = folio->index << PAGE_SHIFT;
 
 			if (from <= start) {
-				zero_user_segment(rpages[i], 0, PAGE_SIZE);
+				folio_zero_segment(folio, 0, folio_size(folio));
 			} else {
-				zero_user_segment(rpages[i], from - start,
-								PAGE_SIZE);
+				folio_zero_segment(folio, from - start,
+						folio_size(folio));
 				break;
 			}
 		}
-- 
cgit v1.2.3


From ac866908d7a92b6b2be1127a2d5e85e23da86fa3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:19 +0000
Subject: f2fs: Use a folio in f2fs_write_compressed_pages()

Remove accesses to page->index and an unnecessary reference to
page->mapping.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 494baa1e8bd3..0b55b2695c9b 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1282,6 +1282,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 		.encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ?
 									1 : 0,
 	};
+	struct folio *folio;
 	struct dnode_of_data dn;
 	struct node_info ni;
 	struct compress_io_ctx *cic;
@@ -1293,7 +1294,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 
 	/* we should bypass data pages to proceed the kworker jobs */
 	if (unlikely(f2fs_cp_error(sbi))) {
-		mapping_set_error(cc->rpages[0]->mapping, -EIO);
+		mapping_set_error(inode->i_mapping, -EIO);
 		goto out_free;
 	}
 
@@ -1320,7 +1321,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 			goto out_put_dnode;
 	}
 
-	psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT;
+	folio = page_folio(cc->rpages[last_index]);
+	psize = folio_pos(folio) + folio_size(folio);
 
 	err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false);
 	if (err)
@@ -1343,7 +1345,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 
 	for (i = 0; i < cc->valid_nr_cpages; i++) {
 		f2fs_set_compressed_page(cc->cpages[i], inode,
-					cc->rpages[i + 1]->index, cic);
+				page_folio(cc->rpages[i + 1])->index, cic);
 		fio.compressed_page = cc->cpages[i];
 
 		fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page,
-- 
cgit v1.2.3


From 87e2a15bc00840762b082399493597e8d3c1e42c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:20 +0000
Subject: f2fs: Convert submit tracepoints to take a folio

Remove accesses to page->index and page->mapping as well as
unnecessary calls to page_file_mapping().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c              |  6 +++---
 include/trace/events/f2fs.h | 30 +++++++++++++++---------------
 2 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a2478c2afb3a..7cb2272c723e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -697,7 +697,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 			META_GENERIC : DATA_GENERIC_ENHANCE)))
 		return -EFSCORRUPTED;
 
-	trace_f2fs_submit_page_bio(page, fio);
+	trace_f2fs_submit_folio_bio(page_folio(page), fio);
 
 	/* Allocate a new bio */
 	bio = __bio_alloc(fio, 1);
@@ -894,7 +894,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
 			__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
 		return -EFSCORRUPTED;
 
-	trace_f2fs_submit_page_bio(page, fio);
+	trace_f2fs_submit_folio_bio(page_folio(page), fio);
 
 	if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block,
 						fio->new_blkaddr))
@@ -1018,7 +1018,7 @@ alloc_new:
 
 	io->last_block_in_bio = fio->new_blkaddr;
 
-	trace_f2fs_submit_page_write(fio->page, fio);
+	trace_f2fs_submit_folio_write(page_folio(fio->page), fio);
 #ifdef CONFIG_BLK_DEV_ZONED
 	if (f2fs_sb_has_blkzoned(sbi) && btype < META &&
 			is_end_zone_blkaddr(sbi, fio->new_blkaddr)) {
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 2851c823095b..3253f45508e8 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1119,11 +1119,11 @@ TRACE_EVENT(f2fs_reserve_new_blocks,
 		(unsigned long long)__entry->count)
 );
 
-DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
+DECLARE_EVENT_CLASS(f2fs__submit_folio_bio,
 
-	TP_PROTO(struct page *page, struct f2fs_io_info *fio),
+	TP_PROTO(struct folio *folio, struct f2fs_io_info *fio),
 
-	TP_ARGS(page, fio),
+	TP_ARGS(folio, fio),
 
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
@@ -1138,9 +1138,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= page_file_mapping(page)->host->i_sb->s_dev;
-		__entry->ino		= page_file_mapping(page)->host->i_ino;
-		__entry->index		= page->index;
+		__entry->dev		= folio->mapping->host->i_sb->s_dev;
+		__entry->ino		= folio->mapping->host->i_ino;
+		__entry->index		= folio->index;
 		__entry->old_blkaddr	= fio->old_blkaddr;
 		__entry->new_blkaddr	= fio->new_blkaddr;
 		__entry->op		= fio->op;
@@ -1149,7 +1149,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		__entry->type		= fio->type;
 	),
 
-	TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
+	TP_printk("dev = (%d,%d), ino = %lu, folio_index = 0x%lx, "
 		"oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s",
 		show_dev_ino(__entry),
 		(unsigned long)__entry->index,
@@ -1160,22 +1160,22 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		show_block_type(__entry->type))
 );
 
-DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio,
+DEFINE_EVENT_CONDITION(f2fs__submit_folio_bio, f2fs_submit_folio_bio,
 
-	TP_PROTO(struct page *page, struct f2fs_io_info *fio),
+	TP_PROTO(struct folio *folio, struct f2fs_io_info *fio),
 
-	TP_ARGS(page, fio),
+	TP_ARGS(folio, fio),
 
-	TP_CONDITION(page->mapping)
+	TP_CONDITION(folio->mapping)
 );
 
-DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write,
+DEFINE_EVENT_CONDITION(f2fs__submit_folio_bio, f2fs_submit_folio_write,
 
-	TP_PROTO(struct page *page, struct f2fs_io_info *fio),
+	TP_PROTO(struct folio *folio, struct f2fs_io_info *fio),
 
-	TP_ARGS(page, fio),
+	TP_ARGS(folio, fio),
 
-	TP_CONDITION(page->mapping)
+	TP_CONDITION(folio->mapping)
 );
 
 DECLARE_EVENT_CLASS(f2fs__bio,
-- 
cgit v1.2.3


From 1cf746007005593aa51395302ca0d31814f4ce42 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:21 +0000
Subject: f2fs: Add F2FS_F_SB()

This is the folio equivalent of F2FS_P_SB().  Removes a call to
page_file_mapping() as we know folios seen by f2fs are never part of
the swap cache.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6f2cbf4c5740..f523dd302bf6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2003,9 +2003,14 @@ static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
 	return F2FS_I_SB(mapping->host);
 }
 
+static inline struct f2fs_sb_info *F2FS_F_SB(struct folio *folio)
+{
+	return F2FS_M_SB(folio->mapping);
+}
+
 static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
 {
-	return F2FS_M_SB(page_file_mapping(page));
+	return F2FS_F_SB(page_folio(page));
 }
 
 static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
-- 
cgit v1.2.3


From e0821645dd2d79180418dd9389e3e9e7e10e7281 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:22 +0000
Subject: f2fs: Convert f2fs_finish_read_bio() to use folios

Use bio_for_each_folio_all() to iterate over each folio in the bio.
This lets us use folio_end_read() which saves an atomic operation and
memory barrier compared to marking the folio uptodate and unlocking
it as two separate operations.  This also removes a few hidden calls
to compound_head().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7cb2272c723e..aa08ab387e58 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -136,27 +136,22 @@ struct bio_post_read_ctx {
  */
 static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
 {
-	struct bio_vec *bv;
-	struct bvec_iter_all iter_all;
+	struct folio_iter fi;
 	struct bio_post_read_ctx *ctx = bio->bi_private;
 
-	bio_for_each_segment_all(bv, bio, iter_all) {
-		struct page *page = bv->bv_page;
+	bio_for_each_folio_all(fi, bio) {
+		struct folio *folio = fi.folio;
 
-		if (f2fs_is_compressed_page(page)) {
+		if (f2fs_is_compressed_page(&folio->page)) {
 			if (ctx && !ctx->decompression_attempted)
-				f2fs_end_read_compressed_page(page, true, 0,
+				f2fs_end_read_compressed_page(&folio->page, true, 0,
 							in_task);
-			f2fs_put_page_dic(page, in_task);
+			f2fs_put_page_dic(&folio->page, in_task);
 			continue;
 		}
 
-		if (bio->bi_status)
-			ClearPageUptodate(page);
-		else
-			SetPageUptodate(page);
-		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
-		unlock_page(page);
+		dec_page_count(F2FS_F_SB(folio), __read_io_type(&folio->page));
+		folio_end_read(folio, bio->bi_status == 0);
 	}
 
 	if (ctx)
-- 
cgit v1.2.3


From 0765b3f989a7eb757252951b21a244bfa3224561 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:23 +0000
Subject: f2fs: Use a folio more in f2fs_submit_page_bio()

Cache the result of page_folio(fio->page) in a local variable so
we don't have to keep calling it.  Saves a couple of calls to
compound_head() and removes an access to page->mapping.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa08ab387e58..cae5fa895b97 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -684,6 +684,7 @@ void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
 int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 {
 	struct bio *bio;
+	struct folio *fio_folio = page_folio(fio->page);
 	struct page *page = fio->encrypted_page ?
 			fio->encrypted_page : fio->page;
 
@@ -697,8 +698,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	/* Allocate a new bio */
 	bio = __bio_alloc(fio, 1);
 
-	f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
-			page_folio(fio->page)->index, fio, GFP_NOIO);
+	f2fs_set_bio_crypt_ctx(bio, fio_folio->mapping->host,
+			fio_folio->index, fio, GFP_NOIO);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		bio_put(bio);
@@ -706,8 +707,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	}
 
 	if (fio->io_wbc && !is_read_io(fio->op))
-		wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
-					 PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE);
 
 	inc_page_count(fio->sbi, is_read_io(fio->op) ?
 			__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
-- 
cgit v1.2.3


From f58d8645824b4885caa9e24989f8e601b5e7ed50 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:24 +0000
Subject: f2fs: Use a data folio in f2fs_submit_page_bio()

Remove a call to compound_head().  We can call bio_add_folio_nofail()
here because we just allocated the bio, so we know it can't fail and
thus the error path can never be taken.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index cae5fa895b97..30d727808c92 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -685,32 +685,28 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 {
 	struct bio *bio;
 	struct folio *fio_folio = page_folio(fio->page);
-	struct page *page = fio->encrypted_page ?
-			fio->encrypted_page : fio->page;
+	struct folio *data_folio = fio->encrypted_page ?
+			page_folio(fio->encrypted_page) : fio_folio;
 
 	if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
 			fio->is_por ? META_POR : (__is_meta_io(fio) ?
 			META_GENERIC : DATA_GENERIC_ENHANCE)))
 		return -EFSCORRUPTED;
 
-	trace_f2fs_submit_folio_bio(page_folio(page), fio);
+	trace_f2fs_submit_folio_bio(data_folio, fio);
 
 	/* Allocate a new bio */
 	bio = __bio_alloc(fio, 1);
 
 	f2fs_set_bio_crypt_ctx(bio, fio_folio->mapping->host,
 			fio_folio->index, fio, GFP_NOIO);
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		bio_put(bio);
-		return -EFAULT;
-	}
+	bio_add_folio_nofail(bio, data_folio, folio_size(data_folio), 0);
 
 	if (fio->io_wbc && !is_read_io(fio->op))
 		wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE);
 
 	inc_page_count(fio->sbi, is_read_io(fio->op) ?
-			__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
+			__read_io_type(&data_folio->page) : WB_DATA_TYPE(fio->page, false));
 
 	if (is_read_io(bio_op(bio)))
 		f2fs_submit_read_bio(fio->sbi, bio, fio->type);
-- 
cgit v1.2.3


From 19bbd306ddfd50a2f6cf0c3ccaaa079f22ddf4c5 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:25 +0000
Subject: f2fs: Convert __read_io_type() to take a folio

Remove the last call to page_file_mapping() as both callers can now pass
in a folio.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 30d727808c92..e68a4dd9bc17 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -70,9 +70,9 @@ bool f2fs_is_cp_guaranteed(struct page *page)
 	return false;
 }
 
-static enum count_type __read_io_type(struct page *page)
+static enum count_type __read_io_type(struct folio *folio)
 {
-	struct address_space *mapping = page_file_mapping(page);
+	struct address_space *mapping = folio->mapping;
 
 	if (mapping) {
 		struct inode *inode = mapping->host;
@@ -150,7 +150,7 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
 			continue;
 		}
 
-		dec_page_count(F2FS_F_SB(folio), __read_io_type(&folio->page));
+		dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
 		folio_end_read(folio, bio->bi_status == 0);
 	}
 
@@ -706,7 +706,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 		wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE);
 
 	inc_page_count(fio->sbi, is_read_io(fio->op) ?
-			__read_io_type(&data_folio->page) : WB_DATA_TYPE(fio->page, false));
+			__read_io_type(data_folio) : WB_DATA_TYPE(fio->page, false));
 
 	if (is_read_io(bio_op(bio)))
 		f2fs_submit_read_bio(fio->sbi, bio, fio->type);
-- 
cgit v1.2.3


From c910a64bc4e21782959221b6ea2d6c4cce0506c7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 04:58:26 +0000
Subject: f2fs: Remove calls to folio_file_mapping()

All folios that f2fs sees belong to f2fs and not to the swapcache
so it can dereference folio->mapping directly like all other
filesystems do.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c              | 2 +-
 fs/f2fs/inline.c            | 2 +-
 include/trace/events/f2fs.h | 9 ++++-----
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e68a4dd9bc17..0657f731d4b7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2455,7 +2455,7 @@ next_page:
 
 static int f2fs_read_data_folio(struct file *file, struct folio *folio)
 {
-	struct inode *inode = folio_file_mapping(folio)->host;
+	struct inode *inode = folio->mapping->host;
 	int ret = -EAGAIN;
 
 	trace_f2fs_readpage(folio, DATA);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 005babf1bed1..cbd2a0d34804 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -81,7 +81,7 @@ bool f2fs_may_inline_dentry(struct inode *inode)
 
 void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage)
 {
-	struct inode *inode = folio_file_mapping(folio)->host;
+	struct inode *inode = folio->mapping->host;
 
 	if (folio_test_uptodate(folio))
 		return;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 3253f45508e8..eb3b2f1326b1 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1322,12 +1322,11 @@ DECLARE_EVENT_CLASS(f2fs__folio,
 	),
 
 	TP_fast_assign(
-		__entry->dev	= folio_file_mapping(folio)->host->i_sb->s_dev;
-		__entry->ino	= folio_file_mapping(folio)->host->i_ino;
+		__entry->dev	= folio->mapping->host->i_sb->s_dev;
+		__entry->ino	= folio->mapping->host->i_ino;
 		__entry->type	= type;
-		__entry->dir	=
-			S_ISDIR(folio_file_mapping(folio)->host->i_mode);
-		__entry->index	= folio_index(folio);
+		__entry->dir	= S_ISDIR(folio->mapping->host->i_mode);
+		__entry->index	= folio->index;
 		__entry->dirty	= folio_test_dirty(folio);
 		__entry->uptodate = folio_test_uptodate(folio);
 	),
-- 
cgit v1.2.3


From 5f6594542779e69c6c6e2b57341c352174135eed Mon Sep 17 00:00:00 2001
From: zangyangyang1 <zangyangyang1@xiaomi.com>
Date: Fri, 22 Nov 2024 18:58:22 +0800
Subject: f2fs: cache more dentry pages

While traversing dir entries in dentry page, it's better to refresh current
accessed page in lru list by using FGP_ACCESSED flag, otherwise, such page
may has less chance to survive during memory reclaim, result in causing
additional IO when revisiting dentry page.

Signed-off-by: zangyangyang1 <zangyangyang1@xiaomi.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0657f731d4b7..2ec0cfb41260 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1280,7 +1280,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 
-	page = find_get_page(mapping, index);
+	page = find_get_page_flags(mapping, index, FGP_ACCESSED);
 	if (page && PageUptodate(page))
 		return page;
 	f2fs_put_page(page, 0);
-- 
cgit v1.2.3


From e9a844f6e487ee0f64d995d1d8ffde2e270e2479 Mon Sep 17 00:00:00 2001
From: Yongpeng Yang <yangyongpeng1@oppo.com>
Date: Wed, 4 Dec 2024 11:31:13 +0800
Subject: f2fs: The GC triggered by ioctl also needs to mark the segno as
 victim

In SSR mode, the segment selected for allocation might be the same as
the target segment of the GC triggered by ioctl, resulting in the GC
moving the CURSEG_I(sbi, type)->segno.
Thread A				Thread B or Thread A
- f2fs_ioc_gc_range
 - __f2fs_ioc_gc_range(.victim_segno=segno#N)
  - f2fs_gc
   - __get_victim
    - f2fs_get_victim
    : segno#N is valid, return segno#N as source segment of GC
					- f2fs_allocate_data_block
						- need_new_seg
						- get_ssr_segment
						- f2fs_get_victim
						: get segno #N as destination segment
						- change_curseg

Fixes: e066b83c9b40 ("f2fs: add ioctl to flush data from faster device to cold area")
Signed-off-by: Yongpeng Yang <yangyongpeng1@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 3e1b6d2ff3a7..8029369bb71d 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -806,11 +806,14 @@ retry:
 			goto out;
 		}
 
-		if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+		if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) {
 			ret = -EBUSY;
-		else
-			p.min_segno = *result;
-		goto out;
+			goto out;
+		}
+		if (gc_type == FG_GC)
+			clear_bit(GET_SEC_FROM_SEG(sbi, *result), dirty_i->victim_secmap);
+		p.min_segno = *result;
+		goto got_result;
 	}
 
 	ret = -ENODATA;
-- 
cgit v1.2.3


From 76f01376df398304972bf337ba430a62062add31 Mon Sep 17 00:00:00 2001
From: Dmitry Antipov <dmantipov@yandex.ru>
Date: Thu, 12 Dec 2024 20:57:48 +0300
Subject: f2fs: ensure that node info flags are always initialized

Syzbot has reported the following KMSAN splat:

BUG: KMSAN: uninit-value in f2fs_new_node_page+0x1494/0x1630
 f2fs_new_node_page+0x1494/0x1630
 f2fs_new_inode_page+0xb9/0x100
 f2fs_init_inode_metadata+0x176/0x1e90
 f2fs_add_inline_entry+0x723/0xc90
 f2fs_do_add_link+0x48f/0xa70
 f2fs_symlink+0x6af/0xfc0
 vfs_symlink+0x1f1/0x470
 do_symlinkat+0x471/0xbc0
 __x64_sys_symlink+0xcf/0x140
 x64_sys_call+0x2fcc/0x3d90
 do_syscall_64+0xd9/0x1b0
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Local variable new_ni created at:
 f2fs_new_node_page+0x9d/0x1630
 f2fs_new_inode_page+0xb9/0x100

So adjust 'f2fs_get_node_info()' to ensure that 'flag'
field of 'struct node_info' is always initialized.

Reported-by: syzbot+5141f6db57a2f7614352@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=5141f6db57a2f7614352
Fixes: e05df3b115e7 ("f2fs: add node operations")
Suggested-by: Chao Yu <chao@kernel.org>
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0b900a7a48e5..c04ee1a7ce57 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -558,6 +558,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 	block_t blkaddr;
 	int i;
 
+	ni->flag = 0;
 	ni->nid = nid;
 retry:
 	/* Check nat cache */
-- 
cgit v1.2.3


From d6fd6f8280f0257ba93f16900a0d3d3912f32c79 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 5 Dec 2024 16:49:51 +0100
Subject: ceph: fix memory leaks in __ceph_sync_read()

In two `break` statements, the call to ceph_release_page_vector() was
missing, leaking the allocation from ceph_alloc_page_vector().

Instead of adding the missing ceph_release_page_vector() calls, the
Ceph maintainers preferred to transfer page ownership to the
`ceph_osd_request` by passing `own_pages=true` to
osd_req_op_extent_osd_data_pages().  This requires postponing the
ceph_osdc_put_request() call until after the block that accesses the
`pages`.

Cc: stable@vger.kernel.org
Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads")
Fixes: f0fe1e54cfcf ("ceph: plumb in decryption during reads")
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4b8d59ebda00..ce342a5d4b8b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1127,7 +1127,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 
 		osd_req_op_extent_osd_data_pages(req, 0, pages, read_len,
 						 offset_in_page(read_off),
-						 false, false);
+						 false, true);
 
 		op = &req->r_ops[0];
 		if (sparse) {
@@ -1186,8 +1186,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			ret = min_t(ssize_t, fret, len);
 		}
 
-		ceph_osdc_put_request(req);
-
 		/* Short read but not EOF? Zero out the remainder. */
 		if (ret >= 0 && ret < len && (off + ret < i_size)) {
 			int zlen = min(len - ret, i_size - off - ret);
@@ -1221,7 +1219,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 				break;
 			}
 		}
-		ceph_release_page_vector(pages, num_pages);
+
+		ceph_osdc_put_request(req);
 
 		if (ret < 0) {
 			if (ret == -EBLOCKLISTED)
-- 
cgit v1.2.3


From 550f7ca98ee028a606aa75705a7e77b1bd11720f Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Mon, 18 Nov 2024 23:28:28 +0100
Subject: ceph: give up on paths longer than PATH_MAX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the full path to be built by ceph_mdsc_build_path() happens to be
longer than PATH_MAX, then this function will enter an endless (retry)
loop, effectively blocking the whole task.  Most of the machine
becomes unusable, making this a very simple and effective DoS
vulnerability.

I cannot imagine why this retry was ever implemented, but it seems
rather useless and harmful to me.  Let's remove it and fail with
ENAMETOOLONG instead.

Cc: stable@vger.kernel.org
Reported-by: Dario Weißer <dario@cure53.de>
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Alex Markuze <amarkuze@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 219a2cc2bf3c..785fe489ef4b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2800,12 +2800,11 @@ retry:
 
 	if (pos < 0) {
 		/*
-		 * A rename didn't occur, but somehow we didn't end up where
-		 * we thought we would. Throw a warning and try again.
+		 * The path is longer than PATH_MAX and this function
+		 * cannot ever succeed.  Creating paths that long is
+		 * possible with Ceph, but Linux cannot use them.
 		 */
-		pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n",
-			       pos);
-		goto retry;
+		return ERR_PTR(-ENAMETOOLONG);
 	}
 
 	*pbase = base;
-- 
cgit v1.2.3


From 12eb22a5a609421b380c3c6ca887474fb2089b2c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 20 Nov 2024 16:43:51 +0100
Subject: ceph: validate snapdirname option length when mounting

It becomes a path component, so it shouldn't exceed NAME_MAX
characters.  This was hardened in commit c152737be22b ("ceph: Use
strscpy() instead of strcpy() in __get_snap_name()"), but no actual
check was put in place.

Cc: stable@vger.kernel.org
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Markuze <amarkuze@redhat.com>
---
 fs/ceph/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index de03cd6eb86e..4344e1f11806 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -431,6 +431,8 @@ static int ceph_parse_mount_param(struct fs_context *fc,
 
 	switch (token) {
 	case Opt_snapdirname:
+		if (strlen(param->string) > NAME_MAX)
+			return invalfc(fc, "snapdirname too long");
 		kfree(fsopt->snapdir_name);
 		fsopt->snapdir_name = param->string;
 		param->string = NULL;
-- 
cgit v1.2.3


From 9abee475803fab6ad59d4f4fc59c6a75374a7d9d Mon Sep 17 00:00:00 2001
From: Alex Markuze <amarkuze@redhat.com>
Date: Wed, 27 Nov 2024 15:34:10 +0200
Subject: ceph: improve error handling and short/overflow-read logic in
 __ceph_sync_read()

This patch refines the read logic in __ceph_sync_read() to ensure more
predictable and efficient behavior in various edge cases.

- Return early if the requested read length is zero or if the file size
  (`i_size`) is zero.
- Initialize the index variable (`idx`) where needed and reorder some
  code to ensure it is always set before use.
- Improve error handling by checking for negative return values earlier.
- Remove redundant encrypted file checks after failures. Only attempt
  filesystem-level decryption if the read succeeded.
- Simplify leftover calculations to correctly handle cases where the
  read extends beyond the end of the file or stops short.  This can be
  hit by continuously reading a file while, on another client, we keep
  truncating and writing new data into it.
- This resolves multiple issues caused by integer and consequent buffer
  overflow (`pages` array being accessed beyond `num_pages`):
  - https://tracker.ceph.com/issues/67524
  - https://tracker.ceph.com/issues/68980
  - https://tracker.ceph.com/issues/68981

Cc: stable@vger.kernel.org
Fixes: 1065da21e5df ("ceph: stop copying to iter at EOF on sync reads")
Reported-by: Luis Henriques (SUSE) <luis.henriques@linux.dev>
Signed-off-by: Alex Markuze <amarkuze@redhat.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce342a5d4b8b..8e0400d461a2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1066,7 +1066,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 	if (ceph_inode_is_shutdown(inode))
 		return -EIO;
 
-	if (!len)
+	if (!len || !i_size)
 		return 0;
 	/*
 	 * flush any page cache pages in this range.  this
@@ -1086,7 +1086,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		int num_pages;
 		size_t page_off;
 		bool more;
-		int idx;
+		int idx = 0;
 		size_t left;
 		struct ceph_osd_req_op *op;
 		u64 read_off = off;
@@ -1160,7 +1160,14 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		else if (ret == -ENOENT)
 			ret = 0;
 
-		if (ret > 0 && IS_ENCRYPTED(inode)) {
+		if (ret < 0) {
+			ceph_osdc_put_request(req);
+			if (ret == -EBLOCKLISTED)
+				fsc->blocklisted = true;
+			break;
+		}
+
+		if (IS_ENCRYPTED(inode)) {
 			int fret;
 
 			fret = ceph_fscrypt_decrypt_extents(inode, pages,
@@ -1187,7 +1194,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		}
 
 		/* Short read but not EOF? Zero out the remainder. */
-		if (ret >= 0 && ret < len && (off + ret < i_size)) {
+		if (ret < len && (off + ret < i_size)) {
 			int zlen = min(len - ret, i_size - off - ret);
 			int zoff = page_off + ret;
 
@@ -1197,13 +1204,11 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			ret += zlen;
 		}
 
-		idx = 0;
-		if (ret <= 0)
-			left = 0;
-		else if (off + ret > i_size)
-			left = i_size - off;
+		if (off + ret > i_size)
+			left = (i_size > off) ? i_size - off : 0;
 		else
 			left = ret;
+
 		while (left > 0) {
 			size_t plen, copied;
 
@@ -1222,12 +1227,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 
 		ceph_osdc_put_request(req);
 
-		if (ret < 0) {
-			if (ret == -EBLOCKLISTED)
-				fsc->blocklisted = true;
-			break;
-		}
-
 		if (off >= i_size || !more)
 			break;
 	}
-- 
cgit v1.2.3


From 66e0c4f91461d17d48071695271c824620bed4ef Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 6 Dec 2024 17:32:59 +0100
Subject: ceph: fix memory leak in ceph_direct_read_write()

The bvecs array which is allocated in iter_get_bvecs_alloc() is leaked
and pages remain pinned if ceph_alloc_sparse_ext_map() fails.

There is no need to delay the allocation of sparse_ext map until after
the bvecs array is set up, so fix this by moving sparse_ext allocation
a bit earlier.  Also, make a similar adjustment in __ceph_sync_read()
for consistency (a leak of the same kind in __ceph_sync_read() has been
addressed differently).

Cc: stable@vger.kernel.org
Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Markuze <amarkuze@redhat.com>
---
 fs/ceph/file.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8e0400d461a2..67468d88f139 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1116,6 +1116,16 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			len = read_off + read_len - off;
 		more = len < iov_iter_count(to);
 
+		op = &req->r_ops[0];
+		if (sparse) {
+			extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
+			ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
+			if (ret) {
+				ceph_osdc_put_request(req);
+				break;
+			}
+		}
+
 		num_pages = calc_pages_for(read_off, read_len);
 		page_off = offset_in_page(off);
 		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
@@ -1129,16 +1139,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 						 offset_in_page(read_off),
 						 false, true);
 
-		op = &req->r_ops[0];
-		if (sparse) {
-			extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
-			ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
-			if (ret) {
-				ceph_osdc_put_request(req);
-				break;
-			}
-		}
-
 		ceph_osdc_start_request(osdc, req);
 		ret = ceph_osdc_wait_request(osdc, req);
 
@@ -1551,6 +1551,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			break;
 		}
 
+		op = &req->r_ops[0];
+		if (sparse) {
+			extent_cnt = __ceph_sparse_read_ext_count(inode, size);
+			ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
+			if (ret) {
+				ceph_osdc_put_request(req);
+				break;
+			}
+		}
+
 		len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
 		if (len < 0) {
 			ceph_osdc_put_request(req);
@@ -1560,6 +1570,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		if (len != size)
 			osd_req_op_extent_update(req, 0, len);
 
+		osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
+
 		/*
 		 * To simplify error handling, allow AIO when IO within i_size
 		 * or IO can be satisfied by single OSD request.
@@ -1591,17 +1603,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			req->r_mtime = mtime;
 		}
 
-		osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
-		op = &req->r_ops[0];
-		if (sparse) {
-			extent_cnt = __ceph_sparse_read_ext_count(inode, size);
-			ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
-			if (ret) {
-				ceph_osdc_put_request(req);
-				break;
-			}
-		}
-
 		if (aio_req) {
 			aio_req->total_len += len;
 			aio_req->num_reqs++;
-- 
cgit v1.2.3


From 18d44c5d062b97b97bb0162d9742440518958dc1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sat, 7 Dec 2024 17:33:25 +0100
Subject: ceph: allocate sparse_ext map only for sparse reads

If mounted with sparseread option, ceph_direct_read_write() ends up
making an unnecessarily allocation for O_DIRECT writes.

Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Markuze <amarkuze@redhat.com>
---
 fs/ceph/file.c        | 2 +-
 net/ceph/osd_client.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 67468d88f139..851d70200c6b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1552,7 +1552,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 		op = &req->r_ops[0];
-		if (sparse) {
+		if (!write && sparse) {
 			extent_cnt = __ceph_sparse_read_ext_count(inode, size);
 			ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
 			if (ret) {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 9b1168eb77ab..b24afec24138 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1173,6 +1173,8 @@ EXPORT_SYMBOL(ceph_osdc_new_request);
 
 int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt)
 {
+	WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ);
+
 	op->extent.sparse_ext_cnt = cnt;
 	op->extent.sparse_ext = kmalloc_array(cnt,
 					      sizeof(*op->extent.sparse_ext),
-- 
cgit v1.2.3


From 3a3f61ce5e0b4bcf730acc09c1af91012d241f85 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Fri, 29 Nov 2024 20:06:55 -0800
Subject: exec: Make sure task->comm is always NUL-terminated

Using strscpy() meant that the final character in task->comm may be
non-NUL for a moment before the "string too long" truncation happens.

Instead of adding a new use of the ambiguous strncpy(), we'd want to
use memtostr_pad() which enforces being able to check at compile time
that sizes are sensible, but this requires being able to see string
buffer lengths. Instead of trying to inline __set_task_comm() (which
needs to call trace and perf functions), just open-code it. But to
make sure we're always safe, add compile-time checking like we already
do for get_task_comm().

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/exec.c             | 12 ++++++------
 include/linux/sched.h |  9 ++++-----
 io_uring/io-wq.c      |  2 +-
 io_uring/sqpoll.c     |  2 +-
 kernel/kthread.c      |  3 ++-
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index e0435b31a811..5f16500ac325 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1200,16 +1200,16 @@ char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(__get_task_comm);
 
 /*
- * These functions flushes out all traces of the currently running executable
- * so that a new one can be started
+ * This is unlocked -- the string will always be NUL-terminated, but
+ * may show overlapping contents if racing concurrent reads.
  */
-
 void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
 {
-	task_lock(tsk);
+	size_t len = min(strlen(buf), sizeof(tsk->comm) - 1);
+
 	trace_task_rename(tsk, buf);
-	strscpy_pad(tsk->comm, buf, sizeof(tsk->comm));
-	task_unlock(tsk);
+	memcpy(tsk->comm, buf, len);
+	memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len);
 	perf_event_comm(tsk, exec);
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e6ee4258169a..ac9f429ddc17 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1932,11 +1932,10 @@ static inline void kick_process(struct task_struct *tsk) { }
 #endif
 
 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
-
-static inline void set_task_comm(struct task_struct *tsk, const char *from)
-{
-	__set_task_comm(tsk, from, false);
-}
+#define set_task_comm(tsk, from) ({			\
+	BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN);	\
+	__set_task_comm(tsk, from, false);		\
+})
 
 extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
 #define get_task_comm(buf, tsk) ({			\
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index a38f36b68060..5d0928f37471 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -634,7 +634,7 @@ static int io_wq_worker(void *data)
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
 	struct io_wq *wq = worker->wq;
 	bool exit_mask = false, last_timeout = false;
-	char buf[TASK_COMM_LEN];
+	char buf[TASK_COMM_LEN] = {};
 
 	set_mask_bits(&worker->flags, 0,
 		      BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING));
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index a26593979887..90011f06c7fb 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -271,7 +271,7 @@ static int io_sq_thread(void *data)
 	struct io_ring_ctx *ctx;
 	struct rusage start;
 	unsigned long timeout = 0;
-	char buf[TASK_COMM_LEN];
+	char buf[TASK_COMM_LEN] = {};
 	DEFINE_WAIT(wait);
 
 	/* offload context creation failed, just exit */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index db4ceb0f503c..162d55811744 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -736,10 +736,11 @@ EXPORT_SYMBOL(kthread_stop_put);
 
 int kthreadd(void *unused)
 {
+	static const char comm[TASK_COMM_LEN] = "kthreadd";
 	struct task_struct *tsk = current;
 
 	/* Setup a clean context for our children to inherit. */
-	set_task_comm(tsk, "kthreadd");
+	set_task_comm(tsk, comm);
 	ignore_signals(tsk);
 	set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
 	set_mems_allowed(node_states[N_MEMORY]);
-- 
cgit v1.2.3


From 543841d1806029889c2f69f040e88b247aba8e22 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Thu, 21 Nov 2024 07:07:05 -0800
Subject: exec: fix up /proc/pid/comm in the execveat(AT_EMPTY_PATH) case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zbigniew mentioned at Linux Plumber's that systemd is interested in
switching to execveat() for service execution, but can't, because the
contents of /proc/pid/comm are the file descriptor which was used,
instead of the path to the binary[1]. This makes the output of tools like
top and ps useless, especially in a world where most fds are opened
CLOEXEC so the number is truly meaningless.

When the filename passed in is empty (e.g. with AT_EMPTY_PATH), use the
dentry's filename for "comm" instead of using the useless numeral from
the synthetic fdpath construction. This way the actual exec machinery
is unchanged, but cosmetically the comm looks reasonable to admins
investigating things.

Instead of adding TASK_COMM_LEN more bytes to bprm, use one of the unused
flag bits to indicate that we need to set "comm" from the dentry.

Suggested-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Suggested-by: Tycho Andersen <tandersen@netflix.com>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://github.com/uapi-group/kernel-features#set-comm-field-before-exec [1]
Reviewed-by: Aleksa Sarai <cyphar@cyphar.com>
Tested-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/exec.c               | 29 ++++++++++++++++++++++++++---
 include/linux/binfmts.h |  4 +++-
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 5f16500ac325..1843366be6ff 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1347,7 +1347,28 @@ int begin_new_exec(struct linux_binprm * bprm)
 		set_dumpable(current->mm, SUID_DUMP_USER);
 
 	perf_event_exec();
-	__set_task_comm(me, kbasename(bprm->filename), true);
+
+	/*
+	 * If the original filename was empty, alloc_bprm() made up a path
+	 * that will probably not be useful to admins running ps or similar.
+	 * Let's fix it up to be something reasonable.
+	 */
+	if (bprm->comm_from_dentry) {
+		/*
+		 * Hold RCU lock to keep the name from being freed behind our back.
+		 * Use acquire semantics to make sure the terminating NUL from
+		 * __d_alloc() is seen.
+		 *
+		 * Note, we're deliberately sloppy here. We don't need to care about
+		 * detecting a concurrent rename and just want a terminated name.
+		 */
+		rcu_read_lock();
+		__set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name),
+				true);
+		rcu_read_unlock();
+	} else {
+		__set_task_comm(me, kbasename(bprm->filename), true);
+	}
 
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
@@ -1521,11 +1542,13 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl
 	if (fd == AT_FDCWD || filename->name[0] == '/') {
 		bprm->filename = filename->name;
 	} else {
-		if (filename->name[0] == '\0')
+		if (filename->name[0] == '\0') {
 			bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
-		else
+			bprm->comm_from_dentry = 1;
+		} else {
 			bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
 						  fd, filename->name);
+		}
 		if (!bprm->fdpath)
 			goto out_free;
 
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index e6c00e860951..3305c849abd6 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -42,7 +42,9 @@ struct linux_binprm {
 		 * Set when errors can no longer be returned to the
 		 * original userspace.
 		 */
-		point_of_no_return:1;
+		point_of_no_return:1,
+		/* Set when "comm" must come from the dentry. */
+		comm_from_dentry:1;
 	struct file *executable; /* Executable to pass to the interpreter */
 	struct file *interpreter;
 	struct file *file;
-- 
cgit v1.2.3


From c220e216d6bcd52cc7333e38edf43dc66ba0dd13 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 14:38:04 +0100
Subject: exportfs: add permission method

This allows filesystems such as pidfs to provide their custom permission
checks.

Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-5-87d803a42495@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c             | 35 ++++++++++++++---------------------
 include/linux/exportfs.h | 17 ++++++++++++++++-
 2 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fhandle.c b/fs/fhandle.c
index e17029b1dc44..3e092ae6d142 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root)
 	return 0;
 }
 
-enum handle_to_path_flags {
-	HANDLE_CHECK_PERMS   = (1 << 0),
-	HANDLE_CHECK_SUBTREE = (1 << 1),
-};
-
-struct handle_to_path_ctx {
-	struct path root;
-	enum handle_to_path_flags flags;
-	unsigned int fh_flags;
-};
-
 static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
 {
 	struct handle_to_path_ctx *ctx = context;
@@ -279,13 +268,13 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
 	return 0;
 }
 
-static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
-				 unsigned int o_flags)
+static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
+				unsigned int o_flags)
 {
 	struct path *root = &ctx->root;
 
 	if (capable(CAP_DAC_READ_SEARCH))
-		return true;
+		return 0;
 
 	/*
 	 * Allow relaxed permissions of file handles if the caller has
@@ -309,7 +298,7 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
 	 * There's only one dentry for each directory inode (VFS rule)...
 	 */
 	if (!(o_flags & O_DIRECTORY))
-		return false;
+		return -EPERM;
 
 	if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
 		ctx->flags = HANDLE_CHECK_PERMS;
@@ -319,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
 		 !has_locked_children(real_mount(root->mnt), root->dentry))
 		ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
 	else
-		return false;
+		return -EPERM;
 
 	/* Are we able to override DAC permissions? */
 	if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
-		return false;
+		return -EPERM;
 
 	ctx->fh_flags = EXPORT_FH_DIR_ONLY;
-	return true;
+	return 0;
 }
 
 static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
@@ -336,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 	struct file_handle f_handle;
 	struct file_handle *handle = NULL;
 	struct handle_to_path_ctx ctx = {};
+	const struct export_operations *eops;
 
 	retval = get_path_from_fd(mountdirfd, &ctx.root);
 	if (retval)
 		goto out_err;
 
-	if (!may_decode_fh(&ctx, o_flags)) {
-		retval = -EPERM;
+	eops = ctx.root.mnt->mnt_sb->s_export_op;
+	if (eops && eops->permission)
+		retval = eops->permission(&ctx, o_flags);
+	else
+		retval = may_decode_fh(&ctx, o_flags);
+	if (retval)
 		goto out_path;
-	}
 
 	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
 		retval = -EFAULT;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index c69b79b64466..a087606ace19 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -3,6 +3,7 @@
 #define LINUX_EXPORTFS_H 1
 
 #include <linux/types.h>
+#include <linux/path.h>
 
 struct dentry;
 struct iattr;
@@ -10,7 +11,6 @@ struct inode;
 struct iomap;
 struct super_block;
 struct vfsmount;
-struct path;
 
 /* limit the handle size to NFSv4 handle size now */
 #define MAX_HANDLE_SZ 128
@@ -157,6 +157,17 @@ struct fid {
 	};
 };
 
+enum handle_to_path_flags {
+	HANDLE_CHECK_PERMS   = (1 << 0),
+	HANDLE_CHECK_SUBTREE = (1 << 1),
+};
+
+struct handle_to_path_ctx {
+	struct path root;
+	enum handle_to_path_flags flags;
+	unsigned int fh_flags;
+};
+
 #define EXPORT_FH_CONNECTABLE	0x1 /* Encode file handle with parent */
 #define EXPORT_FH_FID		0x2 /* File handle may be non-decodeable */
 #define EXPORT_FH_DIR_ONLY	0x4 /* Only decode file handle for a directory */
@@ -226,6 +237,9 @@ struct fid {
  *    is also a directory.  In the event that it cannot be found, or storage
  *    space cannot be allocated, a %ERR_PTR should be returned.
  *
+ * permission:
+ *    Allow filesystems to specify a custom permission function.
+ *
  * open:
  *    Allow filesystems to specify a custom open function.
  *
@@ -255,6 +269,7 @@ struct export_operations {
 			  bool write, u32 *device_generation);
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
+	int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags);
 	struct file * (*open)(struct path *path, unsigned int oflags);
 #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
 #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
-- 
cgit v1.2.3


From b3caba8f7a34a2bbaf45ffc6ff3a49b70afeb192 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 14:38:05 +0100
Subject: pidfs: implement file handle support

On 64-bit platforms, userspace can read the pidfd's inode in order to
get a never-repeated PID identifier. On 32-bit platforms this identifier
is not exposed, as inodes are limited to 32 bits. Instead expose the
identifier via export_fh, which makes it available to userspace via
name_to_handle_at.

In addition we implement fh_to_dentry, which allows userspace to
recover a pidfd from a pidfs file handle.

Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
[brauner: patch heavily rewritten]
Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-6-87d803a42495@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Co-Developed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

(limited to 'fs')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 8d62d900d20d..cc773875e9e4 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/anon_inodes.h>
+#include <linux/exportfs.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/cgroup.h>
@@ -473,6 +474,118 @@ static const struct dentry_operations pidfs_dentry_operations = {
 	.d_prune	= stashed_dentry_prune,
 };
 
+static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			   struct inode *parent)
+{
+	const struct pid *pid = inode->i_private;
+
+	if (*max_len < 2) {
+		*max_len = 2;
+		return FILEID_INVALID;
+	}
+
+	*max_len = 2;
+	*(u64 *)fh = pid->ino;
+	return FILEID_KERNFS;
+}
+
+/* Find a struct pid based on the inode number. */
+static struct pid *pidfs_ino_get_pid(u64 ino)
+{
+	unsigned long pid_ino = pidfs_ino(ino);
+	u32 gen = pidfs_gen(ino);
+	struct pid *pid;
+
+	guard(rcu)();
+
+	pid = idr_find(&pidfs_ino_idr, lower_32_bits(pid_ino));
+	if (!pid)
+		return NULL;
+
+	if (pidfs_ino(pid->ino) != pid_ino)
+		return NULL;
+
+	if (pidfs_gen(pid->ino) != gen)
+		return NULL;
+
+	/* Within our pid namespace hierarchy? */
+	if (pid_vnr(pid) == 0)
+		return NULL;
+
+	return get_pid(pid);
+}
+
+static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
+					 struct fid *fid, int fh_len,
+					 int fh_type)
+{
+	int ret;
+	u64 pid_ino;
+	struct path path;
+	struct pid *pid;
+
+	if (fh_len < 2)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_KERNFS:
+		pid_ino = *(u64 *)fid;
+		break;
+	default:
+		return NULL;
+	}
+
+	pid = pidfs_ino_get_pid(pid_ino);
+	if (!pid)
+		return NULL;
+
+	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	mntput(path.mnt);
+	return path.dentry;
+}
+
+/*
+ * Make sure that we reject any nonsensical flags that users pass via
+ * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
+ * PIDFD_NONBLOCK as O_NONBLOCK.
+ */
+#define VALID_FILE_HANDLE_OPEN_FLAGS \
+	(O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
+
+static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
+				   unsigned int oflags)
+{
+	if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
+		return -EINVAL;
+
+	/*
+	 * pidfd_ino_get_pid() will verify that the struct pid is part
+	 * of the caller's pid namespace hierarchy. No further
+	 * permission checks are needed.
+	 */
+	return 0;
+}
+
+static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
+{
+	/*
+	 * Clear O_LARGEFILE as open_by_handle_at() forces it and raise
+	 * O_RDWR as pidfds always are.
+	 */
+	oflags &= ~O_LARGEFILE;
+	return dentry_open(path, oflags | O_RDWR, current_cred());
+}
+
+static const struct export_operations pidfs_export_operations = {
+	.encode_fh	= pidfs_encode_fh,
+	.fh_to_dentry	= pidfs_fh_to_dentry,
+	.open		= pidfs_export_open,
+	.permission	= pidfs_export_permission,
+};
+
 static int pidfs_init_inode(struct inode *inode, void *data)
 {
 	const struct pid *pid = data;
@@ -507,6 +620,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	ctx->ops = &pidfs_sops;
+	ctx->eops = &pidfs_export_operations;
 	ctx->dops = &pidfs_dentry_operations;
 	fc->s_fs_info = (void *)&pidfs_stashed_ops;
 	return 0;
-- 
cgit v1.2.3


From 8ce3528188207a2e1896cc3173fba6d99a59013a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Nov 2024 21:16:37 +0100
Subject: pidfs: check for valid ioctl commands

Prior to doing any work, check whether the provided ioctl command is
supported by pidfs.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index cc773875e9e4..fe10d2a126a2 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -264,6 +264,27 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
 	return 0;
 }
 
+static bool pidfs_ioctl_valid(unsigned int cmd)
+{
+	switch (cmd) {
+	case FS_IOC_GETVERSION:
+	case PIDFD_GET_CGROUP_NAMESPACE:
+	case PIDFD_GET_INFO:
+	case PIDFD_GET_IPC_NAMESPACE:
+	case PIDFD_GET_MNT_NAMESPACE:
+	case PIDFD_GET_NET_NAMESPACE:
+	case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
+	case PIDFD_GET_TIME_NAMESPACE:
+	case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
+	case PIDFD_GET_UTS_NAMESPACE:
+	case PIDFD_GET_USER_NAMESPACE:
+	case PIDFD_GET_PID_NAMESPACE:
+		return true;
+	}
+
+	return false;
+}
+
 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct task_struct *task __free(put_task) = NULL;
@@ -272,6 +293,9 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct ns_common *ns_common = NULL;
 	struct pid_namespace *pid_ns;
 
+	if (!pidfs_ioctl_valid(cmd))
+		return -ENOIOCTLCMD;
+
 	if (cmd == FS_IOC_GETVERSION) {
 		if (!arg)
 			return -EINVAL;
-- 
cgit v1.2.3


From 16ecd47cb0cd895c7c2f5dd5db50f6c005c51639 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sat, 14 Dec 2024 22:01:28 +0100
Subject: pidfs: lookup pid through rbtree

The new pid inode number allocation scheme is neat but I overlooked a
possible, even though unlikely, attack that can be used to trigger an
overflow on both 32bit and 64bit.

An unique 64 bit identifier was constructed for each struct pid by two
combining a 32 bit idr with a 32 bit generation number. A 32bit number
was allocated using the idr_alloc_cyclic() infrastructure. When the idr
wrapped around a 32 bit wraparound counter was incremented. The 32 bit
wraparound counter served as the upper 32 bits and the allocated idr
number as the lower 32 bits.

Since the idr can only allocate up to INT_MAX entries everytime a
wraparound happens INT_MAX - 1 entries are lost (Ignoring that numbering
always starts at 2 to avoid theoretical collisions with the root inode
number.).

If userspace fully populates the idr such that and puts itself into
control of two entries such that one entry is somewhere in the middle
and the other entry is the INT_MAX entry then it is possible to overflow
the wraparound counter. That is probably difficult to pull off but the
mere possibility is annoying.

The problem could be contained to 32 bit by switching to a data
structure such as the maple tree that allows allocating 64 bit numbers
on 64 bit machines. That would leave 32 bit in a lurch but that probably
doesn't matter that much. The other problem is that removing entries
form the maple tree is somewhat non-trivial because the removal code can
be called under the irq write lock of tasklist_lock and
irq{save,restore} code.

Instead, allocate unique identifiers for struct pid by simply
incrementing a 64 bit counter and insert each struct pid into the rbtree
so it can be looked up to decode file handles avoiding to leak actual
pids across pid namespaces in file handles.

On both 64 bit and 32 bit the same 64 bit identifier is used to lookup
struct pid in the rbtree. On 64 bit the unique identifier for struct pid
simply becomes the inode number. Comparing two pidfds continues to be as
simple as comparing inode numbers.

On 32 bit the 64 bit number assigned to struct pid is split into two 32
bit numbers. The lower 32 bits are used as the inode number and the
upper 32 bits are used as the inode generation number. Whenever a
wraparound happens on 32 bit the 64 bit number will be incremented by 2
so inode numbering starts at 2 again.

When a wraparound happens on 32 bit multiple pidfds with the same inode
number are likely to exist. This isn't a problem since before pidfs
pidfds used the anonymous inode meaning all pidfds had the same inode
number. On 32 bit sserspace can thus reconstruct the 64 bit identifier
by retrieving both the inode number and the inode generation number to
compare, or use file handles. This gives the same guarantees on both 32
bit and 64 bit.

Link: https://lore.kernel.org/r/20241214-gekoppelt-erdarbeiten-a1f9a982a5a6@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c            | 129 +++++++++++++++++++++++++++++++-------------------
 include/linux/pid.h   |   2 +
 include/linux/pidfs.h |   2 +-
 kernel/pid.c          |   6 +--
 4 files changed, 86 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index fe10d2a126a2..c5a51c69acc8 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -24,18 +24,9 @@
 #include "internal.h"
 #include "mount.h"
 
-static DEFINE_IDR(pidfs_ino_idr);
-
-static u32 pidfs_ino_upper_32_bits = 0;
+static struct rb_root pidfs_ino_tree = RB_ROOT;
 
 #if BITS_PER_LONG == 32
-/*
- * On 32 bit systems the lower 32 bits are the inode number and
- * the higher 32 bits are the generation number. The starting
- * value for the inode number and the generation number is one.
- */
-static u32 pidfs_ino_lower_32_bits = 1;
-
 static inline unsigned long pidfs_ino(u64 ino)
 {
 	return lower_32_bits(ino);
@@ -49,52 +40,79 @@ static inline u32 pidfs_gen(u64 ino)
 
 #else
 
-static u32 pidfs_ino_lower_32_bits = 0;
-
 /* On 64 bit simply return ino. */
 static inline unsigned long pidfs_ino(u64 ino)
 {
 	return ino;
 }
 
-/* On 64 bit the generation number is 1. */
+/* On 64 bit the generation number is 0. */
 static inline u32 pidfs_gen(u64 ino)
 {
-	return 1;
+	return 0;
 }
 #endif
 
-/*
- * Construct an inode number for struct pid in a way that we can use the
- * lower 32bit to lookup struct pid independent of any pid numbers that
- * could be leaked into userspace (e.g., via file handle encoding).
- */
-int pidfs_add_pid(struct pid *pid)
+static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
 {
-	u32 upper;
-	int lower;
-
-        /*
-	 * Inode numbering for pidfs start at 2. This avoids collisions
-	 * with the root inode which is 1 for pseudo filesystems.
-         */
-	lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC);
-	if (lower >= 0 && lower < pidfs_ino_lower_32_bits)
-		pidfs_ino_upper_32_bits++;
-	upper = pidfs_ino_upper_32_bits;
-	pidfs_ino_lower_32_bits = lower;
-	if (lower < 0)
-		return lower;
-
-	pid->ino = ((u64)upper << 32) | lower;
-	pid->stashed = NULL;
+	struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
+	struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
+	u64 pid_ino_a = pid_a->ino;
+	u64 pid_ino_b = pid_b->ino;
+
+	if (pid_ino_a < pid_ino_b)
+		return -1;
+	if (pid_ino_a > pid_ino_b)
+		return 1;
 	return 0;
 }
 
-/* The idr number to remove is the lower 32 bits of the inode. */
+void pidfs_add_pid(struct pid *pid)
+{
+	static u64 pidfs_ino_nr = 2;
+
+	/*
+	 * On 64 bit nothing special happens. The 64bit number assigned
+	 * to struct pid is the inode number.
+	 *
+	 * On 32 bit the 64 bit number assigned to struct pid is split
+	 * into two 32 bit numbers. The lower 32 bits are used as the
+	 * inode number and the upper 32 bits are used as the inode
+	 * generation number.
+	 *
+	 * On 32 bit pidfs_ino() will return the lower 32 bit. When
+	 * pidfs_ino() returns zero a wrap around happened. When a
+	 * wraparound happens the 64 bit number will be incremented by 2
+	 * so inode numbering starts at 2 again.
+	 *
+	 * On 64 bit comparing two pidfds is as simple as comparing
+	 * inode numbers.
+	 *
+	 * When a wraparound happens on 32 bit multiple pidfds with the
+	 * same inode number are likely to exist (This isn't a problem
+	 * since before pidfs pidfds used the anonymous inode meaning
+	 * all pidfds had the same inode number.). Userspace can
+	 * reconstruct the 64 bit identifier by retrieving both the
+	 * inode number and the inode generation number to compare or
+	 * use file handles.
+	 */
+	if (pidfs_ino(pidfs_ino_nr) == 0)
+		pidfs_ino_nr += 2;
+
+	pid->ino = pidfs_ino_nr;
+	pid->stashed = NULL;
+	pidfs_ino_nr++;
+
+	write_seqcount_begin(&pidmap_lock_seq);
+	rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
+	write_seqcount_end(&pidmap_lock_seq);
+}
+
 void pidfs_remove_pid(struct pid *pid)
 {
-	idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino));
+	write_seqcount_begin(&pidmap_lock_seq);
+	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
+	write_seqcount_end(&pidmap_lock_seq);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -513,24 +531,37 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 	return FILEID_KERNFS;
 }
 
+static int pidfs_ino_find(const void *key, const struct rb_node *node)
+{
+	const u64 pid_ino = *(u64 *)key;
+	const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
+
+	if (pid_ino < pid->ino)
+		return -1;
+	if (pid_ino > pid->ino)
+		return 1;
+	return 0;
+}
+
 /* Find a struct pid based on the inode number. */
 static struct pid *pidfs_ino_get_pid(u64 ino)
 {
-	unsigned long pid_ino = pidfs_ino(ino);
-	u32 gen = pidfs_gen(ino);
 	struct pid *pid;
+	struct rb_node *node;
+	unsigned int seq;
 
 	guard(rcu)();
-
-	pid = idr_find(&pidfs_ino_idr, lower_32_bits(pid_ino));
-	if (!pid)
+	do {
+		seq = read_seqcount_begin(&pidmap_lock_seq);
+		node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
+		if (node)
+			break;
+	} while (read_seqcount_retry(&pidmap_lock_seq, seq));
+
+	if (!node)
 		return NULL;
 
-	if (pidfs_ino(pid->ino) != pid_ino)
-		return NULL;
-
-	if (pidfs_gen(pid->ino) != gen)
-		return NULL;
+	pid = rb_entry(node, struct pid, pidfs_node);
 
 	/* Within our pid namespace hierarchy? */
 	if (pid_vnr(pid) == 0)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index a3aad9b4074c..fe575fcdb4af 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -59,6 +59,7 @@ struct pid
 	spinlock_t lock;
 	struct dentry *stashed;
 	u64 ino;
+	struct rb_node pidfs_node;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct hlist_head inodes;
@@ -68,6 +69,7 @@ struct pid
 	struct upid numbers[];
 };
 
+extern seqcount_spinlock_t pidmap_lock_seq;
 extern struct pid init_struct_pid;
 
 struct file;
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 2958652bb108..df574d6708d4 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -4,7 +4,7 @@
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
-int pidfs_add_pid(struct pid *pid);
+void pidfs_add_pid(struct pid *pid);
 void pidfs_remove_pid(struct pid *pid);
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 58567d6904b2..aa2a7d4da455 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,6 +43,7 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 #include <linux/pidfs.h>
+#include <linux/seqlock.h>
 #include <net/sock.h>
 #include <uapi/linux/pidfd.h>
 
@@ -103,6 +104,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
  */
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
 
 void put_pid(struct pid *pid)
 {
@@ -273,9 +275,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	spin_lock_irq(&pidmap_lock);
 	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
-	retval = pidfs_add_pid(pid);
-	if (retval)
-		goto out_unlock;
+	pidfs_add_pid(pid);
 	for ( ; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
-- 
cgit v1.2.3


From 70465acbb0ce1bb69447acf32f136c8153cda0de Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Mon, 2 Dec 2024 09:53:17 +0800
Subject: exfat: fix exfat_find_empty_entry() not returning error on failure

On failure, "dentry" is the error code. If the error code indicates
that there is no space, a new cluster may need to be allocated; for
other errors, it should be returned directly.

Only on success, "dentry" is the index of the directory entry, and
it needs to be converted into the directory entry index within the
cluster where it is located.

Fixes: 8a3f5711ad74 ("exfat: reduce FAT chain traversal")
Reported-by: syzbot+6f6c9397e0078ef60bce@syzkaller.appspotmail.com
Tested-by: syzbot+6f6c9397e0078ef60bce@syzkaller.appspotmail.com
Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 97d2774760fe..099f80645072 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -330,8 +330,8 @@ static int exfat_find_empty_entry(struct inode *inode,
 
 	while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir,
 					num_entries, es)) < 0) {
-		if (dentry == -EIO)
-			break;
+		if (dentry != -ENOSPC)
+			return dentry;
 
 		if (exfat_check_max_dentries(inode))
 			return -ENOSPC;
-- 
cgit v1.2.3


From 2b2fc0be98a828cf33a88a28e9745e8599fb05cf Mon Sep 17 00:00:00 2001
From: Zhang Kunbo <zhangkunbo@huawei.com>
Date: Tue, 17 Dec 2024 07:18:36 +0000
Subject: fs: fix missing declaration of init_files

fs/file.c should include include/linux/init_task.h  for
 declaration of init_files. This fixes the sparse warning:

fs/file.c:501:21: warning: symbol 'init_files' was not declared. Should it be static?

Signed-off-by: Zhang Kunbo <zhangkunbo@huawei.com>
Link: https://lore.kernel.org/r/20241217071836.2634868-1-zhangkunbo@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index fb1011cf6b4a..25c6e53b03f8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -22,6 +22,7 @@
 #include <linux/close_range.h>
 #include <linux/file_ref.h>
 #include <net/sock.h>
+#include <linux/init_task.h>
 
 #include "internal.h"
 
-- 
cgit v1.2.3


From 69d803c40edeaf94089fbc8751c9b746cdc35044 Mon Sep 17 00:00:00 2001
From: Yang Erkun <yangerkun@huawei.com>
Date: Mon, 16 Dec 2024 22:21:52 +0800
Subject: nfsd: Revert "nfsd: release svc_expkey/svc_export with rcu_work"

This reverts commit f8c989a0c89a75d30f899a7cabdc14d72522bb8d.

Before this commit, svc_export_put or expkey_put will call path_put with
sync mode. After this commit, path_put will be called with async mode.
And this can lead the unexpected results show as follow.

mkfs.xfs -f /dev/sda
echo "/ *(rw,no_root_squash,fsid=0)" > /etc/exports
echo "/mnt *(rw,no_root_squash,fsid=1)" >> /etc/exports
exportfs -ra
service nfs-server start
mount -t nfs -o vers=4.0 127.0.0.1:/mnt /mnt1
mount /dev/sda /mnt/sda
touch /mnt1/sda/file
exportfs -r
umount /mnt/sda # failed unexcepted

The touch will finally call nfsd_cross_mnt, add refcount to mount, and
then add cache_head. Before this commit, exportfs -r will call
cache_flush to cleanup all cache_head, and path_put in
svc_export_put/expkey_put will be finished with sync mode. So, the
latter umount will always success. However, after this commit, path_put
will be called with async mode, the latter umount may failed, and if
we add some delay, umount will success too. Personally I think this bug
and should be fixed. We first revert before bugfix patch, and then fix
the original bug with a different way.

Fixes: f8c989a0c89a ("nfsd: release svc_expkey/svc_export with rcu_work")
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/export.c | 31 ++++++-------------------------
 fs/nfsd/export.h |  4 ++--
 2 files changed, 8 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index eacafe46e3b6..aa4712362b3b 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -40,24 +40,15 @@
 #define	EXPKEY_HASHMAX		(1 << EXPKEY_HASHBITS)
 #define	EXPKEY_HASHMASK		(EXPKEY_HASHMAX -1)
 
-static void expkey_put_work(struct work_struct *work)
+static void expkey_put(struct kref *ref)
 {
-	struct svc_expkey *key =
-		container_of(to_rcu_work(work), struct svc_expkey, ek_rcu_work);
+	struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
 
 	if (test_bit(CACHE_VALID, &key->h.flags) &&
 	    !test_bit(CACHE_NEGATIVE, &key->h.flags))
 		path_put(&key->ek_path);
 	auth_domain_put(key->ek_client);
-	kfree(key);
-}
-
-static void expkey_put(struct kref *ref)
-{
-	struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
-
-	INIT_RCU_WORK(&key->ek_rcu_work, expkey_put_work);
-	queue_rcu_work(system_wq, &key->ek_rcu_work);
+	kfree_rcu(key, ek_rcu);
 }
 
 static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
@@ -364,26 +355,16 @@ static void export_stats_destroy(struct export_stats *stats)
 					    EXP_STATS_COUNTERS_NUM);
 }
 
-static void svc_export_put_work(struct work_struct *work)
+static void svc_export_put(struct kref *ref)
 {
-	struct svc_export *exp =
-		container_of(to_rcu_work(work), struct svc_export, ex_rcu_work);
-
+	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
 	path_put(&exp->ex_path);
 	auth_domain_put(exp->ex_client);
 	nfsd4_fslocs_free(&exp->ex_fslocs);
 	export_stats_destroy(exp->ex_stats);
 	kfree(exp->ex_stats);
 	kfree(exp->ex_uuid);
-	kfree(exp);
-}
-
-static void svc_export_put(struct kref *ref)
-{
-	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
-
-	INIT_RCU_WORK(&exp->ex_rcu_work, svc_export_put_work);
-	queue_rcu_work(system_wq, &exp->ex_rcu_work);
+	kfree_rcu(exp, ex_rcu);
 }
 
 static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 6f2fbaae01fa..4d92b99c1ffd 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -75,7 +75,7 @@ struct svc_export {
 	u32			ex_layout_types;
 	struct nfsd4_deviceid_map *ex_devid_map;
 	struct cache_detail	*cd;
-	struct rcu_work		ex_rcu_work;
+	struct rcu_head		ex_rcu;
 	unsigned long		ex_xprtsec_modes;
 	struct export_stats	*ex_stats;
 };
@@ -92,7 +92,7 @@ struct svc_expkey {
 	u32			ek_fsid[6];
 
 	struct path		ek_path;
-	struct rcu_work		ek_rcu_work;
+	struct rcu_head		ek_rcu;
 };
 
 #define EX_ISSYNC(exp)		(!((exp)->ex_flags & NFSEXP_ASYNC))
-- 
cgit v1.2.3


From 62e2a47ceab8f3f7d2e3f0e03fdd1c5e0059fd8b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 16 Dec 2024 19:28:06 -0500
Subject: NFS/pnfs: Fix a live lock between recalled layouts and layoutget

When the server is recalling a layout, we should ignore the count of
outstanding layoutget calls, since the server is expected to return
either NFS4ERR_RECALLCONFLICT or NFS4ERR_RETURNCONFLICT for as long as
the recall is outstanding.
Currently, we may end up livelocking, causing the layout to eventually
be forcibly revoked.

Fixes: bf0291dd2267 ("pNFS: Ensure LAYOUTGET and LAYOUTRETURN are properly serialised")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/pnfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0d16b383a452..5f582713bf05 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1308,7 +1308,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
 		enum pnfs_iomode *iomode)
 {
 	/* Serialise LAYOUTGET/LAYOUTRETURN */
-	if (atomic_read(&lo->plh_outstanding) != 0)
+	if (atomic_read(&lo->plh_outstanding) != 0 && lo->plh_return_seq == 0)
 		return false;
 	if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
 		return false;
-- 
cgit v1.2.3


From bedb4e6088a886f587d2ea44e0c198c8ce2182c9 Mon Sep 17 00:00:00 2001
From: Zhang Kunbo <zhangkunbo@huawei.com>
Date: Tue, 17 Dec 2024 07:19:21 +0000
Subject: fs/nfs: fix missing declaration of nfs_idmap_cache_timeout

fs/nfs/super.c should include fs/nfs/nfs4idmap.h for
declaration of nfs_idmap_cache_timeout. This fixes the sparse warning:

fs/nfs/super.c:1397:14: warning: symbol 'nfs_idmap_cache_timeout' was not declared. Should it be static?

Signed-off-by: Zhang Kunbo <zhangkunbo@huawei.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ae5c5e39afa0..aeb715b4a690 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,6 +73,7 @@
 #include "nfs.h"
 #include "netns.h"
 #include "sysfs.h"
+#include "nfs4idmap.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
-- 
cgit v1.2.3


From d75d72a858f0c00ca8ae161b48cdb403807be4de Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 13 Nov 2024 11:11:55 -0500
Subject: btrfs: fix improper generation check in snapshot delete

We have been using the following check

   if (generation <= root->root_key.offset)

to make decisions about whether or not to visit a node during snapshot
delete.  This is because for normal subvolumes this is set to 0, and for
snapshots it's set to the creation generation.  The idea being that if
the generation of the node is less than or equal to our creation
generation then we don't need to visit that node, because it doesn't
belong to us, we can simply drop our reference and move on.

However reloc roots don't have their generation stored in
root->root_key.offset, instead that is the objectid of their
corresponding fs root.  This means we can incorrectly not walk into
nodes that need to be dropped when deleting a reloc root.

There are a variety of consequences to making the wrong choice in two
distinct areas.

visit_node_for_delete()

1. False positive.  We think we are newer than the block when we really
   aren't.  We don't visit the node and drop our reference to the node
   and carry on.  This would result in leaked space.
2. False negative.  We do decide to walk down into a block that we
   should have just dropped our reference to.  However this means that
   the child node will have refs > 1, so we will switch to
   UPDATE_BACKREF, and then the subsequent walk_down_proc() will notice
   that btrfs_header_owner(node) != root->root_key.objectid and it'll
   break out of the loop, and then walk_up_proc() will drop our reference,
   so this appears to be ok.

do_walk_down()

1. False positive.  We are in UPDATE_BACKREF and incorrectly decide that
   we are done and don't need to update the backref for our lower nodes.
   This is another case that simply won't happen with relocation, as we
   only have to do UPDATE_BACKREF if the node below us was shared and
   didn't have FULL_BACKREF set, and since we don't own that node
   because we're a reloc root we actually won't end up in this case.
2. False negative.  Again this is tricky because as described above, we
   simply wouldn't be here from relocation, because we don't own any of
   the nodes because we never set btrfs_header_owner() to the reloc root
   objectid, and we always use FULL_BACKREF, we never actually need to
   set FULL_BACKREF on any children.

Having spent a lot of time stressing relocation/snapshot delete recently
I've not seen this pop in practice.  But this is objectively incorrect,
so fix this to get the correct starting generation based on the root
we're dropping to keep me from thinking there's a problem here.

CC: stable@vger.kernel.org
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 19 +++++++++++++++++++
 fs/btrfs/extent-tree.c |  6 +++---
 2 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 307dedf95c70..2c341956a01c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -370,6 +370,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi
 	WRITE_ONCE(root->last_trans, transid);
 }
 
+/*
+ * Return the generation this root started with.
+ *
+ * Every normal root that is created with root->root_key.offset set to it's
+ * originating generation.  If it is a snapshot it is the generation when the
+ * snapshot was created.
+ *
+ * However for TREE_RELOC roots root_key.offset is the objectid of the owning
+ * tree root.  Thankfully we copy the root item of the owning tree root, which
+ * has it's last_snapshot set to what we would have root_key.offset set to, so
+ * return that if this is a TREE_RELOC root.
+ */
+static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root)
+{
+	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
+		return btrfs_root_last_snapshot(&root->root_item);
+	return root->root_key.offset;
+}
+
 /*
  * Structure that conveys information about an extent that is going to replace
  * all the extents in a file range.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 412e318e4a22..43a771f7bd7a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5285,7 +5285,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *
 	 * reference to it.
 	 */
 	generation = btrfs_node_ptr_generation(eb, slot);
-	if (!wc->update_ref || generation <= root->root_key.offset)
+	if (!wc->update_ref || generation <= btrfs_root_origin_generation(root))
 		return false;
 
 	/*
@@ -5340,7 +5340,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 			goto reada;
 
 		if (wc->stage == UPDATE_BACKREF &&
-		    generation <= root->root_key.offset)
+		    generation <= btrfs_root_origin_generation(root))
 			continue;
 
 		/* We don't lock the tree block, it's OK to be racy here */
@@ -5683,7 +5683,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	 * for the subtree
 	 */
 	if (wc->stage == UPDATE_BACKREF &&
-	    generation <= root->root_key.offset) {
+	    generation <= btrfs_root_origin_generation(root)) {
 		wc->lookup_info = 1;
 		return 1;
 	}
-- 
cgit v1.2.3


From 6c3864e055486fadb5b97793b57688082e14b43b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 4 Nov 2024 07:26:32 +0100
Subject: btrfs: use bio_is_zone_append() in the completion handler

Otherwise it won't catch bios turned into regular writes by the block
level zone write plugging. The additional test it adds is for emulated
zone append.

Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation")
CC: stable@vger.kernel.org # 6.12
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 1f216d07eff6..011cc97be3b5 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -355,7 +355,7 @@ static void btrfs_simple_end_io(struct bio *bio)
 		INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
 		queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
 	} else {
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+		if (bio_is_zone_append(bio) && !bio->bi_status)
 			btrfs_record_physical_zoned(bbio);
 		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
 	}
@@ -398,7 +398,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
 	else
 		bio->bi_status = BLK_STS_OK;
 
-	if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+	if (bio_is_zone_append(bio) && !bio->bi_status)
 		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 
 	btrfs_bio_end_io(bbio, bbio->bio.bi_status);
@@ -412,7 +412,7 @@ static void btrfs_clone_write_end_io(struct bio *bio)
 	if (bio->bi_status) {
 		atomic_inc(&stripe->bioc->error);
 		btrfs_log_dev_io_error(bio, stripe->dev);
-	} else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+	} else if (bio_is_zone_append(bio)) {
 		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 	}
 
-- 
cgit v1.2.3


From be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 4 Nov 2024 07:26:33 +0100
Subject: btrfs: split bios to the fs sector size boundary

Btrfs like other file systems can't really deal with I/O not aligned to
it's internal block size (which strangely is called sector size in
btrfs, for historical reasons), but the block layer split helper doesn't
even know about that.

Round down the split boundary so that all I/Os are aligned.

Fixes: d5e4377d5051 ("btrfs: split zone append bios in btrfs_submit_bio")
CC: stable@vger.kernel.org # 6.12
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 011cc97be3b5..78f5606baacb 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -649,8 +649,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
 	map_length = min(map_length, bbio->fs_info->max_zone_append_size);
 	sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
 					&nr_segs, map_length);
-	if (sector_offset)
-		return sector_offset << SECTOR_SHIFT;
+	if (sector_offset) {
+		/*
+		 * bio_split_rw_at() could split at a size smaller than our
+		 * sectorsize and thus cause unaligned I/Os.  Fix that by
+		 * always rounding down to the nearest boundary.
+		 */
+		return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
+	}
 	return map_length;
 }
 
-- 
cgit v1.2.3


From dfb92681a19e1d5172420baa242806414b3eff6f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 4 Dec 2024 13:30:46 +1030
Subject: btrfs: tree-checker: reject inline extent items with 0 ref count

[BUG]
There is a bug report in the mailing list where btrfs_run_delayed_refs()
failed to drop the ref count for logical 25870311358464 num_bytes
2113536.

The involved leaf dump looks like this:

  item 166 key (25870311358464 168 2113536) itemoff 10091 itemsize 50
    extent refs 1 gen 84178 flags 1
    ref#0: shared data backref parent 32399126528000 count 0 <<<
    ref#1: shared data backref parent 31808973717504 count 1

Notice the count number is 0.

[CAUSE]
There is no concrete evidence yet, but considering 0 -> 1 is also a
single bit flipped, it's possible that hardware memory bitflip is
involved, causing the on-disk extent tree to be corrupted.

[FIX]
To prevent us reading such corrupted extent item, or writing such
damaged extent item back to disk, enhance the handling of
BTRFS_EXTENT_DATA_REF_KEY and BTRFS_SHARED_DATA_REF_KEY keys for both
inlined and key items, to detect such 0 ref count and reject them.

CC: stable@vger.kernel.org # 5.4+
Link: https://lore.kernel.org/linux-btrfs/7c69dd49-c346-4806-86e7-e6f863a66f48@app.fastmail.com/
Reported-by: Frankie Fisher <frankie@terrorise.me.uk>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 148d8cefa40e..dfeee033f31f 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1527,6 +1527,11 @@ static int check_extent_item(struct extent_buffer *leaf,
 					   dref_offset, fs_info->sectorsize);
 				return -EUCLEAN;
 			}
+			if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
+				extent_err(leaf, slot,
+			"invalid data ref count, should have non-zero value");
+				return -EUCLEAN;
+			}
 			inline_refs += btrfs_extent_data_ref_count(leaf, dref);
 			break;
 		/* Contains parent bytenr and ref count */
@@ -1539,6 +1544,11 @@ static int check_extent_item(struct extent_buffer *leaf,
 					   inline_offset, fs_info->sectorsize);
 				return -EUCLEAN;
 			}
+			if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
+				extent_err(leaf, slot,
+			"invalid shared data ref count, should have non-zero value");
+				return -EUCLEAN;
+			}
 			inline_refs += btrfs_shared_data_ref_count(leaf, sref);
 			break;
 		case BTRFS_EXTENT_OWNER_REF_KEY:
@@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
 {
 	u32 expect_item_size = 0;
 
-	if (key->type == BTRFS_SHARED_DATA_REF_KEY)
+	if (key->type == BTRFS_SHARED_DATA_REF_KEY) {
+		struct btrfs_shared_data_ref *sref;
+
+		sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref);
+		if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
+			extent_err(leaf, slot,
+		"invalid shared data backref count, should have non-zero value");
+			return -EUCLEAN;
+		}
+
 		expect_item_size = sizeof(struct btrfs_shared_data_ref);
+	}
 
 	if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
 		generic_err(leaf, slot,
@@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
 				   offset, leaf->fs_info->sectorsize);
 			return -EUCLEAN;
 		}
+		if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
+			extent_err(leaf, slot,
+	"invalid extent data backref count, should have non-zero value");
+			return -EUCLEAN;
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 9048cf05a17a7bc26f0b8e2e53750b1237303970 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Tue, 17 Dec 2024 16:18:12 -0500
Subject: NFSD: fix management of pending async copies

Currently the pending_async_copies count is decremented just
before a struct nfsd4_copy is destroyed. After commit aa0ebd21df9c
("NFSD: Add nfsd4_copy time-to-live") nfsd4_copy structures sticks
around for 10 lease periods after the COPY itself has completed,
the pending_async_copies count stays high for a long time. This
causes NFSD to avoid the use of background copy even though the
actual background copy workload might no longer be running.

In this patch, decrement pending_async_copies once async copy thread
is done processing the copy work.

Fixes: aa0ebd21df9c ("NFSD: Add nfsd4_copy time-to-live")
Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f8a10f90bc7a..ad44ad49274f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1347,7 +1347,6 @@ static void nfs4_put_copy(struct nfsd4_copy *copy)
 {
 	if (!refcount_dec_and_test(&copy->refcount))
 		return;
-	atomic_dec(&copy->cp_nn->pending_async_copies);
 	kfree(copy->cp_src);
 	kfree(copy);
 }
@@ -1870,6 +1869,7 @@ do_callback:
 	set_bit(NFSD4_COPY_F_COMPLETED, &copy->cp_flags);
 	trace_nfsd_copy_async_done(copy);
 	nfsd4_send_cb_offload(copy);
+	atomic_dec(&copy->cp_nn->pending_async_copies);
 	return 0;
 }
 
@@ -1927,19 +1927,19 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		/* Arbitrary cap on number of pending async copy operations */
 		if (atomic_inc_return(&nn->pending_async_copies) >
 				(int)rqstp->rq_pool->sp_nrthreads)
-			goto out_err;
+			goto out_dec_async_copy_err;
 		async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL);
 		if (!async_copy->cp_src)
-			goto out_err;
+			goto out_dec_async_copy_err;
 		if (!nfs4_init_copy_state(nn, copy))
-			goto out_err;
+			goto out_dec_async_copy_err;
 		memcpy(&result->cb_stateid, &copy->cp_stateid.cs_stid,
 			sizeof(result->cb_stateid));
 		dup_copy_fields(copy, async_copy);
 		async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
 				async_copy, "%s", "copy thread");
 		if (IS_ERR(async_copy->copy_task))
-			goto out_err;
+			goto out_dec_async_copy_err;
 		spin_lock(&async_copy->cp_clp->async_lock);
 		list_add(&async_copy->copies,
 				&async_copy->cp_clp->async_copies);
@@ -1954,6 +1954,9 @@ out:
 	trace_nfsd_copy_done(copy, status);
 	release_copy_files(copy);
 	return status;
+out_dec_async_copy_err:
+	if (async_copy)
+		atomic_dec(&nn->pending_async_copies);
 out_err:
 	if (nfsd4_ssc_is_inter(copy)) {
 		/*
-- 
cgit v1.2.3


From b8f2688258f886f0bc0c0cb3ebe51efaa12191ec Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 16 Dec 2024 14:45:15 -0800
Subject: inotify: Use strscpy() for event->name copies

Since we have already allocated "len + 1" space for event->name, make sure
that name->name cannot ever accidentally cause a copy overflow by calling
strscpy() instead of the unbounded strcpy() routine. This assists in
the ongoing efforts to remove the unsafe strcpy() API[1] from the kernel.

Link: https://github.com/KSPP/linux/issues/88 [1]
Signed-off-by: Kees Cook <kees@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20241216224507.work.859-kees@kernel.org
---
 fs/notify/inotify/inotify_fsnotify.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 993375f0db67..cd7d11b0eb08 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -121,7 +121,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
 	event->sync_cookie = cookie;
 	event->name_len = len;
 	if (len)
-		strcpy(event->name, name->name);
+		strscpy(event->name, name->name, event->name_len + 1);
 
 	ret = fsnotify_add_event(group, fsn_event, inotify_merge);
 	if (ret) {
-- 
cgit v1.2.3


From 1f2bf7049f6ef6048b56b18d0033d3e77b28f973 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Nov 2024 19:44:28 +0000
Subject: ntfs3: Remove an access to page->index

Convert the first page passed to ni_write_frame() to a folio and use
folio_pos() on that instead of open-coding the access to folio->index,
cast & shift.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
---
 fs/ntfs3/frecord.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 8b39d0ce5f28..c57f0686b14b 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -2726,9 +2726,10 @@ int ni_write_frame(struct ntfs_inode *ni, struct page **pages,
 {
 	int err;
 	struct ntfs_sb_info *sbi = ni->mi.sbi;
+	struct folio *folio = page_folio(pages[0]);
 	u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
 	u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT;
-	u64 frame_vbo = (u64)pages[0]->index << PAGE_SHIFT;
+	u64 frame_vbo = folio_pos(folio);
 	CLST frame = frame_vbo >> frame_bits;
 	char *frame_ondisk = NULL;
 	struct page **pages_disk = NULL;
-- 
cgit v1.2.3


From a5874fde3c0884a33ed4145101052318c5e17c74 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Thu, 12 Dec 2024 18:42:16 +0100
Subject: exec: Add a new AT_EXECVE_CHECK flag to execveat(2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new AT_EXECVE_CHECK flag to execveat(2) to check if a file would
be allowed for execution.  The main use case is for script interpreters
and dynamic linkers to check execution permission according to the
kernel's security policy. Another use case is to add context to access
logs e.g., which script (instead of interpreter) accessed a file.  As
any executable code, scripts could also use this check [1].

This is different from faccessat(2) + X_OK which only checks a subset of
access rights (i.e. inode permission and mount options for regular
files), but not the full context (e.g. all LSM access checks).  The main
use case for access(2) is for SUID processes to (partially) check access
on behalf of their caller.  The main use case for execveat(2) +
AT_EXECVE_CHECK is to check if a script execution would be allowed,
according to all the different restrictions in place.  Because the use
of AT_EXECVE_CHECK follows the exact kernel semantic as for a real
execution, user space gets the same error codes.

An interesting point of using execveat(2) instead of openat2(2) is that
it decouples the check from the enforcement.  Indeed, the security check
can be logged (e.g. with audit) without blocking an execution
environment not yet ready to enforce a strict security policy.

LSMs can control or log execution requests with
security_bprm_creds_for_exec().  However, to enforce a consistent and
complete access control (e.g. on binary's dependencies) LSMs should
restrict file executability, or measure executed files, with
security_file_open() by checking file->f_flags & __FMODE_EXEC.

Because AT_EXECVE_CHECK is dedicated to user space interpreters, it
doesn't make sense for the kernel to parse the checked files, look for
interpreters known to the kernel (e.g. ELF, shebang), and return ENOEXEC
if the format is unknown.  Because of that, security_bprm_check() is
never called when AT_EXECVE_CHECK is used.

It should be noted that script interpreters cannot directly use
execveat(2) (without this new AT_EXECVE_CHECK flag) because this could
lead to unexpected behaviors e.g., `python script.sh` could lead to Bash
being executed to interpret the script.  Unlike the kernel, script
interpreters may just interpret the shebang as a simple comment, which
should not change for backward compatibility reasons.

Because scripts or libraries files might not currently have the
executable permission set, or because we might want specific users to be
allowed to run arbitrary scripts, the following patch provides a dynamic
configuration mechanism with the SECBIT_EXEC_RESTRICT_FILE and
SECBIT_EXEC_DENY_INTERACTIVE securebits.

This is a redesign of the CLIP OS 4's O_MAYEXEC:
https://github.com/clipos-archive/src_platform_clip-patches/blob/f5cb330d6b684752e403b4e41b39f7004d88e561/1901_open_mayexec.patch
This patch has been used for more than a decade with customized script
interpreters.  Some examples can be found here:
https://github.com/clipos-archive/clipos4_portage-overlay/search?q=O_MAYEXEC

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Jeff Xu <jeffxu@chromium.org>
Tested-by: Jeff Xu <jeffxu@chromium.org>
Link: https://docs.python.org/3/library/io.html#io.open_code [1]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20241212174223.389435-2-mic@digikod.net
Signed-off-by: Kees Cook <kees@kernel.org>
---
 Documentation/userspace-api/check_exec.rst | 37 ++++++++++++++++++++++++++++++
 Documentation/userspace-api/index.rst      |  1 +
 fs/exec.c                                  | 20 ++++++++++++++--
 include/linux/binfmts.h                    |  7 +++++-
 include/uapi/linux/fcntl.h                 |  4 ++++
 security/security.c                        | 10 ++++++++
 6 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/userspace-api/check_exec.rst

(limited to 'fs')

diff --git a/Documentation/userspace-api/check_exec.rst b/Documentation/userspace-api/check_exec.rst
new file mode 100644
index 000000000000..393dd7ca19c4
--- /dev/null
+++ b/Documentation/userspace-api/check_exec.rst
@@ -0,0 +1,37 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright © 2024 Microsoft Corporation
+
+===================
+Executability check
+===================
+
+AT_EXECVE_CHECK
+===============
+
+Passing the ``AT_EXECVE_CHECK`` flag to :manpage:`execveat(2)` only performs a
+check on a regular file and returns 0 if execution of this file would be
+allowed, ignoring the file format and then the related interpreter dependencies
+(e.g. ELF libraries, script's shebang).
+
+Programs should always perform this check to apply kernel-level checks against
+files that are not directly executed by the kernel but passed to a user space
+interpreter instead.  All files that contain executable code, from the point of
+view of the interpreter, should be checked.  However the result of this check
+should only be enforced according to ``SECBIT_EXEC_RESTRICT_FILE`` or
+``SECBIT_EXEC_DENY_INTERACTIVE.``.
+
+The main purpose of this flag is to improve the security and consistency of an
+execution environment to ensure that direct file execution (e.g.
+``./script.sh``) and indirect file execution (e.g. ``sh script.sh``) lead to
+the same result.  For instance, this can be used to check if a file is
+trustworthy according to the caller's environment.
+
+In a secure environment, libraries and any executable dependencies should also
+be checked.  For instance, dynamic linking should make sure that all libraries
+are allowed for execution to avoid trivial bypass (e.g. using ``LD_PRELOAD``).
+For such secure execution environment to make sense, only trusted code should
+be executable, which also requires integrity guarantees.
+
+To avoid race conditions leading to time-of-check to time-of-use issues,
+``AT_EXECVE_CHECK`` should be used with ``AT_EMPTY_PATH`` to check against a
+file descriptor instead of a path.
diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index 274cc7546efc..6272bcf11296 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -35,6 +35,7 @@ Security-related interfaces
    mfd_noexec
    spec_ctrl
    tee
+   check_exec
 
 Devices and I/O
 ===============
diff --git a/fs/exec.c b/fs/exec.c
index 98cb7ba9983c..e3f461096e84 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -892,7 +892,8 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 		.lookup_flags = LOOKUP_FOLLOW,
 	};
 
-	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+	if ((flags &
+	     ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0)
 		return ERR_PTR(-EINVAL);
 	if (flags & AT_SYMLINK_NOFOLLOW)
 		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
@@ -1541,6 +1542,21 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl
 	}
 	bprm->interp = bprm->filename;
 
+	/*
+	 * At this point, security_file_open() has already been called (with
+	 * __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will
+	 * stop just after the security_bprm_creds_for_exec() call in
+	 * bprm_execve().  Indeed, the kernel should not try to parse the
+	 * content of the file with exec_binprm() nor change the calling
+	 * thread, which means that the following security functions will not
+	 * be called:
+	 * - security_bprm_check()
+	 * - security_bprm_creds_from_file()
+	 * - security_bprm_committing_creds()
+	 * - security_bprm_committed_creds()
+	 */
+	bprm->is_check = !!(flags & AT_EXECVE_CHECK);
+
 	retval = bprm_mm_init(bprm);
 	if (!retval)
 		return bprm;
@@ -1836,7 +1852,7 @@ static int bprm_execve(struct linux_binprm *bprm)
 
 	/* Set the unchanging part of bprm->cred */
 	retval = security_bprm_creds_for_exec(bprm);
-	if (retval)
+	if (retval || bprm->is_check)
 		goto out;
 
 	retval = exec_binprm(bprm);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index e6c00e860951..8ff0eb3644a1 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -42,7 +42,12 @@ struct linux_binprm {
 		 * Set when errors can no longer be returned to the
 		 * original userspace.
 		 */
-		point_of_no_return:1;
+		point_of_no_return:1,
+		/*
+		 * Set by user space to check executability according to the
+		 * caller's environment.
+		 */
+		is_check:1;
 	struct file *executable; /* Executable to pass to the interpreter */
 	struct file *interpreter;
 	struct file *file;
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6e6907e63bfc..a15ac2fa4b20 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -155,4 +155,8 @@
 #define AT_HANDLE_MNT_ID_UNIQUE	0x001	/* Return the u64 unique mount ID. */
 #define AT_HANDLE_CONNECTABLE	0x002	/* Request a connectable file handle */
 
+/* Flags for execveat2(2). */
+#define AT_EXECVE_CHECK		0x10000	/* Only perform a check if execution
+					   would be allowed. */
+
 #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/security/security.c b/security/security.c
index 09664e09fec9..dae7e903947f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1248,6 +1248,12 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
  * to 1 if AT_SECURE should be set to request libc enable secure mode.  @bprm
  * contains the linux_binprm structure.
  *
+ * If execveat(2) is called with the AT_EXECVE_CHECK flag, bprm->is_check is
+ * set.  The result must be the same as without this flag even if the execution
+ * will never really happen and @bprm will always be dropped.
+ *
+ * This hook must not change current->cred, only @bprm->cred.
+ *
  * Return: Returns 0 if the hook is successful and permission is granted.
  */
 int security_bprm_creds_for_exec(struct linux_binprm *bprm)
@@ -3098,6 +3104,10 @@ int security_file_receive(struct file *file)
  * Save open-time permission checking state for later use upon file_permission,
  * and recheck access if anything has changed since inode_permission.
  *
+ * We can check if a file is opened for execution (e.g. execve(2) call), either
+ * directly or indirectly (e.g. ELF's ld.so) by checking file->f_flags &
+ * __FMODE_EXEC .
+ *
  * Return: Returns 0 if permission is granted.
  */
 int security_file_open(struct file *file)
-- 
cgit v1.2.3


From 1a72d2ebeec51f10e5b0f0609c6754e92b11ee9d Mon Sep 17 00:00:00 2001
From: Heming Zhao <heming.zhao@suse.com>
Date: Thu, 5 Dec 2024 18:48:32 +0800
Subject: ocfs2: revert "ocfs2: fix the la space leak when unmounting an ocfs2
 volume"

Patch series "Revert ocfs2 commit dfe6c5692fb5 and provide a new fix".

SUSE QA team detected a mistake in my commit dfe6c5692fb5 ("ocfs2: fix the
la space leak when unmounting an ocfs2 volume").  I am very sorry for my
error.  (If my eyes are correct) From the mailling list mails, this patch
shouldn't be applied to 4.19 5.4 5.10 5.15 6.1 6.6, and these branches
should perform a revert operation.

Reason for revert:
In commit dfe6c5692fb5, I mistakenly wrote: "This bug has existed since
the initial OCFS2 code.".  The statement is wrong.  The correct
introduction commit is 30dd3478c3cd.  IOW, if the branch doesn't include
30dd3478c3cd, dfe6c5692fb5 should also not be included.


This reverts commit dfe6c5692fb5 ("ocfs2: fix the la space leak when
unmounting an ocfs2 volume").

In commit dfe6c5692fb5, the commit log "This bug has existed since the
initial OCFS2 code." is wrong.  The correct introduction commit is
30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()").

The influence of commit dfe6c5692fb5 is that it provides a correct fix for
the latest kernel.  however, it shouldn't be pushed to stable branches.
Let's use this commit to revert all branches that include dfe6c5692fb5 and
use a new fix method to fix commit 30dd3478c3cd.

Link: https://lkml.kernel.org/r/20241205104835.18223-1-heming.zhao@suse.com
Link: https://lkml.kernel.org/r/20241205104835.18223-2-heming.zhao@suse.com
Fixes: dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume")
Signed-off-by: Heming Zhao <heming.zhao@suse.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/localalloc.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 8ac42ea81a17..5df34561c551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -1002,25 +1002,6 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 		start = bit_off + 1;
 	}
 
-	/* clear the contiguous bits until the end boundary */
-	if (count) {
-		blkno = la_start_blk +
-			ocfs2_clusters_to_blocks(osb->sb,
-					start - count);
-
-		trace_ocfs2_sync_local_to_main_free(
-				count, start - count,
-				(unsigned long long)la_start_blk,
-				(unsigned long long)blkno);
-
-		status = ocfs2_release_clusters(handle,
-				main_bm_inode,
-				main_bm_bh, blkno,
-				count);
-		if (status < 0)
-			mlog_errno(status);
-	}
-
 bail:
 	if (status)
 		mlog_errno(status);
-- 
cgit v1.2.3


From 7782e3b3b004e8cb94a88621a22cc3c2f33e5b90 Mon Sep 17 00:00:00 2001
From: Heming Zhao <heming.zhao@suse.com>
Date: Thu, 5 Dec 2024 18:48:33 +0800
Subject: ocfs2: fix the space leak in LA when releasing LA

Commit 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()")
introduced an issue, the ocfs2_sync_local_to_main() ignores the last
contiguous free bits, which causes an OCFS2 volume to lose the last free
clusters of LA window during the release routine.

Please note, because commit dfe6c5692fb5 ("ocfs2: fix the la space leak
when unmounting an ocfs2 volume") was reverted, this commit is a
replacement fix for commit dfe6c5692fb5.

Link: https://lkml.kernel.org/r/20241205104835.18223-3-heming.zhao@suse.com
Fixes: 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()")
Signed-off-by: Heming Zhao <heming.zhao@suse.com>
Suggested-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/localalloc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5df34561c551..d1aa04a5af1b 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -971,9 +971,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 	start = count = 0;
 	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
 
-	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) <
-	       left) {
-		if (bit_off == start) {
+	while (1) {
+		bit_off = ocfs2_find_next_zero_bit(bitmap, left, start);
+		if ((bit_off < left) && (bit_off == start)) {
 			count++;
 			start++;
 			continue;
@@ -998,6 +998,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 			}
 		}
 
+		if (bit_off >= left)
+			break;
 		count = 1;
 		start = bit_off + 1;
 	}
-- 
cgit v1.2.3


From 8aca2bc96c833ba695ede7a45ad7784c836a262e Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 28 Oct 2024 22:56:55 +0800
Subject: mm: use aligned address in clear_gigantic_page()

In current kernel, hugetlb_no_page() calls folio_zero_user() with the
fault address.  Where the fault address may be not aligned with the huge
page size.  Then, folio_zero_user() may call clear_gigantic_page() with
the address, while clear_gigantic_page() requires the address to be huge
page size aligned.  So, this may cause memory corruption or information
leak, addtional, use more obvious naming 'addr_hint' instead of 'addr' for
clear_gigantic_page().

Link: https://lkml.kernel.org/r/20241028145656.932941-1-wangkefeng.wang@huawei.com
Fixes: 78fefd04c123 ("mm: memory: convert clear_huge_page() to folio_zero_user()")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 2 +-
 mm/memory.c          | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 90f883d6b8fd..fc1ae5132127 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -825,7 +825,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 			error = PTR_ERR(folio);
 			goto out;
 		}
-		folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size));
+		folio_zero_user(folio, addr);
 		__folio_mark_uptodate(folio);
 		error = hugetlb_add_to_page_cache(folio, mapping, index);
 		if (unlikely(error)) {
diff --git a/mm/memory.c b/mm/memory.c
index 75c2dfd04f72..84864387f965 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6815,9 +6815,10 @@ static inline int process_huge_page(
 	return 0;
 }
 
-static void clear_gigantic_page(struct folio *folio, unsigned long addr,
+static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
 				unsigned int nr_pages)
 {
+	unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
 	int i;
 
 	might_sleep();
-- 
cgit v1.2.3


From 901ce9705fbb9f330ff1f19600e5daf9770b0175 Mon Sep 17 00:00:00 2001
From: Edward Adam Davis <eadavis@qq.com>
Date: Mon, 9 Dec 2024 15:56:52 +0900
Subject: nilfs2: prevent use of deleted inode

syzbot reported a WARNING in nilfs_rmdir. [1]

Because the inode bitmap is corrupted, an inode with an inode number that
should exist as a ".nilfs" file was reassigned by nilfs_mkdir for "file0",
causing an inode duplication during execution.  And this causes an
underflow of i_nlink in rmdir operations.

The inode is used twice by the same task to unmount and remove directories
".nilfs" and "file0", it trigger warning in nilfs_rmdir.

Avoid to this issue, check i_nlink in nilfs_iget(), if it is 0, it means
that this inode has been deleted, and iput is executed to reclaim it.

[1]
WARNING: CPU: 1 PID: 5824 at fs/inode.c:407 drop_nlink+0xc4/0x110 fs/inode.c:407
...
Call Trace:
 <TASK>
 nilfs_rmdir+0x1b0/0x250 fs/nilfs2/namei.c:342
 vfs_rmdir+0x3a3/0x510 fs/namei.c:4394
 do_rmdir+0x3b5/0x580 fs/namei.c:4453
 __do_sys_rmdir fs/namei.c:4472 [inline]
 __se_sys_rmdir fs/namei.c:4470 [inline]
 __x64_sys_rmdir+0x47/0x50 fs/namei.c:4470
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Link: https://lkml.kernel.org/r/20241209065759.6781-1-konishi.ryusuke@gmail.com
Fixes: d25006523d0b ("nilfs2: pathname operations")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+9260555647a5132edd48@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=9260555647a5132edd48
Tested-by: syzbot+9260555647a5132edd48@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/inode.c | 8 +++++++-
 fs/nilfs2/namei.c | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cf9ba481ae37..b7d4105f37bf 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -544,8 +544,14 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
 	inode = nilfs_iget_locked(sb, root, ino);
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
+
+	if (!(inode->i_state & I_NEW)) {
+		if (!inode->i_nlink) {
+			iput(inode);
+			return ERR_PTR(-ESTALE);
+		}
 		return inode;
+	}
 
 	err = __nilfs_read_inode(sb, root, ino, inode);
 	if (unlikely(err)) {
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 9b108052d9f7..1d836a5540f3 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -67,6 +67,11 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 		inode = NULL;
 	} else {
 		inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
+		if (inode == ERR_PTR(-ESTALE)) {
+			nilfs_error(dir->i_sb,
+					"deleted inode referenced: %lu", ino);
+			return ERR_PTR(-EIO);
+		}
 	}
 
 	return d_splice_alias(inode, dentry);
-- 
cgit v1.2.3


From 6309b8ce98e9a18390b9fd8f03fc412f3c17aee9 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Fri, 13 Dec 2024 01:43:28 +0900
Subject: nilfs2: fix buffer head leaks in calls to truncate_inode_pages()

When block_invalidatepage was converted to block_invalidate_folio, the
fallback to block_invalidatepage in folio_invalidate() if the
address_space_operations method invalidatepage (currently
invalidate_folio) was not set, was removed.

Unfortunately, some pseudo-inodes in nilfs2 use empty_aops set by
inode_init_always_gfp() as is, or explicitly set it to
address_space_operations.  Therefore, with this change,
block_invalidatepage() is no longer called from folio_invalidate(), and as
a result, the buffer_head structures attached to these pages/folios are no
longer freed via try_to_free_buffers().

Thus, these buffer heads are now leaked by truncate_inode_pages(), which
cleans up the page cache from inode evict(), etc.

Three types of caches use empty_aops: gc inode caches and the DAT shadow
inode used by GC, and b-tree node caches.  Of these, b-tree node caches
explicitly call invalidate_mapping_pages() during cleanup, which involves
calling try_to_free_buffers(), so the leak was not visible during normal
operation but worsened when GC was performed.

Fix this issue by using address_space_operations with invalidate_folio set
to block_invalidate_folio instead of empty_aops, which will ensure the
same behavior as before.

Link: https://lkml.kernel.org/r/20241212164556.21338-1-konishi.ryusuke@gmail.com
Fixes: 7ba13abbd31e ("fs: Turn block_invalidatepage into block_invalidate_folio")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: <stable@vger.kernel.org>	[5.18+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/btnode.c  | 1 +
 fs/nilfs2/gcinode.c | 2 +-
 fs/nilfs2/inode.c   | 5 +++++
 fs/nilfs2/nilfs.h   | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 501ad7be5174..54a3fa0cf67e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -35,6 +35,7 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode)
 	ii->i_flags = 0;
 	memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap));
 	mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS);
+	btnc_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops;
 }
 
 void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index ace22253fed0..2dbb15767df1 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -163,7 +163,7 @@ int nilfs_init_gcinode(struct inode *inode)
 
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
-	inode->i_mapping->a_ops = &empty_aops;
+	inode->i_mapping->a_ops = &nilfs_buffer_cache_aops;
 
 	ii->i_flags = 0;
 	nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b7d4105f37bf..23f3a75edd50 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -276,6 +276,10 @@ const struct address_space_operations nilfs_aops = {
 	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
+const struct address_space_operations nilfs_buffer_cache_aops = {
+	.invalidate_folio	= block_invalidate_folio,
+};
+
 static int nilfs_insert_inode_locked(struct inode *inode,
 				     struct nilfs_root *root,
 				     unsigned long ino)
@@ -681,6 +685,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
 	NILFS_I(s_inode)->i_flags = 0;
 	memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap));
 	mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS);
+	s_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops;
 
 	err = nilfs_attach_btree_node_cache(s_inode);
 	if (unlikely(err)) {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 45d03826eaf1..dff241c53fc5 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -401,6 +401,7 @@ extern const struct file_operations nilfs_dir_operations;
 extern const struct inode_operations nilfs_file_inode_operations;
 extern const struct file_operations nilfs_file_operations;
 extern const struct address_space_operations nilfs_aops;
+extern const struct address_space_operations nilfs_buffer_cache_aops;
 extern const struct inode_operations nilfs_dir_inode_operations;
 extern const struct inode_operations nilfs_special_inode_operations;
 extern const struct inode_operations nilfs_symlink_inode_operations;
-- 
cgit v1.2.3


From 974e3fe0ac61de85015bbe5a4990cf4127b304b2 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 19 Dec 2024 12:53:01 +0100
Subject: fs: relax assertions on failure to encode file handles

Encoding file handles is usually performed by a filesystem >encode_fh()
method that may fail for various reasons.

The legacy users of exportfs_encode_fh(), namely, nfsd and
name_to_handle_at(2) syscall are ready to cope with the possibility
of failure to encode a file handle.

There are a few other users of exportfs_encode_{fh,fid}() that
currently have a WARN_ON() assertion when ->encode_fh() fails.
Relax those assertions because they are wrong.

The second linked bug report states commit 16aac5ad1fa9 ("ovl: support
encoding non-decodable file handles") in v6.6 as the regressing commit,
but this is not accurate.

The aforementioned commit only increases the chances of the assertion
and allows triggering the assertion with the reproducer using overlayfs,
inotify and drop_caches.

Triggering this assertion was always possible with other filesystems and
other reasons of ->encode_fh() failures and more particularly, it was
also possible with the exact same reproducer using overlayfs that is
mounted with options index=on,nfs_export=on also on kernels < v6.6.
Therefore, I am not listing the aforementioned commit as a Fixes commit.

Backport hint: this patch will have a trivial conflict applying to
v6.6.y, and other trivial conflicts applying to stable kernels < v6.6.

Reported-by: syzbot+ec07f6f5ce62b858579f@syzkaller.appspotmail.com
Tested-by: syzbot+ec07f6f5ce62b858579f@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-unionfs/671fd40c.050a0220.4735a.024f.GAE@google.com/
Reported-by: Dmitry Safonov <dima@arista.com>
Closes: https://lore.kernel.org/linux-fsdevel/CAGrbwDTLt6drB9eaUagnQVgdPBmhLfqqxAf3F+Juqy_o6oP8uw@mail.gmail.com/
Cc: stable@vger.kernel.org
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20241219115301.465396-1-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/notify/fdinfo.c     | 4 +---
 fs/overlayfs/copy_up.c | 5 ++---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index dec553034027..e933f9c65d90 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -47,10 +47,8 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
 	size = f->handle_bytes >> 2;
 
 	ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size);
-	if ((ret == FILEID_INVALID) || (ret < 0)) {
-		WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
+	if ((ret == FILEID_INVALID) || (ret < 0))
 		return;
-	}
 
 	f->handle_type = ret;
 	f->handle_bytes = size * sizeof(u32);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 3601ddfeddc2..56eee9f23ea9 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -442,9 +442,8 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
 	buflen = (dwords << 2);
 
 	err = -EIO;
-	if (WARN_ON(fh_type < 0) ||
-	    WARN_ON(buflen > MAX_HANDLE_SZ) ||
-	    WARN_ON(fh_type == FILEID_INVALID))
+	if (fh_type < 0 || fh_type == FILEID_INVALID ||
+	    WARN_ON(buflen > MAX_HANDLE_SZ))
 		goto out_err;
 
 	fh->fb.version = OVL_FH_VERSION;
-- 
cgit v1.2.3


From a769bee5f9fbca47efd4fa6bc3d726d370cedebe Mon Sep 17 00:00:00 2001
From: Bharath SM <bharathsm@microsoft.com>
Date: Tue, 17 Dec 2024 00:09:36 +0530
Subject: smb: use macros instead of constants for leasekey size and default
 cifsattrs value

Replace default hardcoded value for cifsAttrs with ATTR_ARCHIVE macro
Use SMB2_LEASE_KEY_SIZE macro for leasekey size in smb2_lease_break

Signed-off-by: Bharath SM <bharathsm@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsfs.c  | 2 +-
 fs/smb/client/smb2pdu.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 9d96b833015c..b800c9f585d8 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -398,7 +398,7 @@ cifs_alloc_inode(struct super_block *sb)
 	cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL);
 	if (!cifs_inode)
 		return NULL;
-	cifs_inode->cifsAttrs = 0x20;	/* default */
+	cifs_inode->cifsAttrs = ATTR_ARCHIVE;	/* default */
 	cifs_inode->time = 0;
 	/*
 	 * Until the file is open and we have gotten oplock info back from the
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 010eae9d6c47..c945b94318f8 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -6204,7 +6204,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
 	req->StructureSize = cpu_to_le16(36);
 	total_len += 12;
 
-	memcpy(req->LeaseKey, lease_key, 16);
+	memcpy(req->LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE);
 	req->LeaseState = lease_state;
 
 	flags |= CIFS_NO_RSP_BUF;
-- 
cgit v1.2.3


From ee1c8e6b2931811a906b8c78006bfe0a3386fa60 Mon Sep 17 00:00:00 2001
From: Dragan Simic <dsimic@manjaro.org>
Date: Tue, 17 Dec 2024 10:25:10 +0100
Subject: smb: client: Deduplicate "select NETFS_SUPPORT" in Kconfig

Repeating automatically selected options in Kconfig files is redundant, so
let's delete repeated "select NETFS_SUPPORT" that was added accidentally.

Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks")
Signed-off-by: Dragan Simic <dsimic@manjaro.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index 2aff6d1395ce..9f05f94e265a 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -2,7 +2,6 @@
 config CIFS
 	tristate "SMB3 and CIFS support (advanced network filesystem)"
 	depends on INET
-	select NETFS_SUPPORT
 	select NLS
 	select NLS_UCS2_UTILS
 	select CRYPTO
-- 
cgit v1.2.3


From e9f2517a3e18a54a3943c098d2226b245d488801 Mon Sep 17 00:00:00 2001
From: Enzo Matsumiya <ematsumiya@suse.de>
Date: Tue, 10 Dec 2024 18:15:12 -0300
Subject: smb: client: fix TCP timers deadlock after rmmod

Commit ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.")
fixed a netns UAF by manually enabled socket refcounting
(sk->sk_net_refcnt=1 and sock_inuse_add(net, 1)).

The reason the patch worked for that bug was because we now hold
references to the netns (get_net_track() gets a ref internally)
and they're properly released (internally, on __sk_destruct()),
but only because sk->sk_net_refcnt was set.

Problem:
(this happens regardless of CONFIG_NET_NS_REFCNT_TRACKER and regardless
if init_net or other)

Setting sk->sk_net_refcnt=1 *manually* and *after* socket creation is not
only out of cifs scope, but also technically wrong -- it's set conditionally
based on user (=1) vs kernel (=0) sockets.  And net/ implementations
seem to base their user vs kernel space operations on it.

e.g. upon TCP socket close, the TCP timers are not cleared because
sk->sk_net_refcnt=1:
(cf. commit 151c9c724d05 ("tcp: properly terminate timers for kernel sockets"))

net/ipv4/tcp.c:
    void tcp_close(struct sock *sk, long timeout)
    {
    	lock_sock(sk);
    	__tcp_close(sk, timeout);
    	release_sock(sk);
    	if (!sk->sk_net_refcnt)
    		inet_csk_clear_xmit_timers_sync(sk);
    	sock_put(sk);
    }

Which will throw a lockdep warning and then, as expected, deadlock on
tcp_write_timer().

A way to reproduce this is by running the reproducer from ef7134c7fc48
and then 'rmmod cifs'.  A few seconds later, the deadlock/lockdep
warning shows up.

Fix:
We shouldn't mess with socket internals ourselves, so do not set
sk_net_refcnt manually.

Also change __sock_create() to sock_create_kern() for explicitness.

As for non-init_net network namespaces, we deal with it the best way
we can -- hold an extra netns reference for server->ssocket and drop it
when it's released.  This ensures that the netns still exists whenever
we need to create/destroy server->ssocket, but is not directly tied to
it.

Fixes: ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.")
Cc: stable@vger.kernel.org
Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 2372538a1211..ddcc9e514a0e 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -987,9 +987,13 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
 	msleep(125);
 	if (cifs_rdma_enabled(server))
 		smbd_destroy(server);
+
 	if (server->ssocket) {
 		sock_release(server->ssocket);
 		server->ssocket = NULL;
+
+		/* Release netns reference for the socket. */
+		put_net(cifs_net_ns(server));
 	}
 
 	if (!list_empty(&server->pending_mid_q)) {
@@ -1037,6 +1041,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
 		 */
 	}
 
+	/* Release netns reference for this server. */
 	put_net(cifs_net_ns(server));
 	kfree(server->leaf_fullpath);
 	kfree(server);
@@ -1713,6 +1718,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
 
 	tcp_ses->ops = ctx->ops;
 	tcp_ses->vals = ctx->vals;
+
+	/* Grab netns reference for this server. */
 	cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
 
 	tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
@@ -1844,6 +1851,7 @@ smbd_connected:
 out_err_crypto_release:
 	cifs_crypto_secmech_release(tcp_ses);
 
+	/* Release netns reference for this server. */
 	put_net(cifs_net_ns(tcp_ses));
 
 out_err:
@@ -1852,8 +1860,10 @@ out_err:
 			cifs_put_tcp_session(tcp_ses->primary_server, false);
 		kfree(tcp_ses->hostname);
 		kfree(tcp_ses->leaf_fullpath);
-		if (tcp_ses->ssocket)
+		if (tcp_ses->ssocket) {
 			sock_release(tcp_ses->ssocket);
+			put_net(cifs_net_ns(tcp_ses));
+		}
 		kfree(tcp_ses);
 	}
 	return ERR_PTR(rc);
@@ -3131,20 +3141,20 @@ generic_ip_connect(struct TCP_Server_Info *server)
 		socket = server->ssocket;
 	} else {
 		struct net *net = cifs_net_ns(server);
-		struct sock *sk;
 
-		rc = __sock_create(net, sfamily, SOCK_STREAM,
-				   IPPROTO_TCP, &server->ssocket, 1);
+		rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket);
 		if (rc < 0) {
 			cifs_server_dbg(VFS, "Error %d creating socket\n", rc);
 			return rc;
 		}
 
-		sk = server->ssocket->sk;
-		__netns_tracker_free(net, &sk->ns_tracker, false);
-		sk->sk_net_refcnt = 1;
-		get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
-		sock_inuse_add(net, 1);
+		/*
+		 * Grab netns reference for the socket.
+		 *
+		 * It'll be released here, on error, or in clean_demultiplex_info() upon server
+		 * teardown.
+		 */
+		get_net(net);
 
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
 		cifs_dbg(FYI, "Socket created\n");
@@ -3158,8 +3168,10 @@ generic_ip_connect(struct TCP_Server_Info *server)
 	}
 
 	rc = bind_socket(server);
-	if (rc < 0)
+	if (rc < 0) {
+		put_net(cifs_net_ns(server));
 		return rc;
+	}
 
 	/*
 	 * Eventually check for other socket options to change from
@@ -3196,6 +3208,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
 	if (rc < 0) {
 		cifs_dbg(FYI, "Error %d connecting to server\n", rc);
 		trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
+		put_net(cifs_net_ns(server));
 		sock_release(socket);
 		server->ssocket = NULL;
 		return rc;
@@ -3204,6 +3217,9 @@ generic_ip_connect(struct TCP_Server_Info *server)
 	if (sport == htons(RFC1001_PORT))
 		rc = ip_rfc1001_connect(server);
 
+	if (rc < 0)
+		put_net(cifs_net_ns(server));
+
 	return rc;
 }
 
-- 
cgit v1.2.3


From 92941c7f2c9529fac1b2670482d0ced3b46eac70 Mon Sep 17 00:00:00 2001
From: Bharath SM <bharathsm.hsk@gmail.com>
Date: Thu, 19 Dec 2024 23:28:50 +0530
Subject: smb: fix bytes written value in /proc/fs/cifs/Stats

With recent netfs apis changes, the bytes written
value was not getting updated in /proc/fs/cifs/Stats.
Fix this by updating tcon->bytes in write operations.

Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib")
Signed-off-by: Bharath SM <bharathsm@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2pdu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index c945b94318f8..959359301250 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4840,6 +4840,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
 		if (written > wdata->subreq.len)
 			written &= 0xFFFF;
 
+		cifs_stats_bytes_written(tcon, written);
+
 		if (written < wdata->subreq.len)
 			wdata->result = -ENOSPC;
 		else
@@ -5156,6 +5158,7 @@ replay_again:
 		cifs_dbg(VFS, "Send error in write = %d\n", rc);
 	} else {
 		*nbytes = le32_to_cpu(rsp->DataLength);
+		cifs_stats_bytes_written(io_parms->tcon, *nbytes);
 		trace_smb3_write_done(0, 0, xid,
 				      req->PersistentFileId,
 				      io_parms->tcon->tid,
-- 
cgit v1.2.3


From 134129520beaf3339482c557361ea0bde709cf36 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Tue, 19 Nov 2024 15:56:44 -0500
Subject: dlm: fix removal of rsb struct that is master and dir record

An rsb struct was not being removed in the case where it
was both the master and the dir record.  This case (master
and dir node) was missed in the condition for doing add_scan()
from deactivate_rsb().  Fixing this triggers a related WARN_ON
that needs to be fixed, and requires adjusting where two
del_scan() calls are made.

Fixes: c217adfc8caa ("dlm: fix add_scan and del_scan usage")
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index fc1d710166e9..c8ff88f1cdcf 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -824,9 +824,12 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
 		r->res_first_lkid = 0;
 	}
 
-	/* A dir record will not be on the scan list. */
-	if (r->res_dir_nodeid != our_nodeid)
-		del_scan(ls, r);
+	/* we always deactivate scan timer for the rsb, when
+	 * we move it out of the inactive state as rsb state
+	 * can be changed and scan timers are only for inactive
+	 * rsbs.
+	 */
+	del_scan(ls, r);
 	list_move(&r->res_slow_list, &ls->ls_slow_active);
 	rsb_clear_flag(r, RSB_INACTIVE);
 	kref_init(&r->res_ref); /* ref is now used in active state */
@@ -989,10 +992,10 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
 		r->res_nodeid = 0;
 	}
 
+	del_scan(ls, r);
 	list_move(&r->res_slow_list, &ls->ls_slow_active);
 	rsb_clear_flag(r, RSB_INACTIVE);
 	kref_init(&r->res_ref);
-	del_scan(ls, r);
 	write_unlock_bh(&ls->ls_rsbtbl_lock);
 
 	goto out;
@@ -1337,9 +1340,13 @@ static int _dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *na
 	__dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags,
 			    r_nodeid, result);
 
-	/* A dir record rsb should never be on scan list. */
-	/* Try to fix this with del_scan? */
-	WARN_ON(!list_empty(&r->res_scan_list));
+	/* A dir record rsb should never be on scan list.
+	 * Except when we are the dir and master node.
+	 * This function should only be called by the dir
+	 * node.
+	 */
+	WARN_ON(!list_empty(&r->res_scan_list) &&
+		r->res_master_nodeid != our_nodeid);
 
 	write_unlock_bh(&ls->ls_rsbtbl_lock);
 
@@ -1430,16 +1437,23 @@ static void deactivate_rsb(struct kref *kref)
 	list_move(&r->res_slow_list, &ls->ls_slow_inactive);
 
 	/*
-	 * When the rsb becomes unused:
-	 * - If it's not a dir record for a remote master rsb,
-	 *   then it is put on the scan list to be freed.
-	 * - If it's a dir record for a remote master rsb,
-	 *   then it is kept in the inactive state until
-	 *   receive_remove() from the master node.
+	 * When the rsb becomes unused, there are two possibilities:
+	 * 1. Leave the inactive rsb in place (don't remove it).
+	 * 2. Add it to the scan list to be removed.
+	 *
+	 * 1 is done when the rsb is acting as the dir record
+	 * for a remotely mastered rsb.  The rsb must be left
+	 * in place as an inactive rsb to act as the dir record.
+	 *
+	 * 2 is done when a) the rsb is not the master and not the
+	 * dir record, b) when the rsb is both the master and the
+	 * dir record, c) when the rsb is master but not dir record.
+	 *
+	 * (If no directory is used, the rsb can always be removed.)
 	 */
-	if (!dlm_no_directory(ls) &&
-	    (r->res_master_nodeid != our_nodeid) &&
-	    (dlm_dir_nodeid(r) != our_nodeid))
+	if (dlm_no_directory(ls) ||
+	    (r->res_master_nodeid == our_nodeid ||
+	     dlm_dir_nodeid(r) != our_nodeid))
 		add_scan(ls, r);
 
 	if (r->res_lvbptr) {
-- 
cgit v1.2.3


From 57cdd1a1cf1464199678f9338049b63fb5d5b41c Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 2 Dec 2024 10:26:37 -0500
Subject: dlm: fix srcu_read_lock() return type to int

The return type of srcu_read_lock() is int and not bool. Whereas we
using the ret variable only to evaluate a bool type of
dlm_lowcomms_con_has_addr() to check if an address is already being set.

Fixes: 6f0b0b5d7ae7 ("fs: dlm: remove dlm_node_addrs lookup list")
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index df40c3fd1070..d28141829c05 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -462,7 +462,8 @@ static bool dlm_lowcomms_con_has_addr(const struct connection *con,
 int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr)
 {
 	struct connection *con;
-	bool ret, idx;
+	bool ret;
+	int idx;
 
 	idx = srcu_read_lock(&connections_srcu);
 	con = nodeid2con(nodeid, GFP_NOFS);
-- 
cgit v1.2.3


From 6784ed98fde5b7538fff6b329b686b119ca23d8b Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 2 Dec 2024 10:26:41 -0500
Subject: dlm: return -ENOENT if no comm was found

Currently if no comm can be found dlm_comm_seq() returns -EEXIST which
means entry already exists for a lookup it makes no sense to return
-EEXIST. We change it to -ENOENT. There is no user that will evaluate
the return value on a specific value so this should be fine.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index b2f21aa00719..cf9ba6fd7a28 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -935,7 +935,7 @@ int dlm_comm_seq(int nodeid, uint32_t *seq, bool locked)
 		mutex_unlock(&clusters_root.subsys.su_mutex);
 	}
 	if (!cm)
-		return -EEXIST;
+		return -ENOENT;
 
 	*seq = cm->seq;
 	put_comm(cm);
-- 
cgit v1.2.3


From 7917f01a286ce01e9c085e24468421f596ee1a0c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 20 Dec 2024 15:28:18 +1100
Subject: nfsd: restore callback functionality for NFSv4.0

A recent patch inadvertently broke callbacks for NFSv4.0.

In the 4.0 case we do not expect a session to be found but still need to
call setup_callback_client() which will not try to dereference it.

This patch moves the check for failure to find a session into the 4.1+
branch of setup_callback_client()

Fixes: 1e02c641c3a4 ("NFSD: Prevent NULL dereference in nfsd4_process_cb_update()")
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3877b53e429f..c083e539e898 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1100,7 +1100,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
 		args.authflavor = clp->cl_cred.cr_flavor;
 		clp->cl_cb_ident = conn->cb_ident;
 	} else {
-		if (!conn->cb_xprt)
+		if (!conn->cb_xprt || !ses)
 			return -EINVAL;
 		clp->cl_cb_session = ses;
 		args.bc_xprt = conn->cb_xprt;
@@ -1522,8 +1522,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 		ses = c->cn_session;
 	}
 	spin_unlock(&clp->cl_lock);
-	if (!c)
-		return;
 
 	err = setup_callback_client(clp, &conn, ses);
 	if (err) {
-- 
cgit v1.2.3


From c8b90d40d5bba8e6fba457b8a7c10d3c0d467e37 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Dec 2024 13:50:02 +0000
Subject: netfs: Fix non-contiguous donation between completed reads

When a read subrequest finishes, if it doesn't have sufficient coverage to
complete the folio(s) covering either side of it, it will donate the excess
coverage to the adjacent subrequests on either side, offloading
responsibility for unlocking the folio(s) covered to them.

Now, preference is given to donating down to a lower file offset over
donating up because that check is done first - but there's no check that
the lower subreq is actually contiguous, and so we can end up donating
incorrectly.

The scenario seen[1] is that an 8MiB readahead request spanning four 2MiB
folios is split into eight 1MiB subreqs (numbered 1 through 8).  These
terminate in the order 1,6,2,5,3,7,4,8.  What happens is:

	- 1 donates to 2
	- 6 donates to 5
	- 2 completes, unlocking the first folio (with 1).
	- 5 completes, unlocking the third folio (with 6).
	- 3 donates to 4
	- 7 donates to 4 incorrectly
	- 4 completes, unlocking the second folio (with 3), but can't use
	  the excess from 7.
	- 8 donates to 4, also incorrectly.

Fix this by preventing downward donation if the subreqs are not contiguous
(in the example above, 7 donates to 4 across the gap left by 5 and 6).

Reported-by: Shyam Prasad N <nspmangalore@gmail.com>
Closes: https://lore.kernel.org/r/CANT5p=qBwjBm-D8soFVVtswGEfmMtQXVW83=TNfUtvyHeFQZBA@mail.gmail.com/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/526707.1733224486@warthog.procyon.org.uk/ [1]
Link: https://lore.kernel.org/r/20241213135013.2964079-3-dhowells@redhat.com
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/read_collect.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 3cbb289535a8..b415e3972336 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -247,16 +247,17 @@ donation_changed:
 
 	/* Deal with the trickiest case: that this subreq is in the middle of a
 	 * folio, not touching either edge, but finishes first.  In such a
-	 * case, we donate to the previous subreq, if there is one, so that the
-	 * donation is only handled when that completes - and remove this
-	 * subreq from the list.
+	 * case, we donate to the previous subreq, if there is one and if it is
+	 * contiguous, so that the donation is only handled when that completes
+	 * - and remove this subreq from the list.
 	 *
 	 * If the previous subreq finished first, we will have acquired their
 	 * donation and should be able to unlock folios and/or donate nextwards.
 	 */
 	if (!subreq->consumed &&
 	    !prev_donated &&
-	    !list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
+	    !list_is_first(&subreq->rreq_link, &rreq->subrequests) &&
+	    subreq->start == prev->start + prev->len) {
 		prev = list_prev_entry(subreq, rreq_link);
 		WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
 		subreq->start += subreq->len;
-- 
cgit v1.2.3


From 105549d09a539a876b7c3330ab52d8aceedad358 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Dec 2024 13:50:03 +0000
Subject: netfs: Fix enomem handling in buffered reads

If netfs_read_to_pagecache() gets an error from either ->prepare_read() or
from netfs_prepare_read_iterator(), it needs to decrement ->nr_outstanding,
cancel the subrequest and break out of the issuing loop.  Currently, it
only does this for two of the cases, but there are two more that aren't
handled.

Fix this by moving the handling to a common place and jumping to it from
all four places.  This is in preference to inserting a wrapper around
netfs_prepare_read_iterator() as proposed by Dmitry Antipov[1].

Link: https://lore.kernel.org/r/20241202093943.227786-1-dmantipov@yandex.ru/ [1]

Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Reported-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=404b4b745080b6210c6c
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241213135013.2964079-4-dhowells@redhat.com
Tested-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com
cc: Dmitry Antipov <dmantipov@yandex.ru>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/buffered_read.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 7ac34550c403..4dc9b8286355 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -275,22 +275,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			netfs_stat(&netfs_n_rh_download);
 			if (rreq->netfs_ops->prepare_read) {
 				ret = rreq->netfs_ops->prepare_read(subreq);
-				if (ret < 0) {
-					atomic_dec(&rreq->nr_outstanding);
-					netfs_put_subrequest(subreq, false,
-							     netfs_sreq_trace_put_cancel);
-					break;
-				}
+				if (ret < 0)
+					goto prep_failed;
 				trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
 			}
 
 			slice = netfs_prepare_read_iterator(subreq);
-			if (slice < 0) {
-				atomic_dec(&rreq->nr_outstanding);
-				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
-				ret = slice;
-				break;
-			}
+			if (slice < 0)
+				goto prep_iter_failed;
 
 			rreq->netfs_ops->issue_read(subreq);
 			goto done;
@@ -302,6 +294,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 			netfs_stat(&netfs_n_rh_zero);
 			slice = netfs_prepare_read_iterator(subreq);
+			if (slice < 0)
+				goto prep_iter_failed;
 			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 			netfs_read_subreq_terminated(subreq, 0, false);
 			goto done;
@@ -310,6 +304,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 		if (source == NETFS_READ_FROM_CACHE) {
 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 			slice = netfs_prepare_read_iterator(subreq);
+			if (slice < 0)
+				goto prep_iter_failed;
 			netfs_read_cache_to_pagecache(rreq, subreq);
 			goto done;
 		}
@@ -318,6 +314,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 		WARN_ON_ONCE(1);
 		break;
 
+	prep_iter_failed:
+		ret = slice;
+	prep_failed:
+		subreq->error = ret;
+		atomic_dec(&rreq->nr_outstanding);
+		netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+		break;
+
 	done:
 		size -= slice;
 		start += slice;
-- 
cgit v1.2.3


From 86ad1a58f6a9453f49e06ef957a40a8dac00a13f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Dec 2024 13:50:04 +0000
Subject: nfs: Fix oops in nfs_netfs_init_request() when copying to cache

When netfslib wants to copy some data that has just been read on behalf of
nfs, it creates a new write request and calls nfs_netfs_init_request() to
initialise it, but with a NULL file pointer.  This causes
nfs_file_open_context() to oops - however, we don't actually need the nfs
context as we're only going to write to the cache.

Fix this by just returning if we aren't given a file pointer and emit a
warning if the request was for something other than copy-to-cache.

Further, fix nfs_netfs_free_request() so that it doesn't try to free the
context if the pointer is NULL.

Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Reported-by: Max Kellermann <max.kellermann@ionos.com>
Closes: https://lore.kernel.org/r/CAKPOu+9DyMbKLhyJb7aMLDTb=Fh0T8Teb9sjuf_pze+XWT1VaQ@mail.gmail.com/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241213135013.2964079-5-dhowells@redhat.com
cc: Trond Myklebust <trondmy@kernel.org>
cc: Anna Schumaker <anna@kernel.org>
cc: Dave Wysochanski <dwysocha@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-nfs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nfs/fscache.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 810269ee0a50..d49e4ce27999 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -263,6 +263,12 @@ int nfs_netfs_readahead(struct readahead_control *ractl)
 static atomic_t nfs_netfs_debug_id;
 static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
+	if (!file) {
+		if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE))
+			return -EIO;
+		return 0;
+	}
+
 	rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
 	rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
 	/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
@@ -274,7 +280,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi
 
 static void nfs_netfs_free_request(struct netfs_io_request *rreq)
 {
-	put_nfs_open_context(rreq->netfs_priv);
+	if (rreq->netfs_priv)
+		put_nfs_open_context(rreq->netfs_priv);
 }
 
 static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
-- 
cgit v1.2.3


From e5a8b6446c0d370716f193771ccacf3260a57534 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Fri, 13 Dec 2024 13:50:05 +0000
Subject: cachefiles: Parse the "secctx" immediately

Instead of storing an opaque string, call security_secctx_to_secid()
right in the "secctx" command handler and store only the numeric
"secid".  This eliminates an unnecessary string allocation and allows
the daemon to receive errors when writing the "secctx" command instead
of postponing the error to the "bind" command handler.  For example,
if the kernel was built without `CONFIG_SECURITY`, "bind" will return
`EOPNOTSUPP`, but the daemon doesn't know why.  With this patch, the
"secctx" will instead return `EOPNOTSUPP` which is the right context
for this error.

This patch adds a boolean flag `have_secid` because I'm not sure if we
can safely assume that zero is the special secid value for "not set".
This appears to be true for SELinux, Smack and AppArmor, but since
this attribute is not documented, I'm unable to derive a stable
guarantee for that.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241209141554.638708-1-max.kellermann@ionos.com/
Link: https://lore.kernel.org/r/20241213135013.2964079-6-dhowells@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/daemon.c   | 14 +++++++-------
 fs/cachefiles/internal.h |  3 ++-
 fs/cachefiles/security.c |  6 +++---
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 89b11336a836..1806bff8e59b 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -15,6 +15,7 @@
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/mount.h>
+#include <linux/security.h>
 #include <linux/statfs.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
@@ -576,7 +577,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
  */
 static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
 {
-	char *secctx;
+	int err;
 
 	_enter(",%s", args);
 
@@ -585,16 +586,16 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
 		return -EINVAL;
 	}
 
-	if (cache->secctx) {
+	if (cache->have_secid) {
 		pr_err("Second security context specified\n");
 		return -EINVAL;
 	}
 
-	secctx = kstrdup(args, GFP_KERNEL);
-	if (!secctx)
-		return -ENOMEM;
+	err = security_secctx_to_secid(args, strlen(args), &cache->secid);
+	if (err)
+		return err;
 
-	cache->secctx = secctx;
+	cache->have_secid = true;
 	return 0;
 }
 
@@ -820,7 +821,6 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
 	put_cred(cache->cache_cred);
 
 	kfree(cache->rootdirname);
-	kfree(cache->secctx);
 	kfree(cache->tag);
 
 	_leave("");
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 7b99bd98de75..38c236e38cef 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -122,7 +122,6 @@ struct cachefiles_cache {
 #define CACHEFILES_STATE_CHANGED	3	/* T if state changed (poll trigger) */
 #define CACHEFILES_ONDEMAND_MODE	4	/* T if in on-demand read mode */
 	char				*rootdirname;	/* name of cache root directory */
-	char				*secctx;	/* LSM security context */
 	char				*tag;		/* cache binding tag */
 	refcount_t			unbind_pincount;/* refcount to do daemon unbind */
 	struct xarray			reqs;		/* xarray of pending on-demand requests */
@@ -130,6 +129,8 @@ struct cachefiles_cache {
 	struct xarray			ondemand_ids;	/* xarray for ondemand_id allocation */
 	u32				ondemand_id_next;
 	u32				msg_id_next;
+	u32				secid;		/* LSM security id */
+	bool				have_secid;	/* whether "secid" was set */
 };
 
 static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index fe777164f1d8..fc6611886b3b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -18,7 +18,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache)
 	struct cred *new;
 	int ret;
 
-	_enter("{%s}", cache->secctx);
+	_enter("{%u}", cache->have_secid ? cache->secid : 0);
 
 	new = prepare_kernel_cred(current);
 	if (!new) {
@@ -26,8 +26,8 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache)
 		goto error;
 	}
 
-	if (cache->secctx) {
-		ret = set_security_override_from_ctx(new, cache->secctx);
+	if (cache->have_secid) {
+		ret = set_security_override(new, cache->secid);
 		if (ret < 0) {
 			put_cred(new);
 			pr_err("Security denies permission to nominate security context: error %d\n",
-- 
cgit v1.2.3


From f4d3cde410cc62b5483f59f0f3454a5c5203a2cb Mon Sep 17 00:00:00 2001
From: Zilin Guan <zilin@seu.edu.cn>
Date: Fri, 13 Dec 2024 13:50:06 +0000
Subject: netfs: Remove redundant use of smp_rmb()

The function netfs_unbuffered_write_iter_locked() in
fs/netfs/direct_write.c contains an unnecessary smp_rmb() call after
wait_on_bit(). Since wait_on_bit() already incorporates a memory barrier
that ensures the flag update is visible before the function returns, the
smp_rmb() provides no additional benefit and incurs unnecessary overhead.

This patch removes the redundant barrier to simplify and optimize the code.

Signed-off-by: Zilin Guan <zilin@seu.edu.cn>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241207021952.2978530-1-zilin@seu.edu.cn/
Link: https://lore.kernel.org/r/20241213135013.2964079-7-dhowells@redhat.com
Reviewed-by: Akira Yokosawa <akiyks@gmail.com>
cc: Akira Yokosawa <akiyks@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/direct_write.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 88f2adfab75e..173e8b5e6a93 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -104,7 +104,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 		trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
 		wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
 			    TASK_UNINTERRUPTIBLE);
-		smp_rmb(); /* Read error/transferred after RIP flag */
 		ret = wreq->error;
 		if (ret == 0) {
 			ret = wreq->transferred;
-- 
cgit v1.2.3


From aa3956418985bda1f68313eadde3267921847978 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Dec 2024 13:50:07 +0000
Subject: netfs: Fix missing barriers by using clear_and_wake_up_bit()

Use clear_and_wake_up_bit() rather than something like:

	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
	wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);

as there needs to be a barrier inserted between which is present in
clear_and_wake_up_bit().

Fixes: 288ace2f57c9 ("netfs: New writeback implementation")
Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241213135013.2964079-8-dhowells@redhat.com
Reviewed-by: Akira Yokosawa <akiyks@gmail.com>
cc: Zilin Guan <zilin@seu.edu.cn>
cc: Akira Yokosawa <akiyks@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/read_collect.c  | 3 +--
 fs/netfs/write_collect.c | 9 +++------
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index b415e3972336..46ce3b7adf07 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -379,8 +379,7 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq)
 	task_io_account_read(rreq->transferred);
 
 	trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
-	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
-	wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+	clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 
 	trace_netfs_rreq(rreq, netfs_rreq_trace_done);
 	netfs_clear_subrequests(rreq, false);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 1d438be2e1b4..82290c92ba7a 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -501,8 +501,7 @@ reassess_streams:
 		goto need_retry;
 	if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
 		trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
-		clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
-		wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE);
+		clear_and_wake_up_bit(NETFS_RREQ_PAUSE, &wreq->flags);
 	}
 
 	if (notes & NEED_REASSESS) {
@@ -605,8 +604,7 @@ void netfs_write_collection_worker(struct work_struct *work)
 
 	_debug("finished");
 	trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
-	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
-	wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+	clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
 
 	if (wreq->iocb) {
 		size_t written = min(wreq->transferred, wreq->len);
@@ -714,8 +712,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
 
 	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
 
-	clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
-	wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS);
+	clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
 
 	/* If we are at the head of the queue, wake up the collector,
 	 * transferring a ref to it if we were the ones to do so.
-- 
cgit v1.2.3


From 4acb665cf4f3e5436844f17ece0a8a55ce688c7b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Dec 2024 13:50:08 +0000
Subject: netfs: Work around recursion by abandoning retry if nothing read

syzkaller reported recursion with a loop of three calls (netfs_rreq_assess,
netfs_retry_reads and netfs_rreq_terminated) hitting the limit of the stack
during an unbuffered or direct I/O read.

There are a number of issues:

 (1) There is no limit on the number of retries.

 (2) A subrequest is supposed to be abandoned if it does not transfer
     anything (NETFS_SREQ_NO_PROGRESS), but that isn't checked under all
     circumstances.

 (3) The actual root cause, which is this:

	if (atomic_dec_and_test(&rreq->nr_outstanding))
		netfs_rreq_terminated(rreq, ...);

     When we do a retry, we bump the rreq->nr_outstanding counter to
     prevent the final cleanup phase running before we've finished
     dispatching the retries.  The problem is if we hit 0, we have to do
     the cleanup phase - but we're in the cleanup phase and end up
     repeating the retry cycle, hence the recursion.

Work around the problem by limiting the number of retries.  This is based
on Lizhi Xu's patch[1], and makes the following changes:

 (1) Replace NETFS_SREQ_NO_PROGRESS with NETFS_SREQ_MADE_PROGRESS and make
     the filesystem set it if it managed to read or write at least one byte
     of data.  Clear this bit before issuing a subrequest.

 (2) Add a ->retry_count member to the subrequest and increment it any time
     we do a retry.

 (3) Remove the NETFS_SREQ_RETRYING flag as it is superfluous with
     ->retry_count.  If the latter is non-zero, we're doing a retry.

 (4) Abandon a subrequest if retry_count is non-zero and we made no
     progress.

 (5) Use ->retry_count in both the write-side and the read-size.

[?] Question: Should I set a hard limit on retry_count in both read and
    write?  Say it hits 50, we always abandon it.  The problem is that
    these changes only mitigate the issue.  As long as it made at least one
    byte of progress, the recursion is still an issue.  This patch
    mitigates the problem, but does not fix the underlying cause.  I have
    patches that will do that, but it's an intrusive fix that's currently
    pending for the next merge window.

The oops generated by KASAN looks something like:

   BUG: TASK stack guard page was hit at ffffc9000482ff48 (stack is ffffc90004830000..ffffc90004838000)
   Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN NOPTI
   ...
   RIP: 0010:mark_lock+0x25/0xc60 kernel/locking/lockdep.c:4686
    ...
    mark_usage kernel/locking/lockdep.c:4646 [inline]
    __lock_acquire+0x906/0x3ce0 kernel/locking/lockdep.c:5156
    lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5825
    local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
    ___slab_alloc+0x123/0x1880 mm/slub.c:3695
    __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908
    __slab_alloc_node mm/slub.c:3961 [inline]
    slab_alloc_node mm/slub.c:4122 [inline]
    kmem_cache_alloc_noprof+0x2a7/0x2f0 mm/slub.c:4141
    radix_tree_node_alloc.constprop.0+0x1e8/0x350 lib/radix-tree.c:253
    idr_get_free+0x528/0xa40 lib/radix-tree.c:1506
    idr_alloc_u32+0x191/0x2f0 lib/idr.c:46
    idr_alloc+0xc1/0x130 lib/idr.c:87
    p9_tag_alloc+0x394/0x870 net/9p/client.c:321
    p9_client_prepare_req+0x19f/0x4d0 net/9p/client.c:644
    p9_client_zc_rpc.constprop.0+0x105/0x880 net/9p/client.c:793
    p9_client_read_once+0x443/0x820 net/9p/client.c:1570
    p9_client_read+0x13f/0x1b0 net/9p/client.c:1534
    v9fs_issue_read+0x115/0x310 fs/9p/vfs_addr.c:74
    netfs_retry_read_subrequests fs/netfs/read_retry.c:60 [inline]
    netfs_retry_reads+0x153a/0x1d00 fs/netfs/read_retry.c:232
    netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
    netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
    netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235
    netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
    netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
    netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235
    netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
    ...
    netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
    netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235
    netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
    netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
    netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235
    netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
    netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
    netfs_dispatch_unbuffered_reads fs/netfs/direct_read.c:103 [inline]
    netfs_unbuffered_read fs/netfs/direct_read.c:127 [inline]
    netfs_unbuffered_read_iter_locked+0x12f6/0x19b0 fs/netfs/direct_read.c:221
    netfs_unbuffered_read_iter+0xc5/0x100 fs/netfs/direct_read.c:256
    v9fs_file_read_iter+0xbf/0x100 fs/9p/vfs_file.c:361
    do_iter_readv_writev+0x614/0x7f0 fs/read_write.c:832
    vfs_readv+0x4cf/0x890 fs/read_write.c:1025
    do_preadv fs/read_write.c:1142 [inline]
    __do_sys_preadv fs/read_write.c:1192 [inline]
    __se_sys_preadv fs/read_write.c:1187 [inline]
    __x64_sys_preadv+0x22d/0x310 fs/read_write.c:1187
    do_syscall_x64 arch/x86/entry/common.c:52 [inline]
    do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83

Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Closes: https://syzkaller.appspot.com/bug?extid=1fc6f64c40a9d143cfb6
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241108034020.3695718-1-lizhi.xu@windriver.com/ [1]
Link: https://lore.kernel.org/r/20241213135013.2964079-9-dhowells@redhat.com
Tested-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com
Suggested-by: Lizhi Xu <lizhi.xu@windriver.com>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: v9fs@lists.linux.dev
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Reported-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/9p/vfs_addr.c         |  6 +++++-
 fs/afs/write.c           |  5 ++++-
 fs/netfs/read_collect.c  | 15 +++++++++------
 fs/netfs/read_retry.c    |  6 ++++--
 fs/netfs/write_collect.c |  5 ++---
 fs/netfs/write_issue.c   |  2 ++
 fs/smb/client/cifssmb.c  | 13 +++++++++----
 fs/smb/client/smb2pdu.c  |  9 ++++++---
 include/linux/netfs.h    |  6 +++---
 9 files changed, 44 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 819c75233235..3bc9ce6c575e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -57,6 +57,8 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
 	int err, len;
 
 	len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+	if (len > 0)
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	netfs_write_subrequest_terminated(subreq, len ?: err, false);
 }
 
@@ -80,8 +82,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
 	if (pos + total >= i_size_read(rreq->inode))
 		__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
 
-	if (!err)
+	if (!err) {
 		subreq->transferred += total;
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+	}
 
 	netfs_read_subreq_terminated(subreq, err, false);
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 34107b55f834..ccb6aa8027c5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -122,7 +122,7 @@ static void afs_issue_write_worker(struct work_struct *work)
 	if (subreq->debug_index == 3)
 		return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
 
-	if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
+	if (!subreq->retry_count) {
 		set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 		return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
 	}
@@ -149,6 +149,9 @@ static void afs_issue_write_worker(struct work_struct *work)
 	afs_wait_for_operation(op);
 	ret = afs_put_operation(op);
 	switch (ret) {
+	case 0:
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+		break;
 	case -EACCES:
 	case -EPERM:
 	case -ENOKEY:
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 46ce3b7adf07..47ed3a5044e2 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -438,7 +438,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
 	     rreq->origin == NETFS_READPAGE ||
 	     rreq->origin == NETFS_READ_FOR_WRITE)) {
 		netfs_consume_read_data(subreq, was_async);
-		__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	}
 }
 EXPORT_SYMBOL(netfs_read_subreq_progress);
@@ -497,7 +497,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 		     rreq->origin == NETFS_READPAGE ||
 		     rreq->origin == NETFS_READ_FOR_WRITE)) {
 			netfs_consume_read_data(subreq, was_async);
-			__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 		}
 		rreq->transferred += subreq->transferred;
 	}
@@ -511,10 +511,13 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 		} else {
 			trace_netfs_sreq(subreq, netfs_sreq_trace_short);
 			if (subreq->transferred > subreq->consumed) {
-				__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-				__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
-				set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
-			} else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+				/* If we didn't read new data, abandon retry. */
+				if (subreq->retry_count &&
+				    test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
+					__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+					set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
+				}
+			} else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
 				__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 				set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
 			} else {
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 0350592ea804..0e72e9226fc8 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -56,6 +56,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 			if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
 				break;
 			if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+				__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+				subreq->retry_count++;
 				netfs_reset_iter(subreq);
 				netfs_reissue_read(rreq, subreq);
 			}
@@ -137,7 +139,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 			stream0->sreq_max_len = subreq->len;
 
 			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+			__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+			subreq->retry_count++;
 
 			spin_lock_bh(&rreq->lock);
 			list_add_tail(&subreq->rreq_link, &rreq->subrequests);
@@ -213,7 +216,6 @@ abandon:
 			subreq->error = -ENOMEM;
 		__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
 		__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-		__clear_bit(NETFS_SREQ_RETRYING, &subreq->flags);
 	}
 	spin_lock_bh(&rreq->lock);
 	list_splice_tail_init(&queue, &rreq->subrequests);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 82290c92ba7a..ca3a11ed9b54 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -179,7 +179,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 				struct iov_iter source = subreq->io_iter;
 
 				iov_iter_revert(&source, subreq->len - source.count);
-				__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
 				netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
 				netfs_reissue_write(stream, subreq, &source);
 			}
@@ -234,7 +233,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 			/* Renegotiate max_len (wsize) */
 			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
 			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+			subreq->retry_count++;
 			stream->prepare_write(subreq);
 
 			part = min(len, stream->sreq_max_len);
@@ -279,7 +278,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 			subreq->start		= start;
 			subreq->debug_index	= atomic_inc_return(&wreq->subreq_counter);
 			subreq->stream_nr	= to->stream_nr;
-			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+			subreq->retry_count	= 1;
 
 			trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
 					     refcount_read(&subreq->ref),
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index bf6d507578e5..ff0e82505a0b 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -244,6 +244,8 @@ void netfs_reissue_write(struct netfs_io_stream *stream,
 	iov_iter_advance(source, size);
 	iov_iter_truncate(&subreq->io_iter, size);
 
+	subreq->retry_count++;
+	__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
 	netfs_do_issue_write(stream, subreq);
 }
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index bd42a419458e..6cb1e81993f8 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1319,14 +1319,16 @@ cifs_readv_callback(struct mid_q_entry *mid)
 	}
 
 	if (rdata->result == -ENODATA) {
-		__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 		rdata->result = 0;
+		__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 	} else {
 		size_t trans = rdata->subreq.transferred + rdata->got_bytes;
 		if (trans < rdata->subreq.len &&
 		    rdata->subreq.start + trans == ictx->remote_i_size) {
-			__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 			rdata->result = 0;
+			__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
+		} else if (rdata->got_bytes > 0) {
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
 		}
 	}
 
@@ -1670,10 +1672,13 @@ cifs_writev_callback(struct mid_q_entry *mid)
 		if (written > wdata->subreq.len)
 			written &= 0xFFFF;
 
-		if (written < wdata->subreq.len)
+		if (written < wdata->subreq.len) {
 			result = -ENOSPC;
-		else
+		} else {
 			result = written;
+			if (written > 0)
+				__set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags);
+		}
 		break;
 	case MID_REQUEST_SUBMITTED:
 	case MID_RETRY_NEEDED:
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 010eae9d6c47..458b53d1f9cb 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4615,6 +4615,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
 			__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 			rdata->result = 0;
 		}
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
 	}
 	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
 			      server->credits, server->in_flight,
@@ -4840,10 +4841,12 @@ smb2_writev_callback(struct mid_q_entry *mid)
 		if (written > wdata->subreq.len)
 			written &= 0xFFFF;
 
-		if (written < wdata->subreq.len)
+		if (written < wdata->subreq.len) {
 			wdata->result = -ENOSPC;
-		else
+		} else if (written > 0) {
 			wdata->subreq.len = written;
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags);
+		}
 		break;
 	case MID_REQUEST_SUBMITTED:
 	case MID_RETRY_NEEDED:
@@ -5012,7 +5015,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
 	}
 #endif
 
-	if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
+	if (wdata->subreq.retry_count > 0)
 		smb2_set_replay(server, &rqst);
 
 	cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n",
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 5eaceef41e6c..4083d77e3f39 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -185,6 +185,7 @@ struct netfs_io_subrequest {
 	short			error;		/* 0 or error that occurred */
 	unsigned short		debug_index;	/* Index in list (for debugging output) */
 	unsigned int		nr_segs;	/* Number of segs in io_iter */
+	u8			retry_count;	/* The number of retries (0 on initial pass) */
 	enum netfs_io_source	source;		/* Where to read from/write to */
 	unsigned char		stream_nr;	/* I/O stream this belongs to */
 	unsigned char		curr_folioq_slot; /* Folio currently being read */
@@ -194,14 +195,13 @@ struct netfs_io_subrequest {
 #define NETFS_SREQ_COPY_TO_CACHE	0	/* Set if should copy the data to the cache */
 #define NETFS_SREQ_CLEAR_TAIL		1	/* Set if the rest of the read should be cleared */
 #define NETFS_SREQ_SEEK_DATA_READ	3	/* Set if ->read() should SEEK_DATA first */
-#define NETFS_SREQ_NO_PROGRESS		4	/* Set if we didn't manage to read any data */
+#define NETFS_SREQ_MADE_PROGRESS	4	/* Set if we transferred at least some data */
 #define NETFS_SREQ_ONDEMAND		5	/* Set if it's from on-demand read mode */
 #define NETFS_SREQ_BOUNDARY		6	/* Set if ends on hard boundary (eg. ceph object) */
 #define NETFS_SREQ_HIT_EOF		7	/* Set if short due to EOF */
 #define NETFS_SREQ_IN_PROGRESS		8	/* Unlocked when the subrequest completes */
 #define NETFS_SREQ_NEED_RETRY		9	/* Set if the filesystem requests a retry */
-#define NETFS_SREQ_RETRYING		10	/* Set if we're retrying */
-#define NETFS_SREQ_FAILED		11	/* Set if the subreq failed unretryably */
+#define NETFS_SREQ_FAILED		10	/* Set if the subreq failed unretryably */
 };
 
 enum netfs_io_origin {
-- 
cgit v1.2.3


From 38cf8e945721ffe708fa675507465da7f4f2a9f7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Dec 2024 13:50:09 +0000
Subject: netfs: Fix ceph copy to cache on write-begin

At the end of netfs_unlock_read_folio() in which folios are marked
appropriately for copying to the cache (either with by being marked dirty
and having their private data set or by having PG_private_2 set) and then
unlocked, the folio_queue struct has the entry pointing to the folio
cleared.  This presents a problem for netfs_pgpriv2_write_to_the_cache(),
which is used to write folios marked with PG_private_2 to the cache as it
expects to be able to trawl the folio_queue list thereafter to find the
relevant folios, leading to a hang.

Fix this by not clearing the folio_queue entry if we're going to do the
deprecated copy-to-cache.  The clearance will be done instead as the folios
are written to the cache.

This can be reproduced by starting cachefiles, mounting a ceph filesystem
with "-o fsc" and writing to it.

Fixes: 796a4049640b ("netfs: In readahead, put the folio refs as soon extracted")
Reported-by: Max Kellermann <max.kellermann@ionos.com>
Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241213135013.2964079-10-dhowells@redhat.com
Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
cc: Jeff Layton <jlayton@kernel.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: netfs@lists.linux.dev
cc: ceph-devel@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/read_collect.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 47ed3a5044e2..e8624f5c7fcc 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -62,10 +62,14 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
 		} else {
 			trace_netfs_folio(folio, netfs_folio_trace_read_done);
 		}
+
+		folioq_clear(folioq, slot);
 	} else {
 		// TODO: Use of PG_private_2 is deprecated.
 		if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
 			netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
+		else
+			folioq_clear(folioq, slot);
 	}
 
 	if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
@@ -77,8 +81,6 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
 			folio_unlock(folio);
 		}
 	}
-
-	folioq_clear(folioq, slot);
 }
 
 /*
-- 
cgit v1.2.3


From d0327c824338cdccad058723a31d038ecd553409 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Dec 2024 13:50:10 +0000
Subject: netfs: Fix the (non-)cancellation of copy when cache is temporarily
 disabled

When the caching for a cookie is temporarily disabled (e.g. due to a DIO
write on that file), future copying to the cache for that file is disabled
until all fds open on that file are closed.  However, if netfslib is using
the deprecated PG_private_2 method (such as is currently used by ceph), and
decides it wants to copy to the cache, netfs_advance_write() will just bail
at the first check seeing that the cache stream is unavailable, and
indicate that it dealt with all the content.

This means that we have no subrequests to provide notifications to drive
the state machine or even to pin the request and the request just gets
discarded, leaving the folios with PG_private_2 set.

Fix this by jumping directly to cancel the request if the cache is not
available.  That way, we don't remove mark3 from the folio_queue list and
netfs_pgpriv2_cancel() will clean up the folios.

This was found by running the generic/013 xfstest against ceph with an
active cache and the "-o fsc" option passed to ceph.  That would usually
hang

Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Reported-by: Max Kellermann <max.kellermann@ionos.com>
Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241213135013.2964079-11-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: netfs@lists.linux.dev
cc: ceph-devel@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/read_pgpriv2.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index ba5af89d37fa..54d5004fec18 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -170,6 +170,10 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
 
 	trace_netfs_write(wreq, netfs_write_trace_copy_to_cache);
 	netfs_stat(&netfs_n_wh_copy_to_cache);
+	if (!wreq->io_streams[1].avail) {
+		netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+		goto couldnt_start;
+	}
 
 	for (;;) {
 		error = netfs_pgpriv2_copy_folio(wreq, folio);
-- 
cgit v1.2.3


From d4e338de17cb6532bf805fae00db8b41e914009b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:34:45 +0000
Subject: netfs: Fix is-caching check in read-retry

netfs: Fix is-caching check in read-retry

The read-retry code checks the NETFS_RREQ_COPY_TO_CACHE flag to determine
if there might be failed reads from the cache that need turning into reads
from the server, with the intention of skipping the complicated part if it
can.  The code that set the flag, however, got lost during the read-side
rewrite.

Fix the check to see if the cache_resources are valid instead.  The flag
can then be removed.

Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/3752048.1734381285@warthog.procyon.org.uk
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/read_retry.c | 2 +-
 include/linux/netfs.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 0e72e9226fc8..21b4a54e545e 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -49,7 +49,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 	 * up to the first permanently failed one.
 	 */
 	if (!rreq->netfs_ops->prepare_read &&
-	    !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) {
+	    !rreq->cache_resources.ops) {
 		struct netfs_io_subrequest *subreq;
 
 		list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 4083d77e3f39..ecdd5ced16a8 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -269,7 +269,6 @@ struct netfs_io_request {
 	size_t			prev_donated;	/* Fallback for subreq->prev_donated */
 	refcount_t		ref;
 	unsigned long		flags;
-#define NETFS_RREQ_COPY_TO_CACHE	1	/* Need to write to the cache */
 #define NETFS_RREQ_NO_UNLOCK_FOLIO	2	/* Don't unlock no_unlock_folio on completion */
 #define NETFS_RREQ_DONT_UNLOCK_FOLIOS	3	/* Don't unlock the folios on completion */
 #define NETFS_RREQ_FAILED		4	/* The request failed */
-- 
cgit v1.2.3


From eb1181594417dafad0f75808ead71f6d5170c1ea Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:40:53 +0000
Subject: netfs: Use a folio_queue allocation and free functions

Provide and use folio_queue allocation and free functions to combine the
allocation, initialisation and stat (un)accounting steps that are repeated
in several places.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-4-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/buffered_read.c | 12 +++---------
 fs/netfs/misc.c          | 38 ++++++++++++++++++++++++++++++++++----
 include/linux/netfs.h    |  5 +++++
 3 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 4dc9b8286355..78b04763bed6 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -131,11 +131,9 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 			struct folio_queue *tail = rreq->buffer_tail, *new;
 			size_t added;
 
-			new = kmalloc(sizeof(*new), GFP_NOFS);
+			new = netfs_folioq_alloc(GFP_NOFS);
 			if (!new)
 				return -ENOMEM;
-			netfs_stat(&netfs_n_folioq);
-			folioq_init(new);
 			new->prev = tail;
 			tail->next = new;
 			rreq->buffer_tail = new;
@@ -363,11 +361,9 @@ static int netfs_prime_buffer(struct netfs_io_request *rreq)
 	struct folio_batch put_batch;
 	size_t added;
 
-	folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
+	folioq = netfs_folioq_alloc(GFP_KERNEL);
 	if (!folioq)
 		return -ENOMEM;
-	netfs_stat(&netfs_n_folioq);
-	folioq_init(folioq);
 	rreq->buffer = folioq;
 	rreq->buffer_tail = folioq;
 	rreq->submitted = rreq->start;
@@ -440,12 +436,10 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo
 {
 	struct folio_queue *folioq;
 
-	folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
+	folioq = netfs_folioq_alloc(GFP_KERNEL);
 	if (!folioq)
 		return -ENOMEM;
 
-	netfs_stat(&netfs_n_folioq);
-	folioq_init(folioq);
 	folioq_append(folioq, folio);
 	BUG_ON(folioq_folio(folioq, 0) != folio);
 	BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio));
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 78fe5796b2b2..6cd7e1ee7a14 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,6 +8,38 @@
 #include <linux/swap.h>
 #include "internal.h"
 
+/**
+ * netfs_folioq_alloc - Allocate a folio_queue struct
+ * @gfp: Allocation constraints
+ *
+ * Allocate, initialise and account the folio_queue struct.
+ */
+struct folio_queue *netfs_folioq_alloc(gfp_t gfp)
+{
+	struct folio_queue *fq;
+
+	fq = kmalloc(sizeof(*fq), gfp);
+	if (fq) {
+		netfs_stat(&netfs_n_folioq);
+		folioq_init(fq);
+	}
+	return fq;
+}
+EXPORT_SYMBOL(netfs_folioq_alloc);
+
+/**
+ * netfs_folioq_free - Free a folio_queue struct
+ * @folioq: The object to free
+ *
+ * Free and unaccount the folio_queue struct.
+ */
+void netfs_folioq_free(struct folio_queue *folioq)
+{
+	netfs_stat_d(&netfs_n_folioq);
+	kfree(folioq);
+}
+EXPORT_SYMBOL(netfs_folioq_free);
+
 /*
  * Make sure there's space in the rolling queue.
  */
@@ -87,8 +119,7 @@ struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq)
 
 	if (next)
 		next->prev = NULL;
-	netfs_stat_d(&netfs_n_folioq);
-	kfree(head);
+	netfs_folioq_free(head);
 	wreq->buffer = next;
 	return next;
 }
@@ -111,8 +142,7 @@ void netfs_clear_buffer(struct netfs_io_request *rreq)
 				folio_put(folio);
 			}
 		}
-		netfs_stat_d(&netfs_n_folioq);
-		kfree(p);
+		netfs_folioq_free(p);
 	}
 }
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index ecdd5ced16a8..c69e0f02c30f 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -21,6 +21,7 @@
 
 enum netfs_sreq_ref_trace;
 typedef struct mempool_s mempool_t;
+struct folio_queue;
 
 /**
  * folio_start_private_2 - Start an fscache write on a folio.  [DEPRECATED]
@@ -453,6 +454,10 @@ void netfs_end_io_write(struct inode *inode);
 int netfs_start_io_direct(struct inode *inode);
 void netfs_end_io_direct(struct inode *inode);
 
+/* Miscellaneous APIs. */
+struct folio_queue *netfs_folioq_alloc(gfp_t gfp);
+void netfs_folioq_free(struct folio_queue *folioq);
+
 /**
  * netfs_inode - Get the netfs inode context from the inode
  * @inode: The inode to query
-- 
cgit v1.2.3


From aabcabf2746062253565b33aa3f8d25999a5ac01 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:40:54 +0000
Subject: netfs: Add a tracepoint to log the lifespan of folio_queue structs

Add a tracepoint to log the lifespan of folio_queue structs.  For tracing
illustrative purposes, folio_queues are tagged with the debug ID of
whatever they're related to (typically a netfs_io_request) and a debug ID
of their own.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-5-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/buffered_read.c     | 10 +++++++---
 fs/netfs/internal.h          |  3 ++-
 fs/netfs/misc.c              | 31 ++++++++++++++++++++-----------
 fs/netfs/read_collect.c      |  8 ++++++--
 fs/netfs/write_issue.c       |  2 +-
 fs/smb/client/smb2ops.c      |  2 +-
 include/linux/folio_queue.h  | 12 +++++++++---
 include/linux/netfs.h        |  6 ++++--
 include/trace/events/netfs.h | 41 +++++++++++++++++++++++++++++++++++++++--
 lib/kunit_iov_iter.c         |  4 ++--
 10 files changed, 91 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 78b04763bed6..7ec04d5628d8 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -131,7 +131,8 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 			struct folio_queue *tail = rreq->buffer_tail, *new;
 			size_t added;
 
-			new = netfs_folioq_alloc(GFP_NOFS);
+			new = netfs_folioq_alloc(rreq->debug_id, GFP_NOFS,
+						 netfs_trace_folioq_alloc_read_prep);
 			if (!new)
 				return -ENOMEM;
 			new->prev = tail;
@@ -361,9 +362,11 @@ static int netfs_prime_buffer(struct netfs_io_request *rreq)
 	struct folio_batch put_batch;
 	size_t added;
 
-	folioq = netfs_folioq_alloc(GFP_KERNEL);
+	folioq = netfs_folioq_alloc(rreq->debug_id, GFP_KERNEL,
+				    netfs_trace_folioq_alloc_read_prime);
 	if (!folioq)
 		return -ENOMEM;
+
 	rreq->buffer = folioq;
 	rreq->buffer_tail = folioq;
 	rreq->submitted = rreq->start;
@@ -436,7 +439,8 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo
 {
 	struct folio_queue *folioq;
 
-	folioq = netfs_folioq_alloc(GFP_KERNEL);
+	folioq = netfs_folioq_alloc(rreq->debug_id, GFP_KERNEL,
+				    netfs_trace_folioq_alloc_read_sing);
 	if (!folioq)
 		return -ENOMEM;
 
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index c562aec3b483..01b013f558f7 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -58,7 +58,8 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
 /*
  * misc.c
  */
-struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq);
+struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq,
+					    enum netfs_folioq_trace trace);
 int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
 			      bool needs_put);
 struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 6cd7e1ee7a14..afe032551de5 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -10,18 +10,25 @@
 
 /**
  * netfs_folioq_alloc - Allocate a folio_queue struct
+ * @rreq_id: Associated debugging ID for tracing purposes
  * @gfp: Allocation constraints
+ * @trace: Trace tag to indicate the purpose of the allocation
  *
- * Allocate, initialise and account the folio_queue struct.
+ * Allocate, initialise and account the folio_queue struct and log a trace line
+ * to mark the allocation.
  */
-struct folio_queue *netfs_folioq_alloc(gfp_t gfp)
+struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp,
+				       unsigned int /*enum netfs_folioq_trace*/ trace)
 {
+	static atomic_t debug_ids;
 	struct folio_queue *fq;
 
 	fq = kmalloc(sizeof(*fq), gfp);
 	if (fq) {
 		netfs_stat(&netfs_n_folioq);
-		folioq_init(fq);
+		folioq_init(fq, rreq_id);
+		fq->debug_id = atomic_inc_return(&debug_ids);
+		trace_netfs_folioq(fq, trace);
 	}
 	return fq;
 }
@@ -30,11 +37,14 @@ EXPORT_SYMBOL(netfs_folioq_alloc);
 /**
  * netfs_folioq_free - Free a folio_queue struct
  * @folioq: The object to free
+ * @trace: Trace tag to indicate which free
  *
  * Free and unaccount the folio_queue struct.
  */
-void netfs_folioq_free(struct folio_queue *folioq)
+void netfs_folioq_free(struct folio_queue *folioq,
+		       unsigned int /*enum netfs_trace_folioq*/ trace)
 {
+	trace_netfs_folioq(folioq, trace);
 	netfs_stat_d(&netfs_n_folioq);
 	kfree(folioq);
 }
@@ -43,7 +53,8 @@ EXPORT_SYMBOL(netfs_folioq_free);
 /*
  * Make sure there's space in the rolling queue.
  */
-struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq)
+struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq,
+					    enum netfs_folioq_trace trace)
 {
 	struct folio_queue *tail = rreq->buffer_tail, *prev;
 	unsigned int prev_nr_slots = 0;
@@ -59,11 +70,9 @@ struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq)
 		prev_nr_slots = folioq_nr_slots(tail);
 	}
 
-	tail = kmalloc(sizeof(*tail), GFP_NOFS);
+	tail = netfs_folioq_alloc(rreq->debug_id, GFP_NOFS, trace);
 	if (!tail)
 		return ERR_PTR(-ENOMEM);
-	netfs_stat(&netfs_n_folioq);
-	folioq_init(tail);
 	tail->prev = prev;
 	if (prev)
 		/* [!] NOTE: After we set prev->next, the consumer is entirely
@@ -98,7 +107,7 @@ int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio
 	struct folio_queue *tail;
 	unsigned int slot, order = folio_order(folio);
 
-	tail = netfs_buffer_make_space(rreq);
+	tail = netfs_buffer_make_space(rreq, netfs_trace_folioq_alloc_append_folio);
 	if (IS_ERR(tail))
 		return PTR_ERR(tail);
 
@@ -119,7 +128,7 @@ struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq)
 
 	if (next)
 		next->prev = NULL;
-	netfs_folioq_free(head);
+	netfs_folioq_free(head, netfs_trace_folioq_delete);
 	wreq->buffer = next;
 	return next;
 }
@@ -142,7 +151,7 @@ void netfs_clear_buffer(struct netfs_io_request *rreq)
 				folio_put(folio);
 			}
 		}
-		netfs_folioq_free(p);
+		netfs_folioq_free(p, netfs_trace_folioq_clear);
 	}
 }
 
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index e8624f5c7fcc..f7a5cb29dd12 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -103,6 +103,7 @@ static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was
 		 subreq->transferred, subreq->len))
 		subreq->transferred = subreq->len;
 
+	trace_netfs_folioq(folioq, netfs_trace_folioq_read_progress);
 next_folio:
 	fsize = PAGE_SIZE << subreq->curr_folio_order;
 	fpos = round_down(subreq->start + subreq->consumed, fsize);
@@ -119,9 +120,11 @@ next_folio:
 		if (folioq) {
 			struct folio *folio = folioq_folio(folioq, slot);
 
-			pr_err("folioq: orders=%02x%02x%02x%02x\n",
+			pr_err("folioq: fq=%x orders=%02x%02x%02x%02x %px\n",
+			       folioq->debug_id,
 			       folioq->orders[0], folioq->orders[1],
-			       folioq->orders[2], folioq->orders[3]);
+			       folioq->orders[2], folioq->orders[3],
+			       folioq);
 			if (folio)
 				pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
 				       fpos, fend - 1, folio_pos(folio), folio_order(folio),
@@ -222,6 +225,7 @@ donation_changed:
 			slot = 0;
 			folioq = folioq->next;
 			subreq->curr_folioq = folioq;
+			trace_netfs_folioq(folioq, netfs_trace_folioq_read_progress);
 		}
 		subreq->curr_folioq_slot = slot;
 		if (folioq && folioq_folio(folioq, slot))
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index ff0e82505a0b..87e5cf4a0957 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -161,7 +161,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
 	 */
 	if (iov_iter_is_folioq(wreq_iter) &&
 	    wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) {
-		netfs_buffer_make_space(wreq);
+		netfs_buffer_make_space(wreq, netfs_trace_folioq_prep_write);
 	}
 
 	subreq = netfs_alloc_subrequest(wreq);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 87cb1872db28..7121d9e0f404 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -4388,7 +4388,7 @@ static struct folio_queue *cifs_alloc_folioq_buffer(ssize_t size)
 			p = kmalloc(sizeof(*p), GFP_NOFS);
 			if (!p)
 				goto nomem;
-			folioq_init(p);
+			folioq_init(p, 0);
 			if (tail) {
 				tail->next = p;
 				p->prev = tail;
diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
index 3abe614ef5f0..4d3f8074c137 100644
--- a/include/linux/folio_queue.h
+++ b/include/linux/folio_queue.h
@@ -37,16 +37,20 @@ struct folio_queue {
 #if PAGEVEC_SIZE > BITS_PER_LONG
 #error marks is not big enough
 #endif
+	unsigned int		rreq_id;
+	unsigned int		debug_id;
 };
 
 /**
  * folioq_init - Initialise a folio queue segment
  * @folioq: The segment to initialise
+ * @rreq_id: The request identifier to use in tracelines.
  *
- * Initialise a folio queue segment.  Note that the folio pointers are
- * left uninitialised.
+ * Initialise a folio queue segment and set an identifier to be used in traces.
+ *
+ * Note that the folio pointers are left uninitialised.
  */
-static inline void folioq_init(struct folio_queue *folioq)
+static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id)
 {
 	folio_batch_init(&folioq->vec);
 	folioq->next = NULL;
@@ -54,6 +58,8 @@ static inline void folioq_init(struct folio_queue *folioq)
 	folioq->marks = 0;
 	folioq->marks2 = 0;
 	folioq->marks3 = 0;
+	folioq->rreq_id = rreq_id;
+	folioq->debug_id = 0;
 }
 
 /**
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index c69e0f02c30f..5b2f427f8e3e 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -455,8 +455,10 @@ int netfs_start_io_direct(struct inode *inode);
 void netfs_end_io_direct(struct inode *inode);
 
 /* Miscellaneous APIs. */
-struct folio_queue *netfs_folioq_alloc(gfp_t gfp);
-void netfs_folioq_free(struct folio_queue *folioq);
+struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp,
+				       unsigned int trace /*enum netfs_folioq_trace*/);
+void netfs_folioq_free(struct folio_queue *folioq,
+		       unsigned int trace /*enum netfs_trace_folioq*/);
 
 /**
  * netfs_inode - Get the netfs inode context from the inode
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index c3c309f8fbe1..50aa6745df95 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -191,6 +191,16 @@
 	EM(netfs_trace_donate_to_next,		"to-next")	\
 	E_(netfs_trace_donate_to_deferred_next,	"defer-next")
 
+#define netfs_folioq_traces					\
+	EM(netfs_trace_folioq_alloc_append_folio, "alloc-apf")	\
+	EM(netfs_trace_folioq_alloc_read_prep,	"alloc-r-prep")	\
+	EM(netfs_trace_folioq_alloc_read_prime,	"alloc-r-prime") \
+	EM(netfs_trace_folioq_alloc_read_sing,	"alloc-r-sing")	\
+	EM(netfs_trace_folioq_clear,		"clear")	\
+	EM(netfs_trace_folioq_delete,		"delete")	\
+	EM(netfs_trace_folioq_prep_write,	"prep-wr")	\
+	E_(netfs_trace_folioq_read_progress,	"r-progress")
+
 #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
 #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
 
@@ -209,6 +219,7 @@ enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte);
 enum netfs_folio_trace { netfs_folio_traces } __mode(byte);
 enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte);
 enum netfs_donate_trace { netfs_donate_traces } __mode(byte);
+enum netfs_folioq_trace { netfs_folioq_traces } __mode(byte);
 
 #endif
 
@@ -232,6 +243,7 @@ netfs_sreq_ref_traces;
 netfs_folio_traces;
 netfs_collect_contig_traces;
 netfs_donate_traces;
+netfs_folioq_traces;
 
 /*
  * Now redefine the EM() and E_() macros to map the enums to the strings that
@@ -317,6 +329,7 @@ TRACE_EVENT(netfs_sreq,
 		    __field(unsigned short,		flags)
 		    __field(enum netfs_io_source,	source)
 		    __field(enum netfs_sreq_trace,	what)
+		    __field(u8,				slot)
 		    __field(size_t,			len)
 		    __field(size_t,			transferred)
 		    __field(loff_t,			start)
@@ -332,15 +345,16 @@ TRACE_EVENT(netfs_sreq,
 		    __entry->len	= sreq->len;
 		    __entry->transferred = sreq->transferred;
 		    __entry->start	= sreq->start;
+		    __entry->slot	= sreq->curr_folioq_slot;
 			   ),
 
-	    TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx e=%d",
+	    TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx s=%u e=%d",
 		      __entry->rreq, __entry->index,
 		      __print_symbolic(__entry->source, netfs_sreq_sources),
 		      __print_symbolic(__entry->what, netfs_sreq_traces),
 		      __entry->flags,
 		      __entry->start, __entry->transferred, __entry->len,
-		      __entry->error)
+		      __entry->slot, __entry->error)
 	    );
 
 TRACE_EVENT(netfs_failure,
@@ -745,6 +759,29 @@ TRACE_EVENT(netfs_donate,
 		      __entry->amount)
 	    );
 
+TRACE_EVENT(netfs_folioq,
+	    TP_PROTO(const struct folio_queue *fq,
+		     enum netfs_folioq_trace trace),
+
+	    TP_ARGS(fq, trace),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		rreq)
+		    __field(unsigned int,		id)
+		    __field(enum netfs_folioq_trace,	trace)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->rreq	= fq ? fq->rreq_id : 0;
+		    __entry->id		= fq ? fq->debug_id : 0;
+		    __entry->trace	= trace;
+			   ),
+
+	    TP_printk("R=%08x fq=%x %s",
+		      __entry->rreq, __entry->id,
+		      __print_symbolic(__entry->trace, netfs_folioq_traces))
+	    );
+
 #undef EM
 #undef E_
 #endif /* _TRACE_NETFS_H */
diff --git a/lib/kunit_iov_iter.c b/lib/kunit_iov_iter.c
index 13e15687675a..10a560feb66e 100644
--- a/lib/kunit_iov_iter.c
+++ b/lib/kunit_iov_iter.c
@@ -392,7 +392,7 @@ static void __init iov_kunit_load_folioq(struct kunit *test,
 		if (folioq_full(p)) {
 			p->next = kzalloc(sizeof(struct folio_queue), GFP_KERNEL);
 			KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p->next);
-			folioq_init(p->next);
+			folioq_init(p->next, 0);
 			p->next->prev = p;
 			p = p->next;
 		}
@@ -409,7 +409,7 @@ static struct folio_queue *iov_kunit_create_folioq(struct kunit *test)
 	folioq = kzalloc(sizeof(struct folio_queue), GFP_KERNEL);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, folioq);
 	kunit_add_action_or_reset(test, iov_kunit_destroy_folioq, folioq);
-	folioq_init(folioq);
+	folioq_init(folioq, 0);
 	return folioq;
 }
 
-- 
cgit v1.2.3


From 06fa229ceb36898e68022b5654c017d2c6582d7d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:40:55 +0000
Subject: netfs: Abstract out a rolling folio buffer implementation

A rolling buffer is a series of folios held in a list of folio_queues.  New
folios and folio_queue structs may be inserted at the head simultaneously
with spent ones being removed from the tail without the need for locking.

The rolling buffer includes an iov_iter and it has to be careful managing
this as the list of folio_queues is extended such that an oops doesn't
incurred because the iterator was pointing to the end of a folio_queue
segment that got appended to and then removed.

We need to use the mechanism twice, once for read and once for write, and,
in future patches, we will use a second rolling buffer to handle bounce
buffering for content encryption.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-6-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/Makefile              |   1 +
 fs/netfs/buffered_read.c       | 119 +++++-----------------
 fs/netfs/direct_read.c         |  14 +--
 fs/netfs/direct_write.c        |  10 +-
 fs/netfs/internal.h            |   4 -
 fs/netfs/misc.c                | 147 ---------------------------
 fs/netfs/objects.c             |   2 +-
 fs/netfs/read_pgpriv2.c        |  32 +++---
 fs/netfs/read_retry.c          |   2 +-
 fs/netfs/rolling_buffer.c      | 226 +++++++++++++++++++++++++++++++++++++++++
 fs/netfs/write_collect.c       |  19 ++--
 fs/netfs/write_issue.c         |  26 +++--
 include/linux/netfs.h          |  10 +-
 include/linux/rolling_buffer.h |  61 +++++++++++
 include/trace/events/netfs.h   |   2 +
 15 files changed, 375 insertions(+), 300 deletions(-)
 create mode 100644 fs/netfs/rolling_buffer.c
 create mode 100644 include/linux/rolling_buffer.h

(limited to 'fs')

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index d08b0bfb6756..7492c4aa331e 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -13,6 +13,7 @@ netfs-y := \
 	read_collect.o \
 	read_pgpriv2.o \
 	read_retry.o \
+	rolling_buffer.o \
 	write_collect.o \
 	write_issue.o
 
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 7ec04d5628d8..db874fea8794 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -63,37 +63,6 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
 	return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
 }
 
-/*
- * Decant the list of folios to read into a rolling buffer.
- */
-static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
-					struct folio_queue *folioq,
-					struct folio_batch *put_batch)
-{
-	unsigned int order, nr;
-	size_t size = 0;
-
-	nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios,
-			       ARRAY_SIZE(folioq->vec.folios));
-	folioq->vec.nr = nr;
-	for (int i = 0; i < nr; i++) {
-		struct folio *folio = folioq_folio(folioq, i);
-
-		trace_netfs_folio(folio, netfs_folio_trace_read);
-		order = folio_order(folio);
-		folioq->orders[i] = order;
-		size += PAGE_SIZE << order;
-
-		if (!folio_batch_add(put_batch, folio))
-			folio_batch_release(put_batch);
-	}
-
-	for (int i = nr; i < folioq_nr_slots(folioq); i++)
-		folioq_clear(folioq, i);
-
-	return size;
-}
-
 /*
  * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O
  * @subreq: The subrequest to be set up
@@ -128,18 +97,12 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 
 		folio_batch_init(&put_batch);
 		while (rreq->submitted < subreq->start + rsize) {
-			struct folio_queue *tail = rreq->buffer_tail, *new;
-			size_t added;
-
-			new = netfs_folioq_alloc(rreq->debug_id, GFP_NOFS,
-						 netfs_trace_folioq_alloc_read_prep);
-			if (!new)
-				return -ENOMEM;
-			new->prev = tail;
-			tail->next = new;
-			rreq->buffer_tail = new;
-			added = netfs_load_buffer_from_ra(rreq, new, &put_batch);
-			rreq->iter.count += added;
+			ssize_t added;
+
+			added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl,
+							    &put_batch);
+			if (added < 0)
+				return added;
 			rreq->submitted += added;
 		}
 		folio_batch_release(&put_batch);
@@ -147,7 +110,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 
 	subreq->len = rsize;
 	if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
-		size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize,
+		size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize,
 						rreq->io_streams[0].sreq_max_segs);
 
 		if (limit < rsize) {
@@ -156,20 +119,16 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 		}
 	}
 
-	subreq->io_iter	= rreq->iter;
+	subreq->io_iter	= rreq->buffer.iter;
 
 	if (iov_iter_is_folioq(&subreq->io_iter)) {
-		if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) {
-			subreq->io_iter.folioq = subreq->io_iter.folioq->next;
-			subreq->io_iter.folioq_slot = 0;
-		}
 		subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq;
 		subreq->curr_folioq_slot = subreq->io_iter.folioq_slot;
 		subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
 	}
 
 	iov_iter_truncate(&subreq->io_iter, subreq->len);
-	iov_iter_advance(&rreq->iter, subreq->len);
+	rolling_buffer_advance(&rreq->buffer, subreq->len);
 	return subreq->len;
 }
 
@@ -352,34 +311,6 @@ static int netfs_wait_for_read(struct netfs_io_request *rreq)
 	return ret;
 }
 
-/*
- * Set up the initial folioq of buffer folios in the rolling buffer and set the
- * iterator to refer to it.
- */
-static int netfs_prime_buffer(struct netfs_io_request *rreq)
-{
-	struct folio_queue *folioq;
-	struct folio_batch put_batch;
-	size_t added;
-
-	folioq = netfs_folioq_alloc(rreq->debug_id, GFP_KERNEL,
-				    netfs_trace_folioq_alloc_read_prime);
-	if (!folioq)
-		return -ENOMEM;
-
-	rreq->buffer = folioq;
-	rreq->buffer_tail = folioq;
-	rreq->submitted = rreq->start;
-	iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0);
-
-	folio_batch_init(&put_batch);
-	added = netfs_load_buffer_from_ra(rreq, folioq, &put_batch);
-	folio_batch_release(&put_batch);
-	rreq->iter.count += added;
-	rreq->submitted += added;
-	return 0;
-}
-
 /**
  * netfs_readahead - Helper to manage a read request
  * @ractl: The description of the readahead request
@@ -419,7 +350,8 @@ void netfs_readahead(struct readahead_control *ractl)
 	netfs_rreq_expand(rreq, ractl);
 
 	rreq->ractl = ractl;
-	if (netfs_prime_buffer(rreq) < 0)
+	rreq->submitted = rreq->start;
+	if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0)
 		goto cleanup_free;
 	netfs_read_to_pagecache(rreq);
 
@@ -435,22 +367,18 @@ EXPORT_SYMBOL(netfs_readahead);
 /*
  * Create a rolling buffer with a single occupying folio.
  */
-static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio)
+static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio,
+					unsigned int rollbuf_flags)
 {
-	struct folio_queue *folioq;
+	ssize_t added;
 
-	folioq = netfs_folioq_alloc(rreq->debug_id, GFP_KERNEL,
-				    netfs_trace_folioq_alloc_read_sing);
-	if (!folioq)
+	if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0)
 		return -ENOMEM;
 
-	folioq_append(folioq, folio);
-	BUG_ON(folioq_folio(folioq, 0) != folio);
-	BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio));
-	rreq->buffer = folioq;
-	rreq->buffer_tail = folioq;
-	rreq->submitted = rreq->start + rreq->len;
-	iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len);
+	added = rolling_buffer_append(&rreq->buffer, folio, rollbuf_flags);
+	if (added < 0)
+		return added;
+	rreq->submitted = rreq->start + added;
 	rreq->ractl = (struct readahead_control *)1UL;
 	return 0;
 }
@@ -518,7 +446,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
 	}
 	if (to < flen)
 		bvec_set_folio(&bvec[i++], folio, flen - to, to);
-	iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
+	iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len);
 	rreq->submitted = rreq->start + flen;
 
 	netfs_read_to_pagecache(rreq);
@@ -586,7 +514,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
 
 	/* Set up the output buffer */
-	ret = netfs_create_singular_buffer(rreq, folio);
+	ret = netfs_create_singular_buffer(rreq, folio, 0);
 	if (ret < 0)
 		goto discard;
 
@@ -743,7 +671,7 @@ retry:
 	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
 
 	/* Set up the output buffer */
-	ret = netfs_create_singular_buffer(rreq, folio);
+	ret = netfs_create_singular_buffer(rreq, folio, 0);
 	if (ret < 0)
 		goto error_put;
 
@@ -808,11 +736,10 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 	trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
 
 	/* Set up the output buffer */
-	ret = netfs_create_singular_buffer(rreq, folio);
+	ret = netfs_create_singular_buffer(rreq, folio, NETFS_ROLLBUF_PAGECACHE_MARK);
 	if (ret < 0)
 		goto error_put;
 
-	folioq_mark2(rreq->buffer, 0);
 	netfs_read_to_pagecache(rreq);
 	ret = netfs_wait_for_read(rreq);
 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index b1a66a6e6bc2..a3f23adbae0f 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -25,7 +25,7 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
 	subreq->len = rsize;
 
 	if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
-		size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize,
+		size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize,
 						rreq->io_streams[0].sreq_max_segs);
 
 		if (limit < rsize) {
@@ -36,9 +36,9 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
 
 	trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
 
-	subreq->io_iter	= rreq->iter;
+	subreq->io_iter	= rreq->buffer.iter;
 	iov_iter_truncate(&subreq->io_iter, subreq->len);
-	iov_iter_advance(&rreq->iter, subreq->len);
+	iov_iter_advance(&rreq->buffer.iter, subreq->len);
 }
 
 /*
@@ -199,15 +199,15 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
 	 * the request.
 	 */
 	if (user_backed_iter(iter)) {
-		ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+		ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0);
 		if (ret < 0)
 			goto out;
-		rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+		rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec;
 		rreq->direct_bv_count = ret;
 		rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
-		rreq->len = iov_iter_count(&rreq->iter);
+		rreq->len = iov_iter_count(&rreq->buffer.iter);
 	} else {
-		rreq->iter = *iter;
+		rreq->buffer.iter = *iter;
 		rreq->len = orig_count;
 		rreq->direct_bv_unpin = false;
 		iov_iter_advance(iter, orig_count);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 173e8b5e6a93..eded8afaa60b 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -68,19 +68,17 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 		 * request.
 		 */
 		if (async || user_backed_iter(iter)) {
-			n = netfs_extract_user_iter(iter, len, &wreq->iter, 0);
+			n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
 			if (n < 0) {
 				ret = n;
 				goto out;
 			}
-			wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
+			wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
 			wreq->direct_bv_count = n;
 			wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
 		} else {
-			wreq->iter = *iter;
+			wreq->buffer.iter = *iter;
 		}
-
-		wreq->io_iter = wreq->iter;
 	}
 
 	__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
@@ -92,7 +90,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
 	if (async)
 		wreq->iocb = iocb;
-	wreq->len = iov_iter_count(&wreq->io_iter);
+	wreq->len = iov_iter_count(&wreq->buffer.iter);
 	wreq->cleanup = netfs_cleanup_dio_write;
 	ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
 	if (ret < 0) {
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 01b013f558f7..ccd9058acb61 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -60,10 +60,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
  */
 struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq,
 					    enum netfs_folioq_trace trace);
-int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
-			      bool needs_put);
-struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq);
-void netfs_clear_buffer(struct netfs_io_request *rreq);
 void netfs_reset_iter(struct netfs_io_subrequest *subreq);
 
 /*
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index afe032551de5..4249715f4171 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,153 +8,6 @@
 #include <linux/swap.h>
 #include "internal.h"
 
-/**
- * netfs_folioq_alloc - Allocate a folio_queue struct
- * @rreq_id: Associated debugging ID for tracing purposes
- * @gfp: Allocation constraints
- * @trace: Trace tag to indicate the purpose of the allocation
- *
- * Allocate, initialise and account the folio_queue struct and log a trace line
- * to mark the allocation.
- */
-struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp,
-				       unsigned int /*enum netfs_folioq_trace*/ trace)
-{
-	static atomic_t debug_ids;
-	struct folio_queue *fq;
-
-	fq = kmalloc(sizeof(*fq), gfp);
-	if (fq) {
-		netfs_stat(&netfs_n_folioq);
-		folioq_init(fq, rreq_id);
-		fq->debug_id = atomic_inc_return(&debug_ids);
-		trace_netfs_folioq(fq, trace);
-	}
-	return fq;
-}
-EXPORT_SYMBOL(netfs_folioq_alloc);
-
-/**
- * netfs_folioq_free - Free a folio_queue struct
- * @folioq: The object to free
- * @trace: Trace tag to indicate which free
- *
- * Free and unaccount the folio_queue struct.
- */
-void netfs_folioq_free(struct folio_queue *folioq,
-		       unsigned int /*enum netfs_trace_folioq*/ trace)
-{
-	trace_netfs_folioq(folioq, trace);
-	netfs_stat_d(&netfs_n_folioq);
-	kfree(folioq);
-}
-EXPORT_SYMBOL(netfs_folioq_free);
-
-/*
- * Make sure there's space in the rolling queue.
- */
-struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq,
-					    enum netfs_folioq_trace trace)
-{
-	struct folio_queue *tail = rreq->buffer_tail, *prev;
-	unsigned int prev_nr_slots = 0;
-
-	if (WARN_ON_ONCE(!rreq->buffer && tail) ||
-	    WARN_ON_ONCE(rreq->buffer && !tail))
-		return ERR_PTR(-EIO);
-
-	prev = tail;
-	if (prev) {
-		if (!folioq_full(tail))
-			return tail;
-		prev_nr_slots = folioq_nr_slots(tail);
-	}
-
-	tail = netfs_folioq_alloc(rreq->debug_id, GFP_NOFS, trace);
-	if (!tail)
-		return ERR_PTR(-ENOMEM);
-	tail->prev = prev;
-	if (prev)
-		/* [!] NOTE: After we set prev->next, the consumer is entirely
-		 * at liberty to delete prev.
-		 */
-		WRITE_ONCE(prev->next, tail);
-
-	rreq->buffer_tail = tail;
-	if (!rreq->buffer) {
-		rreq->buffer = tail;
-		iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0);
-	} else {
-		/* Make sure we don't leave the master iterator pointing to a
-		 * block that might get immediately consumed.
-		 */
-		if (rreq->io_iter.folioq == prev &&
-		    rreq->io_iter.folioq_slot == prev_nr_slots) {
-			rreq->io_iter.folioq = tail;
-			rreq->io_iter.folioq_slot = 0;
-		}
-	}
-	rreq->buffer_tail_slot = 0;
-	return tail;
-}
-
-/*
- * Append a folio to the rolling queue.
- */
-int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
-			      bool needs_put)
-{
-	struct folio_queue *tail;
-	unsigned int slot, order = folio_order(folio);
-
-	tail = netfs_buffer_make_space(rreq, netfs_trace_folioq_alloc_append_folio);
-	if (IS_ERR(tail))
-		return PTR_ERR(tail);
-
-	rreq->io_iter.count += PAGE_SIZE << order;
-
-	slot = folioq_append(tail, folio);
-	/* Store the counter after setting the slot. */
-	smp_store_release(&rreq->buffer_tail_slot, slot);
-	return 0;
-}
-
-/*
- * Delete the head of a rolling queue.
- */
-struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq)
-{
-	struct folio_queue *head = wreq->buffer, *next = head->next;
-
-	if (next)
-		next->prev = NULL;
-	netfs_folioq_free(head, netfs_trace_folioq_delete);
-	wreq->buffer = next;
-	return next;
-}
-
-/*
- * Clear out a rolling queue.
- */
-void netfs_clear_buffer(struct netfs_io_request *rreq)
-{
-	struct folio_queue *p;
-
-	while ((p = rreq->buffer)) {
-		rreq->buffer = p->next;
-		for (int slot = 0; slot < folioq_count(p); slot++) {
-			struct folio *folio = folioq_folio(p, slot);
-			if (!folio)
-				continue;
-			if (folioq_is_marked(p, slot)) {
-				trace_netfs_folio(folio, netfs_folio_trace_put);
-				folio_put(folio);
-			}
-		}
-		netfs_folioq_free(p, netfs_trace_folioq_clear);
-	}
-}
-
 /*
  * Reset the subrequest iterator to refer just to the region remaining to be
  * read.  The iterator may or may not have been advanced by socket ops or
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 31e388ec6e48..5cdddaf1f978 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -143,7 +143,7 @@ static void netfs_free_request(struct work_struct *work)
 		}
 		kvfree(rreq->direct_bv);
 	}
-	netfs_clear_buffer(rreq);
+	rolling_buffer_clear(&rreq->buffer);
 
 	if (atomic_dec_and_test(&ictx->io_count))
 		wake_up_var(&ictx->io_count);
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index 54d5004fec18..9eee5af6b327 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -34,8 +34,9 @@ void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq,
  * [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an
  * unrecoverable error.
  */
-static void netfs_pgpriv2_cancel(struct folio_queue *folioq)
+static void netfs_pgpriv2_cancel(struct rolling_buffer *buffer)
 {
+	struct folio_queue *folioq = buffer->tail;
 	struct folio *folio;
 	int slot;
 
@@ -94,7 +95,7 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
 	trace_netfs_folio(folio, netfs_folio_trace_store_copy);
 
 	/* Attach the folio to the rolling buffer. */
-	if (netfs_buffer_append_folio(wreq, folio, false) < 0)
+	if (rolling_buffer_append(&wreq->buffer, folio, 0) < 0)
 		return -ENOMEM;
 
 	cache->submit_extendable_to = fsize;
@@ -109,7 +110,7 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
 	do {
 		ssize_t part;
 
-		wreq->io_iter.iov_offset = cache->submit_off;
+		wreq->buffer.iter.iov_offset = cache->submit_off;
 
 		atomic64_set(&wreq->issued_to, fpos + cache->submit_off);
 		cache->submit_extendable_to = fsize - cache->submit_off;
@@ -122,8 +123,8 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
 			cache->submit_len -= part;
 	} while (cache->submit_len > 0);
 
-	wreq->io_iter.iov_offset = 0;
-	iov_iter_advance(&wreq->io_iter, fsize);
+	wreq->buffer.iter.iov_offset = 0;
+	rolling_buffer_advance(&wreq->buffer, fsize);
 	atomic64_set(&wreq->issued_to, fpos + fsize);
 
 	if (flen < fsize)
@@ -151,7 +152,7 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
 		goto couldnt_start;
 
 	/* Need the first folio to be able to set up the op. */
-	for (folioq = rreq->buffer; folioq; folioq = folioq->next) {
+	for (folioq = rreq->buffer.tail; folioq; folioq = folioq->next) {
 		if (folioq->marks3) {
 			slot = __ffs(folioq->marks3);
 			break;
@@ -198,7 +199,7 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
 	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
 	_leave(" = %d", error);
 couldnt_start:
-	netfs_pgpriv2_cancel(rreq->buffer);
+	netfs_pgpriv2_cancel(&rreq->buffer);
 }
 
 /*
@@ -207,13 +208,13 @@ couldnt_start:
  */
 bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
 {
-	struct folio_queue *folioq = wreq->buffer;
+	struct folio_queue *folioq = wreq->buffer.tail;
 	unsigned long long collected_to = wreq->collected_to;
-	unsigned int slot = wreq->buffer_head_slot;
+	unsigned int slot = wreq->buffer.first_tail_slot;
 	bool made_progress = false;
 
 	if (slot >= folioq_nr_slots(folioq)) {
-		folioq = netfs_delete_buffer_head(wreq);
+		folioq = rolling_buffer_delete_spent(&wreq->buffer);
 		slot = 0;
 	}
 
@@ -252,9 +253,9 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
 		folioq_clear(folioq, slot);
 		slot++;
 		if (slot >= folioq_nr_slots(folioq)) {
-			if (READ_ONCE(wreq->buffer_tail) == folioq)
-				break;
-			folioq = netfs_delete_buffer_head(wreq);
+			folioq = rolling_buffer_delete_spent(&wreq->buffer);
+			if (!folioq)
+				goto done;
 			slot = 0;
 		}
 
@@ -262,7 +263,8 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
 			break;
 	}
 
-	wreq->buffer = folioq;
-	wreq->buffer_head_slot = slot;
+	wreq->buffer.tail = folioq;
+done:
+	wreq->buffer.first_tail_slot = slot;
 	return made_progress;
 }
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 21b4a54e545e..0983234c2183 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -245,7 +245,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq)
 {
 	struct folio_queue *p;
 
-	for (p = rreq->buffer; p; p = p->next) {
+	for (p = rreq->buffer.tail; p; p = p->next) {
 		for (int slot = 0; slot < folioq_count(p); slot++) {
 			struct folio *folio = folioq_folio(p, slot);
 
diff --git a/fs/netfs/rolling_buffer.c b/fs/netfs/rolling_buffer.c
new file mode 100644
index 000000000000..75d97af14b4a
--- /dev/null
+++ b/fs/netfs/rolling_buffer.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Rolling buffer helpers
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/bitops.h>
+#include <linux/pagemap.h>
+#include <linux/rolling_buffer.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static atomic_t debug_ids;
+
+/**
+ * netfs_folioq_alloc - Allocate a folio_queue struct
+ * @rreq_id: Associated debugging ID for tracing purposes
+ * @gfp: Allocation constraints
+ * @trace: Trace tag to indicate the purpose of the allocation
+ *
+ * Allocate, initialise and account the folio_queue struct and log a trace line
+ * to mark the allocation.
+ */
+struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp,
+				       unsigned int /*enum netfs_folioq_trace*/ trace)
+{
+	struct folio_queue *fq;
+
+	fq = kmalloc(sizeof(*fq), gfp);
+	if (fq) {
+		netfs_stat(&netfs_n_folioq);
+		folioq_init(fq, rreq_id);
+		fq->debug_id = atomic_inc_return(&debug_ids);
+		trace_netfs_folioq(fq, trace);
+	}
+	return fq;
+}
+EXPORT_SYMBOL(netfs_folioq_alloc);
+
+/**
+ * netfs_folioq_free - Free a folio_queue struct
+ * @folioq: The object to free
+ * @trace: Trace tag to indicate which free
+ *
+ * Free and unaccount the folio_queue struct.
+ */
+void netfs_folioq_free(struct folio_queue *folioq,
+		       unsigned int /*enum netfs_trace_folioq*/ trace)
+{
+	trace_netfs_folioq(folioq, trace);
+	netfs_stat_d(&netfs_n_folioq);
+	kfree(folioq);
+}
+EXPORT_SYMBOL(netfs_folioq_free);
+
+/*
+ * Initialise a rolling buffer.  We allocate an empty folio queue struct to so
+ * that the pointers can be independently driven by the producer and the
+ * consumer.
+ */
+int rolling_buffer_init(struct rolling_buffer *roll, unsigned int rreq_id,
+			unsigned int direction)
+{
+	struct folio_queue *fq;
+
+	fq = netfs_folioq_alloc(rreq_id, GFP_NOFS, netfs_trace_folioq_rollbuf_init);
+	if (!fq)
+		return -ENOMEM;
+
+	roll->head = fq;
+	roll->tail = fq;
+	iov_iter_folio_queue(&roll->iter, direction, fq, 0, 0, 0);
+	return 0;
+}
+
+/*
+ * Add another folio_queue to a rolling buffer if there's no space left.
+ */
+int rolling_buffer_make_space(struct rolling_buffer *roll)
+{
+	struct folio_queue *fq, *head = roll->head;
+
+	if (!folioq_full(head))
+		return 0;
+
+	fq = netfs_folioq_alloc(head->rreq_id, GFP_NOFS, netfs_trace_folioq_make_space);
+	if (!fq)
+		return -ENOMEM;
+	fq->prev = head;
+
+	roll->head = fq;
+	if (folioq_full(head)) {
+		/* Make sure we don't leave the master iterator pointing to a
+		 * block that might get immediately consumed.
+		 */
+		if (roll->iter.folioq == head &&
+		    roll->iter.folioq_slot == folioq_nr_slots(head)) {
+			roll->iter.folioq = fq;
+			roll->iter.folioq_slot = 0;
+		}
+	}
+
+	/* Make sure the initialisation is stored before the next pointer.
+	 *
+	 * [!] NOTE: After we set head->next, the consumer is at liberty to
+	 * immediately delete the old head.
+	 */
+	smp_store_release(&head->next, fq);
+	return 0;
+}
+
+/*
+ * Decant the list of folios to read into a rolling buffer.
+ */
+ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll,
+				    struct readahead_control *ractl,
+				    struct folio_batch *put_batch)
+{
+	struct folio_queue *fq;
+	struct page **vec;
+	int nr, ix, to;
+	ssize_t size = 0;
+
+	if (rolling_buffer_make_space(roll) < 0)
+		return -ENOMEM;
+
+	fq = roll->head;
+	vec = (struct page **)fq->vec.folios;
+	nr = __readahead_batch(ractl, vec + folio_batch_count(&fq->vec),
+			       folio_batch_space(&fq->vec));
+	ix = fq->vec.nr;
+	to = ix + nr;
+	fq->vec.nr = to;
+	for (; ix < to; ix++) {
+		struct folio *folio = folioq_folio(fq, ix);
+		unsigned int order = folio_order(folio);
+
+		fq->orders[ix] = order;
+		size += PAGE_SIZE << order;
+		trace_netfs_folio(folio, netfs_folio_trace_read);
+		if (!folio_batch_add(put_batch, folio))
+			folio_batch_release(put_batch);
+	}
+	WRITE_ONCE(roll->iter.count, roll->iter.count + size);
+
+	/* Store the counter after setting the slot. */
+	smp_store_release(&roll->next_head_slot, to);
+
+	for (; ix < folioq_nr_slots(fq); ix++)
+		folioq_clear(fq, ix);
+
+	return size;
+}
+
+/*
+ * Append a folio to the rolling buffer.
+ */
+ssize_t rolling_buffer_append(struct rolling_buffer *roll, struct folio *folio,
+			      unsigned int flags)
+{
+	ssize_t size = folio_size(folio);
+	int slot;
+
+	if (rolling_buffer_make_space(roll) < 0)
+		return -ENOMEM;
+
+	slot = folioq_append(roll->head, folio);
+	if (flags & ROLLBUF_MARK_1)
+		folioq_mark(roll->head, slot);
+	if (flags & ROLLBUF_MARK_2)
+		folioq_mark2(roll->head, slot);
+
+	WRITE_ONCE(roll->iter.count, roll->iter.count + size);
+
+	/* Store the counter after setting the slot. */
+	smp_store_release(&roll->next_head_slot, slot);
+	return size;
+}
+
+/*
+ * Delete a spent buffer from a rolling queue and return the next in line.  We
+ * don't return the last buffer to keep the pointers independent, but return
+ * NULL instead.
+ */
+struct folio_queue *rolling_buffer_delete_spent(struct rolling_buffer *roll)
+{
+	struct folio_queue *spent = roll->tail, *next = READ_ONCE(spent->next);
+
+	if (!next)
+		return NULL;
+	next->prev = NULL;
+	netfs_folioq_free(spent, netfs_trace_folioq_delete);
+	roll->tail = next;
+	return next;
+}
+
+/*
+ * Clear out a rolling queue.  Folios that have mark 1 set are put.
+ */
+void rolling_buffer_clear(struct rolling_buffer *roll)
+{
+	struct folio_batch fbatch;
+	struct folio_queue *p;
+
+	folio_batch_init(&fbatch);
+
+	while ((p = roll->tail)) {
+		roll->tail = p->next;
+		for (int slot = 0; slot < folioq_count(p); slot++) {
+			struct folio *folio = folioq_folio(p, slot);
+
+			if (!folio)
+				continue;
+			if (folioq_is_marked(p, slot)) {
+				trace_netfs_folio(folio, netfs_folio_trace_put);
+				if (!folio_batch_add(&fbatch, folio))
+					folio_batch_release(&fbatch);
+			}
+		}
+
+		netfs_folioq_free(p, netfs_trace_folioq_clear);
+	}
+
+	folio_batch_release(&fbatch);
+}
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index ca3a11ed9b54..364c1f9d5815 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -83,9 +83,9 @@ end_wb:
 static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
 					  unsigned int *notes)
 {
-	struct folio_queue *folioq = wreq->buffer;
+	struct folio_queue *folioq = wreq->buffer.tail;
 	unsigned long long collected_to = wreq->collected_to;
-	unsigned int slot = wreq->buffer_head_slot;
+	unsigned int slot = wreq->buffer.first_tail_slot;
 
 	if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
 		if (netfs_pgpriv2_unlock_copied_folios(wreq))
@@ -94,7 +94,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
 	}
 
 	if (slot >= folioq_nr_slots(folioq)) {
-		folioq = netfs_delete_buffer_head(wreq);
+		folioq = rolling_buffer_delete_spent(&wreq->buffer);
+		if (!folioq)
+			return;
 		slot = 0;
 	}
 
@@ -134,9 +136,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
 		folioq_clear(folioq, slot);
 		slot++;
 		if (slot >= folioq_nr_slots(folioq)) {
-			if (READ_ONCE(wreq->buffer_tail) == folioq)
-				break;
-			folioq = netfs_delete_buffer_head(wreq);
+			folioq = rolling_buffer_delete_spent(&wreq->buffer);
+			if (!folioq)
+				goto done;
 			slot = 0;
 		}
 
@@ -144,8 +146,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
 			break;
 	}
 
-	wreq->buffer = folioq;
-	wreq->buffer_head_slot = slot;
+	wreq->buffer.tail = folioq;
+done:
+	wreq->buffer.first_tail_slot = slot;
 }
 
 /*
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 87e5cf4a0957..88ceba49ff69 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -107,6 +107,8 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 	ictx = netfs_inode(wreq->inode);
 	if (is_buffered && netfs_is_cache_enabled(ictx))
 		fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
+	if (rolling_buffer_init(&wreq->buffer, wreq->debug_id, ITER_SOURCE) < 0)
+		goto nomem;
 
 	wreq->cleaned_to = wreq->start;
 
@@ -129,6 +131,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 	}
 
 	return wreq;
+nomem:
+	wreq->error = -ENOMEM;
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_failed);
+	return ERR_PTR(-ENOMEM);
 }
 
 /**
@@ -153,16 +159,15 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
 				loff_t start)
 {
 	struct netfs_io_subrequest *subreq;
-	struct iov_iter *wreq_iter = &wreq->io_iter;
+	struct iov_iter *wreq_iter = &wreq->buffer.iter;
 
 	/* Make sure we don't point the iterator at a used-up folio_queue
 	 * struct being used as a placeholder to prevent the queue from
 	 * collapsing.  In such a case, extend the queue.
 	 */
 	if (iov_iter_is_folioq(wreq_iter) &&
-	    wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) {
-		netfs_buffer_make_space(wreq, netfs_trace_folioq_prep_write);
-	}
+	    wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq))
+		rolling_buffer_make_space(&wreq->buffer);
 
 	subreq = netfs_alloc_subrequest(wreq);
 	subreq->source		= stream->source;
@@ -327,6 +332,9 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
 
 	_enter("");
 
+	if (rolling_buffer_make_space(&wreq->buffer) < 0)
+		return -ENOMEM;
+
 	/* netfs_perform_write() may shift i_size around the page or from out
 	 * of the page to beyond it, but cannot move i_size into or through the
 	 * page since we have it locked.
@@ -431,7 +439,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
 	}
 
 	/* Attach the folio to the rolling buffer. */
-	netfs_buffer_append_folio(wreq, folio, false);
+	rolling_buffer_append(&wreq->buffer, folio, 0);
 
 	/* Move the submission point forward to allow for write-streaming data
 	 * not starting at the front of the page.  We don't do write-streaming
@@ -478,7 +486,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
 
 		/* Advance the iterator(s). */
 		if (stream->submit_off > iter_off) {
-			iov_iter_advance(&wreq->io_iter, stream->submit_off - iter_off);
+			rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off);
 			iter_off = stream->submit_off;
 		}
 
@@ -496,7 +504,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
 	}
 
 	if (fsize > iter_off)
-		iov_iter_advance(&wreq->io_iter, fsize - iter_off);
+		rolling_buffer_advance(&wreq->buffer, fsize - iter_off);
 	atomic64_set(&wreq->issued_to, fpos + fsize);
 
 	if (!debug)
@@ -635,7 +643,7 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
 			       struct folio **writethrough_cache)
 {
 	_enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
-	       wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
+	       wreq->debug_id, wreq->buffer.iter.count, wreq->wsize, copied, to_page_end);
 
 	if (!*writethrough_cache) {
 		if (folio_test_dirty(folio))
@@ -710,7 +718,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
 		part = netfs_advance_write(wreq, upload, start, len, false);
 		start += part;
 		len -= part;
-		iov_iter_advance(&wreq->io_iter, part);
+		rolling_buffer_advance(&wreq->buffer, part);
 		if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
 			trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
 			wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 5b2f427f8e3e..bd922f0936e3 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
+#include <linux/rolling_buffer.h>
 
 enum netfs_sreq_ref_trace;
 typedef struct mempool_s mempool_t;
@@ -238,10 +239,9 @@ struct netfs_io_request {
 	struct netfs_io_stream	io_streams[2];	/* Streams of parallel I/O operations */
 #define NR_IO_STREAMS 2 //wreq->nr_io_streams
 	struct netfs_group	*group;		/* Writeback group being written back */
-	struct folio_queue	*buffer;	/* Head of I/O buffer */
-	struct folio_queue	*buffer_tail;	/* Tail of I/O buffer */
-	struct iov_iter		iter;		/* Unencrypted-side iterator */
-	struct iov_iter		io_iter;	/* I/O (Encrypted-side) iterator */
+	struct rolling_buffer	buffer;		/* Unencrypted buffer */
+#define NETFS_ROLLBUF_PUT_MARK		ROLLBUF_MARK_1
+#define NETFS_ROLLBUF_PAGECACHE_MARK	ROLLBUF_MARK_2
 	void			*netfs_priv;	/* Private data for the netfs */
 	void			*netfs_priv2;	/* Private data for the netfs */
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
@@ -259,8 +259,6 @@ struct netfs_io_request {
 	long			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
 	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
-	u8			buffer_head_slot; /* First slot in ->buffer */
-	u8			buffer_tail_slot; /* Next slot in ->buffer_tail */
 	unsigned long long	i_size;		/* Size of the file */
 	unsigned long long	start;		/* Start position */
 	atomic64_t		issued_to;	/* Write issuer folio cursor */
diff --git a/include/linux/rolling_buffer.h b/include/linux/rolling_buffer.h
new file mode 100644
index 000000000000..ac15b1ffdd83
--- /dev/null
+++ b/include/linux/rolling_buffer.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Rolling buffer of folios
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _ROLLING_BUFFER_H
+#define _ROLLING_BUFFER_H
+
+#include <linux/folio_queue.h>
+#include <linux/uio.h>
+
+/*
+ * Rolling buffer.  Whilst the buffer is live and in use, folios and folio
+ * queue segments can be added to one end by one thread and removed from the
+ * other end by another thread.  The buffer isn't allowed to be empty; it must
+ * always have at least one folio_queue in it so that neither side has to
+ * modify both queue pointers.
+ *
+ * The iterator in the buffer is extended as buffers are inserted.  It can be
+ * snapshotted to use a segment of the buffer.
+ */
+struct rolling_buffer {
+	struct folio_queue	*head;		/* Producer's insertion point */
+	struct folio_queue	*tail;		/* Consumer's removal point */
+	struct iov_iter		iter;		/* Iterator tracking what's left in the buffer */
+	u8			next_head_slot;	/* Next slot in ->head */
+	u8			first_tail_slot; /* First slot in ->tail */
+};
+
+/*
+ * Snapshot of a rolling buffer.
+ */
+struct rolling_buffer_snapshot {
+	struct folio_queue	*curr_folioq;	/* Queue segment in which current folio resides */
+	unsigned char		curr_slot;	/* Folio currently being read */
+	unsigned char		curr_order;	/* Order of folio */
+};
+
+/* Marks to store per-folio in the internal folio_queue structs. */
+#define ROLLBUF_MARK_1	BIT(0)
+#define ROLLBUF_MARK_2	BIT(1)
+
+int rolling_buffer_init(struct rolling_buffer *roll, unsigned int rreq_id,
+			unsigned int direction);
+int rolling_buffer_make_space(struct rolling_buffer *roll);
+ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll,
+				    struct readahead_control *ractl,
+				    struct folio_batch *put_batch);
+ssize_t rolling_buffer_append(struct rolling_buffer *roll, struct folio *folio,
+			      unsigned int flags);
+struct folio_queue *rolling_buffer_delete_spent(struct rolling_buffer *roll);
+void rolling_buffer_clear(struct rolling_buffer *roll);
+
+static inline void rolling_buffer_advance(struct rolling_buffer *roll, size_t amount)
+{
+	iov_iter_advance(&roll->iter, amount);
+}
+
+#endif /* _ROLLING_BUFFER_H */
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 50aa6745df95..2dfc9f716e3b 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -198,7 +198,9 @@
 	EM(netfs_trace_folioq_alloc_read_sing,	"alloc-r-sing")	\
 	EM(netfs_trace_folioq_clear,		"clear")	\
 	EM(netfs_trace_folioq_delete,		"delete")	\
+	EM(netfs_trace_folioq_make_space,	"make-space")	\
 	EM(netfs_trace_folioq_prep_write,	"prep-wr")	\
+	EM(netfs_trace_folioq_rollbuf_init,	"roll-init")	\
 	E_(netfs_trace_folioq_read_progress,	"r-progress")
 
 #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
-- 
cgit v1.2.3


From d606c36294f46747b4fa34f79fccea6562d14aa7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:40:56 +0000
Subject: netfs: Make netfs_advance_write() return size_t

netfs_advance_write() calculates the amount of data it's attaching to a
stream with size_t, but then returns this as an int.  Switch the return
value to size_t for consistency.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-7-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/internal.h    | 6 +++---
 fs/netfs/write_issue.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ccd9058acb61..6aa2a8d49b37 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -178,9 +178,9 @@ void netfs_reissue_write(struct netfs_io_stream *stream,
 			 struct iov_iter *source);
 void netfs_issue_write(struct netfs_io_request *wreq,
 		       struct netfs_io_stream *stream);
-int netfs_advance_write(struct netfs_io_request *wreq,
-			struct netfs_io_stream *stream,
-			loff_t start, size_t len, bool to_eof);
+size_t netfs_advance_write(struct netfs_io_request *wreq,
+			   struct netfs_io_stream *stream,
+			   loff_t start, size_t len, bool to_eof);
 struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
 int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
 			       struct folio *folio, size_t copied, bool to_page_end,
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 88ceba49ff69..7a14a48e62ee 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -273,9 +273,9 @@ void netfs_issue_write(struct netfs_io_request *wreq,
  * we can avoid overrunning the credits obtained (cifs) and try to parallelise
  * content-crypto preparation with network writes.
  */
-int netfs_advance_write(struct netfs_io_request *wreq,
-			struct netfs_io_stream *stream,
-			loff_t start, size_t len, bool to_eof)
+size_t netfs_advance_write(struct netfs_io_request *wreq,
+			   struct netfs_io_stream *stream,
+			   loff_t start, size_t len, bool to_eof)
 {
 	struct netfs_io_subrequest *subreq = stream->construct;
 	size_t part;
-- 
cgit v1.2.3


From 751e213f9f8a24e1bd5988b9cb8043b0b2f017f0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:40:57 +0000
Subject: netfs: Split retry code out of fs/netfs/write_collect.c

Split write-retry code out of fs/netfs/write_collect.c as it will become
more elaborate when content crypto is introduced.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-8-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/Makefile        |   3 +-
 fs/netfs/internal.h      |   5 ++
 fs/netfs/write_collect.c | 214 --------------------------------------------
 fs/netfs/write_retry.c   | 226 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 233 insertions(+), 215 deletions(-)
 create mode 100644 fs/netfs/write_retry.c

(limited to 'fs')

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 7492c4aa331e..cbb30bdeacc4 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -15,7 +15,8 @@ netfs-y := \
 	read_retry.o \
 	rolling_buffer.o \
 	write_collect.o \
-	write_issue.o
+	write_issue.o \
+	write_retry.o
 
 netfs-$(CONFIG_NETFS_STATS) += stats.o
 
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 6aa2a8d49b37..73887525e939 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -189,6 +189,11 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr
 			   struct folio *writethrough_cache);
 int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
 
+/*
+ * write_retry.c
+ */
+void netfs_retry_writes(struct netfs_io_request *wreq);
+
 /*
  * Miscellaneous functions.
  */
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 364c1f9d5815..237018caba27 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -151,220 +151,6 @@ done:
 	wreq->buffer.first_tail_slot = slot;
 }
 
-/*
- * Perform retries on the streams that need it.
- */
-static void netfs_retry_write_stream(struct netfs_io_request *wreq,
-				     struct netfs_io_stream *stream)
-{
-	struct list_head *next;
-
-	_enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
-
-	if (list_empty(&stream->subrequests))
-		return;
-
-	if (stream->source == NETFS_UPLOAD_TO_SERVER &&
-	    wreq->netfs_ops->retry_request)
-		wreq->netfs_ops->retry_request(wreq, stream);
-
-	if (unlikely(stream->failed))
-		return;
-
-	/* If there's no renegotiation to do, just resend each failed subreq. */
-	if (!stream->prepare_write) {
-		struct netfs_io_subrequest *subreq;
-
-		list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
-			if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
-				break;
-			if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
-				struct iov_iter source = subreq->io_iter;
-
-				iov_iter_revert(&source, subreq->len - source.count);
-				netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
-				netfs_reissue_write(stream, subreq, &source);
-			}
-		}
-		return;
-	}
-
-	next = stream->subrequests.next;
-
-	do {
-		struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
-		struct iov_iter source;
-		unsigned long long start, len;
-		size_t part;
-		bool boundary = false;
-
-		/* Go through the stream and find the next span of contiguous
-		 * data that we then rejig (cifs, for example, needs the wsize
-		 * renegotiating) and reissue.
-		 */
-		from = list_entry(next, struct netfs_io_subrequest, rreq_link);
-		to = from;
-		start = from->start + from->transferred;
-		len   = from->len   - from->transferred;
-
-		if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
-		    !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
-			return;
-
-		list_for_each_continue(next, &stream->subrequests) {
-			subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
-			if (subreq->start + subreq->transferred != start + len ||
-			    test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
-			    !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
-				break;
-			to = subreq;
-			len += to->len;
-		}
-
-		/* Determine the set of buffers we're going to use.  Each
-		 * subreq gets a subset of a single overall contiguous buffer.
-		 */
-		netfs_reset_iter(from);
-		source = from->io_iter;
-		source.count = len;
-
-		/* Work through the sublist. */
-		subreq = from;
-		list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
-			if (!len)
-				break;
-			/* Renegotiate max_len (wsize) */
-			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
-			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-			subreq->retry_count++;
-			stream->prepare_write(subreq);
-
-			part = min(len, stream->sreq_max_len);
-			subreq->len = part;
-			subreq->start = start;
-			subreq->transferred = 0;
-			len -= part;
-			start += part;
-			if (len && subreq == to &&
-			    __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
-				boundary = true;
-
-			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
-			netfs_reissue_write(stream, subreq, &source);
-			if (subreq == to)
-				break;
-		}
-
-		/* If we managed to use fewer subreqs, we can discard the
-		 * excess; if we used the same number, then we're done.
-		 */
-		if (!len) {
-			if (subreq == to)
-				continue;
-			list_for_each_entry_safe_from(subreq, tmp,
-						      &stream->subrequests, rreq_link) {
-				trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
-				list_del(&subreq->rreq_link);
-				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
-				if (subreq == to)
-					break;
-			}
-			continue;
-		}
-
-		/* We ran out of subrequests, so we need to allocate some more
-		 * and insert them after.
-		 */
-		do {
-			subreq = netfs_alloc_subrequest(wreq);
-			subreq->source		= to->source;
-			subreq->start		= start;
-			subreq->debug_index	= atomic_inc_return(&wreq->subreq_counter);
-			subreq->stream_nr	= to->stream_nr;
-			subreq->retry_count	= 1;
-
-			trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
-					     refcount_read(&subreq->ref),
-					     netfs_sreq_trace_new);
-			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
-
-			list_add(&subreq->rreq_link, &to->rreq_link);
-			to = list_next_entry(to, rreq_link);
-			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
-
-			stream->sreq_max_len	= len;
-			stream->sreq_max_segs	= INT_MAX;
-			switch (stream->source) {
-			case NETFS_UPLOAD_TO_SERVER:
-				netfs_stat(&netfs_n_wh_upload);
-				stream->sreq_max_len = umin(len, wreq->wsize);
-				break;
-			case NETFS_WRITE_TO_CACHE:
-				netfs_stat(&netfs_n_wh_write);
-				break;
-			default:
-				WARN_ON_ONCE(1);
-			}
-
-			stream->prepare_write(subreq);
-
-			part = umin(len, stream->sreq_max_len);
-			subreq->len = subreq->transferred + part;
-			len -= part;
-			start += part;
-			if (!len && boundary) {
-				__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
-				boundary = false;
-			}
-
-			netfs_reissue_write(stream, subreq, &source);
-			if (!len)
-				break;
-
-		} while (len);
-
-	} while (!list_is_head(next, &stream->subrequests));
-}
-
-/*
- * Perform retries on the streams that need it.  If we're doing content
- * encryption and the server copy changed due to a third-party write, we may
- * need to do an RMW cycle and also rewrite the data to the cache.
- */
-static void netfs_retry_writes(struct netfs_io_request *wreq)
-{
-	struct netfs_io_subrequest *subreq;
-	struct netfs_io_stream *stream;
-	int s;
-
-	/* Wait for all outstanding I/O to quiesce before performing retries as
-	 * we may need to renegotiate the I/O sizes.
-	 */
-	for (s = 0; s < NR_IO_STREAMS; s++) {
-		stream = &wreq->io_streams[s];
-		if (!stream->active)
-			continue;
-
-		list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
-			wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
-				    TASK_UNINTERRUPTIBLE);
-		}
-	}
-
-	// TODO: Enc: Fetch changed partial pages
-	// TODO: Enc: Reencrypt content if needed.
-	// TODO: Enc: Wind back transferred point.
-	// TODO: Enc: Mark cache pages for retry.
-
-	for (s = 0; s < NR_IO_STREAMS; s++) {
-		stream = &wreq->io_streams[s];
-		if (stream->need_retry) {
-			stream->need_retry = false;
-			netfs_retry_write_stream(wreq, stream);
-		}
-	}
-}
-
 /*
  * Collect and assess the results of various write subrequests.  We may need to
  * retry some of the results - or even do an RMW cycle for content crypto.
diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c
new file mode 100644
index 000000000000..f3d5e37d4698
--- /dev/null
+++ b/fs/netfs/write_retry.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem write retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Perform retries on the streams that need it.
+ */
+static void netfs_retry_write_stream(struct netfs_io_request *wreq,
+				     struct netfs_io_stream *stream)
+{
+	struct list_head *next;
+
+	_enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
+
+	if (list_empty(&stream->subrequests))
+		return;
+
+	if (stream->source == NETFS_UPLOAD_TO_SERVER &&
+	    wreq->netfs_ops->retry_request)
+		wreq->netfs_ops->retry_request(wreq, stream);
+
+	if (unlikely(stream->failed))
+		return;
+
+	/* If there's no renegotiation to do, just resend each failed subreq. */
+	if (!stream->prepare_write) {
+		struct netfs_io_subrequest *subreq;
+
+		list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+			if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+				break;
+			if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+				struct iov_iter source = subreq->io_iter;
+
+				iov_iter_revert(&source, subreq->len - source.count);
+				netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+				netfs_reissue_write(stream, subreq, &source);
+			}
+		}
+		return;
+	}
+
+	next = stream->subrequests.next;
+
+	do {
+		struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
+		struct iov_iter source;
+		unsigned long long start, len;
+		size_t part;
+		bool boundary = false;
+
+		/* Go through the stream and find the next span of contiguous
+		 * data that we then rejig (cifs, for example, needs the wsize
+		 * renegotiating) and reissue.
+		 */
+		from = list_entry(next, struct netfs_io_subrequest, rreq_link);
+		to = from;
+		start = from->start + from->transferred;
+		len   = from->len   - from->transferred;
+
+		if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
+		    !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+			return;
+
+		list_for_each_continue(next, &stream->subrequests) {
+			subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
+			if (subreq->start + subreq->transferred != start + len ||
+			    test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
+			    !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+				break;
+			to = subreq;
+			len += to->len;
+		}
+
+		/* Determine the set of buffers we're going to use.  Each
+		 * subreq gets a subset of a single overall contiguous buffer.
+		 */
+		netfs_reset_iter(from);
+		source = from->io_iter;
+		source.count = len;
+
+		/* Work through the sublist. */
+		subreq = from;
+		list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+			if (!len)
+				break;
+			/* Renegotiate max_len (wsize) */
+			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+			subreq->retry_count++;
+			stream->prepare_write(subreq);
+
+			part = min(len, stream->sreq_max_len);
+			subreq->len = part;
+			subreq->start = start;
+			subreq->transferred = 0;
+			len -= part;
+			start += part;
+			if (len && subreq == to &&
+			    __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
+				boundary = true;
+
+			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+			netfs_reissue_write(stream, subreq, &source);
+			if (subreq == to)
+				break;
+		}
+
+		/* If we managed to use fewer subreqs, we can discard the
+		 * excess; if we used the same number, then we're done.
+		 */
+		if (!len) {
+			if (subreq == to)
+				continue;
+			list_for_each_entry_safe_from(subreq, tmp,
+						      &stream->subrequests, rreq_link) {
+				trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+				list_del(&subreq->rreq_link);
+				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+				if (subreq == to)
+					break;
+			}
+			continue;
+		}
+
+		/* We ran out of subrequests, so we need to allocate some more
+		 * and insert them after.
+		 */
+		do {
+			subreq = netfs_alloc_subrequest(wreq);
+			subreq->source		= to->source;
+			subreq->start		= start;
+			subreq->debug_index	= atomic_inc_return(&wreq->subreq_counter);
+			subreq->stream_nr	= to->stream_nr;
+			subreq->retry_count	= 1;
+
+			trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+					     refcount_read(&subreq->ref),
+					     netfs_sreq_trace_new);
+			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+
+			list_add(&subreq->rreq_link, &to->rreq_link);
+			to = list_next_entry(to, rreq_link);
+			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+			stream->sreq_max_len	= len;
+			stream->sreq_max_segs	= INT_MAX;
+			switch (stream->source) {
+			case NETFS_UPLOAD_TO_SERVER:
+				netfs_stat(&netfs_n_wh_upload);
+				stream->sreq_max_len = umin(len, wreq->wsize);
+				break;
+			case NETFS_WRITE_TO_CACHE:
+				netfs_stat(&netfs_n_wh_write);
+				break;
+			default:
+				WARN_ON_ONCE(1);
+			}
+
+			stream->prepare_write(subreq);
+
+			part = umin(len, stream->sreq_max_len);
+			subreq->len = subreq->transferred + part;
+			len -= part;
+			start += part;
+			if (!len && boundary) {
+				__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
+				boundary = false;
+			}
+
+			netfs_reissue_write(stream, subreq, &source);
+			if (!len)
+				break;
+
+		} while (len);
+
+	} while (!list_is_head(next, &stream->subrequests));
+}
+
+/*
+ * Perform retries on the streams that need it.  If we're doing content
+ * encryption and the server copy changed due to a third-party write, we may
+ * need to do an RMW cycle and also rewrite the data to the cache.
+ */
+void netfs_retry_writes(struct netfs_io_request *wreq)
+{
+	struct netfs_io_subrequest *subreq;
+	struct netfs_io_stream *stream;
+	int s;
+
+	/* Wait for all outstanding I/O to quiesce before performing retries as
+	 * we may need to renegotiate the I/O sizes.
+	 */
+	for (s = 0; s < NR_IO_STREAMS; s++) {
+		stream = &wreq->io_streams[s];
+		if (!stream->active)
+			continue;
+
+		list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+			wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
+				    TASK_UNINTERRUPTIBLE);
+		}
+	}
+
+	// TODO: Enc: Fetch changed partial pages
+	// TODO: Enc: Reencrypt content if needed.
+	// TODO: Enc: Wind back transferred point.
+	// TODO: Enc: Mark cache pages for retry.
+
+	for (s = 0; s < NR_IO_STREAMS; s++) {
+		stream = &wreq->io_streams[s];
+		if (stream->need_retry) {
+			stream->need_retry = false;
+			netfs_retry_write_stream(wreq, stream);
+		}
+	}
+}
-- 
cgit v1.2.3


From 360157829ee3dba848ffa817792d9a07969e0a95 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:40:58 +0000
Subject: netfs: Drop the error arg from netfs_read_subreq_terminated()

Drop the error argument from netfs_read_subreq_terminated() in favour of
passing the value in subreq->error.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-9-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/9p/vfs_addr.c         |  3 ++-
 fs/afs/file.c            | 15 ++++++++++-----
 fs/ceph/addr.c           | 13 +++++++++----
 fs/netfs/buffered_read.c | 16 ++++++++--------
 fs/netfs/objects.c       | 15 ++++++++++++++-
 fs/netfs/read_collect.c  | 47 ++++++++++++++++++++++++++++++-----------------
 fs/nfs/fscache.c         |  6 ++++--
 fs/nfs/fscache.h         |  3 ++-
 fs/smb/client/cifssmb.c  | 10 +---------
 fs/smb/client/file.c     |  3 ++-
 fs/smb/client/smb2pdu.c  | 10 +---------
 include/linux/netfs.h    |  7 +++----
 12 files changed, 86 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 3bc9ce6c575e..40f5acd7b452 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -87,7 +87,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
 		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	}
 
-	netfs_read_subreq_terminated(subreq, err, false);
+	subreq->error = err;
+	netfs_read_subreq_terminated(subreq, false);
 }
 
 /**
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6762eff97517..56248a078bca 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -246,7 +246,8 @@ static void afs_fetch_data_notify(struct afs_operation *op)
 		subreq->rreq->i_size = req->file_size;
 		if (req->pos + req->actual_len >= req->file_size)
 			__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
-		netfs_read_subreq_terminated(subreq, error, false);
+		subreq->error = error;
+		netfs_read_subreq_terminated(subreq, false);
 		req->subreq = NULL;
 	} else if (req->done) {
 		req->done(req);
@@ -301,8 +302,10 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
 
 	op = afs_alloc_operation(req->key, vnode->volume);
 	if (IS_ERR(op)) {
-		if (req->subreq)
-			netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false);
+		if (req->subreq) {
+			req->subreq->error = PTR_ERR(op);
+			netfs_read_subreq_terminated(req->subreq, false);
+		}
 		return PTR_ERR(op);
 	}
 
@@ -320,8 +323,10 @@ static void afs_read_worker(struct work_struct *work)
 	struct afs_read *fsreq;
 
 	fsreq = afs_alloc_read(GFP_NOFS);
-	if (!fsreq)
-		return netfs_read_subreq_terminated(subreq, -ENOMEM, false);
+	if (!fsreq) {
+		subreq->error = -ENOMEM;
+		return netfs_read_subreq_terminated(subreq, false);
+	}
 
 	fsreq->subreq	= subreq;
 	fsreq->pos	= subreq->start + subreq->transferred;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 85936f6d2bf7..b1b47af94deb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -253,8 +253,9 @@ static void finish_netfs_read(struct ceph_osd_request *req)
 		subreq->transferred = err;
 		err = 0;
 	}
+	subreq->error = err;
 	trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
-	netfs_read_subreq_terminated(subreq, err, false);
+	netfs_read_subreq_terminated(subreq, false);
 	iput(req->r_inode);
 	ceph_dec_osd_stopping_blocker(fsc->mdsc);
 }
@@ -314,7 +315,9 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
 
 	ceph_mdsc_put_request(req);
 out:
-	netfs_read_subreq_terminated(subreq, err, false);
+	subreq->error = err;
+	trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
+	netfs_read_subreq_terminated(subreq, false);
 	return true;
 }
 
@@ -426,8 +429,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	ceph_osdc_start_request(req->r_osdc, req);
 out:
 	ceph_osdc_put_request(req);
-	if (err)
-		netfs_read_subreq_terminated(subreq, err, false);
+	if (err) {
+		subreq->error = err;
+		netfs_read_subreq_terminated(subreq, false);
+	}
 	doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
 }
 
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index db874fea8794..d420d623711c 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -148,14 +148,13 @@ static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error
 {
 	struct netfs_io_subrequest *subreq = priv;
 
-	if (transferred_or_error < 0) {
-		netfs_read_subreq_terminated(subreq, transferred_or_error, was_async);
-		return;
-	}
-
-	if (transferred_or_error > 0)
+	if (transferred_or_error > 0) {
 		subreq->transferred += transferred_or_error;
-	netfs_read_subreq_terminated(subreq, 0, was_async);
+		subreq->error = 0;
+	} else {
+		subreq->error = transferred_or_error;
+	}
+	netfs_read_subreq_terminated(subreq, was_async);
 }
 
 /*
@@ -255,7 +254,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			if (slice < 0)
 				goto prep_iter_failed;
 			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
-			netfs_read_subreq_terminated(subreq, 0, false);
+			subreq->error = 0;
+			netfs_read_subreq_terminated(subreq, false);
 			goto done;
 		}
 
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 5cdddaf1f978..f10fd56efa17 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -191,7 +191,20 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
 	}
 
 	memset(subreq, 0, kmem_cache_size(cache));
-	INIT_WORK(&subreq->work, NULL);
+
+	switch (rreq->origin) {
+	case NETFS_READAHEAD:
+	case NETFS_READPAGE:
+	case NETFS_READ_GAPS:
+	case NETFS_READ_FOR_WRITE:
+	case NETFS_DIO_READ:
+		INIT_WORK(&subreq->work, netfs_read_subreq_termination_worker);
+		break;
+	default:
+		INIT_WORK(&subreq->work, NULL);
+		break;
+	}
+
 	INIT_LIST_HEAD(&subreq->rreq_link);
 	refcount_set(&subreq->ref, 2);
 	subreq->rreq = rreq;
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index f7a5cb29dd12..16770a317b22 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -452,28 +452,26 @@ EXPORT_SYMBOL(netfs_read_subreq_progress);
 /**
  * netfs_read_subreq_terminated - Note the termination of an I/O operation.
  * @subreq: The I/O request that has terminated.
- * @error: Error code indicating type of completion.
- * @was_async: The termination was asynchronous
+ * @was_async: True if we're in an asynchronous context.
  *
  * This tells the read helper that a contributory I/O operation has terminated,
  * one way or another, and that it should integrate the results.
  *
- * The caller indicates the outcome of the operation through @error, supplying
- * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY
- * is set) or a negative error code.  The helper will look after reissuing I/O
- * operations as appropriate and writing downloaded data to the cache.
+ * The caller indicates the outcome of the operation through @subreq->error,
+ * supplying 0 to indicate a successful or retryable transfer (if
+ * NETFS_SREQ_NEED_RETRY is set) or a negative error code.  The helper will
+ * look after reissuing I/O operations as appropriate and writing downloaded
+ * data to the cache.
  *
  * Before calling, the filesystem should update subreq->transferred to track
  * the amount of data copied into the output buffer.
- *
- * If @was_async is true, the caller might be running in softirq or interrupt
- * context and we can't sleep.
  */
-void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
-				  int error, bool was_async)
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_async)
 {
 	struct netfs_io_request *rreq = subreq->rreq;
 
+	might_sleep();
+
 	switch (subreq->source) {
 	case NETFS_READ_FROM_CACHE:
 		netfs_stat(&netfs_n_rh_read_done);
@@ -491,7 +489,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 		 * If the read completed validly short, then we can clear the
 		 * tail before going on to unlock the folios.
 		 */
-		if (error == 0 && subreq->transferred < subreq->len &&
+		if (subreq->error == 0 && subreq->transferred < subreq->len &&
 		    (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
 		     test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
 			netfs_clear_unread(subreq);
@@ -511,7 +509,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 	/* Deal with retry requests, short reads and errors.  If we retry
 	 * but don't make progress, we abandon the attempt.
 	 */
-	if (!error && subreq->transferred < subreq->len) {
+	if (!subreq->error && subreq->transferred < subreq->len) {
 		if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
 			trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
 		} else {
@@ -528,16 +526,15 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 				set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
 			} else {
 				__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
-				error = -ENODATA;
+				subreq->error = -ENODATA;
 			}
 		}
 	}
 
-	subreq->error = error;
 	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
 
-	if (unlikely(error < 0)) {
-		trace_netfs_failure(rreq, subreq, error, netfs_fail_read);
+	if (unlikely(subreq->error < 0)) {
+		trace_netfs_failure(rreq, subreq, subreq->error, netfs_fail_read);
 		if (subreq->source == NETFS_READ_FROM_CACHE) {
 			netfs_stat(&netfs_n_rh_read_failed);
 		} else {
@@ -553,3 +550,19 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 	netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
 }
 EXPORT_SYMBOL(netfs_read_subreq_terminated);
+
+/**
+ * netfs_read_subreq_termination_worker - Workqueue helper for read termination
+ * @work: The subreq->work in the I/O request that has been terminated.
+ *
+ * Helper function to jump to netfs_read_subreq_terminated() from the
+ * subrequest work item.
+ */
+void netfs_read_subreq_termination_worker(struct work_struct *work)
+{
+	struct netfs_io_subrequest *subreq =
+		container_of(work, struct netfs_io_subrequest, work);
+
+	netfs_read_subreq_terminated(subreq, false);
+}
+EXPORT_SYMBOL(netfs_read_subreq_termination_worker);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index d49e4ce27999..be14d30608f6 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -314,8 +314,10 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
 			     &nfs_async_read_completion_ops);
 
 	netfs = nfs_netfs_alloc(sreq);
-	if (!netfs)
-		return netfs_read_subreq_terminated(sreq, -ENOMEM, false);
+	if (!netfs) {
+		sreq->error = -ENOMEM;
+		return netfs_read_subreq_terminated(sreq, false);
+	}
 
 	pgio.pg_netfs = netfs; /* used in completion */
 
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 772d485e96d3..1d86f7cc7195 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -74,7 +74,8 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
 	 */
 	netfs->sreq->transferred = min_t(s64, netfs->sreq->len,
 					 atomic64_read(&netfs->transferred));
-	netfs_read_subreq_terminated(netfs->sreq, netfs->error, false);
+	netfs->sreq->error = netfs->error;
+	netfs_read_subreq_terminated(netfs->sreq, false);
 	kfree(netfs);
 }
 static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6cb1e81993f8..7c9cc6945d18 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1258,14 +1258,6 @@ openRetry:
 	return rc;
 }
 
-static void cifs_readv_worker(struct work_struct *work)
-{
-	struct cifs_io_subrequest *rdata =
-		container_of(work, struct cifs_io_subrequest, subreq.work);
-
-	netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false);
-}
-
 static void
 cifs_readv_callback(struct mid_q_entry *mid)
 {
@@ -1333,8 +1325,8 @@ cifs_readv_callback(struct mid_q_entry *mid)
 	}
 
 	rdata->credits.value = 0;
+	rdata->subreq.error = rdata->result;
 	rdata->subreq.transferred += rdata->got_bytes;
-	INIT_WORK(&rdata->subreq.work, cifs_readv_worker);
 	queue_work(cifsiod_wq, &rdata->subreq.work);
 	release_mid(mid);
 	add_credits(server, &credits, 0);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index a58a3333ecc3..10dd440f8178 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -227,7 +227,8 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq)
 	return;
 
 failed:
-	netfs_read_subreq_terminated(subreq, rc, false);
+	subreq->error = rc;
+	netfs_read_subreq_terminated(subreq, false);
 }
 
 /*
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 458b53d1f9cb..ce57d8697c7c 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4500,14 +4500,6 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
 	return rc;
 }
 
-static void smb2_readv_worker(struct work_struct *work)
-{
-	struct cifs_io_subrequest *rdata =
-		container_of(work, struct cifs_io_subrequest, subreq.work);
-
-	netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false);
-}
-
 static void
 smb2_readv_callback(struct mid_q_entry *mid)
 {
@@ -4621,9 +4613,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
 			      server->credits, server->in_flight,
 			      0, cifs_trace_rw_credits_read_response_clear);
 	rdata->credits.value = 0;
+	rdata->subreq.error = rdata->result;
 	rdata->subreq.transferred += rdata->got_bytes;
 	trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress);
-	INIT_WORK(&rdata->subreq.work, smb2_readv_worker);
 	queue_work(cifsiod_wq, &rdata->subreq.work);
 	release_mid(mid);
 	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index bd922f0936e3..a882921460a9 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -427,10 +427,9 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp);
 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
 
 /* (Sub)request management API. */
-void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
-				bool was_async);
-void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
-				  int error, bool was_async);
+void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, bool was_async);
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_async);
+void netfs_read_subreq_termination_worker(struct work_struct *work);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
 			  enum netfs_sreq_ref_trace what);
 void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
-- 
cgit v1.2.3


From 31fc366aa7aa911ebc0744e99c82caee4e97315a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:40:59 +0000
Subject: netfs: Drop the was_async arg from netfs_read_subreq_terminated()

Drop the was_async argument from netfs_read_subreq_terminated().  Almost
every caller is either in process context and passes false.  Some
filesystems delegate the call to a workqueue to avoid doing the work in
their network message queue parsing thread.

The only exception is netfs_cache_read_terminated() which handles
completion in the cache - which is usually a callback from the backing
filesystem in softirq context, though it can be from process context if an
error occurred.  In this case, delegate to a workqueue.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=wiVC5Cgyz6QKXFu6fTaA6h4CjexDR-OV9kL6Vo5x9v8=A@mail.gmail.com/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-10-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/9p/vfs_addr.c         |  2 +-
 fs/afs/file.c            |  6 +++---
 fs/afs/fsclient.c        |  2 +-
 fs/afs/yfsclient.c       |  2 +-
 fs/ceph/addr.c           |  6 +++---
 fs/netfs/buffered_read.c |  6 +++---
 fs/netfs/direct_read.c   |  2 +-
 fs/netfs/internal.h      |  2 +-
 fs/netfs/objects.c       |  2 +-
 fs/netfs/read_collect.c  | 53 +++++++++++-------------------------------------
 fs/netfs/read_retry.c    |  2 +-
 fs/nfs/fscache.c         |  2 +-
 fs/nfs/fscache.h         |  2 +-
 fs/smb/client/file.c     |  2 +-
 include/linux/netfs.h    |  4 ++--
 15 files changed, 33 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 40f5acd7b452..b38be6ff90bc 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -88,7 +88,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
 	}
 
 	subreq->error = err;
-	netfs_read_subreq_terminated(subreq, false);
+	netfs_read_subreq_terminated(subreq);
 }
 
 /**
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 56248a078bca..f717168da4ab 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -247,7 +247,7 @@ static void afs_fetch_data_notify(struct afs_operation *op)
 		if (req->pos + req->actual_len >= req->file_size)
 			__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
 		subreq->error = error;
-		netfs_read_subreq_terminated(subreq, false);
+		netfs_read_subreq_terminated(subreq);
 		req->subreq = NULL;
 	} else if (req->done) {
 		req->done(req);
@@ -304,7 +304,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
 	if (IS_ERR(op)) {
 		if (req->subreq) {
 			req->subreq->error = PTR_ERR(op);
-			netfs_read_subreq_terminated(req->subreq, false);
+			netfs_read_subreq_terminated(req->subreq);
 		}
 		return PTR_ERR(op);
 	}
@@ -325,7 +325,7 @@ static void afs_read_worker(struct work_struct *work)
 	fsreq = afs_alloc_read(GFP_NOFS);
 	if (!fsreq) {
 		subreq->error = -ENOMEM;
-		return netfs_read_subreq_terminated(subreq, false);
+		return netfs_read_subreq_terminated(subreq);
 	}
 
 	fsreq->subreq	= subreq;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 098fa034a1cc..784f7daab112 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -352,7 +352,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		ret = afs_extract_data(call, true);
 		if (req->subreq) {
 			req->subreq->transferred += count_before - call->iov_len;
-			netfs_read_subreq_progress(req->subreq, false);
+			netfs_read_subreq_progress(req->subreq);
 		}
 		if (ret < 0)
 			return ret;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 024227aba4cd..368cf277d801 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -398,7 +398,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		ret = afs_extract_data(call, true);
 		if (req->subreq) {
 			req->subreq->transferred += count_before - call->iov_len;
-			netfs_read_subreq_progress(req->subreq, false);
+			netfs_read_subreq_progress(req->subreq);
 		}
 		if (ret < 0)
 			return ret;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b1b47af94deb..4deb38fa470e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -255,7 +255,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
 	}
 	subreq->error = err;
 	trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
-	netfs_read_subreq_terminated(subreq, false);
+	netfs_read_subreq_terminated(subreq);
 	iput(req->r_inode);
 	ceph_dec_osd_stopping_blocker(fsc->mdsc);
 }
@@ -317,7 +317,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
 out:
 	subreq->error = err;
 	trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
-	netfs_read_subreq_terminated(subreq, false);
+	netfs_read_subreq_terminated(subreq);
 	return true;
 }
 
@@ -431,7 +431,7 @@ out:
 	ceph_osdc_put_request(req);
 	if (err) {
 		subreq->error = err;
-		netfs_read_subreq_terminated(subreq, false);
+		netfs_read_subreq_terminated(subreq);
 	}
 	doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
 }
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index d420d623711c..fa1013020ac9 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -154,7 +154,7 @@ static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error
 	} else {
 		subreq->error = transferred_or_error;
 	}
-	netfs_read_subreq_terminated(subreq, was_async);
+	schedule_work(&subreq->work);
 }
 
 /*
@@ -255,7 +255,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 				goto prep_iter_failed;
 			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 			subreq->error = 0;
-			netfs_read_subreq_terminated(subreq, false);
+			netfs_read_subreq_terminated(subreq);
 			goto done;
 		}
 
@@ -287,7 +287,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 	} while (size > 0);
 
 	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq, false);
+		netfs_rreq_terminated(rreq);
 
 	/* Defer error return as we may need to wait for outstanding I/O. */
 	cmpxchg(&rreq->error, 0, ret);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index a3f23adbae0f..54027fd14904 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -100,7 +100,7 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
 	} while (size > 0);
 
 	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq, false);
+		netfs_rreq_terminated(rreq);
 	return ret;
 }
 
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 73887525e939..ba32ca61063c 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -85,7 +85,7 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
  * read_collect.c
  */
 void netfs_read_termination_worker(struct work_struct *work);
-void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async);
+void netfs_rreq_terminated(struct netfs_io_request *rreq);
 
 /*
  * read_pgpriv2.c
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index f10fd56efa17..8c98b70eb3a4 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -56,7 +56,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	    origin == NETFS_READ_GAPS ||
 	    origin == NETFS_READ_FOR_WRITE ||
 	    origin == NETFS_DIO_READ)
-		INIT_WORK(&rreq->work, netfs_read_termination_worker);
+		INIT_WORK(&rreq->work, NULL);
 	else
 		INIT_WORK(&rreq->work, netfs_write_collection_worker);
 
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 16770a317b22..454a5bbdd6f8 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -87,7 +87,7 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
  * Unlock any folios that are now completely read.  Returns true if the
  * subrequest is removed from the list.
  */
-static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async)
+static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq)
 {
 	struct netfs_io_subrequest *prev, *next;
 	struct netfs_io_request *rreq = subreq->rreq;
@@ -230,8 +230,7 @@ donation_changed:
 		subreq->curr_folioq_slot = slot;
 		if (folioq && folioq_folio(folioq, slot))
 			subreq->curr_folio_order = folioq->orders[slot];
-		if (!was_async)
-			cond_resched();
+		cond_resched();
 		goto next_folio;
 	}
 
@@ -368,7 +367,7 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
  * Note that we're in normal kernel thread context at this point, possibly
  * running on a workqueue.
  */
-static void netfs_rreq_assess(struct netfs_io_request *rreq)
+void netfs_rreq_terminated(struct netfs_io_request *rreq)
 {
 	trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
 
@@ -394,56 +393,29 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq)
 		netfs_pgpriv2_write_to_the_cache(rreq);
 }
 
-void netfs_read_termination_worker(struct work_struct *work)
-{
-	struct netfs_io_request *rreq =
-		container_of(work, struct netfs_io_request, work);
-	netfs_see_request(rreq, netfs_rreq_trace_see_work);
-	netfs_rreq_assess(rreq);
-	netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete);
-}
-
-/*
- * Handle the completion of all outstanding I/O operations on a read request.
- * We inherit a ref from the caller.
- */
-void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async)
-{
-	if (!was_async)
-		return netfs_rreq_assess(rreq);
-	if (!work_pending(&rreq->work)) {
-		netfs_get_request(rreq, netfs_rreq_trace_get_work);
-		if (!queue_work(system_unbound_wq, &rreq->work))
-			netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq);
-	}
-}
-
 /**
  * netfs_read_subreq_progress - Note progress of a read operation.
  * @subreq: The read request that has terminated.
- * @was_async: True if we're in an asynchronous context.
  *
  * This tells the read side of netfs lib that a contributory I/O operation has
  * made some progress and that it may be possible to unlock some folios.
  *
  * Before calling, the filesystem should update subreq->transferred to track
  * the amount of data copied into the output buffer.
- *
- * If @was_async is true, the caller might be running in softirq or interrupt
- * context and we can't sleep.
  */
-void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
-				bool was_async)
+void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq)
 {
 	struct netfs_io_request *rreq = subreq->rreq;
 
+	might_sleep();
+
 	trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
 
 	if (subreq->transferred > subreq->consumed &&
 	    (rreq->origin == NETFS_READAHEAD ||
 	     rreq->origin == NETFS_READPAGE ||
 	     rreq->origin == NETFS_READ_FOR_WRITE)) {
-		netfs_consume_read_data(subreq, was_async);
+		netfs_consume_read_data(subreq);
 		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	}
 }
@@ -452,7 +424,6 @@ EXPORT_SYMBOL(netfs_read_subreq_progress);
 /**
  * netfs_read_subreq_terminated - Note the termination of an I/O operation.
  * @subreq: The I/O request that has terminated.
- * @was_async: True if we're in an asynchronous context.
  *
  * This tells the read helper that a contributory I/O operation has terminated,
  * one way or another, and that it should integrate the results.
@@ -466,7 +437,7 @@ EXPORT_SYMBOL(netfs_read_subreq_progress);
  * Before calling, the filesystem should update subreq->transferred to track
  * the amount of data copied into the output buffer.
  */
-void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_async)
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
 {
 	struct netfs_io_request *rreq = subreq->rreq;
 
@@ -500,7 +471,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_a
 		    (rreq->origin == NETFS_READAHEAD ||
 		     rreq->origin == NETFS_READPAGE ||
 		     rreq->origin == NETFS_READ_FOR_WRITE)) {
-			netfs_consume_read_data(subreq, was_async);
+			netfs_consume_read_data(subreq);
 			__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 		}
 		rreq->transferred += subreq->transferred;
@@ -545,9 +516,9 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_a
 	}
 
 	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq, was_async);
+		netfs_rreq_terminated(rreq);
 
-	netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+	netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_terminated);
 }
 EXPORT_SYMBOL(netfs_read_subreq_terminated);
 
@@ -563,6 +534,6 @@ void netfs_read_subreq_termination_worker(struct work_struct *work)
 	struct netfs_io_subrequest *subreq =
 		container_of(work, struct netfs_io_subrequest, work);
 
-	netfs_read_subreq_terminated(subreq, false);
+	netfs_read_subreq_terminated(subreq);
 }
 EXPORT_SYMBOL(netfs_read_subreq_termination_worker);
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 0983234c2183..a2021efa44c0 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -234,7 +234,7 @@ void netfs_retry_reads(struct netfs_io_request *rreq)
 	netfs_retry_read_subrequests(rreq);
 
 	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq, false);
+		netfs_rreq_terminated(rreq);
 }
 
 /*
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index be14d30608f6..e278a1ad1ca3 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -316,7 +316,7 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
 	netfs = nfs_netfs_alloc(sreq);
 	if (!netfs) {
 		sreq->error = -ENOMEM;
-		return netfs_read_subreq_terminated(sreq, false);
+		return netfs_read_subreq_terminated(sreq);
 	}
 
 	pgio.pg_netfs = netfs; /* used in completion */
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 1d86f7cc7195..9d86868f4998 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -75,7 +75,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
 	netfs->sreq->transferred = min_t(s64, netfs->sreq->len,
 					 atomic64_read(&netfs->transferred));
 	netfs->sreq->error = netfs->error;
-	netfs_read_subreq_terminated(netfs->sreq, false);
+	netfs_read_subreq_terminated(netfs->sreq);
 	kfree(netfs);
 }
 static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 10dd440f8178..27a1757a278e 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -228,7 +228,7 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq)
 
 failed:
 	subreq->error = rc;
-	netfs_read_subreq_terminated(subreq, false);
+	netfs_read_subreq_terminated(subreq);
 }
 
 /*
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index a882921460a9..374e54beacbe 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -427,8 +427,8 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp);
 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
 
 /* (Sub)request management API. */
-void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, bool was_async);
-void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_async);
+void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq);
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq);
 void netfs_read_subreq_termination_worker(struct work_struct *work);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
 			  enum netfs_sreq_ref_trace what);
-- 
cgit v1.2.3


From 627cf645277b6f8e6128e2c86907a81970bce87a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:00 +0000
Subject: netfs: Don't use bh spinlock

All the accessing of the subrequest lists is now done in process context,
possibly in a workqueue, but not now in a BH context, so we don't need the
lock against BH interference when taking the netfs_io_request::lock
spinlock.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-11-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/buffered_read.c |  4 ++--
 fs/netfs/direct_read.c   |  4 ++--
 fs/netfs/read_collect.c  | 20 ++++++++++----------
 fs/netfs/read_retry.c    |  8 ++++----
 fs/netfs/write_collect.c |  4 ++--
 fs/netfs/write_issue.c   |  4 ++--
 6 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index fa1013020ac9..4ff4b587dc4b 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -200,12 +200,12 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 		subreq->len	= size;
 
 		atomic_inc(&rreq->nr_outstanding);
-		spin_lock_bh(&rreq->lock);
+		spin_lock(&rreq->lock);
 		list_add_tail(&subreq->rreq_link, &rreq->subrequests);
 		subreq->prev_donated = rreq->prev_donated;
 		rreq->prev_donated = 0;
 		trace_netfs_sreq(subreq, netfs_sreq_trace_added);
-		spin_unlock_bh(&rreq->lock);
+		spin_unlock(&rreq->lock);
 
 		source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
 		subreq->source = source;
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 54027fd14904..1a20cc3979c7 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -68,12 +68,12 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
 		subreq->len	= size;
 
 		atomic_inc(&rreq->nr_outstanding);
-		spin_lock_bh(&rreq->lock);
+		spin_lock(&rreq->lock);
 		list_add_tail(&subreq->rreq_link, &rreq->subrequests);
 		subreq->prev_donated = rreq->prev_donated;
 		rreq->prev_donated = 0;
 		trace_netfs_sreq(subreq, netfs_sreq_trace_added);
-		spin_unlock_bh(&rreq->lock);
+		spin_unlock(&rreq->lock);
 
 		netfs_stat(&netfs_n_rh_download);
 		if (rreq->netfs_ops->prepare_read) {
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 454a5bbdd6f8..26e430baeb5a 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -144,7 +144,7 @@ donation_changed:
 	prev_donated = READ_ONCE(subreq->prev_donated);
 	next_donated =  READ_ONCE(subreq->next_donated);
 	if (prev_donated || next_donated) {
-		spin_lock_bh(&rreq->lock);
+		spin_lock(&rreq->lock);
 		prev_donated = subreq->prev_donated;
 		next_donated =  subreq->next_donated;
 		subreq->start -= prev_donated;
@@ -157,7 +157,7 @@ donation_changed:
 			next_donated = subreq->next_donated = 0;
 		}
 		trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
-		spin_unlock_bh(&rreq->lock);
+		spin_unlock(&rreq->lock);
 	}
 
 	avail = subreq->transferred;
@@ -186,18 +186,18 @@ donation_changed:
 		} else if (fpos < start) {
 			excess = fend - subreq->start;
 
-			spin_lock_bh(&rreq->lock);
+			spin_lock(&rreq->lock);
 			/* If we complete first on a folio split with the
 			 * preceding subreq, donate to that subreq - otherwise
 			 * we get the responsibility.
 			 */
 			if (subreq->prev_donated != prev_donated) {
-				spin_unlock_bh(&rreq->lock);
+				spin_unlock(&rreq->lock);
 				goto donation_changed;
 			}
 
 			if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
-				spin_unlock_bh(&rreq->lock);
+				spin_unlock(&rreq->lock);
 				pr_err("Can't donate prior to front\n");
 				goto bad;
 			}
@@ -213,7 +213,7 @@ donation_changed:
 
 			if (subreq->consumed >= subreq->len)
 				goto remove_subreq_locked;
-			spin_unlock_bh(&rreq->lock);
+			spin_unlock(&rreq->lock);
 		} else {
 			pr_err("fpos > start\n");
 			goto bad;
@@ -241,11 +241,11 @@ donation_changed:
 	/* Donate the remaining downloaded data to one of the neighbouring
 	 * subrequests.  Note that we may race with them doing the same thing.
 	 */
-	spin_lock_bh(&rreq->lock);
+	spin_lock(&rreq->lock);
 
 	if (subreq->prev_donated != prev_donated ||
 	    subreq->next_donated != next_donated) {
-		spin_unlock_bh(&rreq->lock);
+		spin_unlock(&rreq->lock);
 		cond_resched();
 		goto donation_changed;
 	}
@@ -296,11 +296,11 @@ donation_changed:
 	goto remove_subreq_locked;
 
 remove_subreq:
-	spin_lock_bh(&rreq->lock);
+	spin_lock(&rreq->lock);
 remove_subreq_locked:
 	subreq->consumed = subreq->len;
 	list_del(&subreq->rreq_link);
-	spin_unlock_bh(&rreq->lock);
+	spin_unlock(&rreq->lock);
 	netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
 	return true;
 
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index a2021efa44c0..a33bd06e80f8 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -142,12 +142,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 			__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 			subreq->retry_count++;
 
-			spin_lock_bh(&rreq->lock);
+			spin_lock(&rreq->lock);
 			list_add_tail(&subreq->rreq_link, &rreq->subrequests);
 			subreq->prev_donated += rreq->prev_donated;
 			rreq->prev_donated = 0;
 			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
-			spin_unlock_bh(&rreq->lock);
+			spin_unlock(&rreq->lock);
 
 			BUG_ON(!len);
 
@@ -217,9 +217,9 @@ abandon:
 		__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
 		__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 	}
-	spin_lock_bh(&rreq->lock);
+	spin_lock(&rreq->lock);
 	list_splice_tail_init(&queue, &rreq->subrequests);
-	spin_unlock_bh(&rreq->lock);
+	spin_unlock(&rreq->lock);
 }
 
 /*
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 237018caba27..f026cbc0e2fe 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -238,14 +238,14 @@ reassess_streams:
 
 		cancel:
 			/* Remove if completely consumed. */
-			spin_lock_bh(&wreq->lock);
+			spin_lock(&wreq->lock);
 
 			remove = front;
 			list_del_init(&front->rreq_link);
 			front = list_first_entry_or_null(&stream->subrequests,
 							 struct netfs_io_subrequest, rreq_link);
 			stream->front = front;
-			spin_unlock_bh(&wreq->lock);
+			spin_unlock(&wreq->lock);
 			netfs_put_subrequest(remove, false,
 					     notes & SAW_FAILURE ?
 					     netfs_sreq_trace_put_cancel :
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 7a14a48e62ee..286bc2aa3ca0 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -203,7 +203,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
 	 * the list.  The collector only goes nextwards and uses the lock to
 	 * remove entries off of the front.
 	 */
-	spin_lock_bh(&wreq->lock);
+	spin_lock(&wreq->lock);
 	list_add_tail(&subreq->rreq_link, &stream->subrequests);
 	if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
 		stream->front = subreq;
@@ -214,7 +214,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
 		}
 	}
 
-	spin_unlock_bh(&wreq->lock);
+	spin_unlock(&wreq->lock);
 
 	stream->construct = subreq;
 }
-- 
cgit v1.2.3


From 6e0b503dc65c89b83fbfafb4dac5201c844da1de Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:01 +0000
Subject: afs: Don't use mutex for I/O operation lock

Don't use the standard mutex for the I/O operation lock, but rather
implement our own as the standard mutex must be released in the same thread
as locked it.  This is a problem when it comes to doing async FetchData
where the lock will be dropped from the workqueue that processed the
incoming data and not from the issuing thread.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-12-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/fs_operation.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/afs/internal.h     |   3 +-
 fs/afs/super.c        |   2 +-
 3 files changed, 108 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 428721bbe4f6..0175d7a31332 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -49,6 +49,105 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
 	return op;
 }
 
+struct afs_io_locker {
+	struct list_head	link;
+	struct task_struct	*task;
+	unsigned long		have_lock;
+};
+
+/*
+ * Unlock the I/O lock on a vnode.
+ */
+static void afs_unlock_for_io(struct afs_vnode *vnode)
+{
+	struct afs_io_locker *locker;
+
+	spin_lock(&vnode->lock);
+	locker = list_first_entry_or_null(&vnode->io_lock_waiters,
+					  struct afs_io_locker, link);
+	if (locker) {
+		list_del(&locker->link);
+		smp_store_release(&locker->have_lock, 1); /* The unlock barrier. */
+		smp_mb__after_atomic(); /* Store have_lock before task state */
+		wake_up_process(locker->task);
+	} else {
+		clear_bit(AFS_VNODE_IO_LOCK, &vnode->flags);
+	}
+	spin_unlock(&vnode->lock);
+}
+
+/*
+ * Lock the I/O lock on a vnode uninterruptibly.  We can't use an ordinary
+ * mutex as lockdep will complain if we unlock it in the wrong thread.
+ */
+static void afs_lock_for_io(struct afs_vnode *vnode)
+{
+	struct afs_io_locker myself = { .task = current, };
+
+	spin_lock(&vnode->lock);
+
+	if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) {
+		spin_unlock(&vnode->lock);
+		return;
+	}
+
+	list_add_tail(&myself.link, &vnode->io_lock_waiters);
+	spin_unlock(&vnode->lock);
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (smp_load_acquire(&myself.have_lock)) /* The lock barrier */
+			break;
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
+/*
+ * Lock the I/O lock on a vnode interruptibly.  We can't use an ordinary mutex
+ * as lockdep will complain if we unlock it in the wrong thread.
+ */
+static int afs_lock_for_io_interruptible(struct afs_vnode *vnode)
+{
+	struct afs_io_locker myself = { .task = current, };
+	int ret = 0;
+
+	spin_lock(&vnode->lock);
+
+	if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) {
+		spin_unlock(&vnode->lock);
+		return 0;
+	}
+
+	list_add_tail(&myself.link, &vnode->io_lock_waiters);
+	spin_unlock(&vnode->lock);
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (smp_load_acquire(&myself.have_lock) || /* The lock barrier */
+		    signal_pending(current))
+			break;
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	/* If we got a signal, try to transfer the lock onto the next
+	 * waiter.
+	 */
+	if (unlikely(signal_pending(current))) {
+		spin_lock(&vnode->lock);
+		if (myself.have_lock) {
+			spin_unlock(&vnode->lock);
+			afs_unlock_for_io(vnode);
+		} else {
+			list_del(&myself.link);
+			spin_unlock(&vnode->lock);
+		}
+		ret = -ERESTARTSYS;
+	}
+	return ret;
+}
+
 /*
  * Lock the vnode(s) being operated upon.
  */
@@ -60,7 +159,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
 	_enter("");
 
 	if (op->flags & AFS_OPERATION_UNINTR) {
-		mutex_lock(&vnode->io_lock);
+		afs_lock_for_io(vnode);
 		op->flags |= AFS_OPERATION_LOCK_0;
 		_leave(" = t [1]");
 		return true;
@@ -72,7 +171,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
 	if (vnode2 > vnode)
 		swap(vnode, vnode2);
 
-	if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
+	if (afs_lock_for_io_interruptible(vnode) < 0) {
 		afs_op_set_error(op, -ERESTARTSYS);
 		op->flags |= AFS_OPERATION_STOP;
 		_leave(" = f [I 0]");
@@ -81,10 +180,10 @@ static bool afs_get_io_locks(struct afs_operation *op)
 	op->flags |= AFS_OPERATION_LOCK_0;
 
 	if (vnode2) {
-		if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
+		if (afs_lock_for_io_interruptible(vnode2) < 0) {
 			afs_op_set_error(op, -ERESTARTSYS);
 			op->flags |= AFS_OPERATION_STOP;
-			mutex_unlock(&vnode->io_lock);
+			afs_unlock_for_io(vnode);
 			op->flags &= ~AFS_OPERATION_LOCK_0;
 			_leave(" = f [I 1]");
 			return false;
@@ -104,9 +203,9 @@ static void afs_drop_io_locks(struct afs_operation *op)
 	_enter("");
 
 	if (op->flags & AFS_OPERATION_LOCK_1)
-		mutex_unlock(&vnode2->io_lock);
+		afs_unlock_for_io(vnode2);
 	if (op->flags & AFS_OPERATION_LOCK_0)
-		mutex_unlock(&vnode->io_lock);
+		afs_unlock_for_io(vnode);
 }
 
 static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index c9d620175e80..07b8f7083e73 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -702,13 +702,14 @@ struct afs_vnode {
 	struct afs_file_status	status;		/* AFS status info for this file */
 	afs_dataversion_t	invalid_before;	/* Child dentries are invalid before this */
 	struct afs_permits __rcu *permit_cache;	/* cache of permits so far obtained */
-	struct mutex		io_lock;	/* Lock for serialising I/O on this mutex */
+	struct list_head	io_lock_waiters; /* Threads waiting for the I/O lock */
 	struct rw_semaphore	validate_lock;	/* lock for validating this vnode */
 	struct rw_semaphore	rmdir_lock;	/* Lock for rmdir vs sillyrename */
 	struct key		*silly_key;	/* Silly rename key */
 	spinlock_t		wb_lock;	/* lock for wb_keys */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
+#define AFS_VNODE_IO_LOCK	0		/* Set if the I/O serialisation lock is held */
 #define AFS_VNODE_UNSET		1		/* set if vnode attributes not yet set */
 #define AFS_VNODE_DIR_VALID	2		/* Set if dir contents are valid */
 #define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f3ba1c3e72f5..7631302c1984 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -663,7 +663,7 @@ static void afs_i_init_once(void *_vnode)
 
 	memset(vnode, 0, sizeof(*vnode));
 	inode_init_once(&vnode->netfs.inode);
-	mutex_init(&vnode->io_lock);
+	INIT_LIST_HEAD(&vnode->io_lock_waiters);
 	init_rwsem(&vnode->validate_lock);
 	spin_lock_init(&vnode->wb_lock);
 	spin_lock_init(&vnode->lock);
-- 
cgit v1.2.3


From b49194da2aff2c879dec9c59ef8dec0f2b0809ef Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:02 +0000
Subject: afs: Fix EEXIST error returned from afs_rmdir() to be ENOTEMPTY

AFS servers pass back a code indicating EEXIST when they're asked to remove
a directory that is not empty rather than ENOTEMPTY because not all the
systems that an AFS server can run on have the latter error available and
AFS preexisted the addition of that error in general.

Fix afs_rmdir() to translate EEXIST to ENOTEMPTY.

Fixes: 260a980317da ("[AFS]: Add "directory write" support.")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-13-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/dir.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ada363af5aab..50edd1cae28a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1472,7 +1472,12 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 		op->file[1].vnode = vnode;
 	}
 
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+
+	/* Not all systems that can host afs servers have ENOTEMPTY. */
+	if (ret == -EEXIST)
+		ret = -ENOTEMPTY;
+	return ret;
 
 error:
 	return afs_put_operation(op);
-- 
cgit v1.2.3


From 07a10767853adcbdbf436dc91393b729b52c4e81 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:03 +0000
Subject: afs: Fix directory format encoding struct

The AFS directory format structure, union afs_xdr_dir_block::meta, has too
many alloc counter slots declared and so pushes the hash table along and
over the data.  This doesn't cause a problem at the moment because I'm
currently ignoring the hash table and only using the correct number of
alloc_ctrs in the code anyway.  In future, however, I should start using
the hash table to try and speed up afs_lookup().

Fix this by using the correct constant to declare the counter array.

Fixes: 4ea219a839bf ("afs: Split the directory content defs into a header")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-14-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/xdr_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h
index 8ca868164507..cc5f143d21a3 100644
--- a/fs/afs/xdr_fs.h
+++ b/fs/afs/xdr_fs.h
@@ -88,7 +88,7 @@ union afs_xdr_dir_block {
 
 	struct {
 		struct afs_xdr_dir_hdr	hdr;
-		u8			alloc_ctrs[AFS_DIR_MAX_BLOCKS];
+		u8			alloc_ctrs[AFS_DIR_BLOCKS_WITH_CTR];
 		__be16			hashtable[AFS_DIR_HASHTBL_SIZE];
 	} meta;
 
-- 
cgit v1.2.3


From 30f878fa0fac932ea811303aacf73c6a44b12bd2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:04 +0000
Subject: netfs: Remove some extraneous directory invalidations

In the directory editing code, we shouldn't re-invalidate the directory
if it is already invalidated.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-15-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/dir_edit.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index fe223fb78111..13fb236a3f50 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -247,7 +247,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 		 */
 		index = b / AFS_DIR_BLOCKS_PER_PAGE;
 		if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
-			goto error;
+			goto error_too_many_blocks;
 		if (index >= folio_nr_pages(folio0)) {
 			folio = afs_dir_get_folio(vnode, index);
 			if (!folio)
@@ -260,7 +260,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 
 		/* Abandon the edit if we got a callback break. */
 		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-			goto invalidated;
+			goto already_invalidated;
 
 		_debug("block %u: %2u %3u %u",
 		       b,
@@ -348,9 +348,8 @@ out_unmap:
 	_leave("");
 	return;
 
-invalidated:
+already_invalidated:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	kunmap_local(block);
 	if (folio != folio0) {
 		folio_unlock(folio);
@@ -358,9 +357,10 @@ invalidated:
 	}
 	goto out_unmap;
 
+error_too_many_blocks:
+	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 error:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	goto out_unmap;
 }
 
@@ -421,7 +421,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
 
 		/* Abandon the edit if we got a callback break. */
 		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-			goto invalidated;
+			goto already_invalidated;
 
 		if (b > AFS_DIR_BLOCKS_WITH_CTR ||
 		    meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) {
@@ -475,10 +475,9 @@ out_unmap:
 	_leave("");
 	return;
 
-invalidated:
+already_invalidated:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval,
 			   0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	kunmap_local(block);
 	if (folio != folio0) {
 		folio_unlock(folio);
@@ -489,7 +488,6 @@ invalidated:
 error:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error,
 			   0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	goto out_unmap;
 }
 
@@ -530,7 +528,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 
 		/* Abandon the edit if we got a callback break. */
 		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-			goto invalidated;
+			goto already_invalidated;
 
 		slot = afs_dir_scan_block(block, &dotdot_name, b);
 		if (slot >= 0)
@@ -564,18 +562,16 @@ out:
 	_leave("");
 	return;
 
-invalidated:
+already_invalidated:
 	kunmap_local(block);
 	folio_unlock(folio);
 	folio_put(folio);
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
 			   0, 0, 0, 0, "..");
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	goto out;
 
 error:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
 			   0, 0, 0, 0, "..");
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	goto out;
 }
-- 
cgit v1.2.3


From bcb33f79e15d0e4dc4b86106ceb01d64bfab9e35 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:05 +0000
Subject: cachefiles: Add some subrequest tracepoints

Add some tracepoints into the cachefiles write paths.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-16-dhowells@redhat.com
cc: netfs@lists.linux.dev
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/io.c           | 4 ++++
 include/trace/events/netfs.h | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 6a821a959b59..92058ae43488 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -13,6 +13,7 @@
 #include <linux/falloc.h>
 #include <linux/sched/mm.h>
 #include <trace/events/fscache.h>
+#include <trace/events/netfs.h>
 #include "internal.h"
 
 struct cachefiles_kiocb {
@@ -366,6 +367,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
 	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
 		if (term_func)
 			term_func(term_func_priv, -ENOBUFS, false);
+		trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite);
 		return -ENOBUFS;
 	}
 
@@ -695,6 +697,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
 		iov_iter_truncate(&subreq->io_iter, len);
 	}
 
+	trace_netfs_sreq(subreq, netfs_sreq_trace_cache_prepare);
 	cachefiles_begin_secure(cache, &saved_cred);
 	ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
 					 &start, &len, len, true);
@@ -704,6 +707,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
 		return;
 	}
 
+	trace_netfs_sreq(subreq, netfs_sreq_trace_cache_write);
 	cachefiles_write(&subreq->rreq->cache_resources,
 			 subreq->start, &subreq->io_iter,
 			 netfs_write_subrequest_terminated, subreq);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 2dfc9f716e3b..02f6e179b7bc 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -74,6 +74,9 @@
 #define netfs_sreq_traces					\
 	EM(netfs_sreq_trace_add_donations,	"+DON ")	\
 	EM(netfs_sreq_trace_added,		"ADD  ")	\
+	EM(netfs_sreq_trace_cache_nowrite,	"CA-NW")	\
+	EM(netfs_sreq_trace_cache_prepare,	"CA-PR")	\
+	EM(netfs_sreq_trace_cache_write,	"CA-WR")	\
 	EM(netfs_sreq_trace_clear,		"CLEAR")	\
 	EM(netfs_sreq_trace_discard,		"DSCRD")	\
 	EM(netfs_sreq_trace_donate_to_prev,	"DON-P")	\
-- 
cgit v1.2.3


From 229105e5cfd9832a9ef1368c96e0098ec3a5fbf0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:06 +0000
Subject: cachefiles: Add auxiliary data trace

Add a display of the first 8 bytes of the downloaded auxiliary data and of
the on-disk stored auxiliary data as these are used in coherency
management.  In the case of afs, this holds the data version number.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-17-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/xattr.c             |  9 ++++++++-
 include/trace/events/cachefiles.h | 13 ++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 7c6f260a3be5..52383b1d0ba6 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -77,6 +77,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object)
 		trace_cachefiles_vfs_error(object, file_inode(file), ret,
 					   cachefiles_trace_setxattr_error);
 		trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+					   be64_to_cpup((__be64 *)buf->data),
 					   buf->content,
 					   cachefiles_coherency_set_fail);
 		if (ret != -ENOMEM)
@@ -85,6 +86,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object)
 				"Failed to set xattr with error %d", ret);
 	} else {
 		trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+					   be64_to_cpup((__be64 *)buf->data),
 					   buf->content,
 					   cachefiles_coherency_set_ok);
 	}
@@ -126,7 +128,10 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file
 				object,
 				"Failed to read aux with error %zd", xlen);
 		why = cachefiles_coherency_check_xattr;
-	} else if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) {
+		goto out;
+	}
+
+	if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) {
 		why = cachefiles_coherency_check_type;
 	} else if (memcmp(buf->data, p, len) != 0) {
 		why = cachefiles_coherency_check_aux;
@@ -141,7 +146,9 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file
 		ret = 0;
 	}
 
+out:
 	trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+				   be64_to_cpup((__be64 *)buf->data),
 				   buf->content, why);
 	kfree(buf);
 	return ret;
diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h
index 74114c261bcd..a743b2a35ea7 100644
--- a/include/trace/events/cachefiles.h
+++ b/include/trace/events/cachefiles.h
@@ -380,10 +380,11 @@ TRACE_EVENT(cachefiles_rename,
 TRACE_EVENT(cachefiles_coherency,
 	    TP_PROTO(struct cachefiles_object *obj,
 		     ino_t ino,
+		     u64 disk_aux,
 		     enum cachefiles_content content,
 		     enum cachefiles_coherency_trace why),
 
-	    TP_ARGS(obj, ino, content, why),
+	    TP_ARGS(obj, ino, disk_aux, content, why),
 
 	    /* Note that obj may be NULL */
 	    TP_STRUCT__entry(
@@ -391,6 +392,8 @@ TRACE_EVENT(cachefiles_coherency,
 		    __field(enum cachefiles_coherency_trace,	why)
 		    __field(enum cachefiles_content,		content)
 		    __field(u64,				ino)
+		    __field(u64,				aux)
+		    __field(u64,				disk_aux)
 			     ),
 
 	    TP_fast_assign(
@@ -398,13 +401,17 @@ TRACE_EVENT(cachefiles_coherency,
 		    __entry->why	= why;
 		    __entry->content	= content;
 		    __entry->ino	= ino;
+		    __entry->aux	= be64_to_cpup((__be64 *)obj->cookie->inline_aux);
+		    __entry->disk_aux	= disk_aux;
 			   ),
 
-	    TP_printk("o=%08x %s B=%llx c=%u",
+	    TP_printk("o=%08x %s B=%llx c=%u aux=%llx dsk=%llx",
 		      __entry->obj,
 		      __print_symbolic(__entry->why, cachefiles_coherency_traces),
 		      __entry->ino,
-		      __entry->content)
+		      __entry->content,
+		      __entry->aux,
+		      __entry->disk_aux)
 	    );
 
 TRACE_EVENT(cachefiles_vol_coherency,
-- 
cgit v1.2.3


From 9e705016eb8f3d4a58f2000e560ea2c7517e081b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:07 +0000
Subject: afs: Add more tracepoints to do with tracking validity

Add wrappers to set and clear the callback promise and to mark a directory
as invalidated, and add tracepoints to track these events:

 (1) afs_cb_promise: Log when a callback promise is set on a vnode.

 (2) afs_vnode_invalid: Log when the server's callback promise for a vnode
     is no longer valid and we need to refetch the vnode metadata.

 (3) afs_dir_invalid: Log when the contents of a directory are marked
     invalid and requiring refetching from the server and the cache
     invalidating.

and two tracepoints to record data version number management:

 (4) afs_set_dv: Log when the DV is recorded on a vnode.

 (5) afs_dv_mismatch: Log when the DV recorded on a vnode plus the expected
     delta for the operation does not match the DV we got back from the
     server.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-18-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/callback.c          |   4 +-
 fs/afs/dir.c               |  14 ++--
 fs/afs/dir_edit.c          |  16 ++---
 fs/afs/inode.c             |  23 +++---
 fs/afs/internal.h          |  32 +++++++++
 fs/afs/rotate.c            |   4 +-
 fs/afs/validation.c        |  31 ++++++---
 include/trace/events/afs.h | 169 +++++++++++++++++++++++++++++++++++++++++++--
 8 files changed, 248 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 99b2c8172021..69e1dd55b160 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -41,7 +41,7 @@ static void afs_volume_init_callback(struct afs_volume *volume)
 
 	list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
 		if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
-			atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
+			afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb);
 			queue_work(system_unbound_wq, &vnode->cb_work);
 		}
 	}
@@ -79,7 +79,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
 	_enter("");
 
 	clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
-	if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) {
+	if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_cb_break)) {
 		vnode->cb_break++;
 		vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
 		afs_clear_permits(vnode);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 50edd1cae28a..f36a28a8f27b 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -324,8 +324,7 @@ expand:
 
 		folio = filemap_get_folio(mapping, i);
 		if (IS_ERR(folio)) {
-			if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-				afs_stat_v(dvnode, n_inval);
+			afs_invalidate_dir(dvnode, afs_dir_invalid_reclaimed_folio);
 			folio = __filemap_get_folio(mapping,
 						    i, FGP_LOCK | FGP_CREAT,
 						    mapping->gfp_mask);
@@ -1388,8 +1387,8 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
 
 		clear_nlink(&vnode->netfs.inode);
 		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		afs_clear_cb_promise(vnode, afs_cb_promise_clear_rmdir);
+		afs_invalidate_dir(vnode, afs_dir_invalid_subdir_removed);
 	}
 }
 
@@ -1851,6 +1850,7 @@ static void afs_rename_success(struct afs_operation *op)
 		write_seqlock(&vnode->cb_lock);
 
 		new_dv = vnode->status.data_version + 1;
+		trace_afs_set_dv(vnode, new_dv);
 		vnode->status.data_version = new_dv;
 		inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
 
@@ -2063,8 +2063,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
 	folio_detach_private(folio);
 
 	/* The directory will need reloading. */
-	if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		afs_stat_v(dvnode, n_relpg);
+	afs_invalidate_dir(dvnode, afs_dir_invalid_release_folio);
 	return true;
 }
 
@@ -2081,8 +2080,7 @@ static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
 	BUG_ON(!folio_test_locked(folio));
 
 	/* The directory will need reloading. */
-	if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		afs_stat_v(dvnode, n_inval);
+	afs_invalidate_dir(dvnode, afs_dir_invalid_inval_folio);
 
 	/* we clean up only if the entire folio is being invalidated */
 	if (offset == 0 && length == folio_size(folio))
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 13fb236a3f50..5d092c8c0157 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -116,7 +116,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
 				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
 				    mapping->gfp_mask);
 	if (IS_ERR(folio)) {
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_get_block);
 		return NULL;
 	}
 	if (!folio_test_private(folio))
@@ -220,7 +220,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 	i_size = i_size_read(&vnode->netfs.inode);
 	if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
 	    (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_bad_size);
 		return;
 	}
 
@@ -299,7 +299,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 	 * succeeded.  Download the directory again.
 	 */
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+	afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_no_slots);
 	goto out_unmap;
 
 new_directory:
@@ -358,7 +358,7 @@ already_invalidated:
 	goto out_unmap;
 
 error_too_many_blocks:
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+	afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_too_many_blocks);
 error:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name);
 	goto out_unmap;
@@ -388,7 +388,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
 	if (i_size < AFS_DIR_BLOCK_SIZE ||
 	    i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
 	    (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_bad_size);
 		return;
 	}
 	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
@@ -440,7 +440,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
 	/* Didn't find the dirent to clobber.  Download the directory again. */
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
 			   0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+	afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name);
 	goto out_unmap;
 
 found_dirent:
@@ -510,7 +510,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 
 	i_size = i_size_read(&vnode->netfs.inode);
 	if (i_size < AFS_DIR_BLOCK_SIZE) {
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_bad_size);
 		return;
 	}
 	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
@@ -542,7 +542,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 	/* Didn't find the dirent to clobber.  Download the directory again. */
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
 			   0, 0, 0, 0, "..");
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+	afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd);
 	goto out;
 
 found_dirent:
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index a95e77670b49..495ecef91679 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -140,15 +140,17 @@ static int afs_inode_init_from_status(struct afs_operation *op,
 	afs_set_netfs_context(vnode);
 
 	vnode->invalid_before	= status->data_version;
+	trace_afs_set_dv(vnode, status->data_version);
 	inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);
 
 	if (!vp->scb.have_cb) {
 		/* it's a symlink we just created (the fileserver
 		 * didn't give us a callback) */
-		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
+		afs_clear_cb_promise(vnode, afs_cb_promise_set_new_symlink);
 	} else {
 		vnode->cb_server = op->server;
-		atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at);
+		afs_set_cb_promise(vnode, vp->scb.callback.expires_at,
+				   afs_cb_promise_set_new_inode);
 	}
 
 	write_sequnlock(&vnode->cb_lock);
@@ -207,12 +209,17 @@ static void afs_apply_status(struct afs_operation *op,
 	if (vp->update_ctime)
 		inode_set_ctime_to_ts(inode, op->ctime);
 
-	if (vnode->status.data_version != status->data_version)
+	if (vnode->status.data_version != status->data_version) {
+		trace_afs_set_dv(vnode, status->data_version);
 		data_changed = true;
+	}
 
 	vnode->status = *status;
 
 	if (vp->dv_before + vp->dv_delta != status->data_version) {
+		trace_afs_dv_mismatch(vnode, vp->dv_before, vp->dv_delta,
+				      status->data_version);
+
 		if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
 		    atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE)
 			pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
@@ -223,12 +230,10 @@ static void afs_apply_status(struct afs_operation *op,
 				op->debug_id);
 
 		vnode->invalid_before = status->data_version;
-		if (vnode->status.type == AFS_FTYPE_DIR) {
-			if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-				afs_stat_v(vnode, n_inval);
-		} else {
+		if (vnode->status.type == AFS_FTYPE_DIR)
+			afs_invalidate_dir(vnode, afs_dir_invalid_dv_mismatch);
+		else
 			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
-		}
 		change_size = true;
 		data_changed = true;
 		unexpected_jump = true;
@@ -273,7 +278,7 @@ static void afs_apply_callback(struct afs_operation *op,
 	if (!afs_cb_is_broken(vp->cb_break_before, vnode)) {
 		if (op->volume->type == AFSVL_RWVOL)
 			vnode->cb_server = op->server;
-		atomic64_set(&vnode->cb_expires_at, cb->expires_at);
+		afs_set_cb_promise(vnode, cb->expires_at, afs_cb_promise_set_apply_cb);
 	}
 }
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 07b8f7083e73..20d2f723948d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1713,6 +1713,38 @@ static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where)
 	return -EIO;
 }
 
+/*
+ * Set the callback promise on a vnode.
+ */
+static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at,
+				      enum afs_cb_promise_trace trace)
+{
+	atomic64_set(&vnode->cb_expires_at, expires_at);
+	trace_afs_cb_promise(vnode, trace);
+}
+
+/*
+ * Clear the callback promise on a vnode, returning true if it was promised.
+ */
+static inline bool afs_clear_cb_promise(struct afs_vnode *vnode,
+					enum afs_cb_promise_trace trace)
+{
+	trace_afs_cb_promise(vnode, trace);
+	return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE;
+}
+
+/*
+ * Mark a directory as being invalid.
+ */
+static inline void afs_invalidate_dir(struct afs_vnode *dvnode,
+				      enum afs_dir_invalid_trace trace)
+{
+	if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
+		trace_afs_dir_invalid(dvnode, trace);
+		afs_stat_v(dvnode, n_inval);
+	}
+}
+
 /*****************************************************************************/
 /*
  * debug tracing
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index d612983d6f38..a1c24f589d9e 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -99,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
 		write_seqlock(&vnode->cb_lock);
 		ASSERTCMP(cb_server, ==, vnode->cb_server);
 		vnode->cb_server = NULL;
-		if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
+		if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_rotate_server))
 			vnode->cb_break++;
 		write_sequnlock(&vnode->cb_lock);
 	}
@@ -583,7 +583,7 @@ selected_server:
 	if (vnode->cb_server != server) {
 		vnode->cb_server = server;
 		vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
-		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
+		afs_clear_cb_promise(vnode, afs_cb_promise_clear_server_change);
 	}
 
 retry_server:
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index bef8af12ebe2..0ba8336c9025 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -120,22 +120,31 @@
 bool afs_check_validity(const struct afs_vnode *vnode)
 {
 	const struct afs_volume *volume = vnode->volume;
+	enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;
+	time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);
 	time64_t deadline = ktime_get_real_seconds() + 10;
 
 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 		return true;
 
-	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
-	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
-	    volume->cb_expires_at <= deadline ||
-	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
-	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
-	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
-		_debug("inval");
-		return false;
-	}
-
-	return true;
+	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break))
+		trace = afs_vnode_invalid_trace_cb_v_break;
+	else if (cb_expires_at == AFS_NO_CB_PROMISE)
+		trace = afs_vnode_invalid_trace_no_cb_promise;
+	else if (cb_expires_at <= deadline)
+		trace = afs_vnode_invalid_trace_expired;
+	else if (volume->cb_expires_at <= deadline)
+		trace = afs_vnode_invalid_trace_vol_expired;
+	else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot))
+		trace = afs_vnode_invalid_trace_cb_ro_snapshot;
+	else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub))
+		trace = afs_vnode_invalid_trace_cb_scrub;
+	else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+		trace = afs_vnode_invalid_trace_zap_data;
+	else
+		return true;
+	trace_afs_vnode_invalid(vnode, trace);
+	return false;
 }
 
 /*
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index a0aed1a428a1..7cb5583efb91 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -323,6 +323,43 @@ enum yfs_cm_operation {
 	EM(yfs_CB_TellMeAboutYourself,		"YFSCB.TellMeAboutYourself") \
 	E_(yfs_CB_CallBack,			"YFSCB.CallBack")
 
+#define afs_cb_promise_traces \
+	EM(afs_cb_promise_clear_cb_break,	"CLEAR cb-break") \
+	EM(afs_cb_promise_clear_rmdir,		"CLEAR rmdir") \
+	EM(afs_cb_promise_clear_rotate_server,	"CLEAR rot-srv") \
+	EM(afs_cb_promise_clear_server_change,	"CLEAR srv-chg") \
+	EM(afs_cb_promise_clear_vol_init_cb,	"CLEAR vol-init-cb") \
+	EM(afs_cb_promise_set_apply_cb,		"SET apply-cb") \
+	EM(afs_cb_promise_set_new_inode,	"SET new-inode") \
+	E_(afs_cb_promise_set_new_symlink,	"SET new-symlink")
+
+#define afs_vnode_invalid_traces \
+	EM(afs_vnode_invalid_trace_cb_ro_snapshot, "cb-ro-snapshot") \
+	EM(afs_vnode_invalid_trace_cb_scrub,	"cb-scrub") \
+	EM(afs_vnode_invalid_trace_cb_v_break,	"cb-v-break") \
+	EM(afs_vnode_invalid_trace_expired,	"expired") \
+	EM(afs_vnode_invalid_trace_no_cb_promise, "no-cb-promise") \
+	EM(afs_vnode_invalid_trace_vol_expired,	"vol-expired") \
+	EM(afs_vnode_invalid_trace_zap_data,	"zap-data") \
+	E_(afs_vnode_valid_trace,		"valid")
+
+#define afs_dir_invalid_traces			\
+	EM(afs_dir_invalid_edit_add_bad_size,	"edit-add-bad-size") \
+	EM(afs_dir_invalid_edit_add_no_slots,	"edit-add-no-slots") \
+	EM(afs_dir_invalid_edit_add_too_many_blocks, "edit-add-too-many-blocks") \
+	EM(afs_dir_invalid_edit_get_block,	"edit-get-block") \
+	EM(afs_dir_invalid_edit_rem_bad_size,	"edit-rem-bad-size") \
+	EM(afs_dir_invalid_edit_rem_wrong_name,	"edit-rem-wrong_name") \
+	EM(afs_dir_invalid_edit_upd_bad_size,	"edit-upd-bad-size") \
+	EM(afs_dir_invalid_edit_upd_no_dd,	"edit-upd-no-dotdot") \
+	EM(afs_dir_invalid_dv_mismatch,		"dv-mismatch") \
+	EM(afs_dir_invalid_inval_folio,		"inv-folio") \
+	EM(afs_dir_invalid_iter_stale,		"iter-stale") \
+	EM(afs_dir_invalid_reclaimed_folio,	"reclaimed-folio") \
+	EM(afs_dir_invalid_release_folio,	"rel-folio") \
+	EM(afs_dir_invalid_remote,		"remote") \
+	E_(afs_dir_invalid_subdir_removed,	"subdir-removed")
+
 #define afs_edit_dir_ops				  \
 	EM(afs_edit_dir_create,			"create") \
 	EM(afs_edit_dir_create_error,		"c_fail") \
@@ -487,7 +524,9 @@ enum yfs_cm_operation {
 enum afs_alist_trace		{ afs_alist_traces } __mode(byte);
 enum afs_call_trace		{ afs_call_traces } __mode(byte);
 enum afs_cb_break_reason	{ afs_cb_break_reasons } __mode(byte);
+enum afs_cb_promise_trace	{ afs_cb_promise_traces } __mode(byte);
 enum afs_cell_trace		{ afs_cell_traces } __mode(byte);
+enum afs_dir_invalid_trace	{ afs_dir_invalid_traces} __mode(byte);
 enum afs_edit_dir_op		{ afs_edit_dir_ops } __mode(byte);
 enum afs_edit_dir_reason	{ afs_edit_dir_reasons } __mode(byte);
 enum afs_eproto_cause		{ afs_eproto_causes } __mode(byte);
@@ -498,6 +537,7 @@ enum afs_flock_operation	{ afs_flock_operations } __mode(byte);
 enum afs_io_error		{ afs_io_errors } __mode(byte);
 enum afs_rotate_trace		{ afs_rotate_traces } __mode(byte);
 enum afs_server_trace		{ afs_server_traces } __mode(byte);
+enum afs_vnode_invalid_trace	{ afs_vnode_invalid_traces} __mode(byte);
 enum afs_volume_trace		{ afs_volume_traces } __mode(byte);
 
 #endif /* end __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY */
@@ -513,8 +553,10 @@ enum afs_volume_trace		{ afs_volume_traces } __mode(byte);
 afs_alist_traces;
 afs_call_traces;
 afs_cb_break_reasons;
+afs_cb_promise_traces;
 afs_cell_traces;
 afs_cm_operations;
+afs_dir_invalid_traces;
 afs_edit_dir_ops;
 afs_edit_dir_reasons;
 afs_eproto_causes;
@@ -526,6 +568,7 @@ afs_fs_operations;
 afs_io_errors;
 afs_rotate_traces;
 afs_server_traces;
+afs_vnode_invalid_traces;
 afs_vl_operations;
 yfs_cm_operations;
 
@@ -670,7 +713,7 @@ TRACE_EVENT(afs_make_fs_call,
 		    }
 			   ),
 
-	    TP_printk("c=%08x %06llx:%06llx:%06x %s",
+	    TP_printk("c=%08x V=%llx i=%llx:%x %s",
 		      __entry->call,
 		      __entry->fid.vid,
 		      __entry->fid.vnode,
@@ -704,7 +747,7 @@ TRACE_EVENT(afs_make_fs_calli,
 		    }
 			   ),
 
-	    TP_printk("c=%08x %06llx:%06llx:%06x %s i=%u",
+	    TP_printk("c=%08x V=%llx i=%llx:%x %s i=%u",
 		      __entry->call,
 		      __entry->fid.vid,
 		      __entry->fid.vnode,
@@ -741,7 +784,7 @@ TRACE_EVENT(afs_make_fs_call1,
 		    __entry->name[__len] = 0;
 			   ),
 
-	    TP_printk("c=%08x %06llx:%06llx:%06x %s \"%s\"",
+	    TP_printk("c=%08x V=%llx i=%llx:%x %s \"%s\"",
 		      __entry->call,
 		      __entry->fid.vid,
 		      __entry->fid.vnode,
@@ -782,7 +825,7 @@ TRACE_EVENT(afs_make_fs_call2,
 		    __entry->name2[__len2] = 0;
 			   ),
 
-	    TP_printk("c=%08x %06llx:%06llx:%06x %s \"%s\" \"%s\"",
+	    TP_printk("c=%08x V=%llx i=%llx:%x %s \"%s\" \"%s\"",
 		      __entry->call,
 		      __entry->fid.vid,
 		      __entry->fid.vnode,
@@ -1002,7 +1045,7 @@ TRACE_EVENT(afs_edit_dir,
 		    __entry->name[__len] = 0;
 			   ),
 
-	    TP_printk("d=%x:%x %s %s %u[%u] f=%x:%x \"%s\"",
+	    TP_printk("di=%x:%x %s %s %u[%u] fi=%x:%x \"%s\"",
 		      __entry->vnode, __entry->unique,
 		      __print_symbolic(__entry->why, afs_edit_dir_reasons),
 		      __print_symbolic(__entry->op, afs_edit_dir_ops),
@@ -1011,6 +1054,122 @@ TRACE_EVENT(afs_edit_dir,
 		      __entry->name)
 	    );
 
+TRACE_EVENT(afs_dir_invalid,
+	    TP_PROTO(const struct afs_vnode *dvnode, enum afs_dir_invalid_trace trace),
+
+	    TP_ARGS(dvnode, trace),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		vnode)
+		    __field(unsigned int,		unique)
+		    __field(enum afs_dir_invalid_trace,	trace)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->vnode	= dvnode->fid.vnode;
+		    __entry->unique	= dvnode->fid.unique;
+		    __entry->trace	= trace;
+			   ),
+
+	    TP_printk("di=%x:%x %s",
+		      __entry->vnode, __entry->unique,
+		      __print_symbolic(__entry->trace, afs_dir_invalid_traces))
+	    );
+
+TRACE_EVENT(afs_cb_promise,
+	    TP_PROTO(const struct afs_vnode *vnode, enum afs_cb_promise_trace trace),
+
+	    TP_ARGS(vnode, trace),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		vnode)
+		    __field(unsigned int,		unique)
+		    __field(enum afs_cb_promise_trace,	trace)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->vnode	= vnode->fid.vnode;
+		    __entry->unique	= vnode->fid.unique;
+		    __entry->trace	= trace;
+			   ),
+
+	    TP_printk("di=%x:%x %s",
+		      __entry->vnode, __entry->unique,
+		      __print_symbolic(__entry->trace, afs_cb_promise_traces))
+	    );
+
+TRACE_EVENT(afs_vnode_invalid,
+	    TP_PROTO(const struct afs_vnode *vnode, enum afs_vnode_invalid_trace trace),
+
+	    TP_ARGS(vnode, trace),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		vnode)
+		    __field(unsigned int,		unique)
+		    __field(enum afs_vnode_invalid_trace, trace)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->vnode	= vnode->fid.vnode;
+		    __entry->unique	= vnode->fid.unique;
+		    __entry->trace	= trace;
+			   ),
+
+	    TP_printk("di=%x:%x %s",
+		      __entry->vnode, __entry->unique,
+		      __print_symbolic(__entry->trace, afs_vnode_invalid_traces))
+	    );
+
+TRACE_EVENT(afs_set_dv,
+	    TP_PROTO(const struct afs_vnode *dvnode, u64 new_dv),
+
+	    TP_ARGS(dvnode, new_dv),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		vnode)
+		    __field(unsigned int,		unique)
+		    __field(u64,			old_dv)
+		    __field(u64,			new_dv)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->vnode	= dvnode->fid.vnode;
+		    __entry->unique	= dvnode->fid.unique;
+		    __entry->old_dv	= dvnode->status.data_version;
+		    __entry->new_dv	= new_dv;
+			   ),
+
+	    TP_printk("di=%x:%x dv=%llx -> dv=%llx",
+		      __entry->vnode, __entry->unique,
+		      __entry->old_dv, __entry->new_dv)
+	    );
+
+TRACE_EVENT(afs_dv_mismatch,
+	    TP_PROTO(const struct afs_vnode *dvnode, u64 before_dv, int delta, u64 new_dv),
+
+	    TP_ARGS(dvnode, before_dv, delta, new_dv),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		vnode)
+		    __field(unsigned int,		unique)
+		    __field(int,			delta)
+		    __field(u64,			before_dv)
+		    __field(u64,			new_dv)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->vnode	= dvnode->fid.vnode;
+		    __entry->unique	= dvnode->fid.unique;
+		    __entry->delta	= delta;
+		    __entry->before_dv	= before_dv;
+		    __entry->new_dv	= new_dv;
+			   ),
+
+	    TP_printk("di=%x:%x xdv=%llx+%d dv=%llx",
+		      __entry->vnode, __entry->unique,
+		      __entry->before_dv, __entry->delta, __entry->new_dv)
+	    );
+
 TRACE_EVENT(afs_protocol_error,
 	    TP_PROTO(struct afs_call *call, enum afs_eproto_cause cause),
 
-- 
cgit v1.2.3


From e61bfaad8fd86ac84eac633e0bbaac47a5dfd358 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:08 +0000
Subject: netfs: Add functions to build/clean a buffer in a folio_queue

Add two netfslib functions to build up or clean up a buffer in a
folio_queue.  The first, netfs_alloc_folioq_buffer() will add folios to a
buffer, extending up at least to the given size.  If it can, it will add
multipage folios.  The folios are optionally have the mapping set and will
have the index set according to the distance from the front of the folio
queue.

The second function will free up a folio queue and put any folios in the
queue that have the first mark set.

The netfs_folio tracepoint is also altered to cope with folios that have a
NULL mapping, and the folios being added/put will have trace lines emitted
and will be accounted in the stats.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-19-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: netfs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/misc.c              | 96 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h        |  6 +++
 include/trace/events/netfs.h |  6 +--
 3 files changed, 104 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 4249715f4171..7099aa07737a 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,6 +8,102 @@
 #include <linux/swap.h>
 #include "internal.h"
 
+/**
+ * netfs_alloc_folioq_buffer - Allocate buffer space into a folio queue
+ * @mapping: Address space to set on the folio (or NULL).
+ * @_buffer: Pointer to the folio queue to add to (may point to a NULL; updated).
+ * @_cur_size: Current size of the buffer (updated).
+ * @size: Target size of the buffer.
+ * @gfp: The allocation constraints.
+ */
+int netfs_alloc_folioq_buffer(struct address_space *mapping,
+			      struct folio_queue **_buffer,
+			      size_t *_cur_size, ssize_t size, gfp_t gfp)
+{
+	struct folio_queue *tail = *_buffer, *p;
+
+	size = round_up(size, PAGE_SIZE);
+	if (*_cur_size >= size)
+		return 0;
+
+	if (tail)
+		while (tail->next)
+			tail = tail->next;
+
+	do {
+		struct folio *folio;
+		int order = 0, slot;
+
+		if (!tail || folioq_full(tail)) {
+			p = netfs_folioq_alloc(0, GFP_NOFS, netfs_trace_folioq_alloc_buffer);
+			if (!p)
+				return -ENOMEM;
+			if (tail) {
+				tail->next = p;
+				p->prev = tail;
+			} else {
+				*_buffer = p;
+			}
+			tail = p;
+		}
+
+		if (size - *_cur_size > PAGE_SIZE)
+			order = umin(ilog2(size - *_cur_size) - PAGE_SHIFT,
+				     MAX_PAGECACHE_ORDER);
+
+		folio = folio_alloc(gfp, order);
+		if (!folio && order > 0)
+			folio = folio_alloc(gfp, 0);
+		if (!folio)
+			return -ENOMEM;
+
+		folio->mapping = mapping;
+		folio->index = *_cur_size / PAGE_SIZE;
+		trace_netfs_folio(folio, netfs_folio_trace_alloc_buffer);
+		slot = folioq_append_mark(tail, folio);
+		*_cur_size += folioq_folio_size(tail, slot);
+	} while (*_cur_size < size);
+
+	return 0;
+}
+EXPORT_SYMBOL(netfs_alloc_folioq_buffer);
+
+/**
+ * netfs_free_folioq_buffer - Free a folio queue.
+ * @fq: The start of the folio queue to free
+ *
+ * Free up a chain of folio_queues and, if marked, the marked folios they point
+ * to.
+ */
+void netfs_free_folioq_buffer(struct folio_queue *fq)
+{
+	struct folio_queue *next;
+	struct folio_batch fbatch;
+
+	folio_batch_init(&fbatch);
+
+	for (; fq; fq = next) {
+		for (int slot = 0; slot < folioq_count(fq); slot++) {
+			struct folio *folio = folioq_folio(fq, slot);
+
+			if (!folio ||
+			    !folioq_is_marked(fq, slot))
+				continue;
+
+			trace_netfs_folio(folio, netfs_folio_trace_put);
+			if (folio_batch_add(&fbatch, folio))
+				folio_batch_release(&fbatch);
+		}
+
+		netfs_stat_d(&netfs_n_folioq);
+		next = fq->next;
+		kfree(fq);
+	}
+
+	folio_batch_release(&fbatch);
+}
+EXPORT_SYMBOL(netfs_free_folioq_buffer);
+
 /*
  * Reset the subrequest iterator to refer just to the region remaining to be
  * read.  The iterator may or may not have been advanced by socket ops or
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 374e54beacbe..dd737344cff3 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -457,6 +457,12 @@ struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp,
 void netfs_folioq_free(struct folio_queue *folioq,
 		       unsigned int trace /*enum netfs_trace_folioq*/);
 
+/* Buffer wrangling helpers API. */
+int netfs_alloc_folioq_buffer(struct address_space *mapping,
+			      struct folio_queue **_buffer,
+			      size_t *_cur_size, ssize_t size, gfp_t gfp);
+void netfs_free_folioq_buffer(struct folio_queue *fq);
+
 /**
  * netfs_inode - Get the netfs inode context from the inode
  * @inode: The inode to query
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 02f6e179b7bc..fc237ff23a33 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -155,6 +155,7 @@
 	EM(netfs_streaming_filled_page,		"mod-streamw-f") \
 	EM(netfs_streaming_cont_filled_page,	"mod-streamw-f+") \
 	EM(netfs_folio_trace_abandon,		"abandon")	\
+	EM(netfs_folio_trace_alloc_buffer,	"alloc-buf")	\
 	EM(netfs_folio_trace_cancel_copy,	"cancel-copy")	\
 	EM(netfs_folio_trace_cancel_store,	"cancel-store")	\
 	EM(netfs_folio_trace_clear,		"clear")	\
@@ -195,10 +196,7 @@
 	E_(netfs_trace_donate_to_deferred_next,	"defer-next")
 
 #define netfs_folioq_traces					\
-	EM(netfs_trace_folioq_alloc_append_folio, "alloc-apf")	\
-	EM(netfs_trace_folioq_alloc_read_prep,	"alloc-r-prep")	\
-	EM(netfs_trace_folioq_alloc_read_prime,	"alloc-r-prime") \
-	EM(netfs_trace_folioq_alloc_read_sing,	"alloc-r-sing")	\
+	EM(netfs_trace_folioq_alloc_buffer,	"alloc-buf")	\
 	EM(netfs_trace_folioq_clear,		"clear")	\
 	EM(netfs_trace_folioq_delete,		"delete")	\
 	EM(netfs_trace_folioq_make_space,	"make-space")	\
-- 
cgit v1.2.3


From 49866ce7ea8d41a3dc198f519cc9caa2d6be1891 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:09 +0000
Subject: netfs: Add support for caching single monolithic objects such as AFS
 dirs

Add support for caching the content of a file that contains a single
monolithic object that must be read/written with a single I/O operation,
such as an AFS directory.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-20-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: netfs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/Makefile            |   1 +
 fs/netfs/buffered_read.c     |  11 ++-
 fs/netfs/internal.h          |   2 +
 fs/netfs/main.c              |   2 +
 fs/netfs/objects.c           |   2 +
 fs/netfs/read_collect.c      |  45 +++++++++-
 fs/netfs/read_single.c       | 202 ++++++++++++++++++++++++++++++++++++++++++
 fs/netfs/stats.c             |   4 +-
 fs/netfs/write_collect.c     |   6 +-
 fs/netfs/write_issue.c       | 203 +++++++++++++++++++++++++++++++++++++++++--
 include/linux/netfs.h        |  10 +++
 include/trace/events/netfs.h |   4 +
 12 files changed, 478 insertions(+), 14 deletions(-)
 create mode 100644 fs/netfs/read_single.c

(limited to 'fs')

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index cbb30bdeacc4..b43188d64bd8 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -13,6 +13,7 @@ netfs-y := \
 	read_collect.o \
 	read_pgpriv2.o \
 	read_retry.o \
+	read_single.o \
 	rolling_buffer.o \
 	write_collect.o \
 	write_issue.o \
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 4ff4b587dc4b..c420e9dee0e4 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -137,14 +137,17 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rr
 						     loff_t i_size)
 {
 	struct netfs_cache_resources *cres = &rreq->cache_resources;
+	enum netfs_io_source source;
 
 	if (!cres->ops)
 		return NETFS_DOWNLOAD_FROM_SERVER;
-	return cres->ops->prepare_read(subreq, i_size);
+	source = cres->ops->prepare_read(subreq, i_size);
+	trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+	return source;
+
 }
 
-static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
-					bool was_async)
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
 {
 	struct netfs_io_subrequest *subreq = priv;
 
@@ -213,6 +216,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			unsigned long long zp = umin(ictx->zero_point, rreq->i_size);
 			size_t len = subreq->len;
 
+			if (unlikely(rreq->origin == NETFS_READ_SINGLE))
+				zp = rreq->i_size;
 			if (subreq->start >= zp) {
 				subreq->source = source = NETFS_FILL_WITH_ZEROES;
 				goto fill_with_zeroes;
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ba32ca61063c..e236f752af88 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -23,6 +23,7 @@
 /*
  * buffered_read.c
  */
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
 int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 			     size_t offset, size_t len);
 
@@ -110,6 +111,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq);
 extern atomic_t netfs_n_rh_dio_read;
 extern atomic_t netfs_n_rh_readahead;
 extern atomic_t netfs_n_rh_read_folio;
+extern atomic_t netfs_n_rh_read_single;
 extern atomic_t netfs_n_rh_rreq;
 extern atomic_t netfs_n_rh_sreq;
 extern atomic_t netfs_n_rh_download;
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 6c7be1377ee0..8c1922c0cb42 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -37,9 +37,11 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
 	[NETFS_READAHEAD]		= "RA",
 	[NETFS_READPAGE]		= "RP",
 	[NETFS_READ_GAPS]		= "RG",
+	[NETFS_READ_SINGLE]		= "R1",
 	[NETFS_READ_FOR_WRITE]		= "RW",
 	[NETFS_DIO_READ]		= "DR",
 	[NETFS_WRITEBACK]		= "WB",
+	[NETFS_WRITEBACK_SINGLE]	= "W1",
 	[NETFS_WRITETHROUGH]		= "WT",
 	[NETFS_UNBUFFERED_WRITE]	= "UW",
 	[NETFS_DIO_WRITE]		= "DW",
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 8c98b70eb3a4..dde4a679d9e2 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -54,6 +54,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	if (origin == NETFS_READAHEAD ||
 	    origin == NETFS_READPAGE ||
 	    origin == NETFS_READ_GAPS ||
+	    origin == NETFS_READ_SINGLE ||
 	    origin == NETFS_READ_FOR_WRITE ||
 	    origin == NETFS_DIO_READ)
 		INIT_WORK(&rreq->work, NULL);
@@ -196,6 +197,7 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
 	case NETFS_READAHEAD:
 	case NETFS_READPAGE:
 	case NETFS_READ_GAPS:
+	case NETFS_READ_SINGLE:
 	case NETFS_READ_FOR_WRITE:
 	case NETFS_DIO_READ:
 		INIT_WORK(&subreq->work, netfs_read_subreq_termination_worker);
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 26e430baeb5a..2e9291ab1d62 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -361,6 +361,39 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
 		inode_dio_end(rreq->inode);
 }
 
+/*
+ * Do processing after reading a monolithic single object.
+ */
+static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
+{
+	struct netfs_io_subrequest *subreq;
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
+
+	subreq = list_first_entry_or_null(&stream->subrequests,
+					  struct netfs_io_subrequest, rreq_link);
+	if (subreq) {
+		if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+			rreq->error = subreq->error;
+		else
+			rreq->transferred = subreq->transferred;
+
+		if (!rreq->error && subreq->source == NETFS_DOWNLOAD_FROM_SERVER &&
+		    fscache_resources_valid(&rreq->cache_resources)) {
+			trace_netfs_rreq(rreq, netfs_rreq_trace_dirty);
+			netfs_single_mark_inode_dirty(rreq->inode);
+		}
+	}
+
+	if (rreq->iocb) {
+		rreq->iocb->ki_pos += rreq->transferred;
+		if (rreq->iocb->ki_complete)
+			rreq->iocb->ki_complete(
+				rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+	}
+	if (rreq->netfs_ops->done)
+		rreq->netfs_ops->done(rreq);
+}
+
 /*
  * Assess the state of a read request and decide what to do next.
  *
@@ -378,9 +411,17 @@ void netfs_rreq_terminated(struct netfs_io_request *rreq)
 		return;
 	}
 
-	if (rreq->origin == NETFS_DIO_READ ||
-	    rreq->origin == NETFS_READ_GAPS)
+	switch (rreq->origin) {
+	case NETFS_DIO_READ:
+	case NETFS_READ_GAPS:
 		netfs_rreq_assess_dio(rreq);
+		break;
+	case NETFS_READ_SINGLE:
+		netfs_rreq_assess_single(rreq);
+		break;
+	default:
+		break;
+	}
 	task_io_account_read(rreq->transferred);
 
 	trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
new file mode 100644
index 000000000000..2a66c5fde071
--- /dev/null
+++ b/fs/netfs/read_single.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Single, monolithic object support (e.g. AFS directory).
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_single_mark_inode_dirty - Mark a single, monolithic object inode dirty
+ * @inode: The inode to mark
+ *
+ * Mark an inode that contains a single, monolithic object as dirty so that its
+ * writepages op will get called.  If set, the SINGLE_NO_UPLOAD flag indicates
+ * that the object will only be written to the cache and not uploaded (e.g. AFS
+ * directory contents).
+ */
+void netfs_single_mark_inode_dirty(struct inode *inode)
+{
+	struct netfs_inode *ictx = netfs_inode(inode);
+	bool cache_only = test_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &ictx->flags);
+	bool caching = fscache_cookie_enabled(netfs_i_cookie(netfs_inode(inode)));
+
+	if (cache_only && !caching)
+		return;
+
+	mark_inode_dirty(inode);
+
+	if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) {
+		bool need_use = false;
+
+		spin_lock(&inode->i_lock);
+		if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+			inode->i_state |= I_PINNING_NETFS_WB;
+			need_use = true;
+		}
+		spin_unlock(&inode->i_lock);
+
+		if (need_use)
+			fscache_use_cookie(netfs_i_cookie(ictx), true);
+	}
+
+}
+EXPORT_SYMBOL(netfs_single_mark_inode_dirty);
+
+static int netfs_single_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
+{
+	return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
+}
+
+static void netfs_single_cache_prepare_read(struct netfs_io_request *rreq,
+					    struct netfs_io_subrequest *subreq)
+{
+	struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+	if (!cres->ops) {
+		subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+		return;
+	}
+	subreq->source = cres->ops->prepare_read(subreq, rreq->i_size);
+	trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+}
+
+static void netfs_single_read_cache(struct netfs_io_request *rreq,
+				    struct netfs_io_subrequest *subreq)
+{
+	struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+	netfs_stat(&netfs_n_rh_read);
+	cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_FAIL,
+			netfs_cache_read_terminated, subreq);
+}
+
+/*
+ * Perform a read to a buffer from the cache or the server.  Only a single
+ * subreq is permitted as the object must be fetched in a single transaction.
+ */
+static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
+{
+	struct netfs_io_subrequest *subreq;
+	int ret = 0;
+
+	atomic_set(&rreq->nr_outstanding, 1);
+
+	subreq = netfs_alloc_subrequest(rreq);
+	if (!subreq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	subreq->source	= NETFS_DOWNLOAD_FROM_SERVER;
+	subreq->start	= 0;
+	subreq->len	= rreq->len;
+	subreq->io_iter	= rreq->buffer.iter;
+
+	atomic_inc(&rreq->nr_outstanding);
+
+	spin_lock_bh(&rreq->lock);
+	list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+	trace_netfs_sreq(subreq, netfs_sreq_trace_added);
+	spin_unlock_bh(&rreq->lock);
+
+	netfs_single_cache_prepare_read(rreq, subreq);
+	switch (subreq->source) {
+	case NETFS_DOWNLOAD_FROM_SERVER:
+		netfs_stat(&netfs_n_rh_download);
+		if (rreq->netfs_ops->prepare_read) {
+			ret = rreq->netfs_ops->prepare_read(subreq);
+			if (ret < 0)
+				goto cancel;
+		}
+
+		rreq->netfs_ops->issue_read(subreq);
+		rreq->submitted += subreq->len;
+		break;
+	case NETFS_READ_FROM_CACHE:
+		trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+		netfs_single_read_cache(rreq, subreq);
+		rreq->submitted += subreq->len;
+		ret = 0;
+		break;
+	default:
+		pr_warn("Unexpected single-read source %u\n", subreq->source);
+		WARN_ON_ONCE(true);
+		ret = -EIO;
+		break;
+	}
+
+out:
+	if (atomic_dec_and_test(&rreq->nr_outstanding))
+		netfs_rreq_terminated(rreq);
+	return ret;
+cancel:
+	atomic_dec(&rreq->nr_outstanding);
+	netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+	goto out;
+}
+
+/**
+ * netfs_read_single - Synchronously read a single blob of pages.
+ * @inode: The inode to read from.
+ * @file: The file we're using to read or NULL.
+ * @iter: The buffer we're reading into.
+ *
+ * Fulfil a read request for a single monolithic object by drawing data from
+ * the cache if possible, or the netfs if not.  The buffer may be larger than
+ * the file content; unused beyond the EOF will be zero-filled.  The content
+ * will be read with a single I/O request (though this may be retried).
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.  If caching is enabled,
+ * the data will be stored as a single object into the cache.
+ */
+ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter)
+{
+	struct netfs_io_request *rreq;
+	struct netfs_inode *ictx = netfs_inode(inode);
+	ssize_t ret;
+
+	rreq = netfs_alloc_request(inode->i_mapping, file, 0, iov_iter_count(iter),
+				   NETFS_READ_SINGLE);
+	if (IS_ERR(rreq))
+		return PTR_ERR(rreq);
+
+	ret = netfs_single_begin_cache_read(rreq, ictx);
+	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+		goto cleanup_free;
+
+	netfs_stat(&netfs_n_rh_read_single);
+	trace_netfs_read(rreq, 0, rreq->len, netfs_read_trace_read_single);
+
+	rreq->buffer.iter = *iter;
+	netfs_single_dispatch_read(rreq);
+
+	trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+	wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
+		    TASK_UNINTERRUPTIBLE);
+
+	ret = rreq->error;
+	if (ret == 0)
+		ret = rreq->transferred;
+	netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
+	return ret;
+
+cleanup_free:
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_read_single);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 8e63516b40f6..f1af344266cc 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -12,6 +12,7 @@
 atomic_t netfs_n_rh_dio_read;
 atomic_t netfs_n_rh_readahead;
 atomic_t netfs_n_rh_read_folio;
+atomic_t netfs_n_rh_read_single;
 atomic_t netfs_n_rh_rreq;
 atomic_t netfs_n_rh_sreq;
 atomic_t netfs_n_rh_download;
@@ -46,10 +47,11 @@ atomic_t netfs_n_folioq;
 
 int netfs_stats_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "Reads  : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n",
+	seq_printf(m, "Reads  : DR=%u RA=%u RF=%u RS=%u WB=%u WBZ=%u\n",
 		   atomic_read(&netfs_n_rh_dio_read),
 		   atomic_read(&netfs_n_rh_readahead),
 		   atomic_read(&netfs_n_rh_read_folio),
+		   atomic_read(&netfs_n_rh_read_single),
 		   atomic_read(&netfs_n_rh_write_begin),
 		   atomic_read(&netfs_n_rh_write_zskip));
 	seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n",
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index f026cbc0e2fe..d54526d2e751 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -17,7 +17,7 @@
 #define HIT_PENDING		0x01	/* A front op was still pending */
 #define NEED_REASSESS		0x02	/* Need to loop round and reassess */
 #define MADE_PROGRESS		0x04	/* Made progress cleaning up a stream or the folio set */
-#define BUFFERED		0x08	/* The pagecache needs cleaning up */
+#define NEED_UNLOCK		0x08	/* The pagecache needs unlocking */
 #define NEED_RETRY		0x10	/* A front op requests retrying */
 #define SAW_FAILURE		0x20	/* One stream or hit a permanent failure */
 
@@ -179,7 +179,7 @@ reassess_streams:
 	if (wreq->origin == NETFS_WRITEBACK ||
 	    wreq->origin == NETFS_WRITETHROUGH ||
 	    wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
-		notes = BUFFERED;
+		notes = NEED_UNLOCK;
 	else
 		notes = 0;
 
@@ -276,7 +276,7 @@ reassess_streams:
 	trace_netfs_collect_state(wreq, wreq->collected_to, notes);
 
 	/* Unlock any folios that we have now finished with. */
-	if (notes & BUFFERED) {
+	if (notes & NEED_UNLOCK) {
 		if (wreq->cleaned_to < wreq->collected_to)
 			netfs_writeback_unlock_folios(wreq, &notes);
 	} else {
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 286bc2aa3ca0..6f14a7c2f040 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -94,9 +94,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 {
 	struct netfs_io_request *wreq;
 	struct netfs_inode *ictx;
-	bool is_buffered = (origin == NETFS_WRITEBACK ||
-			    origin == NETFS_WRITETHROUGH ||
-			    origin == NETFS_PGPRIV2_COPY_TO_CACHE);
+	bool is_cacheable = (origin == NETFS_WRITEBACK ||
+			     origin == NETFS_WRITEBACK_SINGLE ||
+			     origin == NETFS_WRITETHROUGH ||
+			     origin == NETFS_PGPRIV2_COPY_TO_CACHE);
 
 	wreq = netfs_alloc_request(mapping, file, start, 0, origin);
 	if (IS_ERR(wreq))
@@ -105,7 +106,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 	_enter("R=%x", wreq->debug_id);
 
 	ictx = netfs_inode(wreq->inode);
-	if (is_buffered && netfs_is_cache_enabled(ictx))
+	if (is_cacheable && netfs_is_cache_enabled(ictx))
 		fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
 	if (rolling_buffer_init(&wreq->buffer, wreq->debug_id, ITER_SOURCE) < 0)
 		goto nomem;
@@ -452,7 +453,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
 		stream = &wreq->io_streams[s];
 		stream->submit_off = foff;
 		stream->submit_len = flen;
-		if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
+		if (!stream->avail ||
+		    (stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
 		    (stream->source == NETFS_UPLOAD_TO_SERVER &&
 		     fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
 			stream->submit_off = UINT_MAX;
@@ -731,3 +733,194 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
 	_leave(" = %d", error);
 	return error;
 }
+
+/*
+ * Write some of a pending folio data back to the server and/or the cache.
+ */
+static int netfs_write_folio_single(struct netfs_io_request *wreq,
+				    struct folio *folio)
+{
+	struct netfs_io_stream *upload = &wreq->io_streams[0];
+	struct netfs_io_stream *cache  = &wreq->io_streams[1];
+	struct netfs_io_stream *stream;
+	size_t iter_off = 0;
+	size_t fsize = folio_size(folio), flen;
+	loff_t fpos = folio_pos(folio);
+	bool to_eof = false;
+	bool no_debug = false;
+
+	_enter("");
+
+	flen = folio_size(folio);
+	if (flen > wreq->i_size - fpos) {
+		flen = wreq->i_size - fpos;
+		folio_zero_segment(folio, flen, fsize);
+		to_eof = true;
+	} else if (flen == wreq->i_size - fpos) {
+		to_eof = true;
+	}
+
+	_debug("folio %zx/%zx", flen, fsize);
+
+	if (!upload->avail && !cache->avail) {
+		trace_netfs_folio(folio, netfs_folio_trace_cancel_store);
+		return 0;
+	}
+
+	if (!upload->construct)
+		trace_netfs_folio(folio, netfs_folio_trace_store);
+	else
+		trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+
+	/* Attach the folio to the rolling buffer. */
+	folio_get(folio);
+	rolling_buffer_append(&wreq->buffer, folio, NETFS_ROLLBUF_PUT_MARK);
+
+	/* Move the submission point forward to allow for write-streaming data
+	 * not starting at the front of the page.  We don't do write-streaming
+	 * with the cache as the cache requires DIO alignment.
+	 *
+	 * Also skip uploading for data that's been read and just needs copying
+	 * to the cache.
+	 */
+	for (int s = 0; s < NR_IO_STREAMS; s++) {
+		stream = &wreq->io_streams[s];
+		stream->submit_off = 0;
+		stream->submit_len = flen;
+		if (!stream->avail) {
+			stream->submit_off = UINT_MAX;
+			stream->submit_len = 0;
+		}
+	}
+
+	/* Attach the folio to one or more subrequests.  For a big folio, we
+	 * could end up with thousands of subrequests if the wsize is small -
+	 * but we might need to wait during the creation of subrequests for
+	 * network resources (eg. SMB credits).
+	 */
+	for (;;) {
+		ssize_t part;
+		size_t lowest_off = ULONG_MAX;
+		int choose_s = -1;
+
+		/* Always add to the lowest-submitted stream first. */
+		for (int s = 0; s < NR_IO_STREAMS; s++) {
+			stream = &wreq->io_streams[s];
+			if (stream->submit_len > 0 &&
+			    stream->submit_off < lowest_off) {
+				lowest_off = stream->submit_off;
+				choose_s = s;
+			}
+		}
+
+		if (choose_s < 0)
+			break;
+		stream = &wreq->io_streams[choose_s];
+
+		/* Advance the iterator(s). */
+		if (stream->submit_off > iter_off) {
+			rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off);
+			iter_off = stream->submit_off;
+		}
+
+		atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
+		stream->submit_extendable_to = fsize - stream->submit_off;
+		part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
+					   stream->submit_len, to_eof);
+		stream->submit_off += part;
+		if (part > stream->submit_len)
+			stream->submit_len = 0;
+		else
+			stream->submit_len -= part;
+		if (part > 0)
+			no_debug = true;
+	}
+
+	wreq->buffer.iter.iov_offset = 0;
+	if (fsize > iter_off)
+		rolling_buffer_advance(&wreq->buffer, fsize - iter_off);
+	atomic64_set(&wreq->issued_to, fpos + fsize);
+
+	if (!no_debug)
+		kdebug("R=%x: No submit", wreq->debug_id);
+	_leave(" = 0");
+	return 0;
+}
+
+/**
+ * netfs_writeback_single - Write back a monolithic payload
+ * @mapping: The mapping to write from
+ * @wbc: Hints from the VM
+ * @iter: Data to write, must be ITER_FOLIOQ.
+ *
+ * Write a monolithic, non-pagecache object back to the server and/or
+ * the cache.
+ */
+int netfs_writeback_single(struct address_space *mapping,
+			   struct writeback_control *wbc,
+			   struct iov_iter *iter)
+{
+	struct netfs_io_request *wreq;
+	struct netfs_inode *ictx = netfs_inode(mapping->host);
+	struct folio_queue *fq;
+	size_t size = iov_iter_count(iter);
+	int ret;
+
+	if (WARN_ON_ONCE(!iov_iter_is_folioq(iter)))
+		return -EIO;
+
+	if (!mutex_trylock(&ictx->wb_lock)) {
+		if (wbc->sync_mode == WB_SYNC_NONE) {
+			netfs_stat(&netfs_n_wb_lock_skip);
+			return 0;
+		}
+		netfs_stat(&netfs_n_wb_lock_wait);
+		mutex_lock(&ictx->wb_lock);
+	}
+
+	wreq = netfs_create_write_req(mapping, NULL, 0, NETFS_WRITEBACK_SINGLE);
+	if (IS_ERR(wreq)) {
+		ret = PTR_ERR(wreq);
+		goto couldnt_start;
+	}
+
+	trace_netfs_write(wreq, netfs_write_trace_writeback);
+	netfs_stat(&netfs_n_wh_writepages);
+
+	if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+		wreq->netfs_ops->begin_writeback(wreq);
+
+	for (fq = (struct folio_queue *)iter->folioq; fq; fq = fq->next) {
+		for (int slot = 0; slot < folioq_count(fq); slot++) {
+			struct folio *folio = folioq_folio(fq, slot);
+			size_t part = umin(folioq_folio_size(fq, slot), size);
+
+			_debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to));
+
+			ret = netfs_write_folio_single(wreq, folio);
+			if (ret < 0)
+				goto stop;
+			size -= part;
+			if (size <= 0)
+				goto stop;
+		}
+	}
+
+stop:
+	for (int s = 0; s < NR_IO_STREAMS; s++)
+		netfs_issue_write(wreq, &wreq->io_streams[s]);
+	smp_wmb(); /* Write lists before ALL_QUEUED. */
+	set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+	mutex_unlock(&ictx->wb_lock);
+
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+	_leave(" = %d", ret);
+	return ret;
+
+couldnt_start:
+	mutex_unlock(&ictx->wb_lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_writeback_single);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index dd737344cff3..27e62f7d2940 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -73,6 +73,7 @@ struct netfs_inode {
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 #define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
 #define NETFS_ICTX_MODIFIED_ATTR 3		/* Indicate change in mtime/ctime */
+#define NETFS_ICTX_SINGLE_NO_UPLOAD 4		/* Monolithic payload, cache but no upload */
 };
 
 /*
@@ -210,9 +211,11 @@ enum netfs_io_origin {
 	NETFS_READAHEAD,		/* This read was triggered by readahead */
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_GAPS,		/* This read is a synchronous read to fill gaps */
+	NETFS_READ_SINGLE,		/* This read should be treated as a single object */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_WRITEBACK_SINGLE,		/* This monolithic write was triggered by writepages */
 	NETFS_WRITETHROUGH,		/* This write was made by netfs_perform_write() */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_WRITE,		/* This is a direct I/O write */
@@ -408,6 +411,13 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 					   struct netfs_group *netfs_group);
 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from);
 
+/* Single, monolithic object read/write API. */
+void netfs_single_mark_inode_dirty(struct inode *inode);
+ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter);
+int netfs_writeback_single(struct address_space *mapping,
+			   struct writeback_control *wbc,
+			   struct iov_iter *iter);
+
 /* Address operations API */
 struct readahead_control;
 void netfs_readahead(struct readahead_control *);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index fc237ff23a33..6df2e7313371 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -21,6 +21,7 @@
 	EM(netfs_read_trace_readahead,		"READAHEAD")	\
 	EM(netfs_read_trace_readpage,		"READPAGE ")	\
 	EM(netfs_read_trace_read_gaps,		"READ-GAPS")	\
+	EM(netfs_read_trace_read_single,	"READ-SNGL")	\
 	EM(netfs_read_trace_prefetch_for_write,	"PREFETCHW")	\
 	E_(netfs_read_trace_write_begin,	"WRITEBEGN")
 
@@ -35,9 +36,11 @@
 	EM(NETFS_READAHEAD,			"RA")		\
 	EM(NETFS_READPAGE,			"RP")		\
 	EM(NETFS_READ_GAPS,			"RG")		\
+	EM(NETFS_READ_SINGLE,			"R1")		\
 	EM(NETFS_READ_FOR_WRITE,		"RW")		\
 	EM(NETFS_DIO_READ,			"DR")		\
 	EM(NETFS_WRITEBACK,			"WB")		\
+	EM(NETFS_WRITEBACK_SINGLE,		"W1")		\
 	EM(NETFS_WRITETHROUGH,			"WT")		\
 	EM(NETFS_UNBUFFERED_WRITE,		"UW")		\
 	EM(NETFS_DIO_WRITE,			"DW")		\
@@ -47,6 +50,7 @@
 	EM(netfs_rreq_trace_assess,		"ASSESS ")	\
 	EM(netfs_rreq_trace_copy,		"COPY   ")	\
 	EM(netfs_rreq_trace_collect,		"COLLECT")	\
+	EM(netfs_rreq_trace_dirty,		"DIRTY  ")	\
 	EM(netfs_rreq_trace_done,		"DONE   ")	\
 	EM(netfs_rreq_trace_free,		"FREE   ")	\
 	EM(netfs_rreq_trace_redirty,		"REDIRTY")	\
-- 
cgit v1.2.3


From b2604315e87a3fa3c35561e1c37836f915c4e3d8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:10 +0000
Subject: afs: Make afs_init_request() get a key if not given a file

In a future patch, AFS directory caching will go through netfslib and this
will involve, at times, running on behalf of ->lookup(), which doesn't
provide us with a file from which we can get an authentication key.

If a file isn't provided, make afs_init_request() get a key from the
process's keyrings instead when setting up a read.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-21-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/file.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index f717168da4ab..a9d98d18407c 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -372,10 +372,26 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
 
 static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
+	struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
+
 	if (file)
 		rreq->netfs_priv = key_get(afs_file_key(file));
 	rreq->rsize = 256 * 1024;
 	rreq->wsize = 256 * 1024 * 1024;
+
+	switch (rreq->origin) {
+	case NETFS_READ_SINGLE:
+		if (!file) {
+			struct key *key = afs_request_key(vnode->volume->cell);
+
+			if (IS_ERR(key))
+				return PTR_ERR(key);
+			rreq->netfs_priv = key;
+		}
+		break;
+	default:
+		break;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 6dd80936618c4ff852d4db73aca400351d9bd9f0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:11 +0000
Subject: afs: Use netfslib for directories

In the AFS ecosystem, directories are just a special type of file that is
downloaded and parsed locally.  Download is done by the same mechanism as
ordinary files and the data can be cached.  There is one important semantic
restriction on directories over files: the client must download the entire
directory in one go because, for example, the server could fabricate the
contents of the blob on the fly with each download and give a different
image each time.

So that we can cache the directory download, switch AFS directory support
over to using the netfslib single-object API, thereby allowing directory
content to be stored in the local cache.

To make this work, the following changes are made:

 (1) A directory's contents are now stored in a folio_queue chain attached
     to the afs_vnode (inode) struct rather than its associated pagecache,
     though multipage folios are still used to hold the data.  The folio
     queue is discarded when the directory inode is evicted.

     This also helps with the phasing out of ITER_XARRAY.

 (2) Various directory operations are made to use and unuse the cache
     cookie.

 (3) The content checking, content dumping and content iteration are now
     performed with a standard iov_iter iterator over the contents of the
     folio queue.

 (4) Iteration and modification must be done with the vnode's validate_lock
     held.  In conjunction with (1), this means that the iteration can be
     done without the need to lock pages or take extra refs on them, unlike
     when accessing ->i_pages.

 (5) Convert to using netfs_read_single() to read data.

 (6) Provide a ->writepages() to call netfs_writeback_single() to save the
     data to the cache according to the VM's scheduling whilst holding the
     validate_lock read-locked as (4).

 (7) Change local directory image editing functions:

     (a) Provide a function to get a specific block by number from the
     	 folio_queue as we can no longer use the i_pages xarray to locate
     	 folios by index.  This uses a cursor to remember the current
     	 position as we need to iterate through the directory contents.
     	 The block is kmapped before being returned.

     (b) Make the function in (a) extend the directory by an extra folio if
     	 we run out of space.

     (c) Raise the check of the block free space counter, for those blocks
     	 that have one, higher in the function to eliminate a call to get a
     	 block.

     (d) Remove the page unlocking and putting done during the editing
     	 loops.  This is no longer necessary as the folio_queue holds the
     	 references and the pages are no longer in the pagecache.

     (e) Mark the inode dirty and pin the cache usage till writeback at the
     	 end of a successful edit.

 (8) Don't set the large_folios flag on the inode as we do the allocation
     ourselves rather than the VM doing it automatically.

 (9) Mark the inode as being a single object that isn't uploaded to the
     server.

(10) Enable caching on directories.

(11) Only set the upload key for writeback for regular files.

Notes:

 (*) We keep the ->release_folio(), ->invalidate_folio() and
     ->migrate_folio() ops as we set the mapping pointer on the folio.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-22-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-afs@lists.infradead.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/dir.c               | 747 +++++++++++++++++++++++----------------------
 fs/afs/dir_edit.c          | 183 +++++------
 fs/afs/file.c              |   8 +
 fs/afs/inode.c             |  21 +-
 fs/afs/internal.h          |  16 +
 fs/afs/super.c             |   2 +
 fs/afs/write.c             |   4 +-
 include/trace/events/afs.h |   6 +-
 8 files changed, 517 insertions(+), 470 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index f36a28a8f27b..3fa095429794 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
 #include <linux/ctype.h>
 #include <linux/sched.h>
 #include <linux/iversion.h>
+#include <linux/iov_iter.h>
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 #include "afs_fs.h"
@@ -42,15 +43,6 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		      struct dentry *old_dentry, struct inode *new_dir,
 		      struct dentry *new_dentry, unsigned int flags);
-static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags);
-static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
-				   size_t length);
-
-static bool afs_dir_dirty_folio(struct address_space *mapping,
-		struct folio *folio)
-{
-	BUG(); /* This should never happen. */
-}
 
 const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
@@ -75,10 +67,7 @@ const struct inode_operations afs_dir_inode_operations = {
 };
 
 const struct address_space_operations afs_dir_aops = {
-	.dirty_folio	= afs_dir_dirty_folio,
-	.release_folio	= afs_dir_release_folio,
-	.invalidate_folio = afs_dir_invalidate_folio,
-	.migrate_folio	= filemap_migrate_folio,
+	.writepages	= afs_single_writepages,
 };
 
 const struct dentry_operations afs_fs_dentry_operations = {
@@ -105,146 +94,120 @@ struct afs_lookup_cookie {
 	struct afs_fid		fids[50];
 };
 
+static void afs_dir_unuse_cookie(struct afs_vnode *dvnode, int ret)
+{
+	if (ret == 0) {
+		struct afs_vnode_cache_aux aux;
+		loff_t i_size = i_size_read(&dvnode->netfs.inode);
+
+		afs_set_cache_aux(dvnode, &aux);
+		fscache_unuse_cookie(afs_vnode_cache(dvnode), &aux, &i_size);
+	} else {
+		fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL);
+	}
+}
+
 /*
- * Drop the refs that we're holding on the folios we were reading into.  We've
- * got refs on the first nr_pages pages.
+ * Iterate through a kmapped directory segment, dumping a summary of
+ * the contents.
  */
-static void afs_dir_read_cleanup(struct afs_read *req)
+static size_t afs_dir_dump_step(void *iter_base, size_t progress, size_t len,
+				void *priv, void *priv2)
 {
-	struct address_space *mapping = req->vnode->netfs.inode.i_mapping;
-	struct folio *folio;
-	pgoff_t last = req->nr_pages - 1;
+	do {
+		union afs_xdr_dir_block *block = iter_base;
 
-	XA_STATE(xas, &mapping->i_pages, 0);
+		pr_warn("[%05zx] %32phN\n", progress, block);
+		iter_base += AFS_DIR_BLOCK_SIZE;
+		progress += AFS_DIR_BLOCK_SIZE;
+		len -= AFS_DIR_BLOCK_SIZE;
+	} while (len > 0);
 
-	if (unlikely(!req->nr_pages))
-		return;
+	return len;
+}
 
-	rcu_read_lock();
-	xas_for_each(&xas, folio, last) {
-		if (xas_retry(&xas, folio))
-			continue;
-		BUG_ON(xa_is_value(folio));
-		ASSERTCMP(folio->mapping, ==, mapping);
+/*
+ * Dump the contents of a directory.
+ */
+static void afs_dir_dump(struct afs_vnode *dvnode)
+{
+	struct iov_iter iter;
+	unsigned long long i_size = i_size_read(&dvnode->netfs.inode);
 
-		folio_put(folio);
-	}
+	pr_warn("DIR %llx:%llx is=%llx\n",
+		dvnode->fid.vid, dvnode->fid.vnode, i_size);
 
-	rcu_read_unlock();
+	iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size);
+	iterate_folioq(&iter, iov_iter_count(&iter), NULL, NULL,
+		       afs_dir_dump_step);
 }
 
 /*
  * check that a directory folio is valid
  */
-static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,
-				loff_t i_size)
+static bool afs_dir_check_block(struct afs_vnode *dvnode, size_t progress,
+				union afs_xdr_dir_block *block)
 {
-	union afs_xdr_dir_block *block;
-	size_t offset, size;
-	loff_t pos;
+	if (block->hdr.magic != AFS_DIR_MAGIC) {
+		pr_warn("%s(%lx): [%zx] bad magic %04x\n",
+		       __func__, dvnode->netfs.inode.i_ino,
+		       progress, ntohs(block->hdr.magic));
+		trace_afs_dir_check_failed(dvnode, progress);
+		trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
+		return false;
+	}
 
-	/* Determine how many magic numbers there should be in this folio, but
-	 * we must take care because the directory may change size under us.
+	/* Make sure each block is NUL terminated so we can reasonably
+	 * use string functions on it.  The filenames in the folio
+	 * *should* be NUL-terminated anyway.
 	 */
-	pos = folio_pos(folio);
-	if (i_size <= pos)
-		goto checked;
-
-	size = min_t(loff_t, folio_size(folio), i_size - pos);
-	for (offset = 0; offset < size; offset += sizeof(*block)) {
-		block = kmap_local_folio(folio, offset);
-		if (block->hdr.magic != AFS_DIR_MAGIC) {
-			printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n",
-			       __func__, dvnode->netfs.inode.i_ino,
-			       pos, offset, size, ntohs(block->hdr.magic));
-			trace_afs_dir_check_failed(dvnode, pos + offset, i_size);
-			kunmap_local(block);
-			trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
-			goto error;
-		}
-
-		/* Make sure each block is NUL terminated so we can reasonably
-		 * use string functions on it.  The filenames in the folio
-		 * *should* be NUL-terminated anyway.
-		 */
-		((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0;
-
-		kunmap_local(block);
-	}
-checked:
+	((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0;
 	afs_stat_v(dvnode, n_read_dir);
 	return true;
-
-error:
-	return false;
 }
 
 /*
- * Dump the contents of a directory.
+ * Iterate through a kmapped directory segment, checking the content.
  */
-static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
+static size_t afs_dir_check_step(void *iter_base, size_t progress, size_t len,
+				 void *priv, void *priv2)
 {
-	union afs_xdr_dir_block *block;
-	struct address_space *mapping = dvnode->netfs.inode.i_mapping;
-	struct folio *folio;
-	pgoff_t last = req->nr_pages - 1;
-	size_t offset, size;
-
-	XA_STATE(xas, &mapping->i_pages, 0);
-
-	pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n",
-		dvnode->fid.vid, dvnode->fid.vnode,
-		req->file_size, req->len, req->actual_len);
-	pr_warn("DIR %llx %x %zx %zx\n",
-		req->pos, req->nr_pages,
-		req->iter->iov_offset,  iov_iter_count(req->iter));
-
-	xas_for_each(&xas, folio, last) {
-		if (xas_retry(&xas, folio))
-			continue;
+	struct afs_vnode *dvnode = priv;
 
-		BUG_ON(folio->mapping != mapping);
+	if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE ||
+			 len % AFS_DIR_BLOCK_SIZE))
+		return len;
 
-		size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
-		for (offset = 0; offset < size; offset += sizeof(*block)) {
-			block = kmap_local_folio(folio, offset);
-			pr_warn("[%02lx] %32phN\n", folio->index + offset, block);
-			kunmap_local(block);
-		}
-	}
+	do {
+		if (!afs_dir_check_block(dvnode, progress, iter_base))
+			break;
+		iter_base += AFS_DIR_BLOCK_SIZE;
+		len -= AFS_DIR_BLOCK_SIZE;
+	} while (len > 0);
+
+	return len;
 }
 
 /*
- * Check all the blocks in a directory.  All the folios are held pinned.
+ * Check all the blocks in a directory.
  */
-static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
+static int afs_dir_check(struct afs_vnode *dvnode)
 {
-	struct address_space *mapping = dvnode->netfs.inode.i_mapping;
-	struct folio *folio;
-	pgoff_t last = req->nr_pages - 1;
-	int ret = 0;
+	struct iov_iter iter;
+	unsigned long long i_size = i_size_read(&dvnode->netfs.inode);
+	size_t checked = 0;
 
-	XA_STATE(xas, &mapping->i_pages, 0);
-
-	if (unlikely(!req->nr_pages))
+	if (unlikely(!i_size))
 		return 0;
 
-	rcu_read_lock();
-	xas_for_each(&xas, folio, last) {
-		if (xas_retry(&xas, folio))
-			continue;
-
-		BUG_ON(folio->mapping != mapping);
-
-		if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
-			afs_dir_dump(dvnode, req);
-			ret = -EIO;
-			break;
-		}
+	iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size);
+	checked = iterate_folioq(&iter, iov_iter_count(&iter), dvnode, NULL,
+				 afs_dir_check_step);
+	if (checked != i_size) {
+		afs_dir_dump(dvnode);
+		return -EIO;
 	}
-
-	rcu_read_unlock();
-	return ret;
+	return 0;
 }
 
 /*
@@ -264,133 +227,136 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 }
 
 /*
- * Read the directory into the pagecache in one go, scrubbing the previous
- * contents.  The list of folios is returned, pinning them so that they don't
- * get reclaimed during the iteration.
+ * Read a file in a single download.
  */
-static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
-	__acquires(&dvnode->validate_lock)
+static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file)
 {
-	struct address_space *mapping = dvnode->netfs.inode.i_mapping;
-	struct afs_read *req;
+	struct iov_iter iter;
+	ssize_t ret;
 	loff_t i_size;
-	int nr_pages, i;
-	int ret;
-	loff_t remote_size = 0;
-
-	_enter("");
-
-	req = kzalloc(sizeof(*req), GFP_KERNEL);
-	if (!req)
-		return ERR_PTR(-ENOMEM);
+	bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) &&
+		       !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags));
 
-	refcount_set(&req->usage, 1);
-	req->vnode = dvnode;
-	req->key = key_get(key);
-	req->cleanup = afs_dir_read_cleanup;
-
-expand:
 	i_size = i_size_read(&dvnode->netfs.inode);
-	if (i_size < remote_size)
-	    i_size = remote_size;
-	if (i_size < 2048) {
-		ret = afs_bad(dvnode, afs_file_error_dir_small);
-		goto error;
-	}
-	if (i_size > 2048 * 1024) {
-		trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
-		ret = -EFBIG;
-		goto error;
+	if (is_dir) {
+		if (i_size < AFS_DIR_BLOCK_SIZE)
+			return afs_bad(dvnode, afs_file_error_dir_small);
+		if (i_size > AFS_DIR_BLOCK_SIZE * 1024) {
+			trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
+			return -EFBIG;
+		}
+	} else {
+		if (i_size > AFSPATHMAX) {
+			trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
+			return -EFBIG;
+		}
 	}
 
-	_enter("%llu", i_size);
+	/* Expand the storage.  TODO: Shrink the storage too. */
+	if (dvnode->directory_size < i_size) {
+		size_t cur_size = dvnode->directory_size;
 
-	nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE;
+		ret = netfs_alloc_folioq_buffer(NULL,
+						&dvnode->directory, &cur_size, i_size,
+						mapping_gfp_mask(dvnode->netfs.inode.i_mapping));
+		dvnode->directory_size = cur_size;
+		if (ret < 0)
+			return ret;
+	}
 
-	req->actual_len = i_size; /* May change */
-	req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
-	req->data_version = dvnode->status.data_version; /* May change */
-	iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages,
-			0, i_size);
-	req->iter = &req->def_iter;
+	iov_iter_folio_queue(&iter, ITER_DEST, dvnode->directory, 0, 0, dvnode->directory_size);
 
-	/* Fill in any gaps that we might find where the memory reclaimer has
-	 * been at work and pin all the folios.  If there are any gaps, we will
-	 * need to reread the entire directory contents.
+	/* AFS requires us to perform the read of a directory synchronously as
+	 * a single unit to avoid issues with the directory contents being
+	 * changed between reads.
 	 */
-	i = req->nr_pages;
-	while (i < nr_pages) {
-		struct folio *folio;
-
-		folio = filemap_get_folio(mapping, i);
-		if (IS_ERR(folio)) {
-			afs_invalidate_dir(dvnode, afs_dir_invalid_reclaimed_folio);
-			folio = __filemap_get_folio(mapping,
-						    i, FGP_LOCK | FGP_CREAT,
-						    mapping->gfp_mask);
-			if (IS_ERR(folio)) {
-				ret = PTR_ERR(folio);
-				goto error;
-			}
-			folio_attach_private(folio, (void *)1);
-			folio_unlock(folio);
+	ret = netfs_read_single(&dvnode->netfs.inode, file, &iter);
+	if (ret >= 0) {
+		i_size = i_size_read(&dvnode->netfs.inode);
+		if (i_size > ret) {
+			/* The content has grown, so we need to expand the
+			 * buffer.
+			 */
+			ret = -ESTALE;
+		} else if (is_dir) {
+			int ret2 = afs_dir_check(dvnode);
+
+			if (ret2 < 0)
+				ret = ret2;
+		} else if (i_size < folioq_folio_size(dvnode->directory, 0)) {
+			/* NUL-terminate a symlink. */
+			char *symlink = kmap_local_folio(folioq_folio(dvnode->directory, 0), 0);
+
+			symlink[i_size] = 0;
+			kunmap_local(symlink);
 		}
-
-		req->nr_pages += folio_nr_pages(folio);
-		i += folio_nr_pages(folio);
 	}
 
-	/* If we're going to reload, we need to lock all the pages to prevent
-	 * races.
-	 */
+	return ret;
+}
+
+ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file)
+{
+	ssize_t ret;
+
+	fscache_use_cookie(afs_vnode_cache(dvnode), false);
+	ret = afs_do_read_single(dvnode, file);
+	fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL);
+	return ret;
+}
+
+/*
+ * Read the directory into a folio_queue buffer in one go, scrubbing the
+ * previous contents.  We return -ESTALE if the caller needs to call us again.
+ */
+static ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
+	__acquires(&dvnode->validate_lock)
+{
+	ssize_t ret;
+	loff_t i_size;
+
+	i_size = i_size_read(&dvnode->netfs.inode);
+
 	ret = -ERESTARTSYS;
 	if (down_read_killable(&dvnode->validate_lock) < 0)
 		goto error;
 
-	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		goto success;
+	/* We only need to reread the data if it became invalid - or if we
+	 * haven't read it yet.
+	 */
+	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+	    test_bit(AFS_VNODE_DIR_READ, &dvnode->flags))
+		goto valid;
 
 	up_read(&dvnode->validate_lock);
 	if (down_write_killable(&dvnode->validate_lock) < 0)
 		goto error;
 
-	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
-		trace_afs_reload_dir(dvnode);
-		ret = afs_fetch_data(dvnode, req);
-		if (ret < 0)
-			goto error_unlock;
-
-		task_io_account_read(PAGE_SIZE * req->nr_pages);
+	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+		afs_invalidate_cache(dvnode, 0);
 
-		if (req->len < req->file_size) {
-			/* The content has grown, so we need to expand the
-			 * buffer.
-			 */
-			up_write(&dvnode->validate_lock);
-			remote_size = req->file_size;
-			goto expand;
-		}
-
-		/* Validate the data we just read. */
-		ret = afs_dir_check(dvnode, req);
+	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) ||
+	    !test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) {
+		trace_afs_reload_dir(dvnode);
+		ret = afs_read_single(dvnode, file);
 		if (ret < 0)
 			goto error_unlock;
 
 		// TODO: Trim excess pages
 
 		set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
+		set_bit(AFS_VNODE_DIR_READ, &dvnode->flags);
 	}
 
 	downgrade_write(&dvnode->validate_lock);
-success:
-	return req;
+valid:
+	return i_size;
 
 error_unlock:
 	up_write(&dvnode->validate_lock);
 error:
-	afs_put_read(req);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
+	_leave(" = %zd", ret);
+	return ret;
 }
 
 /*
@@ -398,79 +364,69 @@ error:
  */
 static int afs_dir_iterate_block(struct afs_vnode *dvnode,
 				 struct dir_context *ctx,
-				 union afs_xdr_dir_block *block,
-				 unsigned blkoff)
+				 union afs_xdr_dir_block *block)
 {
 	union afs_xdr_dirent *dire;
-	unsigned offset, next, curr, nr_slots;
+	unsigned int blknum, base, hdr, pos, next, nr_slots;
 	size_t nlen;
 	int tmp;
 
-	_enter("%llx,%x", ctx->pos, blkoff);
+	blknum	= ctx->pos / AFS_DIR_BLOCK_SIZE;
+	base	= blknum * AFS_DIR_SLOTS_PER_BLOCK;
+	hdr	= (blknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
+	pos	= DIV_ROUND_UP(ctx->pos, AFS_DIR_DIRENT_SIZE) - base;
 
-	curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent);
+	_enter("%llx,%x", ctx->pos, blknum);
 
 	/* walk through the block, an entry at a time */
-	for (offset = (blkoff == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
-	     offset < AFS_DIR_SLOTS_PER_BLOCK;
-	     offset = next
-	     ) {
+	for (unsigned int slot = hdr; slot < AFS_DIR_SLOTS_PER_BLOCK; slot = next) {
 		/* skip entries marked unused in the bitmap */
-		if (!(block->hdr.bitmap[offset / 8] &
-		      (1 << (offset % 8)))) {
-			_debug("ENT[%zu.%u]: unused",
-			       blkoff / sizeof(union afs_xdr_dir_block), offset);
-			next = offset + 1;
-			if (offset >= curr)
-				ctx->pos = blkoff +
-					next * sizeof(union afs_xdr_dirent);
+		if (!(block->hdr.bitmap[slot / 8] &
+		      (1 << (slot % 8)))) {
+			_debug("ENT[%x]: Unused", base + slot);
+			next = slot + 1;
+			if (next >= pos)
+				ctx->pos = (base + next) * sizeof(union afs_xdr_dirent);
 			continue;
 		}
 
 		/* got a valid entry */
-		dire = &block->dirents[offset];
+		dire = &block->dirents[slot];
 		nlen = strnlen(dire->u.name,
-			       sizeof(*block) -
-			       offset * sizeof(union afs_xdr_dirent));
+			       (unsigned long)(block + 1) - (unsigned long)dire->u.name - 1);
 		if (nlen > AFSNAMEMAX - 1) {
-			_debug("ENT[%zu]: name too long (len %u/%zu)",
-			       blkoff / sizeof(union afs_xdr_dir_block),
-			       offset, nlen);
+			_debug("ENT[%x]: Name too long (len %zx)",
+			       base + slot, nlen);
 			return afs_bad(dvnode, afs_file_error_dir_name_too_long);
 		}
 
-		_debug("ENT[%zu.%u]: %s %zu \"%s\"",
-		       blkoff / sizeof(union afs_xdr_dir_block), offset,
-		       (offset < curr ? "skip" : "fill"),
+		_debug("ENT[%x]: %s %zx \"%s\"",
+		       base + slot, (slot < pos ? "skip" : "fill"),
 		       nlen, dire->u.name);
 
 		nr_slots = afs_dir_calc_slots(nlen);
-		next = offset + nr_slots;
+		next = slot + nr_slots;
 		if (next > AFS_DIR_SLOTS_PER_BLOCK) {
-			_debug("ENT[%zu.%u]:"
-			       " %u extends beyond end dir block"
-			       " (len %zu)",
-			       blkoff / sizeof(union afs_xdr_dir_block),
-			       offset, next, nlen);
+			_debug("ENT[%x]: extends beyond end dir block (len %zx)",
+			       base + slot, nlen);
 			return afs_bad(dvnode, afs_file_error_dir_over_end);
 		}
 
 		/* Check that the name-extension dirents are all allocated */
 		for (tmp = 1; tmp < nr_slots; tmp++) {
-			unsigned int ix = offset + tmp;
-			if (!(block->hdr.bitmap[ix / 8] & (1 << (ix % 8)))) {
-				_debug("ENT[%zu.u]:"
-				       " %u unmarked extension (%u/%u)",
-				       blkoff / sizeof(union afs_xdr_dir_block),
-				       offset, tmp, nr_slots);
+			unsigned int xslot = slot + tmp;
+
+			if (!(block->hdr.bitmap[xslot / 8] & (1 << (xslot % 8)))) {
+				_debug("ENT[%x]: Unmarked extension (%x/%x)",
+				       base + slot, tmp, nr_slots);
 				return afs_bad(dvnode, afs_file_error_dir_unmarked_ext);
 			}
 		}
 
 		/* skip if starts before the current position */
-		if (offset < curr) {
-			if (next > curr)
-				ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
+		if (slot < pos) {
+			if (next > pos)
+				ctx->pos = (base + next) * sizeof(union afs_xdr_dirent);
 			continue;
 		}
 
@@ -484,75 +440,110 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
 			return 0;
 		}
 
-		ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
+		ctx->pos = (base + next) * sizeof(union afs_xdr_dirent);
 	}
 
 	_leave(" = 1 [more]");
 	return 1;
 }
 
+struct afs_dir_iteration_ctx {
+	struct dir_context	*dir_ctx;
+	int			error;
+};
+
 /*
- * iterate through the data blob that lists the contents of an AFS directory
+ * Iterate through a kmapped directory segment.
  */
-static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
-			   struct key *key, afs_dataversion_t *_dir_version)
+static size_t afs_dir_iterate_step(void *iter_base, size_t progress, size_t len,
+				   void *priv, void *priv2)
 {
-	struct afs_vnode *dvnode = AFS_FS_I(dir);
-	union afs_xdr_dir_block *dblock;
-	struct afs_read *req;
-	struct folio *folio;
-	unsigned offset, size;
+	struct afs_dir_iteration_ctx *ctx = priv2;
+	struct afs_vnode *dvnode = priv;
 	int ret;
 
-	_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
-
-	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
-		_leave(" = -ESTALE");
-		return -ESTALE;
+	if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE ||
+			 len % AFS_DIR_BLOCK_SIZE)) {
+		pr_err("Mis-iteration prog=%zx len=%zx\n",
+		       progress % AFS_DIR_BLOCK_SIZE,
+		       len % AFS_DIR_BLOCK_SIZE);
+		return len;
 	}
 
-	req = afs_read_dir(dvnode, key);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	*_dir_version = req->data_version;
+	do {
+		ret = afs_dir_iterate_block(dvnode, ctx->dir_ctx, iter_base);
+		if (ret != 1)
+			break;
 
-	/* round the file position up to the next entry boundary */
-	ctx->pos += sizeof(union afs_xdr_dirent) - 1;
-	ctx->pos &= ~(sizeof(union afs_xdr_dirent) - 1);
+		ctx->dir_ctx->pos = round_up(ctx->dir_ctx->pos, AFS_DIR_BLOCK_SIZE);
+		iter_base += AFS_DIR_BLOCK_SIZE;
+		len -= AFS_DIR_BLOCK_SIZE;
+	} while (len > 0);
 
-	/* walk through the blocks in sequence */
-	ret = 0;
-	while (ctx->pos < req->actual_len) {
-		/* Fetch the appropriate folio from the directory and re-add it
-		 * to the LRU.  We have all the pages pinned with an extra ref.
-		 */
-		folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
-					    FGP_ACCESSED, 0);
-		if (IS_ERR(folio)) {
-			ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
-			break;
-		}
+	return len;
+}
 
-		offset = round_down(ctx->pos, sizeof(*dblock)) - folio_pos(folio);
-		size = min_t(loff_t, folio_size(folio),
-			     req->actual_len - folio_pos(folio));
+/*
+ * Iterate through the directory folios.
+ */
+static int afs_dir_iterate_contents(struct inode *dir, struct dir_context *dir_ctx)
+{
+	struct afs_dir_iteration_ctx ctx = { .dir_ctx = dir_ctx };
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct iov_iter iter;
+	unsigned long long i_size = i_size_read(dir);
 
-		do {
-			dblock = kmap_local_folio(folio, offset);
-			ret = afs_dir_iterate_block(dvnode, ctx, dblock,
-						    folio_pos(folio) + offset);
-			kunmap_local(dblock);
-			if (ret != 1)
-				goto out;
+	/* Round the file position up to the next entry boundary */
+	dir_ctx->pos = round_up(dir_ctx->pos, sizeof(union afs_xdr_dirent));
 
-		} while (offset += sizeof(*dblock), offset < size);
+	if (i_size <= 0 || dir_ctx->pos >= i_size)
+		return 0;
 
-		ret = 0;
-	}
+	iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size);
+	iov_iter_advance(&iter, round_down(dir_ctx->pos, AFS_DIR_BLOCK_SIZE));
+
+	iterate_folioq(&iter, iov_iter_count(&iter), dvnode, &ctx,
+		       afs_dir_iterate_step);
+
+	if (ctx.error == -ESTALE)
+		afs_invalidate_dir(dvnode, afs_dir_invalid_iter_stale);
+	return ctx.error;
+}
+
+/*
+ * iterate through the data blob that lists the contents of an AFS directory
+ */
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
+			   struct file *file, afs_dataversion_t *_dir_version)
+{
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	int retry_limit = 100;
+	int ret;
+
+	_enter("{%lu},%llx,,", dir->i_ino, ctx->pos);
+
+	do {
+		if (--retry_limit < 0) {
+			pr_warn("afs_read_dir(): Too many retries\n");
+			ret = -ESTALE;
+			break;
+		}
+		ret = afs_read_dir(dvnode, file);
+		if (ret < 0) {
+			if (ret != -ESTALE)
+				break;
+			if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
+				ret = -ESTALE;
+				break;
+			}
+			continue;
+		}
+		*_dir_version = inode_peek_iversion_raw(dir);
+
+		ret = afs_dir_iterate_contents(dir, ctx);
+		up_read(&dvnode->validate_lock);
+	} while (ret == -ESTALE);
 
-out:
-	up_read(&dvnode->validate_lock);
-	afs_put_read(req);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -564,8 +555,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
 	afs_dataversion_t dir_version;
 
-	return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file),
-			       &dir_version);
+	return afs_dir_iterate(file_inode(file), ctx, file, &dir_version);
 }
 
 /*
@@ -606,7 +596,7 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
  * - just returns the FID the dentry name maps to if found
  */
 static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
-			     struct afs_fid *fid, struct key *key,
+			     struct afs_fid *fid,
 			     afs_dataversion_t *_dir_version)
 {
 	struct afs_super_info *as = dir->i_sb->s_fs_info;
@@ -620,7 +610,7 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
 	_enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
 
 	/* search the directory */
-	ret = afs_dir_iterate(dir, &cookie.ctx, key, _dir_version);
+	ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version);
 	if (ret < 0) {
 		_leave(" = %d [iter]", ret);
 		return ret;
@@ -787,8 +777,7 @@ static bool afs_server_supports_ibulk(struct afs_vnode *dvnode)
  * files in one go and create inodes for them.  The inode of the file we were
  * asked for is returned.
  */
-static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
-				   struct key *key)
+static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 {
 	struct afs_lookup_cookie *cookie;
 	struct afs_vnode_param *vp;
@@ -816,7 +805,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
 		cookie->one_only = true;
 
 	/* search the directory */
-	ret = afs_dir_iterate(dir, &cookie->ctx, key, &data_version);
+	ret = afs_dir_iterate(dir, &cookie->ctx, NULL, &data_version);
 	if (ret < 0)
 		goto out;
 
@@ -925,8 +914,7 @@ out:
 /*
  * Look up an entry in a directory with @sys substitution.
  */
-static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry,
-				       struct key *key)
+static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry)
 {
 	struct afs_sysnames *subs;
 	struct afs_net *net = afs_i2net(dir);
@@ -974,7 +962,6 @@ out_s:
 	afs_put_sysnames(subs);
 	kfree(buf);
 out_p:
-	key_put(key);
 	return ret;
 }
 
@@ -988,7 +975,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	struct afs_fid fid = {};
 	struct inode *inode;
 	struct dentry *d;
-	struct key *key;
 	int ret;
 
 	_enter("{%llx:%llu},%p{%pd},",
@@ -1006,15 +992,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(-ESTALE);
 	}
 
-	key = afs_request_key(dvnode->volume->cell);
-	if (IS_ERR(key)) {
-		_leave(" = %ld [key]", PTR_ERR(key));
-		return ERR_CAST(key);
-	}
-
-	ret = afs_validate(dvnode, key);
+	ret = afs_validate(dvnode, NULL);
 	if (ret < 0) {
-		key_put(key);
+		afs_dir_unuse_cookie(dvnode, ret);
 		_leave(" = %d [val]", ret);
 		return ERR_PTR(ret);
 	}
@@ -1024,11 +1004,10 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	    dentry->d_name.name[dentry->d_name.len - 3] == 's' &&
 	    dentry->d_name.name[dentry->d_name.len - 2] == 'y' &&
 	    dentry->d_name.name[dentry->d_name.len - 1] == 's')
-		return afs_lookup_atsys(dir, dentry, key);
+		return afs_lookup_atsys(dir, dentry);
 
 	afs_stat_v(dvnode, n_lookup);
-	inode = afs_do_lookup(dir, dentry, key);
-	key_put(key);
+	inode = afs_do_lookup(dir, dentry);
 	if (inode == ERR_PTR(-ENOENT))
 		inode = afs_try_auto_mntpt(dentry, dir);
 
@@ -1154,7 +1133,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	afs_stat_v(dir, n_reval);
 
 	/* search the directory for this vnode */
-	ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version);
+	ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, &dir_version);
 	switch (ret) {
 	case 0:
 		/* the filename maps to something */
@@ -1316,18 +1295,21 @@ static void afs_create_success(struct afs_operation *op)
 
 static void afs_create_edit_dir(struct afs_operation *op)
 {
+	struct netfs_cache_resources cres = {};
 	struct afs_vnode_param *dvp = &op->file[0];
 	struct afs_vnode_param *vp = &op->file[1];
 	struct afs_vnode *dvnode = dvp->vnode;
 
 	_enter("op=%08x", op->debug_id);
 
+	fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode));
 	down_write(&dvnode->validate_lock);
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
 	    dvnode->status.data_version == dvp->dv_before + dvp->dv_delta)
 		afs_edit_dir_add(dvnode, &op->dentry->d_name, &vp->fid,
 				 op->create.reason);
 	up_write(&dvnode->validate_lock);
+	fscache_end_operation(&cres);
 }
 
 static void afs_create_put(struct afs_operation *op)
@@ -1355,6 +1337,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 {
 	struct afs_operation *op;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	int ret;
 
 	_enter("{%llx:%llu},{%pd},%ho",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
@@ -1365,6 +1348,8 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		return PTR_ERR(op);
 	}
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 	op->file[0].modification = true;
@@ -1374,7 +1359,9 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	op->create.reason = afs_edit_dir_for_mkdir;
 	op->mtime	= current_time(dir);
 	op->ops		= &afs_mkdir_operation;
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 }
 
 /*
@@ -1402,18 +1389,21 @@ static void afs_rmdir_success(struct afs_operation *op)
 
 static void afs_rmdir_edit_dir(struct afs_operation *op)
 {
+	struct netfs_cache_resources cres = {};
 	struct afs_vnode_param *dvp = &op->file[0];
 	struct afs_vnode *dvnode = dvp->vnode;
 
 	_enter("op=%08x", op->debug_id);
 	afs_dir_remove_subdir(op->dentry);
 
+	fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode));
 	down_write(&dvnode->validate_lock);
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
 	    dvnode->status.data_version == dvp->dv_before + dvp->dv_delta)
 		afs_edit_dir_remove(dvnode, &op->dentry->d_name,
 				    afs_edit_dir_for_rmdir);
 	up_write(&dvnode->validate_lock);
+	fscache_end_operation(&cres);
 }
 
 static void afs_rmdir_put(struct afs_operation *op)
@@ -1448,6 +1438,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 	op->file[0].modification = true;
@@ -1476,10 +1468,13 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	/* Not all systems that can host afs servers have ENOTEMPTY. */
 	if (ret == -EEXIST)
 		ret = -ENOTEMPTY;
+out:
+	afs_dir_unuse_cookie(dvnode, ret);
 	return ret;
 
 error:
-	return afs_put_operation(op);
+	ret = afs_put_operation(op);
+	goto out;
 }
 
 /*
@@ -1542,16 +1537,19 @@ static void afs_unlink_success(struct afs_operation *op)
 
 static void afs_unlink_edit_dir(struct afs_operation *op)
 {
+	struct netfs_cache_resources cres = {};
 	struct afs_vnode_param *dvp = &op->file[0];
 	struct afs_vnode *dvnode = dvp->vnode;
 
 	_enter("op=%08x", op->debug_id);
+	fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode));
 	down_write(&dvnode->validate_lock);
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
 	    dvnode->status.data_version == dvp->dv_before + dvp->dv_delta)
 		afs_edit_dir_remove(dvnode, &op->dentry->d_name,
 				    afs_edit_dir_for_unlink);
 	up_write(&dvnode->validate_lock);
+	fscache_end_operation(&cres);
 }
 
 static void afs_unlink_put(struct afs_operation *op)
@@ -1590,6 +1588,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 	op->file[0].modification = true;
@@ -1636,10 +1636,10 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		afs_wait_for_operation(op);
 	}
 
-	return afs_put_operation(op);
-
 error:
-	return afs_put_operation(op);
+	ret = afs_put_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 }
 
 static const struct afs_operation_ops afs_create_operation = {
@@ -1673,6 +1673,8 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
 		goto error;
 	}
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 	op->file[0].modification = true;
@@ -1683,7 +1685,9 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
 	op->create.reason = afs_edit_dir_for_create;
 	op->mtime	= current_time(dir);
 	op->ops		= &afs_create_operation;
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 
 error:
 	d_drop(dentry);
@@ -1748,6 +1752,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
 		goto error;
 	}
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	ret = afs_validate(vnode, op->key);
 	if (ret < 0)
 		goto error_op;
@@ -1763,10 +1769,13 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	op->dentry_2		= from;
 	op->ops			= &afs_link_operation;
 	op->create.reason	= afs_edit_dir_for_link;
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 
 error_op:
 	afs_put_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
 error:
 	d_drop(dentry);
 	_leave(" = %d", ret);
@@ -1810,6 +1819,8 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		goto error;
 	}
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 
@@ -1818,7 +1829,9 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	op->create.reason	= afs_edit_dir_for_symlink;
 	op->create.symlink	= content;
 	op->mtime		= current_time(dir);
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 
 error:
 	d_drop(dentry);
@@ -1860,6 +1873,7 @@ static void afs_rename_success(struct afs_operation *op)
 
 static void afs_rename_edit_dir(struct afs_operation *op)
 {
+	struct netfs_cache_resources orig_cres = {}, new_cres = {};
 	struct afs_vnode_param *orig_dvp = &op->file[0];
 	struct afs_vnode_param *new_dvp = &op->file[1];
 	struct afs_vnode *orig_dvnode = orig_dvp->vnode;
@@ -1876,6 +1890,10 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 		op->rename.rehash = NULL;
 	}
 
+	fscache_begin_write_operation(&orig_cres, afs_vnode_cache(orig_dvnode));
+	if (new_dvnode != orig_dvnode)
+		fscache_begin_write_operation(&new_cres, afs_vnode_cache(new_dvnode));
+
 	down_write(&orig_dvnode->validate_lock);
 	if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
 	    orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta)
@@ -1925,6 +1943,9 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 	d_move(old_dentry, new_dentry);
 
 	up_write(&new_dvnode->validate_lock);
+	fscache_end_operation(&orig_cres);
+	if (new_dvnode != orig_dvnode)
+		fscache_end_operation(&new_cres);
 }
 
 static void afs_rename_put(struct afs_operation *op)
@@ -1977,6 +1998,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	fscache_use_cookie(afs_vnode_cache(orig_dvnode), true);
+	if (new_dvnode != orig_dvnode)
+		fscache_use_cookie(afs_vnode_cache(new_dvnode), true);
+
 	ret = afs_validate(vnode, op->key);
 	afs_op_set_error(op, ret);
 	if (ret < 0)
@@ -2044,45 +2069,43 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	 */
 	d_drop(old_dentry);
 
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+out:
+	afs_dir_unuse_cookie(orig_dvnode, ret);
+	if (new_dvnode != orig_dvnode)
+		afs_dir_unuse_cookie(new_dvnode, ret);
+	return ret;
 
 error:
-	return afs_put_operation(op);
-}
-
-/*
- * Release a directory folio and clean up its private state if it's not busy
- * - return true if the folio can now be released, false if not
- */
-static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
-{
-	struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
-
-	_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index);
-
-	folio_detach_private(folio);
-
-	/* The directory will need reloading. */
-	afs_invalidate_dir(dvnode, afs_dir_invalid_release_folio);
-	return true;
+	ret = afs_put_operation(op);
+	goto out;
 }
 
 /*
- * Invalidate part or all of a folio.
+ * Write the file contents to the cache as a single blob.
  */
-static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
-				   size_t length)
+int afs_single_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
 {
-	struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
-
-	_enter("{%lu},%zu,%zu", folio->index, offset, length);
-
-	BUG_ON(!folio_test_locked(folio));
+	struct afs_vnode *dvnode = AFS_FS_I(mapping->host);
+	struct iov_iter iter;
+	bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) &&
+		       !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags));
+	int ret = 0;
 
-	/* The directory will need reloading. */
-	afs_invalidate_dir(dvnode, afs_dir_invalid_inval_folio);
+	/* Need to lock to prevent the folio queue and folios from being thrown
+	 * away.
+	 */
+	down_read(&dvnode->validate_lock);
+
+	if (is_dir ?
+	    test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) :
+	    atomic64_read(&dvnode->cb_expires_at) != AFS_NO_CB_PROMISE) {
+		iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0,
+				     i_size_read(&dvnode->netfs.inode));
+		ret = netfs_writeback_single(mapping, wbc, &iter);
+	}
 
-	/* we clean up only if the entire folio is being invalidated */
-	if (offset == 0 && length == folio_size(folio))
-		folio_detach_private(folio);
+	up_read(&dvnode->validate_lock);
+	return ret;
 }
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 5d092c8c0157..71cce884e434 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -10,6 +10,7 @@
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/iversion.h>
+#include <linux/folio_queue.h>
 #include "internal.h"
 #include "xdr_fs.h"
 
@@ -105,23 +106,57 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
 }
 
 /*
- * Get a new directory folio.
+ * Get a specific block, extending the directory storage to cover it as needed.
  */
-static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
+static union afs_xdr_dir_block *afs_dir_get_block(struct afs_dir_iter *iter, size_t block)
 {
-	struct address_space *mapping = vnode->netfs.inode.i_mapping;
+	struct folio_queue *fq;
+	struct afs_vnode *dvnode = iter->dvnode;
 	struct folio *folio;
+	size_t blpos = block * AFS_DIR_BLOCK_SIZE;
+	size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos;
+	int ret;
+
+	if (dvnode->directory_size < blend) {
+		size_t cur_size = dvnode->directory_size;
+
+		ret = netfs_alloc_folioq_buffer(
+			NULL, &dvnode->directory, &cur_size, blend,
+			mapping_gfp_mask(dvnode->netfs.inode.i_mapping));
+		dvnode->directory_size = cur_size;
+		if (ret < 0)
+			goto fail;
+	}
 
-	folio = __filemap_get_folio(mapping, index,
-				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
-				    mapping->gfp_mask);
-	if (IS_ERR(folio)) {
-		afs_invalidate_dir(vnode, afs_dir_invalid_edit_get_block);
-		return NULL;
+	fq = iter->fq;
+	if (!fq)
+		fq = dvnode->directory;
+
+	/* Search the folio queue for the folio containing the block... */
+	for (; fq; fq = fq->next) {
+		for (int s = iter->fq_slot; s < folioq_count(fq); s++) {
+			size_t fsize = folioq_folio_size(fq, s);
+
+			if (blend <= fpos + fsize) {
+				/* ... and then return the mapped block. */
+				folio = folioq_folio(fq, s);
+				if (WARN_ON_ONCE(folio_pos(folio) != fpos))
+					goto fail;
+				iter->fq = fq;
+				iter->fq_slot = s;
+				iter->fpos = fpos;
+				return kmap_local_folio(folio, blpos - fpos);
+			}
+			fpos += fsize;
+		}
+		iter->fq_slot = 0;
 	}
-	if (!folio_test_private(folio))
-		folio_attach_private(folio, (void *)1);
-	return folio;
+
+fail:
+	iter->fq = NULL;
+	iter->fq_slot = 0;
+	afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block);
+	return NULL;
 }
 
 /*
@@ -209,9 +244,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 {
 	union afs_xdr_dir_block *meta, *block;
 	union afs_xdr_dirent *de;
-	struct folio *folio0, *folio;
+	struct afs_dir_iter iter = { .dvnode = vnode };
 	unsigned int need_slots, nr_blocks, b;
-	pgoff_t index;
 	loff_t i_size;
 	int slot;
 
@@ -224,16 +258,13 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 		return;
 	}
 
-	folio0 = afs_dir_get_folio(vnode, 0);
-	if (!folio0) {
-		_leave(" [fgp]");
+	meta = afs_dir_get_block(&iter, 0);
+	if (!meta)
 		return;
-	}
 
 	/* Work out how many slots we're going to need. */
 	need_slots = afs_dir_calc_slots(name->len);
 
-	meta = kmap_local_folio(folio0, 0);
 	if (i_size == 0)
 		goto new_directory;
 	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
@@ -245,18 +276,17 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 		/* If the directory extended into a new folio, then we need to
 		 * tack a new folio on the end.
 		 */
-		index = b / AFS_DIR_BLOCKS_PER_PAGE;
 		if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
 			goto error_too_many_blocks;
-		if (index >= folio_nr_pages(folio0)) {
-			folio = afs_dir_get_folio(vnode, index);
-			if (!folio)
-				goto error;
-		} else {
-			folio = folio0;
-		}
 
-		block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
+		/* Lower dir blocks have a counter in the header we can check. */
+		if (b < AFS_DIR_BLOCKS_WITH_CTR &&
+		    meta->meta.alloc_ctrs[b] < need_slots)
+			continue;
+
+		block = afs_dir_get_block(&iter, b);
+		if (!block)
+			goto error;
 
 		/* Abandon the edit if we got a callback break. */
 		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
@@ -275,24 +305,16 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 			afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE);
 		}
 
-		/* Only lower dir blocks have a counter in the header. */
-		if (b >= AFS_DIR_BLOCKS_WITH_CTR ||
-		    meta->meta.alloc_ctrs[b] >= need_slots) {
-			/* We need to try and find one or more consecutive
-			 * slots to hold the entry.
-			 */
-			slot = afs_find_contig_bits(block, need_slots);
-			if (slot >= 0) {
-				_debug("slot %u", slot);
-				goto found_space;
-			}
+		/* We need to try and find one or more consecutive slots to
+		 * hold the entry.
+		 */
+		slot = afs_find_contig_bits(block, need_slots);
+		if (slot >= 0) {
+			_debug("slot %u", slot);
+			goto found_space;
 		}
 
 		kunmap_local(block);
-		if (folio != folio0) {
-			folio_unlock(folio);
-			folio_put(folio);
-		}
 	}
 
 	/* There are no spare slots of sufficient size, yet the operation
@@ -307,8 +329,7 @@ new_directory:
 	i_size = AFS_DIR_BLOCK_SIZE;
 	afs_set_i_size(vnode, i_size);
 	slot = AFS_DIR_RESV_BLOCKS0;
-	folio = folio0;
-	block = kmap_local_folio(folio, 0);
+	block = afs_dir_get_block(&iter, 0);
 	nr_blocks = 1;
 	b = 0;
 
@@ -328,10 +349,6 @@ found_space:
 	/* Adjust the bitmap. */
 	afs_set_contig_bits(block, slot, need_slots);
 	kunmap_local(block);
-	if (folio != folio0) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
 
 	/* Adjust the allocation counter. */
 	if (b < AFS_DIR_BLOCKS_WITH_CTR)
@@ -341,20 +358,16 @@ found_space:
 	afs_stat_v(vnode, n_dir_cr);
 	_debug("Insert %s in %u[%u]", name->name, b, slot);
 
+	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
+
 out_unmap:
 	kunmap_local(meta);
-	folio_unlock(folio0);
-	folio_put(folio0);
 	_leave("");
 	return;
 
 already_invalidated:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name);
 	kunmap_local(block);
-	if (folio != folio0) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
 	goto out_unmap;
 
 error_too_many_blocks:
@@ -376,9 +389,8 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
 {
 	union afs_xdr_dir_block *meta, *block;
 	union afs_xdr_dirent *de;
-	struct folio *folio0, *folio;
+	struct afs_dir_iter iter = { .dvnode = vnode };
 	unsigned int need_slots, nr_blocks, b;
-	pgoff_t index;
 	loff_t i_size;
 	int slot;
 
@@ -393,31 +405,20 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
 	}
 	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
 
-	folio0 = afs_dir_get_folio(vnode, 0);
-	if (!folio0) {
-		_leave(" [fgp]");
+	meta = afs_dir_get_block(&iter, 0);
+	if (!meta)
 		return;
-	}
 
 	/* Work out how many slots we're going to discard. */
 	need_slots = afs_dir_calc_slots(name->len);
 
-	meta = kmap_local_folio(folio0, 0);
-
 	/* Find a block that has sufficient slots available.  Each folio
 	 * contains two or more directory blocks.
 	 */
 	for (b = 0; b < nr_blocks; b++) {
-		index = b / AFS_DIR_BLOCKS_PER_PAGE;
-		if (index >= folio_nr_pages(folio0)) {
-			folio = afs_dir_get_folio(vnode, index);
-			if (!folio)
-				goto error;
-		} else {
-			folio = folio0;
-		}
-
-		block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
+		block = afs_dir_get_block(&iter, b);
+		if (!block)
+			goto error;
 
 		/* Abandon the edit if we got a callback break. */
 		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
@@ -431,10 +432,6 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
 		}
 
 		kunmap_local(block);
-		if (folio != folio0) {
-			folio_unlock(folio);
-			folio_put(folio);
-		}
 	}
 
 	/* Didn't find the dirent to clobber.  Download the directory again. */
@@ -455,34 +452,26 @@ found_dirent:
 	/* Adjust the bitmap. */
 	afs_clear_contig_bits(block, slot, need_slots);
 	kunmap_local(block);
-	if (folio != folio0) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
 
 	/* Adjust the allocation counter. */
 	if (b < AFS_DIR_BLOCKS_WITH_CTR)
 		meta->meta.alloc_ctrs[b] += need_slots;
 
+	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
+
 	inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
 	afs_stat_v(vnode, n_dir_rm);
 	_debug("Remove %s from %u[%u]", name->name, b, slot);
 
 out_unmap:
 	kunmap_local(meta);
-	folio_unlock(folio0);
-	folio_put(folio0);
 	_leave("");
 	return;
 
 already_invalidated:
+	kunmap_local(block);
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval,
 			   0, 0, 0, 0, name->name);
-	kunmap_local(block);
-	if (folio != folio0) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
 	goto out_unmap;
 
 error:
@@ -500,9 +489,8 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 {
 	union afs_xdr_dir_block *block;
 	union afs_xdr_dirent *de;
-	struct folio *folio;
+	struct afs_dir_iter iter = { .dvnode = vnode };
 	unsigned int nr_blocks, b;
-	pgoff_t index;
 	loff_t i_size;
 	int slot;
 
@@ -513,19 +501,17 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 		afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_bad_size);
 		return;
 	}
+
 	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
 
 	/* Find a block that has sufficient slots available.  Each folio
 	 * contains two or more directory blocks.
 	 */
 	for (b = 0; b < nr_blocks; b++) {
-		index = b / AFS_DIR_BLOCKS_PER_PAGE;
-		folio = afs_dir_get_folio(vnode, index);
-		if (!folio)
+		block = afs_dir_get_block(&iter, b);
+		if (!block)
 			goto error;
 
-		block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
-
 		/* Abandon the edit if we got a callback break. */
 		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
 			goto already_invalidated;
@@ -535,8 +521,6 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 			goto found_dirent;
 
 		kunmap_local(block);
-		folio_unlock(folio);
-		folio_put(folio);
 	}
 
 	/* Didn't find the dirent to clobber.  Download the directory again. */
@@ -554,8 +538,7 @@ found_dirent:
 			   ntohl(de->u.vnode), ntohl(de->u.unique), "..");
 
 	kunmap_local(block);
-	folio_unlock(folio);
-	folio_put(folio);
+	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
 	inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
 
 out:
@@ -564,8 +547,6 @@ out:
 
 already_invalidated:
 	kunmap_local(block);
-	folio_unlock(folio);
-	folio_put(folio);
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
 			   0, 0, 0, 0, "..");
 	goto out;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a9d98d18407c..5bc36bfaa173 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -389,6 +389,14 @@ static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
 			rreq->netfs_priv = key;
 		}
 		break;
+	case NETFS_WRITEBACK:
+	case NETFS_WRITETHROUGH:
+	case NETFS_UNBUFFERED_WRITE:
+	case NETFS_DIO_WRITE:
+		if (S_ISREG(rreq->inode->i_mode))
+			rreq->io_streams[0].avail = true;
+		break;
+	case NETFS_WRITEBACK_SINGLE:
 	default:
 		break;
 	}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 495ecef91679..0ed1e5c35fef 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -110,7 +110,9 @@ static int afs_inode_init_from_status(struct afs_operation *op,
 		inode->i_op	= &afs_dir_inode_operations;
 		inode->i_fop	= &afs_dir_file_operations;
 		inode->i_mapping->a_ops	= &afs_dir_aops;
-		mapping_set_large_folios(inode->i_mapping);
+		__set_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &vnode->netfs.flags);
+		/* Assume locally cached directory data will be valid. */
+		__set_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 		break;
 	case AFS_FTYPE_SYMLINK:
 		/* Symlinks with a mode of 0644 are actually mountpoints. */
@@ -440,7 +442,8 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
 	} __packed key;
 	struct afs_vnode_cache_aux aux;
 
-	if (vnode->status.type != AFS_FTYPE_FILE) {
+	if (vnode->status.type != AFS_FTYPE_FILE &&
+	    vnode->status.type != AFS_FTYPE_DIR) {
 		vnode->netfs.cache = NULL;
 		return;
 	}
@@ -642,6 +645,7 @@ int afs_drop_inode(struct inode *inode)
 void afs_evict_inode(struct inode *inode)
 {
 	struct afs_vnode_cache_aux aux;
+	struct afs_super_info *sbi = AFS_FS_S(inode->i_sb);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
 	_enter("{%llx:%llu.%d}",
@@ -653,8 +657,21 @@ void afs_evict_inode(struct inode *inode)
 
 	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
 
+	if ((S_ISDIR(inode->i_mode)) &&
+	    (inode->i_state & I_DIRTY) &&
+	    !sbi->dyn_root) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.for_sync = true,
+			.range_end = LLONG_MAX,
+		};
+
+		afs_single_writepages(inode->i_mapping, &wbc);
+	}
+
 	netfs_wait_for_outstanding_io(inode);
 	truncate_inode_pages_final(&inode->i_data);
+	netfs_free_folioq_buffer(vnode->directory);
 
 	afs_set_cache_aux(vnode, &aux);
 	netfs_clear_inode_writeback(inode, &aux);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 20d2f723948d..1744a93aae27 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -720,7 +720,9 @@ struct afs_vnode {
 #define AFS_VNODE_NEW_CONTENT	8		/* Set if file has new content (create/trunc-0) */
 #define AFS_VNODE_SILLY_DELETED	9		/* Set if file has been silly-deleted */
 #define AFS_VNODE_MODIFYING	10		/* Set if we're performing a modification op */
+#define AFS_VNODE_DIR_READ	11		/* Set if we've read a dir's contents */
 
+	struct folio_queue	*directory;	/* Directory contents */
 	struct list_head	wb_keys;	/* List of keys available for writeback */
 	struct list_head	pending_locks;	/* locks waiting to be granted */
 	struct list_head	granted_locks;	/* locks granted on this file */
@@ -729,6 +731,7 @@ struct afs_vnode {
 	ktime_t			locked_at;	/* Time at which lock obtained */
 	enum afs_lock_state	lock_state : 8;
 	afs_lock_type_t		lock_type : 8;
+	unsigned int		directory_size;	/* Amount of space in ->directory */
 
 	/* outstanding callback notification on this file */
 	struct work_struct	cb_work;	/* Work for mmap'd files */
@@ -984,6 +987,16 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
 			   i_size_read(&vnode->netfs.inode), flags);
 }
 
+/*
+ * Directory iteration management.
+ */
+struct afs_dir_iter {
+	struct afs_vnode	*dvnode;
+	struct folio_queue	*fq;
+	unsigned int		fpos;
+	int			fq_slot;
+};
+
 #include <trace/events/afs.h>
 
 /*****************************************************************************/
@@ -1065,8 +1078,11 @@ extern const struct inode_operations afs_dir_inode_operations;
 extern const struct address_space_operations afs_dir_aops;
 extern const struct dentry_operations afs_fs_dentry_operations;
 
+ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file);
 extern void afs_d_release(struct dentry *);
 extern void afs_check_for_remote_deletion(struct afs_operation *);
+int afs_single_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc);
 
 /*
  * dir_edit.c
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7631302c1984..a9bee610674e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -696,6 +696,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	vnode->volume		= NULL;
 	vnode->lock_key		= NULL;
 	vnode->permit_cache	= NULL;
+	vnode->directory	= NULL;
+	vnode->directory_size	= 0;
 
 	vnode->flags		= 1 << AFS_VNODE_UNSET;
 	vnode->lock_state	= AFS_VNODE_LOCK_NONE;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index ccb6aa8027c5..62570357a0bd 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -182,8 +182,8 @@ void afs_issue_write(struct netfs_io_subrequest *subreq)
  */
 void afs_begin_writeback(struct netfs_io_request *wreq)
 {
-	afs_get_writeback_key(wreq);
-	wreq->io_streams[0].avail = true;
+	if (S_ISREG(wreq->inode->i_mode))
+		afs_get_writeback_key(wreq);
 }
 
 /*
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 7cb5583efb91..d05f2c09efe3 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -930,9 +930,9 @@ TRACE_EVENT(afs_sent_data,
 	    );
 
 TRACE_EVENT(afs_dir_check_failed,
-	    TP_PROTO(struct afs_vnode *vnode, loff_t off, loff_t i_size),
+	    TP_PROTO(struct afs_vnode *vnode, loff_t off),
 
-	    TP_ARGS(vnode, off, i_size),
+	    TP_ARGS(vnode, off),
 
 	    TP_STRUCT__entry(
 		    __field(struct afs_vnode *,		vnode)
@@ -943,7 +943,7 @@ TRACE_EVENT(afs_dir_check_failed,
 	    TP_fast_assign(
 		    __entry->vnode = vnode;
 		    __entry->off = off;
-		    __entry->i_size = i_size;
+		    __entry->i_size = i_size_read(&vnode->netfs.inode);
 			   ),
 
 	    TP_printk("vn=%p %llx/%llx",
-- 
cgit v1.2.3


From eae9e78951bb02a7b94a9adef6e981413d13c564 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:12 +0000
Subject: afs: Use netfslib for symlinks, allowing them to be cached

Use netfslib to read symlinks, thereby allowing them to be cached by
fscache and cachefiles.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-23-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-afs@lists.infradead.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/file.c              | 32 -----------------------
 fs/afs/inode.c             | 64 ++++++++++++++++++++++++++++++++++++++++++----
 fs/afs/internal.h          |  4 ++-
 fs/afs/mntpt.c             | 22 ++++++++--------
 include/trace/events/afs.h |  1 +
 5 files changed, 74 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 5bc36bfaa173..48695a50d2f9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -20,7 +20,6 @@
 #include "internal.h"
 
 static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
-static int afs_symlink_read_folio(struct file *file, struct folio *folio);
 
 static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -61,13 +60,6 @@ const struct address_space_operations afs_file_aops = {
 	.writepages	= afs_writepages,
 };
 
-const struct address_space_operations afs_symlink_aops = {
-	.read_folio	= afs_symlink_read_folio,
-	.release_folio	= netfs_release_folio,
-	.invalidate_folio = netfs_invalidate_folio,
-	.migrate_folio	= filemap_migrate_folio,
-};
-
 static const struct vm_operations_struct afs_vm_ops = {
 	.open		= afs_vm_open,
 	.close		= afs_vm_close,
@@ -346,30 +338,6 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
 	queue_work(system_long_wq, &subreq->work);
 }
 
-static int afs_symlink_read_folio(struct file *file, struct folio *folio)
-{
-	struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);
-	struct afs_read *fsreq;
-	int ret;
-
-	fsreq = afs_alloc_read(GFP_NOFS);
-	if (!fsreq)
-		return -ENOMEM;
-
-	fsreq->pos	= folio_pos(folio);
-	fsreq->len	= folio_size(folio);
-	fsreq->vnode	= vnode;
-	fsreq->iter	= &fsreq->def_iter;
-	iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages,
-			fsreq->pos, fsreq->len);
-
-	ret = afs_fetch_data(fsreq->vnode, fsreq);
-	if (ret == 0)
-		folio_mark_uptodate(folio);
-	folio_unlock(folio);
-	return ret;
-}
-
 static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
 	struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0ed1e5c35fef..6934cc30a4ca 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -25,8 +25,60 @@
 #include "internal.h"
 #include "afs_fs.h"
 
+static void afs_put_link(void *arg)
+{
+	struct folio *folio = virt_to_folio(arg);
+
+	kunmap_local(arg);
+	folio_put(folio);
+}
+
+const char *afs_get_link(struct dentry *dentry, struct inode *inode,
+			 struct delayed_call *callback)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct folio *folio;
+	char *content;
+	ssize_t ret;
+
+	if (atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE ||
+	    !test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) {
+		if (!dentry)
+			return ERR_PTR(-ECHILD);
+		ret = afs_read_single(vnode, NULL);
+		if (ret < 0)
+			return ERR_PTR(ret);
+	}
+
+	folio = folioq_folio(vnode->directory, 0);
+	folio_get(folio);
+	content = kmap_local_folio(folio, 0);
+	set_delayed_call(callback, afs_put_link, content);
+	return content;
+}
+
+int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	DEFINE_DELAYED_CALL(done);
+	const char *content;
+	int len;
+
+	content = afs_get_link(dentry, d_inode(dentry), &done);
+	if (IS_ERR(content)) {
+		do_delayed_call(&done);
+		return PTR_ERR(content);
+	}
+
+	len = umin(strlen(content), buflen);
+	if (copy_to_user(buffer, content, len))
+		len = -EFAULT;
+	do_delayed_call(&done);
+	return len;
+}
+
 static const struct inode_operations afs_symlink_inode_operations = {
-	.get_link	= page_get_link,
+	.get_link	= afs_get_link,
+	.readlink	= afs_readlink,
 };
 
 static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
@@ -124,13 +176,13 @@ static int afs_inode_init_from_status(struct afs_operation *op,
 			inode->i_mode	= S_IFDIR | 0555;
 			inode->i_op	= &afs_mntpt_inode_operations;
 			inode->i_fop	= &afs_mntpt_file_operations;
-			inode->i_mapping->a_ops	= &afs_symlink_aops;
 		} else {
 			inode->i_mode	= S_IFLNK | status->mode;
 			inode->i_op	= &afs_symlink_inode_operations;
-			inode->i_mapping->a_ops	= &afs_symlink_aops;
 		}
+		inode->i_mapping->a_ops	= &afs_dir_aops;
 		inode_nohighmem(inode);
+		mapping_set_release_always(inode->i_mapping);
 		break;
 	default:
 		dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL);
@@ -443,7 +495,8 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
 	struct afs_vnode_cache_aux aux;
 
 	if (vnode->status.type != AFS_FTYPE_FILE &&
-	    vnode->status.type != AFS_FTYPE_DIR) {
+	    vnode->status.type != AFS_FTYPE_DIR &&
+	    vnode->status.type != AFS_FTYPE_SYMLINK) {
 		vnode->netfs.cache = NULL;
 		return;
 	}
@@ -657,7 +710,8 @@ void afs_evict_inode(struct inode *inode)
 
 	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
 
-	if ((S_ISDIR(inode->i_mode)) &&
+	if ((S_ISDIR(inode->i_mode) ||
+	     S_ISLNK(inode->i_mode)) &&
 	    (inode->i_state & I_DIRTY) &&
 	    !sbi->dyn_root) {
 		struct writeback_control wbc = {
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 1744a93aae27..7f170455cf25 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1116,7 +1116,6 @@ extern void afs_dynroot_depopulate(struct super_block *);
  * file.c
  */
 extern const struct address_space_operations afs_file_aops;
-extern const struct address_space_operations afs_symlink_aops;
 extern const struct inode_operations afs_file_inode_operations;
 extern const struct file_operations afs_file_operations;
 extern const struct netfs_request_ops afs_req_ops;
@@ -1222,6 +1221,9 @@ extern void afs_fs_probe_cleanup(struct afs_net *);
  */
 extern const struct afs_operation_ops afs_fetch_status_operation;
 
+const char *afs_get_link(struct dentry *dentry, struct inode *inode,
+			 struct delayed_call *callback);
+int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
 extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
 extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
 extern int afs_ilookup5_test_by_fid(struct inode *, void *);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 297487ee8323..507c25a5b2cb 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -30,7 +30,7 @@ const struct file_operations afs_mntpt_file_operations = {
 
 const struct inode_operations afs_mntpt_inode_operations = {
 	.lookup		= afs_mntpt_lookup,
-	.readlink	= page_readlink,
+	.readlink	= afs_readlink,
 	.getattr	= afs_getattr,
 };
 
@@ -118,9 +118,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
 		ctx->volnamesz = sizeof(afs_root_volume) - 1;
 	} else {
 		/* read the contents of the AFS special symlink */
-		struct page *page;
+		DEFINE_DELAYED_CALL(cleanup);
+		const char *content;
 		loff_t size = i_size_read(d_inode(mntpt));
-		char *buf;
 
 		if (src_as->cell)
 			ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt);
@@ -128,16 +128,16 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
 		if (size < 2 || size > PAGE_SIZE - 1)
 			return -EINVAL;
 
-		page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
+		content = afs_get_link(mntpt, d_inode(mntpt), &cleanup);
+		if (IS_ERR(content)) {
+			do_delayed_call(&cleanup);
+			return PTR_ERR(content);
+		}
 
-		buf = kmap(page);
 		ret = -EINVAL;
-		if (buf[size - 1] == '.')
-			ret = vfs_parse_fs_string(fc, "source", buf, size - 1);
-		kunmap(page);
-		put_page(page);
+		if (content[size - 1] == '.')
+			ret = vfs_parse_fs_string(fc, "source", content, size - 1);
+		do_delayed_call(&cleanup);
 		if (ret < 0)
 			return ret;
 
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index d05f2c09efe3..49a749672e38 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -422,6 +422,7 @@ enum yfs_cm_operation {
 	EM(afs_file_error_dir_over_end,		"DIR_ENT_OVER_END")	\
 	EM(afs_file_error_dir_small,		"DIR_SMALL")		\
 	EM(afs_file_error_dir_unmarked_ext,	"DIR_UNMARKED_EXT")	\
+	EM(afs_file_error_symlink_big,		"SYM_BIG")		\
 	EM(afs_file_error_mntpt,		"MNTPT_READ_FAILED")	\
 	E_(afs_file_error_writeback_fail,	"WRITEBACK_FAILED")
 
-- 
cgit v1.2.3


From f28fc2010d622a2f1f3fe8fcd2ce2376ecf3430f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:13 +0000
Subject: afs: Eliminate afs_read

Now that directory and symlink reads go through netfslib, the afs_read
struct is mostly redundant with almost all data duplicated in the
netfs_io_request and netfs_io_subrequest structs that are also available
any time we're doing a fetch.

Eliminate afs_read by moving the one field we still need there to the
afs_call struct (we may be given a different amount of data than what we
asked for and have to track what remains of that) and using the
netfs_io_subrequest directly instead.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-24-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/file.c      | 96 ++++++++++--------------------------------------------
 fs/afs/fsclient.c  | 55 +++++++++++++++----------------
 fs/afs/inode.c     |  2 ++
 fs/afs/internal.h  | 35 ++------------------
 fs/afs/yfsclient.c | 47 +++++++++++++-------------
 fs/netfs/main.c    |  2 +-
 6 files changed, 72 insertions(+), 165 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 48695a50d2f9..b996f4419c0c 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -200,50 +200,12 @@ int afs_release(struct inode *inode, struct file *file)
 	return ret;
 }
 
-/*
- * Allocate a new read record.
- */
-struct afs_read *afs_alloc_read(gfp_t gfp)
-{
-	struct afs_read *req;
-
-	req = kzalloc(sizeof(struct afs_read), gfp);
-	if (req)
-		refcount_set(&req->usage, 1);
-
-	return req;
-}
-
-/*
- * Dispose of a ref to a read record.
- */
-void afs_put_read(struct afs_read *req)
-{
-	if (refcount_dec_and_test(&req->usage)) {
-		if (req->cleanup)
-			req->cleanup(req);
-		key_put(req->key);
-		kfree(req);
-	}
-}
-
 static void afs_fetch_data_notify(struct afs_operation *op)
 {
-	struct afs_read *req = op->fetch.req;
-	struct netfs_io_subrequest *subreq = req->subreq;
-	int error = afs_op_error(op);
-
-	req->error = error;
-	if (subreq) {
-		subreq->rreq->i_size = req->file_size;
-		if (req->pos + req->actual_len >= req->file_size)
-			__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
-		subreq->error = error;
-		netfs_read_subreq_terminated(subreq);
-		req->subreq = NULL;
-	} else if (req->done) {
-		req->done(req);
-	}
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
+
+	subreq->error = afs_op_error(op);
+	netfs_read_subreq_terminated(subreq);
 }
 
 static void afs_fetch_data_success(struct afs_operation *op)
@@ -253,7 +215,7 @@ static void afs_fetch_data_success(struct afs_operation *op)
 	_enter("op=%08x", op->debug_id);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_stat_v(vnode, n_fetches);
-	atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes);
+	atomic_long_add(op->fetch.subreq->transferred, &op->net->n_fetch_bytes);
 	afs_fetch_data_notify(op);
 }
 
@@ -265,11 +227,10 @@ static void afs_fetch_data_aborted(struct afs_operation *op)
 
 static void afs_fetch_data_put(struct afs_operation *op)
 {
-	op->fetch.req->error = afs_op_error(op);
-	afs_put_read(op->fetch.req);
+	op->fetch.subreq->error = afs_op_error(op);
 }
 
-static const struct afs_operation_ops afs_fetch_data_operation = {
+const struct afs_operation_ops afs_fetch_data_operation = {
 	.issue_afs_rpc	= afs_fs_fetch_data,
 	.issue_yfs_rpc	= yfs_fs_fetch_data,
 	.success	= afs_fetch_data_success,
@@ -281,55 +242,34 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
 /*
  * Fetch file data from the volume.
  */
-int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
+static void afs_read_worker(struct work_struct *work)
 {
+	struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
 	struct afs_operation *op;
+	struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
+	struct key *key = subreq->rreq->netfs_priv;
 
 	_enter("%s{%llx:%llu.%u},%x,,,",
 	       vnode->volume->name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
-	       key_serial(req->key));
+	       key_serial(key));
 
-	op = afs_alloc_operation(req->key, vnode->volume);
+	op = afs_alloc_operation(key, vnode->volume);
 	if (IS_ERR(op)) {
-		if (req->subreq) {
-			req->subreq->error = PTR_ERR(op);
-			netfs_read_subreq_terminated(req->subreq);
-		}
-		return PTR_ERR(op);
+		subreq->error = PTR_ERR(op);
+		netfs_read_subreq_terminated(subreq);
+		return;
 	}
 
 	afs_op_set_vnode(op, 0, vnode);
 
-	op->fetch.req	= afs_get_read(req);
+	op->fetch.subreq = subreq;
 	op->ops		= &afs_fetch_data_operation;
-	return afs_do_sync_operation(op);
-}
-
-static void afs_read_worker(struct work_struct *work)
-{
-	struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
-	struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
-	struct afs_read *fsreq;
-
-	fsreq = afs_alloc_read(GFP_NOFS);
-	if (!fsreq) {
-		subreq->error = -ENOMEM;
-		return netfs_read_subreq_terminated(subreq);
-	}
-
-	fsreq->subreq	= subreq;
-	fsreq->pos	= subreq->start + subreq->transferred;
-	fsreq->len	= subreq->len   - subreq->transferred;
-	fsreq->key	= key_get(subreq->rreq->netfs_priv);
-	fsreq->vnode	= vnode;
-	fsreq->iter	= &subreq->io_iter;
 
 	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
-	afs_fetch_data(fsreq->vnode, fsreq);
-	afs_put_read(fsreq);
+	afs_do_sync_operation(op);
 }
 
 static void afs_issue_read(struct netfs_io_subrequest *subreq)
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 784f7daab112..d9d224c95454 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -301,19 +301,19 @@ void afs_fs_fetch_status(struct afs_operation *op)
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
 	struct afs_operation *op = call->op;
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_read *req = op->fetch.req;
 	const __be32 *bp;
 	size_t count_before;
 	int ret;
 
 	_enter("{%u,%zu,%zu/%llu}",
 	       call->unmarshall, call->iov_len, iov_iter_count(call->iter),
-	       req->actual_len);
+	       call->remaining);
 
 	switch (call->unmarshall) {
 	case 0:
-		req->actual_len = 0;
+		call->remaining = 0;
 		call->unmarshall++;
 		if (call->operation_ID == FSFETCHDATA64) {
 			afs_extract_to_tmp64(call);
@@ -323,8 +323,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		}
 		fallthrough;
 
-		/* Extract the returned data length into
-		 * ->actual_len.  This may indicate more or less data than was
+		/* Extract the returned data length into ->remaining.
+		 * This may indicate more or less data than was
 		 * requested will be returned.
 		 */
 	case 1:
@@ -333,42 +333,41 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		req->actual_len = be64_to_cpu(call->tmp64);
-		_debug("DATA length: %llu", req->actual_len);
+		call->remaining = be64_to_cpu(call->tmp64);
+		_debug("DATA length: %llu", call->remaining);
 
-		if (req->actual_len == 0)
+		if (call->remaining == 0)
 			goto no_more_data;
 
-		call->iter = req->iter;
-		call->iov_len = min(req->actual_len, req->len);
+		call->iter = &subreq->io_iter;
+		call->iov_len = umin(call->remaining, subreq->len - subreq->transferred);
 		call->unmarshall++;
 		fallthrough;
 
 		/* extract the returned data */
 	case 2:
 		count_before = call->iov_len;
-		_debug("extract data %zu/%llu", count_before, req->actual_len);
+		_debug("extract data %zu/%llu", count_before, call->remaining);
 
 		ret = afs_extract_data(call, true);
-		if (req->subreq) {
-			req->subreq->transferred += count_before - call->iov_len;
-			netfs_read_subreq_progress(req->subreq);
-		}
+		subreq->transferred += count_before - call->iov_len;
+		call->remaining -= count_before - call->iov_len;
+		netfs_read_subreq_progress(subreq);
 		if (ret < 0)
 			return ret;
 
 		call->iter = &call->def_iter;
-		if (req->actual_len <= req->len)
+		if (call->remaining)
 			goto no_more_data;
 
 		/* Discard any excess data the server gave us */
-		afs_extract_discard(call, req->actual_len - req->len);
+		afs_extract_discard(call, call->remaining);
 		call->unmarshall = 3;
 		fallthrough;
 
 	case 3:
 		_debug("extract discard %zu/%llu",
-		       iov_iter_count(call->iter), req->actual_len - req->len);
+		       iov_iter_count(call->iter), call->remaining);
 
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -390,8 +389,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		xdr_decode_AFSCallBack(&bp, call, &vp->scb);
 		xdr_decode_AFSVolSync(&bp, &op->volsync);
 
-		req->data_version = vp->scb.status.data_version;
-		req->file_size = vp->scb.status.size;
+		if (subreq->start + subreq->transferred >= vp->scb.status.size)
+			__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
 
 		call->unmarshall++;
 		fallthrough;
@@ -426,8 +425,8 @@ static const struct afs_call_type afs_RXFSFetchData64 = {
  */
 static void afs_fs_fetch_data64(struct afs_operation *op)
 {
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_read *req = op->fetch.req;
 	struct afs_call *call;
 	__be32 *bp;
 
@@ -443,10 +442,10 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
 	bp[1] = htonl(vp->fid.vid);
 	bp[2] = htonl(vp->fid.vnode);
 	bp[3] = htonl(vp->fid.unique);
-	bp[4] = htonl(upper_32_bits(req->pos));
-	bp[5] = htonl(lower_32_bits(req->pos));
+	bp[4] = htonl(upper_32_bits(subreq->start + subreq->transferred));
+	bp[5] = htonl(lower_32_bits(subreq->start + subreq->transferred));
 	bp[6] = 0;
-	bp[7] = htonl(lower_32_bits(req->len));
+	bp[7] = htonl(lower_32_bits(subreq->len   - subreq->transferred));
 
 	call->fid = vp->fid;
 	trace_afs_make_fs_call(call, &vp->fid);
@@ -458,9 +457,9 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
  */
 void afs_fs_fetch_data(struct afs_operation *op)
 {
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
 	struct afs_call *call;
-	struct afs_read *req = op->fetch.req;
 	__be32 *bp;
 
 	if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
@@ -472,16 +471,14 @@ void afs_fs_fetch_data(struct afs_operation *op)
 	if (!call)
 		return afs_op_nomem(op);
 
-	req->call_debug_id = call->debug_id;
-
 	/* marshall the parameters */
 	bp = call->request;
 	bp[0] = htonl(FSFETCHDATA);
 	bp[1] = htonl(vp->fid.vid);
 	bp[2] = htonl(vp->fid.vnode);
 	bp[3] = htonl(vp->fid.unique);
-	bp[4] = htonl(lower_32_bits(req->pos));
-	bp[5] = htonl(lower_32_bits(req->len));
+	bp[4] = htonl(lower_32_bits(subreq->start + subreq->transferred));
+	bp[5] = htonl(lower_32_bits(subreq->len   + subreq->transferred));
 
 	call->fid = vp->fid;
 	trace_afs_make_fs_call(call, &vp->fid);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 6934cc30a4ca..0e3c43c40632 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -317,6 +317,8 @@ static void afs_apply_status(struct afs_operation *op,
 			inode_set_ctime_to_ts(inode, t);
 			inode_set_atime_to_ts(inode, t);
 		}
+		if (op->ops == &afs_fetch_data_operation)
+			op->fetch.subreq->rreq->i_size = status->size;
 	}
 }
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7f170455cf25..39d2e29ed0e0 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -163,6 +163,7 @@ struct afs_call {
 	spinlock_t		state_lock;
 	int			error;		/* error code */
 	u32			abort_code;	/* Remote abort ID or 0 */
+	unsigned long long	remaining;	/* How much is left to receive */
 	unsigned int		max_lifespan;	/* Maximum lifespan in secs to set if not 0 */
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
@@ -232,28 +233,6 @@ static inline struct key *afs_file_key(struct file *file)
 	return af->key;
 }
 
-/*
- * Record of an outstanding read operation on a vnode.
- */
-struct afs_read {
-	loff_t			pos;		/* Where to start reading */
-	loff_t			len;		/* How much we're asking for */
-	loff_t			actual_len;	/* How much we're actually getting */
-	loff_t			file_size;	/* File size returned by server */
-	struct key		*key;		/* The key to use to reissue the read */
-	struct afs_vnode	*vnode;		/* The file being read into. */
-	struct netfs_io_subrequest *subreq;	/* Fscache helper read request this belongs to */
-	afs_dataversion_t	data_version;	/* Version number returned by server */
-	refcount_t		usage;
-	unsigned int		call_debug_id;
-	unsigned int		nr_pages;
-	int			error;
-	void (*done)(struct afs_read *);
-	void (*cleanup)(struct afs_read *);
-	struct iov_iter		*iter;		/* Iterator representing the buffer */
-	struct iov_iter		def_iter;	/* Default iterator */
-};
-
 /*
  * AFS superblock private data
  * - there's one superblock per volume
@@ -911,7 +890,7 @@ struct afs_operation {
 			bool	new_negative;
 		} rename;
 		struct {
-			struct afs_read *req;
+			struct netfs_io_subrequest *subreq;
 		} fetch;
 		struct {
 			afs_lock_type_t type;
@@ -1118,21 +1097,13 @@ extern void afs_dynroot_depopulate(struct super_block *);
 extern const struct address_space_operations afs_file_aops;
 extern const struct inode_operations afs_file_inode_operations;
 extern const struct file_operations afs_file_operations;
+extern const struct afs_operation_ops afs_fetch_data_operation;
 extern const struct netfs_request_ops afs_req_ops;
 
 extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
 extern void afs_put_wb_key(struct afs_wb_key *);
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
-extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
-extern struct afs_read *afs_alloc_read(gfp_t);
-extern void afs_put_read(struct afs_read *);
-
-static inline struct afs_read *afs_get_read(struct afs_read *req)
-{
-	refcount_inc(&req->usage);
-	return req;
-}
 
 /*
  * flock.c
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 368cf277d801..3718d852fabc 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -352,19 +352,19 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call)
 static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 {
 	struct afs_operation *op = call->op;
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_read *req = op->fetch.req;
 	const __be32 *bp;
 	size_t count_before;
 	int ret;
 
 	_enter("{%u,%zu, %zu/%llu}",
 	       call->unmarshall, call->iov_len, iov_iter_count(call->iter),
-	       req->actual_len);
+	       call->remaining);
 
 	switch (call->unmarshall) {
 	case 0:
-		req->actual_len = 0;
+		call->remaining = 0;
 		afs_extract_to_tmp64(call);
 		call->unmarshall++;
 		fallthrough;
@@ -379,42 +379,40 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		req->actual_len = be64_to_cpu(call->tmp64);
-		_debug("DATA length: %llu", req->actual_len);
+		call->remaining = be64_to_cpu(call->tmp64);
+		_debug("DATA length: %llu", call->remaining);
 
-		if (req->actual_len == 0)
+		if (call->remaining == 0)
 			goto no_more_data;
 
-		call->iter = req->iter;
-		call->iov_len = min(req->actual_len, req->len);
+		call->iter = &subreq->io_iter;
+		call->iov_len = min(call->remaining, subreq->len - subreq->transferred);
 		call->unmarshall++;
 		fallthrough;
 
 		/* extract the returned data */
 	case 2:
 		count_before = call->iov_len;
-		_debug("extract data %zu/%llu", count_before, req->actual_len);
+		_debug("extract data %zu/%llu", count_before, call->remaining);
 
 		ret = afs_extract_data(call, true);
-		if (req->subreq) {
-			req->subreq->transferred += count_before - call->iov_len;
-			netfs_read_subreq_progress(req->subreq);
-		}
+		subreq->transferred += count_before - call->iov_len;
+		netfs_read_subreq_progress(subreq);
 		if (ret < 0)
 			return ret;
 
 		call->iter = &call->def_iter;
-		if (req->actual_len <= req->len)
+		if (call->remaining)
 			goto no_more_data;
 
 		/* Discard any excess data the server gave us */
-		afs_extract_discard(call, req->actual_len - req->len);
+		afs_extract_discard(call, call->remaining);
 		call->unmarshall = 3;
 		fallthrough;
 
 	case 3:
 		_debug("extract discard %zu/%llu",
-		       iov_iter_count(call->iter), req->actual_len - req->len);
+		       iov_iter_count(call->iter), call->remaining);
 
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -439,8 +437,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		xdr_decode_YFSCallBack(&bp, call, &vp->scb);
 		xdr_decode_YFSVolSync(&bp, &op->volsync);
 
-		req->data_version = vp->scb.status.data_version;
-		req->file_size = vp->scb.status.size;
+		if (subreq->start + subreq->transferred >= vp->scb.status.size)
+			__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
 
 		call->unmarshall++;
 		fallthrough;
@@ -468,14 +466,15 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = {
  */
 void yfs_fs_fetch_data(struct afs_operation *op)
 {
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_read *req = op->fetch.req;
 	struct afs_call *call;
 	__be32 *bp;
 
-	_enter(",%x,{%llx:%llu},%llx,%llx",
+	_enter(",%x,{%llx:%llu},%llx,%zx",
 	       key_serial(op->key), vp->fid.vid, vp->fid.vnode,
-	       req->pos, req->len);
+	       subreq->start + subreq->transferred,
+	       subreq->len   - subreq->transferred);
 
 	call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64,
 				   sizeof(__be32) * 2 +
@@ -487,15 +486,13 @@ void yfs_fs_fetch_data(struct afs_operation *op)
 	if (!call)
 		return afs_op_nomem(op);
 
-	req->call_debug_id = call->debug_id;
-
 	/* marshall the parameters */
 	bp = call->request;
 	bp = xdr_encode_u32(bp, YFSFETCHDATA64);
 	bp = xdr_encode_u32(bp, 0); /* RPC flags */
 	bp = xdr_encode_YFSFid(bp, &vp->fid);
-	bp = xdr_encode_u64(bp, req->pos);
-	bp = xdr_encode_u64(bp, req->len);
+	bp = xdr_encode_u64(bp, subreq->start + subreq->transferred);
+	bp = xdr_encode_u64(bp, subreq->len   - subreq->transferred);
 	yfs_check_req(call, bp);
 
 	call->fid = vp->fid;
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 8c1922c0cb42..16760695e667 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -118,7 +118,7 @@ static int __init netfs_init(void)
 		goto error_reqpool;
 
 	netfs_subrequest_slab = kmem_cache_create("netfs_subrequest",
-						  sizeof(struct netfs_io_subrequest), 0,
+						  sizeof(struct netfs_io_subrequest) + 16, 0,
 						  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
 						  NULL);
 	if (!netfs_subrequest_slab)
-- 
cgit v1.2.3


From 9750be93b2be12b6d92323b97d7c055099d279e6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:14 +0000
Subject: afs: Fix cleanup of immediately failed async calls

If we manage to begin an async call, but fail to transmit any data on it
due to a signal, we then abort it which causes a race between the
notification of call completion from rxrpc and our attempt to cancel the
notification.  The notification will be necessary, however, for async
FetchData to terminate the netfs subrequest.

However, since we get a notification from rxrpc upon completion of a call
(aborted or otherwise), we can just leave it to that.

This leads to calls not getting cleaned up, but appearing in
/proc/net/rxrpc/calls as being aborted with code 6.

Fix this by making the "error_do_abort:" case of afs_make_call() abort the
call and then abandon it to the notification handler.

Fixes: 34fa47612bfe ("afs: Fix race in async call refcounting")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-25-dhowells@redhat.com
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/internal.h          |  9 +++++++++
 fs/afs/rxrpc.c             | 12 +++++++++---
 include/trace/events/afs.h |  2 ++
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 39d2e29ed0e0..96fc466efd10 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1336,6 +1336,15 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
 extern int afs_extract_data(struct afs_call *, bool);
 extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause);
 
+static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why)
+{
+	int r = refcount_read(&call->ref);
+
+	trace_afs_call(call->debug_id, why, r,
+		       atomic_read(&call->net->nr_outstanding_calls),
+		       __builtin_return_address(0));
+}
+
 static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call,
 				    gfp_t gfp)
 {
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 9f2a3bb56ec6..a122c6366ce1 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -430,11 +430,16 @@ void afs_make_call(struct afs_call *call, gfp_t gfp)
 	return;
 
 error_do_abort:
-	if (ret != -ECONNABORTED) {
+	if (ret != -ECONNABORTED)
 		rxrpc_kernel_abort_call(call->net->socket, rxcall,
 					RX_USER_ABORT, ret,
 					afs_abort_send_data_error);
-	} else {
+	if (call->async) {
+		afs_see_call(call, afs_call_trace_async_abort);
+		return;
+	}
+
+	if (ret == -ECONNABORTED) {
 		len = 0;
 		iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
 		rxrpc_kernel_recv_data(call->net->socket, rxcall,
@@ -445,6 +450,8 @@ error_do_abort:
 	call->error = ret;
 	trace_afs_call_done(call);
 error_kill_call:
+	if (call->async)
+		afs_see_call(call, afs_call_trace_async_kill);
 	if (call->type->done)
 		call->type->done(call);
 
@@ -602,7 +609,6 @@ local_abort:
 	abort_code = 0;
 call_complete:
 	afs_set_call_complete(call, ret, remote_abort);
-	state = AFS_CALL_COMPLETE;
 	goto done;
 }
 
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 49a749672e38..cdb5f2af7799 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -118,6 +118,8 @@ enum yfs_cm_operation {
  */
 #define afs_call_traces \
 	EM(afs_call_trace_alloc,		"ALLOC") \
+	EM(afs_call_trace_async_abort,		"ASYAB") \
+	EM(afs_call_trace_async_kill,		"ASYKL") \
 	EM(afs_call_trace_free,			"FREE ") \
 	EM(afs_call_trace_get,			"GET  ") \
 	EM(afs_call_trace_put,			"PUT  ") \
-- 
cgit v1.2.3


From eddf51f2bb2c28b082199c6f5fd95611ca511135 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:15 +0000
Subject: afs: Make {Y,}FS.FetchData an asynchronous operation

Make FS.FetchData and YFS.FetchData an asynchronous operation in that the
request is queued in AF_RXRPC and then we return to the caller rather than
waiting.  Processing of the returning packets is then done inline if it's a
synchronous VFS/VM call (readdir, read_folio, sync DIO, prep for write) or
offloaded to a workqueue if asynchronous VM calls (eg. readahead, async
DIO).

This reduces the chain of workqueues invoking workqueues and cuts out some
of the overhead, driving rxrpc data extraction and netfslib read collection
from a thread that's going to block to completion anyway if possible.

The ->done() call op is also split with ->immediate_cancel() handling the
cancellation on failure to begin the call and ->done() handling the rest.
This means that the AFS async FetchData code doesn't try to terminate the
netfs subrequest twice.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-26-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/file.c         | 126 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/afs/fs_operation.c |   2 +-
 fs/afs/fsclient.c     |   9 +++-
 fs/afs/internal.h     |  24 ++++++++++
 fs/afs/main.c         |   2 +-
 fs/afs/rxrpc.c        |  25 +++-------
 fs/afs/vlclient.c     |   1 +
 fs/afs/write.c        |  12 +++++
 fs/afs/yfsclient.c    |   6 ++-
 9 files changed, 170 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index b996f4419c0c..c296efebb491 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -225,26 +225,111 @@ static void afs_fetch_data_aborted(struct afs_operation *op)
 	afs_fetch_data_notify(op);
 }
 
-static void afs_fetch_data_put(struct afs_operation *op)
-{
-	op->fetch.subreq->error = afs_op_error(op);
-}
-
 const struct afs_operation_ops afs_fetch_data_operation = {
 	.issue_afs_rpc	= afs_fs_fetch_data,
 	.issue_yfs_rpc	= yfs_fs_fetch_data,
 	.success	= afs_fetch_data_success,
 	.aborted	= afs_fetch_data_aborted,
 	.failed		= afs_fetch_data_notify,
-	.put		= afs_fetch_data_put,
 };
 
+static void afs_issue_read_call(struct afs_operation *op)
+{
+	op->call_responded = false;
+	op->call_error = 0;
+	op->call_abort_code = 0;
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags))
+		yfs_fs_fetch_data(op);
+	else
+		afs_fs_fetch_data(op);
+}
+
+static void afs_end_read(struct afs_operation *op)
+{
+	if (op->call_responded && op->server)
+		set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
+
+	if (!afs_op_error(op))
+		afs_fetch_data_success(op);
+	else if (op->cumul_error.aborted)
+		afs_fetch_data_aborted(op);
+	else
+		afs_fetch_data_notify(op);
+
+	afs_end_vnode_operation(op);
+	afs_put_operation(op);
+}
+
+/*
+ * Perform I/O processing on an asynchronous call.  The work item carries a ref
+ * to the call struct that we either need to release or to pass on.
+ */
+static void afs_read_receive(struct afs_call *call)
+{
+	struct afs_operation *op = call->op;
+	enum afs_call_state state;
+
+	_enter("");
+
+	state = READ_ONCE(call->state);
+	if (state == AFS_CALL_COMPLETE)
+		return;
+
+	while (state < AFS_CALL_COMPLETE && READ_ONCE(call->need_attention)) {
+		WRITE_ONCE(call->need_attention, false);
+		afs_deliver_to_call(call);
+		state = READ_ONCE(call->state);
+	}
+
+	if (state < AFS_CALL_COMPLETE) {
+		netfs_read_subreq_progress(op->fetch.subreq);
+		if (rxrpc_kernel_check_life(call->net->socket, call->rxcall))
+			return;
+		/* rxrpc terminated the call. */
+		afs_set_call_complete(call, call->error, call->abort_code);
+	}
+
+	op->call_abort_code	= call->abort_code;
+	op->call_error		= call->error;
+	op->call_responded	= call->responded;
+	op->call		= NULL;
+	call->op		= NULL;
+	afs_put_call(call);
+
+	/* If the call failed, then we need to crank the server rotation
+	 * handle and try the next.
+	 */
+	if (afs_select_fileserver(op)) {
+		afs_issue_read_call(op);
+		return;
+	}
+
+	afs_end_read(op);
+}
+
+void afs_fetch_data_async_rx(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, async_work);
+
+	afs_read_receive(call);
+	afs_put_call(call);
+}
+
+void afs_fetch_data_immediate_cancel(struct afs_call *call)
+{
+	if (call->async) {
+		afs_get_call(call, afs_call_trace_wake);
+		if (!queue_work(afs_async_calls, &call->async_work))
+			afs_deferred_put_call(call);
+		flush_work(&call->async_work);
+	}
+}
+
 /*
  * Fetch file data from the volume.
  */
-static void afs_read_worker(struct work_struct *work)
+static void afs_issue_read(struct netfs_io_subrequest *subreq)
 {
-	struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
 	struct afs_operation *op;
 	struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
 	struct key *key = subreq->rreq->netfs_priv;
@@ -269,13 +354,26 @@ static void afs_read_worker(struct work_struct *work)
 	op->ops		= &afs_fetch_data_operation;
 
 	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
-	afs_do_sync_operation(op);
-}
 
-static void afs_issue_read(struct netfs_io_subrequest *subreq)
-{
-	INIT_WORK(&subreq->work, afs_read_worker);
-	queue_work(system_long_wq, &subreq->work);
+	if (subreq->rreq->origin == NETFS_READAHEAD ||
+	    subreq->rreq->iocb) {
+		op->flags |= AFS_OPERATION_ASYNC;
+
+		if (!afs_begin_vnode_operation(op)) {
+			subreq->error = afs_put_operation(op);
+			netfs_read_subreq_terminated(subreq);
+			return;
+		}
+
+		if (!afs_select_fileserver(op)) {
+			afs_end_read(op);
+			return;
+		}
+
+		afs_issue_read_call(op);
+	} else {
+		afs_do_sync_operation(op);
+	}
 }
 
 static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 0175d7a31332..8418813ee043 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -256,7 +256,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op)
 /*
  * Tidy up a filesystem cursor and unlock the vnode.
  */
-static void afs_end_vnode_operation(struct afs_operation *op)
+void afs_end_vnode_operation(struct afs_operation *op)
 {
 	_enter("");
 
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index d9d224c95454..1d9ecd5418d8 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -352,7 +352,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		ret = afs_extract_data(call, true);
 		subreq->transferred += count_before - call->iov_len;
 		call->remaining -= count_before - call->iov_len;
-		netfs_read_subreq_progress(subreq);
 		if (ret < 0)
 			return ret;
 
@@ -409,14 +408,18 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 static const struct afs_call_type afs_RXFSFetchData = {
 	.name		= "FS.FetchData",
 	.op		= afs_FS_FetchData,
+	.async_rx	= afs_fetch_data_async_rx,
 	.deliver	= afs_deliver_fs_fetch_data,
+	.immediate_cancel = afs_fetch_data_immediate_cancel,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSFetchData64 = {
 	.name		= "FS.FetchData64",
 	.op		= afs_FS_FetchData64,
+	.async_rx	= afs_fetch_data_async_rx,
 	.deliver	= afs_deliver_fs_fetch_data,
+	.immediate_cancel = afs_fetch_data_immediate_cancel,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -436,6 +439,9 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
 	if (!call)
 		return afs_op_nomem(op);
 
+	if (op->flags & AFS_OPERATION_ASYNC)
+		call->async = true;
+
 	/* marshall the parameters */
 	bp = call->request;
 	bp[0] = htonl(FSFETCHDATA64);
@@ -1730,6 +1736,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
 	.op		= afs_FS_GetCapabilities,
 	.deliver	= afs_deliver_fs_get_capabilities,
 	.done		= afs_fileserver_probe_result,
+	.immediate_cancel = afs_fileserver_probe_result,
 	.destructor	= afs_fs_get_capabilities_destructor,
 };
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 96fc466efd10..cd2c4f85117d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -202,11 +202,17 @@ struct afs_call_type {
 	/* clean up a call */
 	void (*destructor)(struct afs_call *call);
 
+	/* Async receive processing function */
+	void (*async_rx)(struct work_struct *work);
+
 	/* Work function */
 	void (*work)(struct work_struct *work);
 
 	/* Call done function (gets called immediately on success or failure) */
 	void (*done)(struct afs_call *call);
+
+	/* Handle a call being immediately cancelled. */
+	void (*immediate_cancel)(struct afs_call *call);
 };
 
 /*
@@ -942,6 +948,7 @@ struct afs_operation {
 #define AFS_OPERATION_TRIED_ALL		0x0400	/* Set if we've tried all the fileservers */
 #define AFS_OPERATION_RETRY_SERVER	0x0800	/* Set if we should retry the current server */
 #define AFS_OPERATION_DIR_CONFLICT	0x1000	/* Set if we detected a 3rd-party dir change */
+#define AFS_OPERATION_ASYNC		0x2000	/* Set if should run asynchronously */
 };
 
 /*
@@ -1104,6 +1111,8 @@ extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
 extern void afs_put_wb_key(struct afs_wb_key *);
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
+void afs_fetch_data_async_rx(struct work_struct *work);
+void afs_fetch_data_immediate_cancel(struct afs_call *call);
 
 /*
  * flock.c
@@ -1155,6 +1164,7 @@ extern void afs_fs_store_acl(struct afs_operation *);
 extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *);
 extern int afs_put_operation(struct afs_operation *);
 extern bool afs_begin_vnode_operation(struct afs_operation *);
+extern void afs_end_vnode_operation(struct afs_operation *op);
 extern void afs_wait_for_operation(struct afs_operation *);
 extern int afs_do_sync_operation(struct afs_operation *);
 
@@ -1326,6 +1336,7 @@ extern void afs_charge_preallocation(struct work_struct *);
 extern void afs_put_call(struct afs_call *);
 void afs_deferred_put_call(struct afs_call *call);
 void afs_make_call(struct afs_call *call, gfp_t gfp);
+void afs_deliver_to_call(struct afs_call *call);
 void afs_wait_for_call_to_complete(struct afs_call *call);
 extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
 					    const struct afs_call_type *,
@@ -1336,6 +1347,19 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
 extern int afs_extract_data(struct afs_call *, bool);
 extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause);
 
+static inline struct afs_call *afs_get_call(struct afs_call *call,
+					    enum afs_call_trace why)
+{
+	int r;
+
+	__refcount_inc(&call->ref, &r);
+
+	trace_afs_call(call->debug_id, why, r + 1,
+		       atomic_read(&call->net->nr_outstanding_calls),
+		       __builtin_return_address(0));
+	return call;
+}
+
 static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why)
 {
 	int r = refcount_read(&call->ref);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index a14f6013e316..1ae0067f772d 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -177,7 +177,7 @@ static int __init afs_init(void)
 	afs_wq = alloc_workqueue("afs", 0, 0);
 	if (!afs_wq)
 		goto error_afs_wq;
-	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
+	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
 	if (!afs_async_calls)
 		goto error_async;
 	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index a122c6366ce1..886416ea1d96 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -149,7 +149,8 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
 	call->net = net;
 	call->debug_id = atomic_inc_return(&rxrpc_debug_id);
 	refcount_set(&call->ref, 1);
-	INIT_WORK(&call->async_work, afs_process_async_call);
+	INIT_WORK(&call->async_work, type->async_rx ?: afs_process_async_call);
+	INIT_WORK(&call->work, call->type->work);
 	INIT_WORK(&call->free_work, afs_deferred_free_worker);
 	init_waitqueue_head(&call->waitq);
 	spin_lock_init(&call->state_lock);
@@ -235,27 +236,12 @@ void afs_deferred_put_call(struct afs_call *call)
 		schedule_work(&call->free_work);
 }
 
-static struct afs_call *afs_get_call(struct afs_call *call,
-				     enum afs_call_trace why)
-{
-	int r;
-
-	__refcount_inc(&call->ref, &r);
-
-	trace_afs_call(call->debug_id, why, r + 1,
-		       atomic_read(&call->net->nr_outstanding_calls),
-		       __builtin_return_address(0));
-	return call;
-}
-
 /*
  * Queue the call for actual work.
  */
 static void afs_queue_call_work(struct afs_call *call)
 {
 	if (call->type->work) {
-		INIT_WORK(&call->work, call->type->work);
-
 		afs_get_call(call, afs_call_trace_work);
 		if (!queue_work(afs_wq, &call->work))
 			afs_put_call(call);
@@ -452,8 +438,8 @@ error_do_abort:
 error_kill_call:
 	if (call->async)
 		afs_see_call(call, afs_call_trace_async_kill);
-	if (call->type->done)
-		call->type->done(call);
+	if (call->type->immediate_cancel)
+		call->type->immediate_cancel(call);
 
 	/* We need to dispose of the extra ref we grabbed for an async call.
 	 * The call, however, might be queued on afs_async_calls and we need to
@@ -508,7 +494,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
 /*
  * deliver messages to a call
  */
-static void afs_deliver_to_call(struct afs_call *call)
+void afs_deliver_to_call(struct afs_call *call)
 {
 	enum afs_call_state state;
 	size_t len;
@@ -809,6 +795,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
 		return -ENOTSUPP;
 
 	trace_afs_cb_call(call);
+	call->work.func = call->type->work;
 
 	/* pass responsibility for the remainer of this message off to the
 	 * cache manager op */
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index cac75f89b64a..adc617a82a86 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -370,6 +370,7 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
 	.name		= "VL.GetCapabilities",
 	.op		= afs_VL_GetCapabilities,
 	.deliver	= afs_deliver_vl_get_capabilities,
+	.immediate_cancel = afs_vlserver_probe_result,
 	.done		= afs_vlserver_probe_result,
 	.destructor	= afs_destroy_vl_get_capabilities,
 };
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 62570357a0bd..18b0a9f1615e 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -196,6 +196,18 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st
 		list_first_entry(&stream->subrequests,
 				 struct netfs_io_subrequest, rreq_link);
 
+	switch (wreq->origin) {
+	case NETFS_READAHEAD:
+	case NETFS_READPAGE:
+	case NETFS_READ_GAPS:
+	case NETFS_READ_SINGLE:
+	case NETFS_READ_FOR_WRITE:
+	case NETFS_DIO_READ:
+		return;
+	default:
+		break;
+	}
+
 	switch (subreq->error) {
 	case -EACCES:
 	case -EPERM:
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 3718d852fabc..f57c089f26ee 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -397,7 +397,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 
 		ret = afs_extract_data(call, true);
 		subreq->transferred += count_before - call->iov_len;
-		netfs_read_subreq_progress(subreq);
 		if (ret < 0)
 			return ret;
 
@@ -457,7 +456,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 static const struct afs_call_type yfs_RXYFSFetchData64 = {
 	.name		= "YFS.FetchData64",
 	.op		= yfs_FS_FetchData64,
+	.async_rx	= afs_fetch_data_async_rx,
 	.deliver	= yfs_deliver_fs_fetch_data64,
+	.immediate_cancel = afs_fetch_data_immediate_cancel,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -486,6 +487,9 @@ void yfs_fs_fetch_data(struct afs_operation *op)
 	if (!call)
 		return afs_op_nomem(op);
 
+	if (op->flags & AFS_OPERATION_ASYNC)
+		call->async = true;
+
 	/* marshall the parameters */
 	bp = call->request;
 	bp = xdr_encode_u32(bp, YFSFETCHDATA64);
-- 
cgit v1.2.3


From e2d46f2ec332533816417b60933954173f602121 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:17 +0000
Subject: netfs: Change the read result collector to only use one work item

Change the way netfslib collects read results to do all the collection for
a particular read request using a single work item that walks along the
subrequest queue as subrequests make progress or complete, unlocking folios
progressively rather than doing the unlock in parallel as parallel requests
come in.

The code is remodelled to be more like the write-side code, though only
using a single stream.  This makes it more directly comparable and thus
easier to duplicate fixes between the two sides.

This has a number of advantages:

 (1) It's simpler.  There doesn't need to be a complex donation mechanism
     to handle mismatches between the size and alignment of subrequests and
     folios.  The collector unlocks folios as the subrequests covering each
     complete.

 (2) It should cause less scheduler overhead as there's a single work item
     in play unlocking pages in parallel when a read gets split up into a
     lot of subrequests instead of one per subrequest.

     Whilst the parallellism is nice in theory, in practice, the vast
     majority of loads are sequential reads of the whole file, so
     committing a bunch of threads to unlocking folios out of order doesn't
     help in those cases.

 (3) It should make it easier to implement content decryption.  A folio
     cannot be decrypted until all the requests that contribute to it have
     completed - and, again, most loads are sequential and so, most of the
     time, we want to begin decryption sequentially (though it's great if
     the decryption can happen in parallel).

There is a disadvantage in that we're losing the ability to decrypt and
unlock things on an as-things-arrive basis which may affect some
applications.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-28-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/9p/vfs_addr.c             |   3 +-
 fs/afs/dir.c                 |   8 +-
 fs/ceph/addr.c               |   9 +-
 fs/netfs/buffered_read.c     | 160 +++++-----
 fs/netfs/direct_read.c       |  60 ++--
 fs/netfs/internal.h          |  21 +-
 fs/netfs/main.c              |   2 +-
 fs/netfs/objects.c           |  34 +-
 fs/netfs/read_collect.c      | 716 +++++++++++++++++++++++++------------------
 fs/netfs/read_pgpriv2.c      | 203 +++++-------
 fs/netfs/read_retry.c        | 207 +++++++------
 fs/netfs/read_single.c       |  37 +--
 fs/netfs/write_collect.c     |   4 +-
 fs/netfs/write_issue.c       |   2 +-
 fs/netfs/write_retry.c       |  14 +-
 fs/smb/client/cifssmb.c      |   2 +
 fs/smb/client/smb2pdu.c      |   5 +-
 include/linux/netfs.h        |  16 +-
 include/trace/events/netfs.h |  79 +----
 19 files changed, 819 insertions(+), 763 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index b38be6ff90bc..32619d146cbc 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -81,8 +81,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
 		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 	if (pos + total >= i_size_read(rreq->inode))
 		__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
-
-	if (!err) {
+	if (!err && total) {
 		subreq->transferred += total;
 		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 3fa095429794..a386b4649f3e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -325,8 +325,10 @@ static ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
 	 * haven't read it yet.
 	 */
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
-	    test_bit(AFS_VNODE_DIR_READ, &dvnode->flags))
+	    test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) {
+		ret = i_size;
 		goto valid;
+	}
 
 	up_read(&dvnode->validate_lock);
 	if (down_write_killable(&dvnode->validate_lock) < 0)
@@ -346,11 +348,13 @@ static ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
 
 		set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
 		set_bit(AFS_VNODE_DIR_READ, &dvnode->flags);
+	} else {
+		ret = i_size;
 	}
 
 	downgrade_write(&dvnode->validate_lock);
 valid:
-	return i_size;
+	return ret;
 
 error_unlock:
 	up_write(&dvnode->validate_lock);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4deb38fa470e..f5224a566b69 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -223,10 +223,13 @@ static void finish_netfs_read(struct ceph_osd_request *req)
 	      subreq->len, i_size_read(req->r_inode));
 
 	/* no object means success but no data */
-	if (err == -ENOENT)
+	if (err == -ENOENT) {
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 		err = 0;
-	else if (err == -EBLOCKLISTED)
+	} else if (err == -EBLOCKLISTED) {
 		fsc->blocklisted = true;
+	}
 
 	if (err >= 0) {
 		if (sparse && err > 0)
@@ -242,6 +245,8 @@ static void finish_netfs_read(struct ceph_osd_request *req)
 			if (err > subreq->len)
 				err = subreq->len;
 		}
+		if (err > 0)
+			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 	}
 
 	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index c420e9dee0e4..0245943d974d 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -121,12 +121,6 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 
 	subreq->io_iter	= rreq->buffer.iter;
 
-	if (iov_iter_is_folioq(&subreq->io_iter)) {
-		subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq;
-		subreq->curr_folioq_slot = subreq->io_iter.folioq_slot;
-		subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
-	}
-
 	iov_iter_truncate(&subreq->io_iter, subreq->len);
 	rolling_buffer_advance(&rreq->buffer, subreq->len);
 	return subreq->len;
@@ -147,19 +141,6 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rr
 
 }
 
-void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
-{
-	struct netfs_io_subrequest *subreq = priv;
-
-	if (transferred_or_error > 0) {
-		subreq->transferred += transferred_or_error;
-		subreq->error = 0;
-	} else {
-		subreq->error = transferred_or_error;
-	}
-	schedule_work(&subreq->work);
-}
-
 /*
  * Issue a read against the cache.
  * - Eats the caller's ref on subreq.
@@ -174,6 +155,47 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
 			netfs_cache_read_terminated, subreq);
 }
 
+static void netfs_issue_read(struct netfs_io_request *rreq,
+			     struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
+
+	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+	/* We add to the end of the list whilst the collector may be walking
+	 * the list.  The collector only goes nextwards and uses the lock to
+	 * remove entries off of the front.
+	 */
+	spin_lock(&rreq->lock);
+	list_add_tail(&subreq->rreq_link, &stream->subrequests);
+	if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+		stream->front = subreq;
+		if (!stream->active) {
+			stream->collected_to = stream->front->start;
+			/* Store list pointers before active flag */
+			smp_store_release(&stream->active, true);
+		}
+	}
+
+	spin_unlock(&rreq->lock);
+
+	switch (subreq->source) {
+	case NETFS_DOWNLOAD_FROM_SERVER:
+		rreq->netfs_ops->issue_read(subreq);
+		break;
+	case NETFS_READ_FROM_CACHE:
+		netfs_read_cache_to_pagecache(rreq, subreq);
+		break;
+	default:
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+		subreq->error = 0;
+		iov_iter_zero(subreq->len, &subreq->io_iter);
+		subreq->transferred = subreq->len;
+		netfs_read_subreq_terminated(subreq);
+		break;
+	}
+}
+
 /*
  * Perform a read to the pagecache from a series of sources of different types,
  * slicing up the region to be read according to available cache blocks and
@@ -186,8 +208,6 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 	ssize_t size = rreq->len;
 	int ret = 0;
 
-	atomic_inc(&rreq->nr_outstanding);
-
 	do {
 		struct netfs_io_subrequest *subreq;
 		enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
@@ -202,14 +222,6 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 		subreq->start	= start;
 		subreq->len	= size;
 
-		atomic_inc(&rreq->nr_outstanding);
-		spin_lock(&rreq->lock);
-		list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-		subreq->prev_donated = rreq->prev_donated;
-		rreq->prev_donated = 0;
-		trace_netfs_sreq(subreq, netfs_sreq_trace_added);
-		spin_unlock(&rreq->lock);
-
 		source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
 		subreq->source = source;
 		if (source == NETFS_DOWNLOAD_FROM_SERVER) {
@@ -237,17 +249,18 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			netfs_stat(&netfs_n_rh_download);
 			if (rreq->netfs_ops->prepare_read) {
 				ret = rreq->netfs_ops->prepare_read(subreq);
-				if (ret < 0)
-					goto prep_failed;
+				if (ret < 0) {
+					subreq->error = ret;
+					/* Not queued - release both refs. */
+					netfs_put_subrequest(subreq, false,
+							     netfs_sreq_trace_put_cancel);
+					netfs_put_subrequest(subreq, false,
+							     netfs_sreq_trace_put_cancel);
+					break;
+				}
 				trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
 			}
-
-			slice = netfs_prepare_read_iterator(subreq);
-			if (slice < 0)
-				goto prep_iter_failed;
-
-			rreq->netfs_ops->issue_read(subreq);
-			goto done;
+			goto issue;
 		}
 
 	fill_with_zeroes:
@@ -255,67 +268,50 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			subreq->source = NETFS_FILL_WITH_ZEROES;
 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 			netfs_stat(&netfs_n_rh_zero);
-			slice = netfs_prepare_read_iterator(subreq);
-			if (slice < 0)
-				goto prep_iter_failed;
-			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
-			subreq->error = 0;
-			netfs_read_subreq_terminated(subreq);
-			goto done;
+			goto issue;
 		}
 
 		if (source == NETFS_READ_FROM_CACHE) {
 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
-			slice = netfs_prepare_read_iterator(subreq);
-			if (slice < 0)
-				goto prep_iter_failed;
-			netfs_read_cache_to_pagecache(rreq, subreq);
-			goto done;
+			goto issue;
 		}
 
 		pr_err("Unexpected read source %u\n", source);
 		WARN_ON_ONCE(1);
 		break;
 
-	prep_iter_failed:
-		ret = slice;
-	prep_failed:
-		subreq->error = ret;
-		atomic_dec(&rreq->nr_outstanding);
-		netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
-		break;
-
-	done:
+	issue:
+		slice = netfs_prepare_read_iterator(subreq);
+		if (slice < 0) {
+			ret = slice;
+			subreq->error = ret;
+			trace_netfs_sreq(subreq, netfs_sreq_trace_cancel);
+			/* Not queued - release both refs. */
+			netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+			netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+			break;
+		}
 		size -= slice;
 		start += slice;
+		if (size <= 0) {
+			smp_wmb(); /* Write lists before ALL_QUEUED. */
+			set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+		}
+
+		netfs_issue_read(rreq, subreq);
 		cond_resched();
 	} while (size > 0);
 
-	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq);
+	if (unlikely(size > 0)) {
+		smp_wmb(); /* Write lists before ALL_QUEUED. */
+		set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+		netfs_wake_read_collector(rreq);
+	}
 
 	/* Defer error return as we may need to wait for outstanding I/O. */
 	cmpxchg(&rreq->error, 0, ret);
 }
 
-/*
- * Wait for the read operation to complete, successfully or otherwise.
- */
-static int netfs_wait_for_read(struct netfs_io_request *rreq)
-{
-	int ret;
-
-	trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
-	wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
-	ret = rreq->error;
-	if (ret == 0 && rreq->submitted < rreq->len) {
-		trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
-		ret = -EIO;
-	}
-
-	return ret;
-}
-
 /**
  * netfs_readahead - Helper to manage a read request
  * @ractl: The description of the readahead request
@@ -344,6 +340,8 @@ void netfs_readahead(struct readahead_control *ractl)
 	if (IS_ERR(rreq))
 		return;
 
+	__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);
+
 	ret = netfs_begin_cache_read(rreq, ictx);
 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 		goto cleanup_free;
@@ -460,7 +458,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
 		folio_put(sink);
 
 	ret = netfs_wait_for_read(rreq);
-	if (ret == 0) {
+	if (ret >= 0) {
 		flush_dcache_folio(folio);
 		folio_mark_uptodate(folio);
 	}
@@ -748,7 +746,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 	netfs_read_to_pagecache(rreq);
 	ret = netfs_wait_for_read(rreq);
 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
-	return ret;
+	return ret < 0 ? ret : 0;
 
 error_put:
 	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 1a20cc3979c7..0bf3c2f5a710 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -47,12 +47,11 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
  */
 static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
 {
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
 	unsigned long long start = rreq->start;
 	ssize_t size = rreq->len;
 	int ret = 0;
 
-	atomic_set(&rreq->nr_outstanding, 1);
-
 	do {
 		struct netfs_io_subrequest *subreq;
 		ssize_t slice;
@@ -67,11 +66,18 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
 		subreq->start	= start;
 		subreq->len	= size;
 
-		atomic_inc(&rreq->nr_outstanding);
+		__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
 		spin_lock(&rreq->lock);
-		list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-		subreq->prev_donated = rreq->prev_donated;
-		rreq->prev_donated = 0;
+		list_add_tail(&subreq->rreq_link, &stream->subrequests);
+		if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+			stream->front = subreq;
+			if (!stream->active) {
+				stream->collected_to = stream->front->start;
+				/* Store list pointers before active flag */
+				smp_store_release(&stream->active, true);
+			}
+		}
 		trace_netfs_sreq(subreq, netfs_sreq_trace_added);
 		spin_unlock(&rreq->lock);
 
@@ -79,7 +85,6 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
 		if (rreq->netfs_ops->prepare_read) {
 			ret = rreq->netfs_ops->prepare_read(subreq);
 			if (ret < 0) {
-				atomic_dec(&rreq->nr_outstanding);
 				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
 				break;
 			}
@@ -87,20 +92,32 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
 
 		netfs_prepare_dio_read_iterator(subreq);
 		slice = subreq->len;
-		rreq->netfs_ops->issue_read(subreq);
-
 		size -= slice;
 		start += slice;
 		rreq->submitted += slice;
+		if (size <= 0) {
+			smp_wmb(); /* Write lists before ALL_QUEUED. */
+			set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+		}
+
+		rreq->netfs_ops->issue_read(subreq);
 
+		if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+			netfs_wait_for_pause(rreq);
+		if (test_bit(NETFS_RREQ_FAILED, &rreq->flags))
+			break;
 		if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
 		    test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
 			break;
 		cond_resched();
 	} while (size > 0);
 
-	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq);
+	if (unlikely(size > 0)) {
+		smp_wmb(); /* Write lists before ALL_QUEUED. */
+		set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+		netfs_wake_read_collector(rreq);
+	}
+
 	return ret;
 }
 
@@ -133,21 +150,10 @@ static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
 		goto out;
 	}
 
-	if (sync) {
-		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
-		wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
-			    TASK_UNINTERRUPTIBLE);
-
-		ret = rreq->error;
-		if (ret == 0 && rreq->submitted < rreq->len &&
-		    rreq->origin != NETFS_DIO_READ) {
-			trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
-			ret = -EIO;
-		}
-	} else {
+	if (sync)
+		ret = netfs_wait_for_read(rreq);
+	else
 		ret = -EIOCBQUEUED;
-	}
-
 out:
 	_leave(" = %d", ret);
 	return ret;
@@ -215,8 +221,10 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
 
 	// TODO: Set up bounce buffer if needed
 
-	if (!sync)
+	if (!sync) {
 		rreq->iocb = iocb;
+		__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);
+	}
 
 	ret = netfs_unbuffered_read(rreq, sync);
 	if (ret < 0)
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index e236f752af88..eb76f98c894b 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -82,20 +82,27 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
 	trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what);
 }
 
+static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq,
+					enum netfs_sreq_ref_trace what)
+{
+	trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index,
+			     refcount_read(&subreq->ref), what);
+}
+
 /*
  * read_collect.c
  */
-void netfs_read_termination_worker(struct work_struct *work);
-void netfs_rreq_terminated(struct netfs_io_request *rreq);
+void netfs_read_collection_worker(struct work_struct *work);
+void netfs_wake_read_collector(struct netfs_io_request *rreq);
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
+ssize_t netfs_wait_for_read(struct netfs_io_request *rreq);
+void netfs_wait_for_pause(struct netfs_io_request *rreq);
 
 /*
  * read_pgpriv2.c
  */
-void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq,
-				      struct netfs_io_request *rreq,
-				      struct folio_queue *folioq,
-				      int slot);
-void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq);
+void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio);
+void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq);
 bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq);
 
 /*
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 16760695e667..4e3e62040831 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -71,7 +71,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
 		   refcount_read(&rreq->ref),
 		   rreq->flags,
 		   rreq->error,
-		   atomic_read(&rreq->nr_outstanding),
+		   0,
 		   rreq->start, rreq->submitted, rreq->len);
 	seq_putc(m, '\n');
 	return 0;
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index dde4a679d9e2..dc6b41ef18b0 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -48,7 +48,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	spin_lock_init(&rreq->lock);
 	INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
 	INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
-	INIT_LIST_HEAD(&rreq->subrequests);
+	init_waitqueue_head(&rreq->waitq);
 	refcount_set(&rreq->ref, 1);
 
 	if (origin == NETFS_READAHEAD ||
@@ -56,10 +56,12 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	    origin == NETFS_READ_GAPS ||
 	    origin == NETFS_READ_SINGLE ||
 	    origin == NETFS_READ_FOR_WRITE ||
-	    origin == NETFS_DIO_READ)
-		INIT_WORK(&rreq->work, NULL);
-	else
+	    origin == NETFS_DIO_READ) {
+		INIT_WORK(&rreq->work, netfs_read_collection_worker);
+		rreq->io_streams[0].avail = true;
+	} else {
 		INIT_WORK(&rreq->work, netfs_write_collection_worker);
+	}
 
 	__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 	if (file && file->f_flags & O_NONBLOCK)
@@ -93,14 +95,6 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
 	struct netfs_io_stream *stream;
 	int s;
 
-	while (!list_empty(&rreq->subrequests)) {
-		subreq = list_first_entry(&rreq->subrequests,
-					  struct netfs_io_subrequest, rreq_link);
-		list_del(&subreq->rreq_link);
-		netfs_put_subrequest(subreq, was_async,
-				     netfs_sreq_trace_put_clear);
-	}
-
 	for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) {
 		stream = &rreq->io_streams[s];
 		while (!list_empty(&stream->subrequests)) {
@@ -192,21 +186,7 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
 	}
 
 	memset(subreq, 0, kmem_cache_size(cache));
-
-	switch (rreq->origin) {
-	case NETFS_READAHEAD:
-	case NETFS_READPAGE:
-	case NETFS_READ_GAPS:
-	case NETFS_READ_SINGLE:
-	case NETFS_READ_FOR_WRITE:
-	case NETFS_DIO_READ:
-		INIT_WORK(&subreq->work, netfs_read_subreq_termination_worker);
-		break;
-	default:
-		INIT_WORK(&subreq->work, NULL);
-		break;
-	}
-
+	INIT_WORK(&subreq->work, NULL);
 	INIT_LIST_HEAD(&subreq->rreq_link);
 	refcount_set(&subreq->ref, 2);
 	subreq->rreq = rreq;
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 2e9291ab1d62..f65affa5a9e4 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -14,6 +14,14 @@
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 
+/* Notes made in the collector */
+#define HIT_PENDING	0x01	/* A front op was still pending */
+#define MADE_PROGRESS	0x04	/* Made progress cleaning up a stream or the folio set */
+#define BUFFERED	0x08	/* The pagecache needs cleaning up */
+#define NEED_RETRY	0x10	/* A front op requests retrying */
+#define COPY_TO_CACHE	0x40	/* Need to copy subrequest to cache */
+#define ABANDON_SREQ	0x80	/* Need to abandon untransferred part of subrequest */
+
 /*
  * Clear the unread part of an I/O request.
  */
@@ -31,14 +39,18 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
  * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
  * dirty and let writeback handle it.
  */
-static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
-				    struct netfs_io_request *rreq,
+static void netfs_unlock_read_folio(struct netfs_io_request *rreq,
 				    struct folio_queue *folioq,
 				    int slot)
 {
 	struct netfs_folio *finfo;
 	struct folio *folio = folioq_folio(folioq, slot);
 
+	if (unlikely(folio_pos(folio) < rreq->abandon_to)) {
+		trace_netfs_folio(folio, netfs_folio_trace_abandon);
+		goto just_unlock;
+	}
+
 	flush_dcache_folio(folio);
 	folio_mark_uptodate(folio);
 
@@ -53,7 +65,7 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
 			kfree(finfo);
 		}
 
-		if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+		if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags)) {
 			if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
 				trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
 				folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
@@ -66,12 +78,11 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
 		folioq_clear(folioq, slot);
 	} else {
 		// TODO: Use of PG_private_2 is deprecated.
-		if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
-			netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
-		else
-			folioq_clear(folioq, slot);
+		if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags))
+			netfs_pgpriv2_copy_to_cache(rreq, folio);
 	}
 
+just_unlock:
 	if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
 		if (folio->index == rreq->no_unlock_folio &&
 		    test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
@@ -81,241 +92,249 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
 			folio_unlock(folio);
 		}
 	}
+
+	folioq_clear(folioq, slot);
 }
 
 /*
- * Unlock any folios that are now completely read.  Returns true if the
- * subrequest is removed from the list.
+ * Unlock any folios we've finished with.
  */
-static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq)
+static void netfs_read_unlock_folios(struct netfs_io_request *rreq,
+				     unsigned int *notes)
 {
-	struct netfs_io_subrequest *prev, *next;
-	struct netfs_io_request *rreq = subreq->rreq;
-	struct folio_queue *folioq = subreq->curr_folioq;
-	size_t avail, prev_donated, next_donated, fsize, part, excess;
-	loff_t fpos, start;
-	loff_t fend;
-	int slot = subreq->curr_folioq_slot;
-
-	if (WARN(subreq->transferred > subreq->len,
-		 "Subreq overread: R%x[%x] %zu > %zu",
-		 rreq->debug_id, subreq->debug_index,
-		 subreq->transferred, subreq->len))
-		subreq->transferred = subreq->len;
-
-	trace_netfs_folioq(folioq, netfs_trace_folioq_read_progress);
-next_folio:
-	fsize = PAGE_SIZE << subreq->curr_folio_order;
-	fpos = round_down(subreq->start + subreq->consumed, fsize);
-	fend = fpos + fsize;
-
-	if (WARN_ON_ONCE(!folioq) ||
-	    WARN_ON_ONCE(!folioq_folio(folioq, slot)) ||
-	    WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) {
-		pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n",
-		       rreq->debug_id, subreq->debug_index,
-		       subreq->start, subreq->start + subreq->transferred - 1,
-		       subreq->consumed, subreq->transferred, subreq->len,
-		       slot);
-		if (folioq) {
-			struct folio *folio = folioq_folio(folioq, slot);
-
-			pr_err("folioq: fq=%x orders=%02x%02x%02x%02x %px\n",
-			       folioq->debug_id,
-			       folioq->orders[0], folioq->orders[1],
-			       folioq->orders[2], folioq->orders[3],
-			       folioq);
-			if (folio)
-				pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
-				       fpos, fend - 1, folio_pos(folio), folio_order(folio),
-				       folioq_folio_order(folioq, slot));
-		}
-	}
+	struct folio_queue *folioq = rreq->buffer.tail;
+	unsigned long long collected_to = rreq->collected_to;
+	unsigned int slot = rreq->buffer.first_tail_slot;
 
-donation_changed:
-	/* Try to consume the current folio if we've hit or passed the end of
-	 * it.  There's a possibility that this subreq doesn't start at the
-	 * beginning of the folio, in which case we need to donate to/from the
-	 * preceding subreq.
-	 *
-	 * We also need to include any potential donation back from the
-	 * following subreq.
-	 */
-	prev_donated = READ_ONCE(subreq->prev_donated);
-	next_donated =  READ_ONCE(subreq->next_donated);
-	if (prev_donated || next_donated) {
-		spin_lock(&rreq->lock);
-		prev_donated = subreq->prev_donated;
-		next_donated =  subreq->next_donated;
-		subreq->start -= prev_donated;
-		subreq->len += prev_donated;
-		subreq->transferred += prev_donated;
-		prev_donated = subreq->prev_donated = 0;
-		if (subreq->transferred == subreq->len) {
-			subreq->len += next_donated;
-			subreq->transferred += next_donated;
-			next_donated = subreq->next_donated = 0;
+	if (rreq->cleaned_to >= rreq->collected_to)
+		return;
+
+	// TODO: Begin decryption
+
+	if (slot >= folioq_nr_slots(folioq)) {
+		folioq = rolling_buffer_delete_spent(&rreq->buffer);
+		if (!folioq) {
+			rreq->front_folio_order = 0;
+			return;
 		}
-		trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
-		spin_unlock(&rreq->lock);
+		slot = 0;
 	}
 
-	avail = subreq->transferred;
-	if (avail == subreq->len)
-		avail += next_donated;
-	start = subreq->start;
-	if (subreq->consumed == 0) {
-		start -= prev_donated;
-		avail += prev_donated;
-	} else {
-		start += subreq->consumed;
-		avail -= subreq->consumed;
-	}
-	part = umin(avail, fsize);
-
-	trace_netfs_progress(subreq, start, avail, part);
-
-	if (start + avail >= fend) {
-		if (fpos == start) {
-			/* Flush, unlock and mark for caching any folio we've just read. */
-			subreq->consumed = fend - subreq->start;
-			netfs_unlock_read_folio(subreq, rreq, folioq, slot);
-			folioq_mark2(folioq, slot);
-			if (subreq->consumed >= subreq->len)
-				goto remove_subreq;
-		} else if (fpos < start) {
-			excess = fend - subreq->start;
-
-			spin_lock(&rreq->lock);
-			/* If we complete first on a folio split with the
-			 * preceding subreq, donate to that subreq - otherwise
-			 * we get the responsibility.
-			 */
-			if (subreq->prev_donated != prev_donated) {
-				spin_unlock(&rreq->lock);
-				goto donation_changed;
-			}
+	for (;;) {
+		struct folio *folio;
+		unsigned long long fpos, fend;
+		unsigned int order;
+		size_t fsize;
 
-			if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
-				spin_unlock(&rreq->lock);
-				pr_err("Can't donate prior to front\n");
-				goto bad;
-			}
+		if (*notes & COPY_TO_CACHE)
+			set_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
 
-			prev = list_prev_entry(subreq, rreq_link);
-			WRITE_ONCE(prev->next_donated, prev->next_donated + excess);
-			subreq->start += excess;
-			subreq->len -= excess;
-			subreq->transferred -= excess;
-			trace_netfs_donate(rreq, subreq, prev, excess,
-					   netfs_trace_donate_tail_to_prev);
-			trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
-
-			if (subreq->consumed >= subreq->len)
-				goto remove_subreq_locked;
-			spin_unlock(&rreq->lock);
-		} else {
-			pr_err("fpos > start\n");
-			goto bad;
-		}
+		folio = folioq_folio(folioq, slot);
+		if (WARN_ONCE(!folio_test_locked(folio),
+			      "R=%08x: folio %lx is not locked\n",
+			      rreq->debug_id, folio->index))
+			trace_netfs_folio(folio, netfs_folio_trace_not_locked);
 
-		/* Advance the rolling buffer to the next folio. */
+		order = folioq_folio_order(folioq, slot);
+		rreq->front_folio_order = order;
+		fsize = PAGE_SIZE << order;
+		fpos = folio_pos(folio);
+		fend = umin(fpos + fsize, rreq->i_size);
+
+		trace_netfs_collect_folio(rreq, folio, fend, collected_to);
+
+		/* Unlock any folio we've transferred all of. */
+		if (collected_to < fend)
+			break;
+
+		netfs_unlock_read_folio(rreq, folioq, slot);
+		WRITE_ONCE(rreq->cleaned_to, fpos + fsize);
+		*notes |= MADE_PROGRESS;
+
+		clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
+
+		/* Clean up the head folioq.  If we clear an entire folioq, then
+		 * we can get rid of it provided it's not also the tail folioq
+		 * being filled by the issuer.
+		 */
+		folioq_clear(folioq, slot);
 		slot++;
 		if (slot >= folioq_nr_slots(folioq)) {
+			folioq = rolling_buffer_delete_spent(&rreq->buffer);
+			if (!folioq)
+				goto done;
 			slot = 0;
-			folioq = folioq->next;
-			subreq->curr_folioq = folioq;
 			trace_netfs_folioq(folioq, netfs_trace_folioq_read_progress);
 		}
-		subreq->curr_folioq_slot = slot;
-		if (folioq && folioq_folio(folioq, slot))
-			subreq->curr_folio_order = folioq->orders[slot];
-		cond_resched();
-		goto next_folio;
+
+		if (fpos + fsize >= collected_to)
+			break;
 	}
 
-	/* Deal with partial progress. */
-	if (subreq->transferred < subreq->len)
-		return false;
+	rreq->buffer.tail = folioq;
+done:
+	rreq->buffer.first_tail_slot = slot;
+}
 
-	/* Donate the remaining downloaded data to one of the neighbouring
-	 * subrequests.  Note that we may race with them doing the same thing.
+/*
+ * Collect and assess the results of various read subrequests.  We may need to
+ * retry some of the results.
+ *
+ * Note that we have a sequence of subrequests, which may be drawing on
+ * different sources and may or may not be the same size or starting position
+ * and may not even correspond in boundary alignment.
+ */
+static void netfs_collect_read_results(struct netfs_io_request *rreq)
+{
+	struct netfs_io_subrequest *front, *remove;
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
+	unsigned int notes;
+
+	_enter("%llx-%llx", rreq->start, rreq->start + rreq->len);
+	trace_netfs_rreq(rreq, netfs_rreq_trace_collect);
+	trace_netfs_collect(rreq);
+
+reassess:
+	if (rreq->origin == NETFS_READAHEAD ||
+	    rreq->origin == NETFS_READPAGE ||
+	    rreq->origin == NETFS_READ_FOR_WRITE)
+		notes = BUFFERED;
+	else
+		notes = 0;
+
+	/* Remove completed subrequests from the front of the stream and
+	 * advance the completion point.  We stop when we hit something that's
+	 * in progress.  The issuer thread may be adding stuff to the tail
+	 * whilst we're doing this.
 	 */
-	spin_lock(&rreq->lock);
+	front = READ_ONCE(stream->front);
+	while (front) {
+		size_t transferred;
+
+		trace_netfs_collect_sreq(rreq, front);
+		_debug("sreq [%x] %llx %zx/%zx",
+		       front->debug_index, front->start, front->transferred, front->len);
+
+		if (stream->collected_to < front->start) {
+			trace_netfs_collect_gap(rreq, stream, front->start, 'F');
+			stream->collected_to = front->start;
+		}
+
+		if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags))
+			notes |= HIT_PENDING;
+		smp_rmb(); /* Read counters after IN_PROGRESS flag. */
+		transferred = READ_ONCE(front->transferred);
+
+		/* If we can now collect the next folio, do so.  We don't want
+		 * to defer this as we have to decide whether we need to copy
+		 * to the cache or not, and that may differ between adjacent
+		 * subreqs.
+		 */
+		if (notes & BUFFERED) {
+			size_t fsize = PAGE_SIZE << rreq->front_folio_order;
+
+			/* Clear the tail of a short read. */
+			if (!(notes & HIT_PENDING) &&
+			    front->error == 0 &&
+			    transferred < front->len &&
+			    (test_bit(NETFS_SREQ_HIT_EOF, &front->flags) ||
+			     test_bit(NETFS_SREQ_CLEAR_TAIL, &front->flags))) {
+				netfs_clear_unread(front);
+				transferred = front->transferred = front->len;
+				trace_netfs_sreq(front, netfs_sreq_trace_clear);
+			}
 
-	if (subreq->prev_donated != prev_donated ||
-	    subreq->next_donated != next_donated) {
+			stream->collected_to = front->start + transferred;
+			rreq->collected_to = stream->collected_to;
+
+			if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &front->flags))
+				notes |= COPY_TO_CACHE;
+
+			if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+				rreq->abandon_to = front->start + front->len;
+				front->transferred = front->len;
+				transferred = front->len;
+				trace_netfs_rreq(rreq, netfs_rreq_trace_set_abandon);
+			}
+			if (front->start + transferred >= rreq->cleaned_to + fsize ||
+			    test_bit(NETFS_SREQ_HIT_EOF, &front->flags))
+				netfs_read_unlock_folios(rreq, &notes);
+		} else {
+			stream->collected_to = front->start + transferred;
+			rreq->collected_to = stream->collected_to;
+		}
+
+		/* Stall if the front is still undergoing I/O. */
+		if (notes & HIT_PENDING)
+			break;
+
+		if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+			if (!stream->failed) {
+				stream->error = front->error;
+				rreq->error = front->error;
+				set_bit(NETFS_RREQ_FAILED, &rreq->flags);
+				stream->failed = true;
+			}
+			notes |= MADE_PROGRESS | ABANDON_SREQ;
+		} else if (test_bit(NETFS_SREQ_NEED_RETRY, &front->flags)) {
+			stream->need_retry = true;
+			notes |= NEED_RETRY | MADE_PROGRESS;
+			break;
+		} else {
+			if (!stream->failed)
+				stream->transferred = stream->collected_to - rreq->start;
+			notes |= MADE_PROGRESS;
+		}
+
+		/* Remove if completely consumed. */
+		stream->source = front->source;
+		spin_lock(&rreq->lock);
+
+		remove = front;
+		trace_netfs_sreq(front, netfs_sreq_trace_discard);
+		list_del_init(&front->rreq_link);
+		front = list_first_entry_or_null(&stream->subrequests,
+						 struct netfs_io_subrequest, rreq_link);
+		stream->front = front;
 		spin_unlock(&rreq->lock);
-		cond_resched();
-		goto donation_changed;
+		netfs_put_subrequest(remove, false,
+				     notes & ABANDON_SREQ ?
+				     netfs_sreq_trace_put_abandon :
+				     netfs_sreq_trace_put_done);
 	}
 
-	/* Deal with the trickiest case: that this subreq is in the middle of a
-	 * folio, not touching either edge, but finishes first.  In such a
-	 * case, we donate to the previous subreq, if there is one and if it is
-	 * contiguous, so that the donation is only handled when that completes
-	 * - and remove this subreq from the list.
-	 *
-	 * If the previous subreq finished first, we will have acquired their
-	 * donation and should be able to unlock folios and/or donate nextwards.
-	 */
-	if (!subreq->consumed &&
-	    !prev_donated &&
-	    !list_is_first(&subreq->rreq_link, &rreq->subrequests) &&
-	    subreq->start == prev->start + prev->len) {
-		prev = list_prev_entry(subreq, rreq_link);
-		WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
-		subreq->start += subreq->len;
-		subreq->len = 0;
-		subreq->transferred = 0;
-		trace_netfs_donate(rreq, subreq, prev, subreq->len,
-				   netfs_trace_donate_to_prev);
-		trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
-		goto remove_subreq_locked;
-	}
+	trace_netfs_collect_stream(rreq, stream);
+	trace_netfs_collect_state(rreq, rreq->collected_to, notes);
 
-	/* If we can't donate down the chain, donate up the chain instead. */
-	excess = subreq->len - subreq->consumed + next_donated;
+	if (!(notes & BUFFERED))
+		rreq->cleaned_to = rreq->collected_to;
 
-	if (!subreq->consumed)
-		excess += prev_donated;
+	if (notes & NEED_RETRY)
+		goto need_retry;
+	if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) {
+		trace_netfs_rreq(rreq, netfs_rreq_trace_unpause);
+		clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags);
+		smp_mb__after_atomic(); /* Set PAUSE before task state */
+		wake_up(&rreq->waitq);
+	}
 
-	if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
-		rreq->prev_donated = excess;
-		trace_netfs_donate(rreq, subreq, NULL, excess,
-				   netfs_trace_donate_to_deferred_next);
-	} else {
-		next = list_next_entry(subreq, rreq_link);
-		WRITE_ONCE(next->prev_donated, excess);
-		trace_netfs_donate(rreq, subreq, next, excess,
-				   netfs_trace_donate_to_next);
+	if (notes & MADE_PROGRESS) {
+		//cond_resched();
+		goto reassess;
 	}
-	trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next);
-	subreq->len = subreq->consumed;
-	subreq->transferred = subreq->consumed;
-	goto remove_subreq_locked;
-
-remove_subreq:
-	spin_lock(&rreq->lock);
-remove_subreq_locked:
-	subreq->consumed = subreq->len;
-	list_del(&subreq->rreq_link);
-	spin_unlock(&rreq->lock);
-	netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
-	return true;
-
-bad:
-	/* Errr... prev and next both donated to us, but insufficient to finish
-	 * the folio.
+
+out:
+	_leave(" = %x", notes);
+	return;
+
+need_retry:
+	/* Okay...  We're going to have to retry parts of the stream.  Note
+	 * that any partially completed op will have had any wholly transferred
+	 * folios removed from it.
 	 */
-	printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
-	       rreq->debug_id, subreq->debug_index,
-	       subreq->start, subreq->start + subreq->transferred - 1,
-	       subreq->consumed, subreq->transferred, subreq->len);
-	printk("folio: %llx-%llx\n", fpos, fend - 1);
-	printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated);
-	printk("s=%llx av=%zx part=%zx\n", start, avail, part);
-	BUG();
+	_debug("retry");
+	netfs_retry_reads(rreq);
+	goto out;
 }
 
 /*
@@ -324,12 +343,13 @@ bad:
 static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
 {
 	struct netfs_io_subrequest *subreq;
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
 	unsigned int i;
 
 	/* Collect unbuffered reads and direct reads, adding up the transfer
 	 * sizes until we find the first short or failed subrequest.
 	 */
-	list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+	list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
 		rreq->transferred += subreq->transferred;
 
 		if (subreq->transferred < subreq->len ||
@@ -366,22 +386,12 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
  */
 static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
 {
-	struct netfs_io_subrequest *subreq;
 	struct netfs_io_stream *stream = &rreq->io_streams[0];
 
-	subreq = list_first_entry_or_null(&stream->subrequests,
-					  struct netfs_io_subrequest, rreq_link);
-	if (subreq) {
-		if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
-			rreq->error = subreq->error;
-		else
-			rreq->transferred = subreq->transferred;
-
-		if (!rreq->error && subreq->source == NETFS_DOWNLOAD_FROM_SERVER &&
-		    fscache_resources_valid(&rreq->cache_resources)) {
-			trace_netfs_rreq(rreq, netfs_rreq_trace_dirty);
-			netfs_single_mark_inode_dirty(rreq->inode);
-		}
+	if (!rreq->error && stream->source == NETFS_DOWNLOAD_FROM_SERVER &&
+	    fscache_resources_valid(&rreq->cache_resources)) {
+		trace_netfs_rreq(rreq, netfs_rreq_trace_dirty);
+		netfs_single_mark_inode_dirty(rreq->inode);
 	}
 
 	if (rreq->iocb) {
@@ -395,21 +405,32 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
 }
 
 /*
- * Assess the state of a read request and decide what to do next.
+ * Perform the collection of subrequests and folios.
  *
  * Note that we're in normal kernel thread context at this point, possibly
  * running on a workqueue.
  */
-void netfs_rreq_terminated(struct netfs_io_request *rreq)
+static void netfs_read_collection(struct netfs_io_request *rreq)
 {
-	trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
 
-	//netfs_rreq_is_still_valid(rreq);
+	netfs_collect_read_results(rreq);
 
-	if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) {
-		netfs_retry_reads(rreq);
+	/* We're done when the app thread has finished posting subreqs and the
+	 * queue is empty.
+	 */
+	if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags))
+		return;
+	smp_rmb(); /* Read ALL_QUEUED before subreq lists. */
+
+	if (!list_empty(&stream->subrequests))
 		return;
-	}
+
+	/* Okay, declare that all I/O is complete. */
+	rreq->transferred = stream->transferred;
+	trace_netfs_rreq(rreq, netfs_rreq_trace_complete);
+
+	//netfs_rreq_is_still_valid(rreq);
 
 	switch (rreq->origin) {
 	case NETFS_DIO_READ:
@@ -430,8 +451,35 @@ void netfs_rreq_terminated(struct netfs_io_request *rreq)
 	trace_netfs_rreq(rreq, netfs_rreq_trace_done);
 	netfs_clear_subrequests(rreq, false);
 	netfs_unlock_abandoned_read_pages(rreq);
-	if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)))
-		netfs_pgpriv2_write_to_the_cache(rreq);
+	if (unlikely(rreq->copy_to_cache))
+		netfs_pgpriv2_end_copy_to_cache(rreq);
+}
+
+void netfs_read_collection_worker(struct work_struct *work)
+{
+	struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
+
+	netfs_see_request(rreq, netfs_rreq_trace_see_work);
+	if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
+		netfs_read_collection(rreq);
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_work);
+}
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_read_collector(struct netfs_io_request *rreq)
+{
+	if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
+		if (!work_pending(&rreq->work)) {
+			netfs_get_request(rreq, netfs_rreq_trace_get_work);
+			if (!queue_work(system_unbound_wq, &rreq->work))
+				netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq);
+		}
+	} else {
+		trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
+		wake_up(&rreq->waitq);
+	}
 }
 
 /**
@@ -447,17 +495,22 @@ void netfs_rreq_terminated(struct netfs_io_request *rreq)
 void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq)
 {
 	struct netfs_io_request *rreq = subreq->rreq;
-
-	might_sleep();
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
+	size_t fsize = PAGE_SIZE << rreq->front_folio_order;
 
 	trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
 
-	if (subreq->transferred > subreq->consumed &&
+	/* If we are at the head of the queue, wake up the collector,
+	 * getting a ref to it if we were the ones to do so.
+	 */
+	if (subreq->start + subreq->transferred > rreq->cleaned_to + fsize &&
 	    (rreq->origin == NETFS_READAHEAD ||
 	     rreq->origin == NETFS_READPAGE ||
-	     rreq->origin == NETFS_READ_FOR_WRITE)) {
-		netfs_consume_read_data(subreq);
+	     rreq->origin == NETFS_READ_FOR_WRITE) &&
+	    list_is_first(&subreq->rreq_link, &stream->subrequests)
+	    ) {
 		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+		netfs_wake_read_collector(rreq);
 	}
 }
 EXPORT_SYMBOL(netfs_read_subreq_progress);
@@ -481,8 +534,7 @@ EXPORT_SYMBOL(netfs_read_subreq_progress);
 void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
 {
 	struct netfs_io_request *rreq = subreq->rreq;
-
-	might_sleep();
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
 
 	switch (subreq->source) {
 	case NETFS_READ_FROM_CACHE:
@@ -495,86 +547,156 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
 		break;
 	}
 
-	if (rreq->origin != NETFS_DIO_READ) {
-		/* Collect buffered reads.
-		 *
-		 * If the read completed validly short, then we can clear the
-		 * tail before going on to unlock the folios.
-		 */
-		if (subreq->error == 0 && subreq->transferred < subreq->len &&
-		    (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
-		     test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
-			netfs_clear_unread(subreq);
-			subreq->transferred = subreq->len;
-			trace_netfs_sreq(subreq, netfs_sreq_trace_clear);
-		}
-		if (subreq->transferred > subreq->consumed &&
-		    (rreq->origin == NETFS_READAHEAD ||
-		     rreq->origin == NETFS_READPAGE ||
-		     rreq->origin == NETFS_READ_FOR_WRITE)) {
-			netfs_consume_read_data(subreq);
-			__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
-		}
-		rreq->transferred += subreq->transferred;
-	}
-
 	/* Deal with retry requests, short reads and errors.  If we retry
 	 * but don't make progress, we abandon the attempt.
 	 */
 	if (!subreq->error && subreq->transferred < subreq->len) {
 		if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
 			trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
+		} else if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
+			trace_netfs_sreq(subreq, netfs_sreq_trace_need_clear);
+		} else if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+			trace_netfs_sreq(subreq, netfs_sreq_trace_need_retry);
+		} else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
+			__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+			trace_netfs_sreq(subreq, netfs_sreq_trace_partial_read);
 		} else {
+			__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+			subreq->error = -ENODATA;
 			trace_netfs_sreq(subreq, netfs_sreq_trace_short);
-			if (subreq->transferred > subreq->consumed) {
-				/* If we didn't read new data, abandon retry. */
-				if (subreq->retry_count &&
-				    test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
-					__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-					set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
-				}
-			} else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
-				__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-				set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
-			} else {
-				__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
-				subreq->error = -ENODATA;
-			}
 		}
 	}
 
-	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
 	if (unlikely(subreq->error < 0)) {
 		trace_netfs_failure(rreq, subreq, subreq->error, netfs_fail_read);
 		if (subreq->source == NETFS_READ_FROM_CACHE) {
 			netfs_stat(&netfs_n_rh_read_failed);
+			__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 		} else {
 			netfs_stat(&netfs_n_rh_download_failed);
-			set_bit(NETFS_RREQ_FAILED, &rreq->flags);
-			rreq->error = subreq->error;
+			__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
 		}
+		trace_netfs_rreq(rreq, netfs_rreq_trace_set_pause);
+		set_bit(NETFS_RREQ_PAUSE, &rreq->flags);
 	}
 
-	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq);
+	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+	clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+	smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
 
-	netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_terminated);
+	/* If we are at the head of the queue, wake up the collector. */
+	if (list_is_first(&subreq->rreq_link, &stream->subrequests))
+		netfs_wake_read_collector(rreq);
+
+	netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);
 }
 EXPORT_SYMBOL(netfs_read_subreq_terminated);
 
-/**
- * netfs_read_subreq_termination_worker - Workqueue helper for read termination
- * @work: The subreq->work in the I/O request that has been terminated.
- *
- * Helper function to jump to netfs_read_subreq_terminated() from the
- * subrequest work item.
+/*
+ * Handle termination of a read from the cache.
  */
-void netfs_read_subreq_termination_worker(struct work_struct *work)
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
 {
-	struct netfs_io_subrequest *subreq =
-		container_of(work, struct netfs_io_subrequest, work);
+	struct netfs_io_subrequest *subreq = priv;
 
+	if (transferred_or_error > 0) {
+		subreq->error = 0;
+		if (transferred_or_error > 0) {
+			subreq->transferred += transferred_or_error;
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+		}
+	} else {
+		subreq->error = transferred_or_error;
+	}
 	netfs_read_subreq_terminated(subreq);
 }
-EXPORT_SYMBOL(netfs_read_subreq_termination_worker);
+
+/*
+ * Wait for the read operation to complete, successfully or otherwise.
+ */
+ssize_t netfs_wait_for_read(struct netfs_io_request *rreq)
+{
+	struct netfs_io_subrequest *subreq;
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
+	DEFINE_WAIT(myself);
+	ssize_t ret;
+
+	for (;;) {
+		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
+		prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+		subreq = list_first_entry_or_null(&stream->subrequests,
+						  struct netfs_io_subrequest, rreq_link);
+		if (subreq &&
+		    (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
+		     test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
+			__set_current_state(TASK_RUNNING);
+			netfs_read_collection(rreq);
+			continue;
+		}
+
+		if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
+			break;
+
+		schedule();
+		trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
+	}
+
+	finish_wait(&rreq->waitq, &myself);
+
+	ret = rreq->error;
+	if (ret == 0) {
+		ret = rreq->transferred;
+		switch (rreq->origin) {
+		case NETFS_DIO_READ:
+		case NETFS_READ_SINGLE:
+			ret = rreq->transferred;
+			break;
+		default:
+			if (rreq->submitted < rreq->len) {
+				trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+				ret = -EIO;
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Wait for a paused read operation to unpause or complete in some manner.
+ */
+void netfs_wait_for_pause(struct netfs_io_request *rreq)
+{
+	struct netfs_io_subrequest *subreq;
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
+	DEFINE_WAIT(myself);
+
+	trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
+
+	for (;;) {
+		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
+		prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+		subreq = list_first_entry_or_null(&stream->subrequests,
+						  struct netfs_io_subrequest, rreq_link);
+		if (subreq &&
+		    (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
+		     test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
+			__set_current_state(TASK_RUNNING);
+			netfs_read_collection(rreq);
+			continue;
+		}
+
+		if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) ||
+		    !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+			break;
+
+		schedule();
+		trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
+	}
+
+	finish_wait(&rreq->waitq, &myself);
+}
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index 9eee5af6b327..cf7727060215 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -13,54 +13,12 @@
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 
-/*
- * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2.  The
- * third mark in the folio queue is used to indicate that this folio needs
- * writing.
- */
-void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq,
-				      struct netfs_io_request *rreq,
-				      struct folio_queue *folioq,
-				      int slot)
-{
-	struct folio *folio = folioq_folio(folioq, slot);
-
-	trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
-	folio_start_private_2(folio);
-	folioq_mark3(folioq, slot);
-}
-
-/*
- * [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an
- * unrecoverable error.
- */
-static void netfs_pgpriv2_cancel(struct rolling_buffer *buffer)
-{
-	struct folio_queue *folioq = buffer->tail;
-	struct folio *folio;
-	int slot;
-
-	while (folioq) {
-		if (!folioq->marks3) {
-			folioq = folioq->next;
-			continue;
-		}
-
-		slot = __ffs(folioq->marks3);
-		folio = folioq_folio(folioq, slot);
-
-		trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
-		folio_end_private_2(folio);
-		folioq_unmark3(folioq, slot);
-	}
-}
-
 /*
  * [DEPRECATED] Copy a folio to the cache with PG_private_2 set.
  */
-static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio *folio)
+static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio *folio)
 {
-	struct netfs_io_stream *cache  = &wreq->io_streams[1];
+	struct netfs_io_stream *cache = &creq->io_streams[1];
 	size_t fsize = folio_size(folio), flen = fsize;
 	loff_t fpos = folio_pos(folio), i_size;
 	bool to_eof = false;
@@ -71,17 +29,17 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
 	 * of the page to beyond it, but cannot move i_size into or through the
 	 * page since we have it locked.
 	 */
-	i_size = i_size_read(wreq->inode);
+	i_size = i_size_read(creq->inode);
 
 	if (fpos >= i_size) {
 		/* mmap beyond eof. */
 		_debug("beyond eof");
 		folio_end_private_2(folio);
-		return 0;
+		return;
 	}
 
-	if (fpos + fsize > wreq->i_size)
-		wreq->i_size = i_size;
+	if (fpos + fsize > creq->i_size)
+		creq->i_size = i_size;
 
 	if (flen > i_size - fpos) {
 		flen = i_size - fpos;
@@ -95,8 +53,10 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
 	trace_netfs_folio(folio, netfs_folio_trace_store_copy);
 
 	/* Attach the folio to the rolling buffer. */
-	if (rolling_buffer_append(&wreq->buffer, folio, 0) < 0)
-		return -ENOMEM;
+	if (rolling_buffer_append(&creq->buffer, folio, 0) < 0) {
+		clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &creq->flags);
+		return;
+	}
 
 	cache->submit_extendable_to = fsize;
 	cache->submit_off = 0;
@@ -110,11 +70,11 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
 	do {
 		ssize_t part;
 
-		wreq->buffer.iter.iov_offset = cache->submit_off;
+		creq->buffer.iter.iov_offset = cache->submit_off;
 
-		atomic64_set(&wreq->issued_to, fpos + cache->submit_off);
+		atomic64_set(&creq->issued_to, fpos + cache->submit_off);
 		cache->submit_extendable_to = fsize - cache->submit_off;
-		part = netfs_advance_write(wreq, cache, fpos + cache->submit_off,
+		part = netfs_advance_write(creq, cache, fpos + cache->submit_off,
 					   cache->submit_len, to_eof);
 		cache->submit_off += part;
 		if (part > cache->submit_len)
@@ -123,98 +83,95 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
 			cache->submit_len -= part;
 	} while (cache->submit_len > 0);
 
-	wreq->buffer.iter.iov_offset = 0;
-	rolling_buffer_advance(&wreq->buffer, fsize);
-	atomic64_set(&wreq->issued_to, fpos + fsize);
+	creq->buffer.iter.iov_offset = 0;
+	rolling_buffer_advance(&creq->buffer, fsize);
+	atomic64_set(&creq->issued_to, fpos + fsize);
 
 	if (flen < fsize)
-		netfs_issue_write(wreq, cache);
-
-	_leave(" = 0");
-	return 0;
+		netfs_issue_write(creq, cache);
 }
 
 /*
- * [DEPRECATED] Go through the buffer and write any folios that are marked with
- * the third mark to the cache.
+ * [DEPRECATED] Set up copying to the cache.
  */
-void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
+static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
+	struct netfs_io_request *rreq, struct folio *folio)
 {
-	struct netfs_io_request *wreq;
-	struct folio_queue *folioq;
-	struct folio *folio;
-	int error = 0;
-	int slot = 0;
-
-	_enter("");
+	struct netfs_io_request *creq;
 
 	if (!fscache_resources_valid(&rreq->cache_resources))
-		goto couldnt_start;
+		goto cancel;
 
-	/* Need the first folio to be able to set up the op. */
-	for (folioq = rreq->buffer.tail; folioq; folioq = folioq->next) {
-		if (folioq->marks3) {
-			slot = __ffs(folioq->marks3);
-			break;
-		}
-	}
-	if (!folioq)
-		return;
-	folio = folioq_folio(folioq, slot);
-
-	wreq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio),
+	creq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio),
 				      NETFS_PGPRIV2_COPY_TO_CACHE);
-	if (IS_ERR(wreq)) {
-		kleave(" [create %ld]", PTR_ERR(wreq));
-		goto couldnt_start;
-	}
+	if (IS_ERR(creq))
+		goto cancel;
 
-	trace_netfs_write(wreq, netfs_write_trace_copy_to_cache);
+	if (!creq->io_streams[1].avail)
+		goto cancel_put;
+
+	trace_netfs_write(creq, netfs_write_trace_copy_to_cache);
 	netfs_stat(&netfs_n_wh_copy_to_cache);
-	if (!wreq->io_streams[1].avail) {
-		netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
-		goto couldnt_start;
-	}
+	rreq->copy_to_cache = creq;
+	return creq;
+
+cancel_put:
+	netfs_put_request(creq, false, netfs_rreq_trace_put_return);
+cancel:
+	rreq->copy_to_cache = ERR_PTR(-ENOBUFS);
+	clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
+	return ERR_PTR(-ENOBUFS);
+}
 
-	for (;;) {
-		error = netfs_pgpriv2_copy_folio(wreq, folio);
-		if (error < 0)
-			break;
+/*
+ * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2 and add
+ * it to the copy write request.
+ */
+void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio)
+{
+	struct netfs_io_request *creq = rreq->copy_to_cache;
 
-		folioq_unmark3(folioq, slot);
-		if (!folioq->marks3) {
-			folioq = folioq->next;
-			if (!folioq)
-				break;
-		}
+	if (!creq)
+		creq = netfs_pgpriv2_begin_copy_to_cache(rreq, folio);
+	if (IS_ERR(creq))
+		return;
 
-		slot = __ffs(folioq->marks3);
-		folio = folioq_folio(folioq, slot);
-	}
+	trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+	folio_start_private_2(folio);
+	netfs_pgpriv2_copy_folio(creq, folio);
+}
+
+/*
+ * [DEPRECATED] End writing to the cache, flushing out any outstanding writes.
+ */
+void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq)
+{
+	struct netfs_io_request *creq = rreq->copy_to_cache;
+
+	if (IS_ERR_OR_NULL(creq))
+		return;
 
-	netfs_issue_write(wreq, &wreq->io_streams[1]);
+	netfs_issue_write(creq, &creq->io_streams[1]);
 	smp_wmb(); /* Write lists before ALL_QUEUED. */
-	set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+	set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags);
 
-	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
-	_leave(" = %d", error);
-couldnt_start:
-	netfs_pgpriv2_cancel(&rreq->buffer);
+	netfs_put_request(creq, false, netfs_rreq_trace_put_return);
+	creq->copy_to_cache = NULL;
 }
 
 /*
  * [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished
  * copying.
  */
-bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
+bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *creq)
 {
-	struct folio_queue *folioq = wreq->buffer.tail;
-	unsigned long long collected_to = wreq->collected_to;
-	unsigned int slot = wreq->buffer.first_tail_slot;
+	struct folio_queue *folioq = creq->buffer.tail;
+	unsigned long long collected_to = creq->collected_to;
+	unsigned int slot = creq->buffer.first_tail_slot;
 	bool made_progress = false;
 
 	if (slot >= folioq_nr_slots(folioq)) {
-		folioq = rolling_buffer_delete_spent(&wreq->buffer);
+		folioq = rolling_buffer_delete_spent(&creq->buffer);
 		slot = 0;
 	}
 
@@ -226,16 +183,16 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
 		folio = folioq_folio(folioq, slot);
 		if (WARN_ONCE(!folio_test_private_2(folio),
 			      "R=%08x: folio %lx is not marked private_2\n",
-			      wreq->debug_id, folio->index))
+			      creq->debug_id, folio->index))
 			trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
 
 		fpos = folio_pos(folio);
 		fsize = folio_size(folio);
 		flen = fsize;
 
-		fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
+		fend = min_t(unsigned long long, fpos + flen, creq->i_size);
 
-		trace_netfs_collect_folio(wreq, folio, fend, collected_to);
+		trace_netfs_collect_folio(creq, folio, fend, collected_to);
 
 		/* Unlock any folio we've transferred all of. */
 		if (collected_to < fend)
@@ -243,7 +200,7 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
 
 		trace_netfs_folio(folio, netfs_folio_trace_end_copy);
 		folio_end_private_2(folio);
-		wreq->cleaned_to = fpos + fsize;
+		creq->cleaned_to = fpos + fsize;
 		made_progress = true;
 
 		/* Clean up the head folioq.  If we clear an entire folioq, then
@@ -253,7 +210,7 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
 		folioq_clear(folioq, slot);
 		slot++;
 		if (slot >= folioq_nr_slots(folioq)) {
-			folioq = rolling_buffer_delete_spent(&wreq->buffer);
+			folioq = rolling_buffer_delete_spent(&creq->buffer);
 			if (!folioq)
 				goto done;
 			slot = 0;
@@ -263,8 +220,8 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
 			break;
 	}
 
-	wreq->buffer.tail = folioq;
+	creq->buffer.tail = folioq;
 done:
-	wreq->buffer.first_tail_slot = slot;
+	creq->buffer.first_tail_slot = slot;
 	return made_progress;
 }
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index a33bd06e80f8..bf6f26525b0d 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -12,15 +12,7 @@
 static void netfs_reissue_read(struct netfs_io_request *rreq,
 			       struct netfs_io_subrequest *subreq)
 {
-	struct iov_iter *io_iter = &subreq->io_iter;
-
-	if (iov_iter_is_folioq(io_iter)) {
-		subreq->curr_folioq = (struct folio_queue *)io_iter->folioq;
-		subreq->curr_folioq_slot = io_iter->folioq_slot;
-		subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
-	}
-
-	atomic_inc(&rreq->nr_outstanding);
+	__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
 	netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
 	subreq->rreq->netfs_ops->issue_read(subreq);
@@ -33,13 +25,12 @@ static void netfs_reissue_read(struct netfs_io_request *rreq,
 static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 {
 	struct netfs_io_subrequest *subreq;
-	struct netfs_io_stream *stream0 = &rreq->io_streams[0];
-	LIST_HEAD(sublist);
-	LIST_HEAD(queue);
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
+	struct list_head *next;
 
 	_enter("R=%x", rreq->debug_id);
 
-	if (list_empty(&rreq->subrequests))
+	if (list_empty(&stream->subrequests))
 		return;
 
 	if (rreq->netfs_ops->retry_request)
@@ -50,9 +41,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 	 */
 	if (!rreq->netfs_ops->prepare_read &&
 	    !rreq->cache_resources.ops) {
-		struct netfs_io_subrequest *subreq;
-
-		list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+		list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
 			if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
 				break;
 			if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
@@ -75,48 +64,44 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 	 * populating with smaller subrequests.  In the event that the subreq
 	 * we just launched finishes before we insert the next subreq, it'll
 	 * fill in rreq->prev_donated instead.
-
+	 *
 	 * Note: Alternatively, we could split the tail subrequest right before
 	 * we reissue it and fix up the donations under lock.
 	 */
-	list_splice_init(&rreq->subrequests, &queue);
+	next = stream->subrequests.next;
 
 	do {
-		struct netfs_io_subrequest *from;
+		struct netfs_io_subrequest *from, *to, *tmp;
 		struct iov_iter source;
 		unsigned long long start, len;
-		size_t part, deferred_next_donated = 0;
+		size_t part;
 		bool boundary = false;
 
 		/* Go through the subreqs and find the next span of contiguous
 		 * buffer that we then rejig (cifs, for example, needs the
 		 * rsize renegotiating) and reissue.
 		 */
-		from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link);
-		list_move_tail(&from->rreq_link, &sublist);
+		from = list_entry(next, struct netfs_io_subrequest, rreq_link);
+		to = from;
 		start = from->start + from->transferred;
 		len   = from->len   - from->transferred;
 
-		_debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx",
+		_debug("from R=%08x[%x] s=%llx ctl=%zx/%zx",
 		       rreq->debug_id, from->debug_index,
-		       from->start, from->consumed, from->transferred, from->len);
+		       from->start, from->transferred, from->len);
 
 		if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
 		    !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
 			goto abandon;
 
-		deferred_next_donated = from->next_donated;
-		while ((subreq = list_first_entry_or_null(
-				&queue, struct netfs_io_subrequest, rreq_link))) {
-			if (subreq->start != start + len ||
-			    subreq->transferred > 0 ||
+		list_for_each_continue(next, &stream->subrequests) {
+			subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
+			if (subreq->start + subreq->transferred != start + len ||
+			    test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
 			    !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
 				break;
-			list_move_tail(&subreq->rreq_link, &sublist);
-			len += subreq->len;
-			deferred_next_donated = subreq->next_donated;
-			if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags))
-				break;
+			to = subreq;
+			len += to->len;
 		}
 
 		_debug(" - range: %llx-%llx %llx", start, start + len - 1, len);
@@ -129,37 +114,30 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 		source.count = len;
 
 		/* Work through the sublist. */
-		while ((subreq = list_first_entry_or_null(
-				&sublist, struct netfs_io_subrequest, rreq_link))) {
-			list_del(&subreq->rreq_link);
-
+		subreq = from;
+		list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+			if (!len)
+				break;
 			subreq->source	= NETFS_DOWNLOAD_FROM_SERVER;
 			subreq->start	= start - subreq->transferred;
 			subreq->len	= len   + subreq->transferred;
-			stream0->sreq_max_len = subreq->len;
-
 			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 			__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 			subreq->retry_count++;
 
-			spin_lock(&rreq->lock);
-			list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-			subreq->prev_donated += rreq->prev_donated;
-			rreq->prev_donated = 0;
 			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
-			spin_unlock(&rreq->lock);
-
-			BUG_ON(!len);
 
 			/* Renegotiate max_len (rsize) */
+			stream->sreq_max_len = subreq->len;
 			if (rreq->netfs_ops->prepare_read(subreq) < 0) {
 				trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
 				__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+				goto abandon;
 			}
 
-			part = umin(len, stream0->sreq_max_len);
-			if (unlikely(rreq->io_streams[0].sreq_max_segs))
-				part = netfs_limit_iter(&source, 0, part, stream0->sreq_max_segs);
+			part = umin(len, stream->sreq_max_len);
+			if (unlikely(stream->sreq_max_segs))
+				part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
 			subreq->len = subreq->transferred + part;
 			subreq->io_iter = source;
 			iov_iter_truncate(&subreq->io_iter, part);
@@ -169,57 +147,105 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 			if (!len) {
 				if (boundary)
 					__set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
-				subreq->next_donated = deferred_next_donated;
 			} else {
 				__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
-				subreq->next_donated = 0;
 			}
 
+			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
 			netfs_reissue_read(rreq, subreq);
-			if (!len)
+			if (subreq == to)
 				break;
-
-			/* If we ran out of subrequests, allocate another. */
-			if (list_empty(&sublist)) {
-				subreq = netfs_alloc_subrequest(rreq);
-				if (!subreq)
-					goto abandon;
-				subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
-				subreq->start = start;
-
-				/* We get two refs, but need just one. */
-				netfs_put_subrequest(subreq, false, netfs_sreq_trace_new);
-				trace_netfs_sreq(subreq, netfs_sreq_trace_split);
-				list_add_tail(&subreq->rreq_link, &sublist);
-			}
 		}
 
 		/* If we managed to use fewer subreqs, we can discard the
-		 * excess.
+		 * excess; if we used the same number, then we're done.
 		 */
-		while ((subreq = list_first_entry_or_null(
-				&sublist, struct netfs_io_subrequest, rreq_link))) {
-			trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
-			list_del(&subreq->rreq_link);
-			netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+		if (!len) {
+			if (subreq == to)
+				continue;
+			list_for_each_entry_safe_from(subreq, tmp,
+						      &stream->subrequests, rreq_link) {
+				trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+				list_del(&subreq->rreq_link);
+				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+				if (subreq == to)
+					break;
+			}
+			continue;
 		}
 
-	} while (!list_empty(&queue));
+		/* We ran out of subrequests, so we need to allocate some more
+		 * and insert them after.
+		 */
+		do {
+			subreq = netfs_alloc_subrequest(rreq);
+			if (!subreq) {
+				subreq = to;
+				goto abandon_after;
+			}
+			subreq->source		= NETFS_DOWNLOAD_FROM_SERVER;
+			subreq->start		= start;
+			subreq->len		= len;
+			subreq->debug_index	= atomic_inc_return(&rreq->subreq_counter);
+			subreq->stream_nr	= stream->stream_nr;
+			subreq->retry_count	= 1;
+
+			trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index,
+					     refcount_read(&subreq->ref),
+					     netfs_sreq_trace_new);
+			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+
+			list_add(&subreq->rreq_link, &to->rreq_link);
+			to = list_next_entry(to, rreq_link);
+			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+			stream->sreq_max_len	= umin(len, rreq->rsize);
+			stream->sreq_max_segs	= 0;
+			if (unlikely(stream->sreq_max_segs))
+				part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
+
+			netfs_stat(&netfs_n_rh_download);
+			if (rreq->netfs_ops->prepare_read(subreq) < 0) {
+				trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
+				__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+				goto abandon;
+			}
+
+			part = umin(len, stream->sreq_max_len);
+			subreq->len = subreq->transferred + part;
+			subreq->io_iter = source;
+			iov_iter_truncate(&subreq->io_iter, part);
+			iov_iter_advance(&source, part);
+
+			len -= part;
+			start += part;
+			if (!len && boundary) {
+				__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
+				boundary = false;
+			}
+
+			netfs_reissue_read(rreq, subreq);
+		} while (len);
+
+	} while (!list_is_head(next, &stream->subrequests));
 
 	return;
 
-	/* If we hit ENOMEM, fail all remaining subrequests */
+	/* If we hit an error, fail all remaining incomplete subrequests */
+abandon_after:
+	if (list_is_last(&subreq->rreq_link, &stream->subrequests))
+		return;
+	subreq = list_next_entry(subreq, rreq_link);
 abandon:
-	list_splice_init(&sublist, &queue);
-	list_for_each_entry(subreq, &queue, rreq_link) {
-		if (!subreq->error)
-			subreq->error = -ENOMEM;
-		__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
+	list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+		if (!subreq->error &&
+		    !test_bit(NETFS_SREQ_FAILED, &subreq->flags) &&
+		    !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+			continue;
+		subreq->error = -ENOMEM;
+		__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
 		__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 	}
-	spin_lock(&rreq->lock);
-	list_splice_tail_init(&queue, &rreq->subrequests);
-	spin_unlock(&rreq->lock);
 }
 
 /*
@@ -227,14 +253,19 @@ abandon:
  */
 void netfs_retry_reads(struct netfs_io_request *rreq)
 {
-	trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+	struct netfs_io_subrequest *subreq;
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
 
-	atomic_inc(&rreq->nr_outstanding);
+	/* Wait for all outstanding I/O to quiesce before performing retries as
+	 * we may need to renegotiate the I/O sizes.
+	 */
+	list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+		wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
+			    TASK_UNINTERRUPTIBLE);
+	}
 
+	trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
 	netfs_retry_read_subrequests(rreq);
-
-	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq);
 }
 
 /*
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index 2a66c5fde071..14bc61107182 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -77,6 +77,7 @@ static void netfs_single_read_cache(struct netfs_io_request *rreq,
 {
 	struct netfs_cache_resources *cres = &rreq->cache_resources;
 
+	_enter("R=%08x[%x]", rreq->debug_id, subreq->debug_index);
 	netfs_stat(&netfs_n_rh_read);
 	cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_FAIL,
 			netfs_cache_read_terminated, subreq);
@@ -88,28 +89,28 @@ static void netfs_single_read_cache(struct netfs_io_request *rreq,
  */
 static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
 {
+	struct netfs_io_stream *stream = &rreq->io_streams[0];
 	struct netfs_io_subrequest *subreq;
 	int ret = 0;
 
-	atomic_set(&rreq->nr_outstanding, 1);
-
 	subreq = netfs_alloc_subrequest(rreq);
-	if (!subreq) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!subreq)
+		return -ENOMEM;
 
 	subreq->source	= NETFS_DOWNLOAD_FROM_SERVER;
 	subreq->start	= 0;
 	subreq->len	= rreq->len;
 	subreq->io_iter	= rreq->buffer.iter;
 
-	atomic_inc(&rreq->nr_outstanding);
+	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
 
-	spin_lock_bh(&rreq->lock);
-	list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+	spin_lock(&rreq->lock);
+	list_add_tail(&subreq->rreq_link, &stream->subrequests);
 	trace_netfs_sreq(subreq, netfs_sreq_trace_added);
-	spin_unlock_bh(&rreq->lock);
+	stream->front = subreq;
+	/* Store list pointers before active flag */
+	smp_store_release(&stream->active, true);
+	spin_unlock(&rreq->lock);
 
 	netfs_single_cache_prepare_read(rreq, subreq);
 	switch (subreq->source) {
@@ -137,14 +138,12 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
 		break;
 	}
 
-out:
-	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		netfs_rreq_terminated(rreq);
+	smp_wmb(); /* Write lists before ALL_QUEUED. */
+	set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
 	return ret;
 cancel:
-	atomic_dec(&rreq->nr_outstanding);
 	netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
-	goto out;
+	return ret;
 }
 
 /**
@@ -185,13 +184,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite
 	rreq->buffer.iter = *iter;
 	netfs_single_dispatch_read(rreq);
 
-	trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
-	wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
-		    TASK_UNINTERRUPTIBLE);
-
-	ret = rreq->error;
-	if (ret == 0)
-		ret = rreq->transferred;
+	ret = netfs_wait_for_read(rreq);
 	netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
 	return ret;
 
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index d54526d2e751..1b7f53d01b8d 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -289,7 +289,9 @@ reassess_streams:
 		goto need_retry;
 	if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
 		trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
-		clear_and_wake_up_bit(NETFS_RREQ_PAUSE, &wreq->flags);
+		clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
+		smp_mb__after_atomic(); /* Set PAUSE before task state */
+		wake_up(&wreq->waitq);
 	}
 
 	if (notes & NEED_REASSESS) {
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 6f14a7c2f040..69727411683e 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -723,7 +723,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
 		rolling_buffer_advance(&wreq->buffer, part);
 		if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
 			trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
-			wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
+			wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags));
 		}
 		if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
 			break;
diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c
index f3d5e37d4698..c841a851dd73 100644
--- a/fs/netfs/write_retry.c
+++ b/fs/netfs/write_retry.c
@@ -93,15 +93,21 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 		list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
 			if (!len)
 				break;
-			/* Renegotiate max_len (wsize) */
-			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+			subreq->start	= start;
+			subreq->len	= len;
 			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 			subreq->retry_count++;
+			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+			/* Renegotiate max_len (wsize) */
+			stream->sreq_max_len = len;
 			stream->prepare_write(subreq);
 
-			part = min(len, stream->sreq_max_len);
+			part = umin(len, stream->sreq_max_len);
+			if (unlikely(stream->sreq_max_segs))
+				part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
 			subreq->len = part;
-			subreq->start = start;
 			subreq->transferred = 0;
 			len -= part;
 			start += part;
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 7c9cc6945d18..000522a34985 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1322,6 +1322,8 @@ cifs_readv_callback(struct mid_q_entry *mid)
 		} else if (rdata->got_bytes > 0) {
 			__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
 		}
+		if (rdata->got_bytes)
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
 	}
 
 	rdata->credits.value = 0;
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index ce57d8697c7c..81c76e1b7209 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4607,7 +4607,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
 			__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 			rdata->result = 0;
 		}
-		__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
+		if (rdata->got_bytes)
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
 	}
 	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
 			      server->credits, server->in_flight,
@@ -4616,7 +4617,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
 	rdata->subreq.error = rdata->result;
 	rdata->subreq.transferred += rdata->got_bytes;
 	trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress);
-	queue_work(cifsiod_wq, &rdata->subreq.work);
+	netfs_read_subreq_terminated(&rdata->subreq);
 	release_mid(mid);
 	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
 			      server->credits, server->in_flight,
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 27e62f7d2940..071d05d81d38 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -181,9 +181,6 @@ struct netfs_io_subrequest {
 	unsigned long long	start;		/* Where to start the I/O */
 	size_t			len;		/* Size of the I/O */
 	size_t			transferred;	/* Amount of data transferred */
-	size_t			consumed;	/* Amount of read data consumed */
-	size_t			prev_donated;	/* Amount of data donated from previous subreq */
-	size_t			next_donated;	/* Amount of data donated from next subreq */
 	refcount_t		ref;
 	short			error;		/* 0 or error that occurred */
 	unsigned short		debug_index;	/* Index in list (for debugging output) */
@@ -191,9 +188,6 @@ struct netfs_io_subrequest {
 	u8			retry_count;	/* The number of retries (0 on initial pass) */
 	enum netfs_io_source	source;		/* Where to read from/write to */
 	unsigned char		stream_nr;	/* I/O stream this belongs to */
-	unsigned char		curr_folioq_slot; /* Folio currently being read */
-	unsigned char		curr_folio_order; /* Order of folio */
-	struct folio_queue	*curr_folioq;	/* Queue segment in which current folio resides */
 	unsigned long		flags;
 #define NETFS_SREQ_COPY_TO_CACHE	0	/* Set if should copy the data to the cache */
 #define NETFS_SREQ_CLEAR_TAIL		1	/* Set if the rest of the read should be cleared */
@@ -236,15 +230,16 @@ struct netfs_io_request {
 	struct address_space	*mapping;	/* The mapping being accessed */
 	struct kiocb		*iocb;		/* AIO completion vector */
 	struct netfs_cache_resources cache_resources;
+	struct netfs_io_request	*copy_to_cache;	/* Request to write just-read data to the cache */
 	struct readahead_control *ractl;	/* Readahead descriptor */
 	struct list_head	proc_link;	/* Link in netfs_iorequests */
-	struct list_head	subrequests;	/* Contributory I/O operations */
 	struct netfs_io_stream	io_streams[2];	/* Streams of parallel I/O operations */
 #define NR_IO_STREAMS 2 //wreq->nr_io_streams
 	struct netfs_group	*group;		/* Writeback group being written back */
 	struct rolling_buffer	buffer;		/* Unencrypted buffer */
 #define NETFS_ROLLBUF_PUT_MARK		ROLLBUF_MARK_1
 #define NETFS_ROLLBUF_PAGECACHE_MARK	ROLLBUF_MARK_2
+	wait_queue_head_t	waitq;		/* Processor waiter */
 	void			*netfs_priv;	/* Private data for the netfs */
 	void			*netfs_priv2;	/* Private data for the netfs */
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
@@ -255,7 +250,6 @@ struct netfs_io_request {
 	atomic_t		subreq_counter;	/* Next subreq->debug_index */
 	unsigned int		nr_group_rel;	/* Number of refs to release on ->group */
 	spinlock_t		lock;		/* Lock for queuing subreqs */
-	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	unsigned long long	submitted;	/* Amount submitted for I/O so far */
 	unsigned long long	len;		/* Length of the request */
 	size_t			transferred;	/* Amount to be indicated as transferred */
@@ -267,14 +261,17 @@ struct netfs_io_request {
 	atomic64_t		issued_to;	/* Write issuer folio cursor */
 	unsigned long long	collected_to;	/* Point we've collected to */
 	unsigned long long	cleaned_to;	/* Position we've cleaned folios to */
+	unsigned long long	abandon_to;	/* Position to abandon folios to */
 	pgoff_t			no_unlock_folio; /* Don't unlock this folio after read */
-	size_t			prev_donated;	/* Fallback for subreq->prev_donated */
+	unsigned char		front_folio_order; /* Order (size) of front folio */
 	refcount_t		ref;
 	unsigned long		flags;
+#define NETFS_RREQ_OFFLOAD_COLLECTION	0	/* Offload collection to workqueue */
 #define NETFS_RREQ_NO_UNLOCK_FOLIO	2	/* Don't unlock no_unlock_folio on completion */
 #define NETFS_RREQ_DONT_UNLOCK_FOLIOS	3	/* Don't unlock the folios on completion */
 #define NETFS_RREQ_FAILED		4	/* The request failed */
 #define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes */
+#define NETFS_RREQ_FOLIO_COPY_TO_CACHE	6	/* Copy current folio to cache from read */
 #define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
 #define NETFS_RREQ_NONBLOCK		9	/* Don't block if possible (O_NONBLOCK) */
 #define NETFS_RREQ_BLOCKED		10	/* We blocked */
@@ -439,7 +436,6 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
 /* (Sub)request management API. */
 void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq);
 void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq);
-void netfs_read_subreq_termination_worker(struct work_struct *work);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
 			  enum netfs_sreq_ref_trace what);
 void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 6df2e7313371..6e699cadcb29 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -50,18 +50,23 @@
 	EM(netfs_rreq_trace_assess,		"ASSESS ")	\
 	EM(netfs_rreq_trace_copy,		"COPY   ")	\
 	EM(netfs_rreq_trace_collect,		"COLLECT")	\
+	EM(netfs_rreq_trace_complete,		"COMPLET")	\
 	EM(netfs_rreq_trace_dirty,		"DIRTY  ")	\
 	EM(netfs_rreq_trace_done,		"DONE   ")	\
 	EM(netfs_rreq_trace_free,		"FREE   ")	\
 	EM(netfs_rreq_trace_redirty,		"REDIRTY")	\
 	EM(netfs_rreq_trace_resubmit,		"RESUBMT")	\
+	EM(netfs_rreq_trace_set_abandon,	"S-ABNDN")	\
 	EM(netfs_rreq_trace_set_pause,		"PAUSE  ")	\
 	EM(netfs_rreq_trace_unlock,		"UNLOCK ")	\
 	EM(netfs_rreq_trace_unlock_pgpriv2,	"UNLCK-2")	\
 	EM(netfs_rreq_trace_unmark,		"UNMARK ")	\
 	EM(netfs_rreq_trace_wait_ip,		"WAIT-IP")	\
 	EM(netfs_rreq_trace_wait_pause,		"WT-PAUS")	\
+	EM(netfs_rreq_trace_wait_queue,		"WAIT-Q ")	\
 	EM(netfs_rreq_trace_wake_ip,		"WAKE-IP")	\
+	EM(netfs_rreq_trace_wake_queue,		"WAKE-Q ")	\
+	EM(netfs_rreq_trace_woke_queue,		"WOKE-Q ")	\
 	EM(netfs_rreq_trace_unpause,		"UNPAUSE")	\
 	E_(netfs_rreq_trace_write_done,		"WR-DONE")
 
@@ -81,6 +86,7 @@
 	EM(netfs_sreq_trace_cache_nowrite,	"CA-NW")	\
 	EM(netfs_sreq_trace_cache_prepare,	"CA-PR")	\
 	EM(netfs_sreq_trace_cache_write,	"CA-WR")	\
+	EM(netfs_sreq_trace_cancel,		"CANCL")	\
 	EM(netfs_sreq_trace_clear,		"CLEAR")	\
 	EM(netfs_sreq_trace_discard,		"DSCRD")	\
 	EM(netfs_sreq_trace_donate_to_prev,	"DON-P")	\
@@ -91,6 +97,9 @@
 	EM(netfs_sreq_trace_hit_eof,		"EOF  ")	\
 	EM(netfs_sreq_trace_io_progress,	"IO   ")	\
 	EM(netfs_sreq_trace_limited,		"LIMIT")	\
+	EM(netfs_sreq_trace_need_clear,		"N-CLR")	\
+	EM(netfs_sreq_trace_partial_read,	"PARTR")	\
+	EM(netfs_sreq_trace_need_retry,		"NRTRY")	\
 	EM(netfs_sreq_trace_prepare,		"PREP ")	\
 	EM(netfs_sreq_trace_prep_failed,	"PRPFL")	\
 	EM(netfs_sreq_trace_progress,		"PRGRS")	\
@@ -136,6 +145,7 @@
 	EM(netfs_sreq_trace_get_submit,		"GET SUBMIT")	\
 	EM(netfs_sreq_trace_get_short_read,	"GET SHORTRD")	\
 	EM(netfs_sreq_trace_new,		"NEW        ")	\
+	EM(netfs_sreq_trace_put_abandon,	"PUT ABANDON")	\
 	EM(netfs_sreq_trace_put_cancel,		"PUT CANCEL ")	\
 	EM(netfs_sreq_trace_put_clear,		"PUT CLEAR  ")	\
 	EM(netfs_sreq_trace_put_consumed,	"PUT CONSUME")	\
@@ -176,6 +186,7 @@
 	EM(netfs_folio_trace_mkwrite,		"mkwrite")	\
 	EM(netfs_folio_trace_mkwrite_plus,	"mkwrite+")	\
 	EM(netfs_folio_trace_not_under_wback,	"!wback")	\
+	EM(netfs_folio_trace_not_locked,	"!locked")	\
 	EM(netfs_folio_trace_put,		"put")		\
 	EM(netfs_folio_trace_read,		"read")		\
 	EM(netfs_folio_trace_read_done,		"read-done")	\
@@ -204,7 +215,6 @@
 	EM(netfs_trace_folioq_clear,		"clear")	\
 	EM(netfs_trace_folioq_delete,		"delete")	\
 	EM(netfs_trace_folioq_make_space,	"make-space")	\
-	EM(netfs_trace_folioq_prep_write,	"prep-wr")	\
 	EM(netfs_trace_folioq_rollbuf_init,	"roll-init")	\
 	E_(netfs_trace_folioq_read_progress,	"r-progress")
 
@@ -352,7 +362,7 @@ TRACE_EVENT(netfs_sreq,
 		    __entry->len	= sreq->len;
 		    __entry->transferred = sreq->transferred;
 		    __entry->start	= sreq->start;
-		    __entry->slot	= sreq->curr_folioq_slot;
+		    __entry->slot	= sreq->io_iter.folioq_slot;
 			   ),
 
 	    TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx s=%u e=%d",
@@ -701,71 +711,6 @@ TRACE_EVENT(netfs_collect_stream,
 		      __entry->collected_to, __entry->front)
 	    );
 
-TRACE_EVENT(netfs_progress,
-	    TP_PROTO(const struct netfs_io_subrequest *subreq,
-		     unsigned long long start, size_t avail, size_t part),
-
-	    TP_ARGS(subreq, start, avail, part),
-
-	    TP_STRUCT__entry(
-		    __field(unsigned int,		rreq)
-		    __field(unsigned int,		subreq)
-		    __field(unsigned int,		consumed)
-		    __field(unsigned int,		transferred)
-		    __field(unsigned long long,		f_start)
-		    __field(unsigned int,		f_avail)
-		    __field(unsigned int,		f_part)
-		    __field(unsigned char,		slot)
-			     ),
-
-	    TP_fast_assign(
-		    __entry->rreq	= subreq->rreq->debug_id;
-		    __entry->subreq	= subreq->debug_index;
-		    __entry->consumed	= subreq->consumed;
-		    __entry->transferred = subreq->transferred;
-		    __entry->f_start	= start;
-		    __entry->f_avail	= avail;
-		    __entry->f_part	= part;
-		    __entry->slot	= subreq->curr_folioq_slot;
-			   ),
-
-	    TP_printk("R=%08x[%02x] s=%llx ct=%x/%x pa=%x/%x sl=%x",
-		      __entry->rreq, __entry->subreq, __entry->f_start,
-		      __entry->consumed, __entry->transferred,
-		      __entry->f_part, __entry->f_avail,  __entry->slot)
-	    );
-
-TRACE_EVENT(netfs_donate,
-	    TP_PROTO(const struct netfs_io_request *rreq,
-		     const struct netfs_io_subrequest *from,
-		     const struct netfs_io_subrequest *to,
-		     size_t amount,
-		     enum netfs_donate_trace trace),
-
-	    TP_ARGS(rreq, from, to, amount, trace),
-
-	    TP_STRUCT__entry(
-		    __field(unsigned int,		rreq)
-		    __field(unsigned int,		from)
-		    __field(unsigned int,		to)
-		    __field(unsigned int,		amount)
-		    __field(enum netfs_donate_trace,	trace)
-			     ),
-
-	    TP_fast_assign(
-		    __entry->rreq	= rreq->debug_id;
-		    __entry->from	= from->debug_index;
-		    __entry->to		= to ? to->debug_index : -1;
-		    __entry->amount	= amount;
-		    __entry->trace	= trace;
-			   ),
-
-	    TP_printk("R=%08x[%02x] -> [%02x] %s am=%x",
-		      __entry->rreq, __entry->from, __entry->to,
-		      __print_symbolic(__entry->trace, netfs_donate_traces),
-		      __entry->amount)
-	    );
-
 TRACE_EVENT(netfs_folioq,
 	    TP_PROTO(const struct folio_queue *fq,
 		     enum netfs_folioq_trace trace),
-- 
cgit v1.2.3


From 836bb70bde6a24a0069866b69d23eb61a00c422a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:18 +0000
Subject: afs: Make afs_mkdir() locally initialise a new directory's content

Initialise a new directory's content when it is created by mkdir locally
rather than downloading the content from the server as we can predict what
it's going to look like.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-29-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/dir.c               |  3 +++
 fs/afs/dir_edit.c          | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/afs/internal.h          |  1 +
 include/trace/events/afs.h |  2 ++
 4 files changed, 55 insertions(+)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a386b4649f3e..bf46485d12f8 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1264,6 +1264,7 @@ void afs_check_for_remote_deletion(struct afs_operation *op)
  */
 static void afs_vnode_new_inode(struct afs_operation *op)
 {
+	struct afs_vnode_param *dvp = &op->file[0];
 	struct afs_vnode_param *vp = &op->file[1];
 	struct afs_vnode *vnode;
 	struct inode *inode;
@@ -1283,6 +1284,8 @@ static void afs_vnode_new_inode(struct afs_operation *op)
 
 	vnode = AFS_FS_I(inode);
 	set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+	if (S_ISDIR(inode->i_mode))
+		afs_mkdir_init_dir(vnode, dvp->vnode);
 	if (!afs_op_error(op))
 		afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb);
 	d_instantiate(op->dentry, inode);
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 71cce884e434..53178bb2d1a6 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -556,3 +556,52 @@ error:
 			   0, 0, 0, 0, "..");
 	goto out;
 }
+
+/*
+ * Initialise a new directory.  We need to fill in the "." and ".." entries.
+ */
+void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_dvnode)
+{
+	union afs_xdr_dir_block *meta;
+	struct afs_dir_iter iter = { .dvnode = dvnode };
+	union afs_xdr_dirent *de;
+	unsigned int slot = AFS_DIR_RESV_BLOCKS0;
+	loff_t i_size;
+
+	i_size = i_size_read(&dvnode->netfs.inode);
+	if (i_size != AFS_DIR_BLOCK_SIZE) {
+		afs_invalidate_dir(dvnode, afs_dir_invalid_edit_add_bad_size);
+		return;
+	}
+
+	meta = afs_dir_get_block(&iter, 0);
+	if (!meta)
+		return;
+
+	afs_edit_init_block(meta, meta, 0);
+
+	de = &meta->dirents[slot];
+	de->u.valid  = 1;
+	de->u.vnode  = htonl(dvnode->fid.vnode);
+	de->u.unique = htonl(dvnode->fid.unique);
+	memcpy(de->u.name, ".", 2);
+	trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot,
+			   dvnode->fid.vnode, dvnode->fid.unique, ".");
+	slot++;
+
+	de = &meta->dirents[slot];
+	de->u.valid  = 1;
+	de->u.vnode  = htonl(parent_dvnode->fid.vnode);
+	de->u.unique = htonl(parent_dvnode->fid.unique);
+	memcpy(de->u.name, "..", 3);
+	trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot,
+			   parent_dvnode->fid.vnode, parent_dvnode->fid.unique, "..");
+
+	afs_set_contig_bits(meta, AFS_DIR_RESV_BLOCKS0, 2);
+	meta->meta.alloc_ctrs[0] -= 2;
+	kunmap_local(meta);
+
+	netfs_single_mark_inode_dirty(&dvnode->netfs.inode);
+	set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
+	set_bit(AFS_VNODE_DIR_READ, &dvnode->flags);
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cd2c4f85117d..acae1b5bfc63 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1078,6 +1078,7 @@ extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *
 extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
 void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
 				enum afs_edit_dir_reason why);
+void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
 
 /*
  * dir_silly.c
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index cdb5f2af7799..c52fd83ca9b7 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -350,6 +350,7 @@ enum yfs_cm_operation {
 	EM(afs_dir_invalid_edit_add_no_slots,	"edit-add-no-slots") \
 	EM(afs_dir_invalid_edit_add_too_many_blocks, "edit-add-too-many-blocks") \
 	EM(afs_dir_invalid_edit_get_block,	"edit-get-block") \
+	EM(afs_dir_invalid_edit_mkdir,		"edit-mkdir") \
 	EM(afs_dir_invalid_edit_rem_bad_size,	"edit-rem-bad-size") \
 	EM(afs_dir_invalid_edit_rem_wrong_name,	"edit-rem-wrong_name") \
 	EM(afs_dir_invalid_edit_upd_bad_size,	"edit-upd-bad-size") \
@@ -371,6 +372,7 @@ enum yfs_cm_operation {
 	EM(afs_edit_dir_delete_error,		"d_err ") \
 	EM(afs_edit_dir_delete_inval,		"d_invl") \
 	EM(afs_edit_dir_delete_noent,		"d_nent") \
+	EM(afs_edit_dir_mkdir,			"mk_ent") \
 	EM(afs_edit_dir_update_dd,		"u_ddot") \
 	EM(afs_edit_dir_update_error,		"u_fail") \
 	EM(afs_edit_dir_update_inval,		"u_invl") \
-- 
cgit v1.2.3


From a5b5beebcf96d5e8a2fc79856c2ac1e93f82478e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:19 +0000
Subject: afs: Use the contained hashtable to search a directory

Each directory image contains a hashtable with 128 buckets to speed up
searching.  Currently, kafs does not use this, but rather iterates over all
the occupied slots in the image as it can share this with readdir.

Switch kafs to use the hashtable for lookups to reduce the latency.  Care
must be taken that the hash chains are acyclic.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-30-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/Makefile     |   1 +
 fs/afs/dir.c        |  42 ++++------
 fs/afs/dir_edit.c   | 135 ++++++++++++++++++++-----------
 fs/afs/dir_search.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/afs/internal.h   |  18 +++++
 5 files changed, 350 insertions(+), 73 deletions(-)
 create mode 100644 fs/afs/dir_search.c

(limited to 'fs')

diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index dcdc0f1bb76f..5efd7e13b304 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -11,6 +11,7 @@ kafs-y := \
 	cmservice.o \
 	dir.o \
 	dir_edit.o \
+	dir_search.o \
 	dir_silly.o \
 	dynroot.o \
 	file.o \
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index bf46485d12f8..dd0c98d25270 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -88,8 +88,6 @@ struct afs_lookup_one_cookie {
 struct afs_lookup_cookie {
 	struct dir_context	ctx;
 	struct qstr		name;
-	bool			found;
-	bool			one_only;
 	unsigned short		nr_fids;
 	struct afs_fid		fids[50];
 };
@@ -309,7 +307,7 @@ ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file)
  * Read the directory into a folio_queue buffer in one go, scrubbing the
  * previous contents.  We return -ESTALE if the caller needs to call us again.
  */
-static ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
+ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
 	__acquires(&dvnode->validate_lock)
 {
 	ssize_t ret;
@@ -649,19 +647,10 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name,
 	BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048);
 	BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32);
 
-	if (cookie->found) {
-		if (cookie->nr_fids < 50) {
-			cookie->fids[cookie->nr_fids].vnode	= ino;
-			cookie->fids[cookie->nr_fids].unique	= dtype;
-			cookie->nr_fids++;
-		}
-	} else if (cookie->name.len == nlen &&
-		   memcmp(cookie->name.name, name, nlen) == 0) {
-		cookie->fids[1].vnode	= ino;
-		cookie->fids[1].unique	= dtype;
-		cookie->found = 1;
-		if (cookie->one_only)
-			return false;
+	if (cookie->nr_fids < 50) {
+		cookie->fids[cookie->nr_fids].vnode	= ino;
+		cookie->fids[cookie->nr_fids].unique	= dtype;
+		cookie->nr_fids++;
 	}
 
 	return cookie->nr_fids < 50;
@@ -789,6 +778,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
 	struct inode *inode = NULL, *ti;
 	afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version);
+	bool supports_ibulk;
 	long ret;
 	int i;
 
@@ -805,19 +795,19 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 	cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want
 			      * and slot 0 for the directory */
 
-	if (!afs_server_supports_ibulk(dvnode))
-		cookie->one_only = true;
-
-	/* search the directory */
-	ret = afs_dir_iterate(dir, &cookie->ctx, NULL, &data_version);
+	/* Search the directory for the named entry using the hash table... */
+	ret = afs_dir_search(dvnode, &dentry->d_name, &cookie->fids[1], &data_version);
 	if (ret < 0)
 		goto out;
 
-	dentry->d_fsdata = (void *)(unsigned long)data_version;
+	supports_ibulk = afs_server_supports_ibulk(dvnode);
+	if (supports_ibulk) {
+		/* ...then scan linearly from that point for entries to lookup-ahead. */
+		cookie->ctx.pos = (ret + 1) * AFS_DIR_DIRENT_SIZE;
+		afs_dir_iterate(dir, &cookie->ctx, NULL, &data_version);
+	}
 
-	ret = -ENOENT;
-	if (!cookie->found)
-		goto out;
+	dentry->d_fsdata = (void *)(unsigned long)data_version;
 
 	/* Check to see if we already have an inode for the primary fid. */
 	inode = ilookup5(dir->i_sb, cookie->fids[1].vnode,
@@ -876,7 +866,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 	 * the whole operation.
 	 */
 	afs_op_set_error(op, -ENOTSUPP);
-	if (!cookie->one_only) {
+	if (supports_ibulk) {
 		op->ops = &afs_inline_bulk_status_operation;
 		afs_begin_vnode_operation(op);
 		afs_wait_for_operation(op);
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 53178bb2d1a6..60a549f1d9c5 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -245,7 +245,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 	union afs_xdr_dir_block *meta, *block;
 	union afs_xdr_dirent *de;
 	struct afs_dir_iter iter = { .dvnode = vnode };
-	unsigned int need_slots, nr_blocks, b;
+	unsigned int nr_blocks, b, entry;
 	loff_t i_size;
 	int slot;
 
@@ -263,7 +263,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 		return;
 
 	/* Work out how many slots we're going to need. */
-	need_slots = afs_dir_calc_slots(name->len);
+	iter.nr_slots = afs_dir_calc_slots(name->len);
 
 	if (i_size == 0)
 		goto new_directory;
@@ -281,7 +281,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 
 		/* Lower dir blocks have a counter in the header we can check. */
 		if (b < AFS_DIR_BLOCKS_WITH_CTR &&
-		    meta->meta.alloc_ctrs[b] < need_slots)
+		    meta->meta.alloc_ctrs[b] < iter.nr_slots)
 			continue;
 
 		block = afs_dir_get_block(&iter, b);
@@ -308,7 +308,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 		/* We need to try and find one or more consecutive slots to
 		 * hold the entry.
 		 */
-		slot = afs_find_contig_bits(block, need_slots);
+		slot = afs_find_contig_bits(block, iter.nr_slots);
 		if (slot >= 0) {
 			_debug("slot %u", slot);
 			goto found_space;
@@ -347,12 +347,18 @@ found_space:
 	de->u.name[name->len] = 0;
 
 	/* Adjust the bitmap. */
-	afs_set_contig_bits(block, slot, need_slots);
-	kunmap_local(block);
+	afs_set_contig_bits(block, slot, iter.nr_slots);
 
 	/* Adjust the allocation counter. */
 	if (b < AFS_DIR_BLOCKS_WITH_CTR)
-		meta->meta.alloc_ctrs[b] -= need_slots;
+		meta->meta.alloc_ctrs[b] -= iter.nr_slots;
+
+	/* Adjust the hash chain. */
+	entry = b * AFS_DIR_SLOTS_PER_BLOCK + slot;
+	iter.bucket = afs_dir_hash_name(name);
+	de->u.hash_next = meta->meta.hashtable[iter.bucket];
+	meta->meta.hashtable[iter.bucket] = htons(entry);
+	kunmap_local(block);
 
 	inode_inc_iversion_raw(&vnode->netfs.inode);
 	afs_stat_v(vnode, n_dir_cr);
@@ -387,12 +393,14 @@ error:
 void afs_edit_dir_remove(struct afs_vnode *vnode,
 			 struct qstr *name, enum afs_edit_dir_reason why)
 {
-	union afs_xdr_dir_block *meta, *block;
-	union afs_xdr_dirent *de;
+	union afs_xdr_dir_block *meta, *block, *pblock;
+	union afs_xdr_dirent *de, *pde;
 	struct afs_dir_iter iter = { .dvnode = vnode };
-	unsigned int need_slots, nr_blocks, b;
+	struct afs_fid fid;
+	unsigned int b, slot, entry;
 	loff_t i_size;
-	int slot;
+	__be16 next;
+	int found;
 
 	_enter(",,{%d,%s},", name->len, name->name);
 
@@ -403,59 +411,90 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
 		afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_bad_size);
 		return;
 	}
-	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
 
-	meta = afs_dir_get_block(&iter, 0);
-	if (!meta)
+	if (!afs_dir_init_iter(&iter, name))
 		return;
 
-	/* Work out how many slots we're going to discard. */
-	need_slots = afs_dir_calc_slots(name->len);
-
-	/* Find a block that has sufficient slots available.  Each folio
-	 * contains two or more directory blocks.
-	 */
-	for (b = 0; b < nr_blocks; b++) {
-		block = afs_dir_get_block(&iter, b);
-		if (!block)
-			goto error;
-
-		/* Abandon the edit if we got a callback break. */
-		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-			goto already_invalidated;
-
-		if (b > AFS_DIR_BLOCKS_WITH_CTR ||
-		    meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) {
-			slot = afs_dir_scan_block(block, name, b);
-			if (slot >= 0)
-				goto found_dirent;
-		}
+	meta = afs_dir_find_block(&iter, 0);
+	if (!meta)
+		return;
 
-		kunmap_local(block);
+	/* Find the entry in the blob. */
+	found = afs_dir_search_bucket(&iter, name, &fid);
+	if (found < 0) {
+		/* Didn't find the dirent to clobber.  Re-download. */
+		trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
+				   0, 0, 0, 0, name->name);
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name);
+		goto out_unmap;
 	}
 
-	/* Didn't find the dirent to clobber.  Download the directory again. */
-	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
-			   0, 0, 0, 0, name->name);
-	afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name);
-	goto out_unmap;
+	entry = found;
+	b    = entry / AFS_DIR_SLOTS_PER_BLOCK;
+	slot = entry % AFS_DIR_SLOTS_PER_BLOCK;
 
-found_dirent:
+	block = afs_dir_find_block(&iter, b);
+	if (!block)
+		goto error;
+	if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+		goto already_invalidated;
+
+	/* Check and clear the entry. */
 	de = &block->dirents[slot];
+	if (de->u.valid != 1)
+		goto error_unmap;
 
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot,
 			   ntohl(de->u.vnode), ntohl(de->u.unique),
 			   name->name);
 
-	memset(de, 0, sizeof(*de) * need_slots);
-
 	/* Adjust the bitmap. */
-	afs_clear_contig_bits(block, slot, need_slots);
-	kunmap_local(block);
+	afs_clear_contig_bits(block, slot, iter.nr_slots);
 
 	/* Adjust the allocation counter. */
 	if (b < AFS_DIR_BLOCKS_WITH_CTR)
-		meta->meta.alloc_ctrs[b] += need_slots;
+		meta->meta.alloc_ctrs[b] += iter.nr_slots;
+
+	/* Clear the constituent entries. */
+	next = de->u.hash_next;
+	memset(de, 0, sizeof(*de) * iter.nr_slots);
+	kunmap_local(block);
+
+	/* Adjust the hash chain: if iter->prev_entry is 0, the hashtable head
+	 * index is previous; otherwise it's slot number of the previous entry.
+	 */
+	if (!iter.prev_entry) {
+		__be16 prev_next = meta->meta.hashtable[iter.bucket];
+
+		if (unlikely(prev_next != htons(entry))) {
+			pr_warn("%llx:%llx:%x: not head of chain b=%x p=%x,%x e=%x %*s",
+				vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
+				iter.bucket, iter.prev_entry, prev_next, entry,
+				name->len, name->name);
+			goto error;
+		}
+		meta->meta.hashtable[iter.bucket] = next;
+	} else {
+		unsigned int pb = iter.prev_entry / AFS_DIR_SLOTS_PER_BLOCK;
+		unsigned int ps = iter.prev_entry % AFS_DIR_SLOTS_PER_BLOCK;
+		__be16 prev_next;
+
+		pblock = afs_dir_find_block(&iter, pb);
+		if (!pblock)
+			goto error;
+		pde = &pblock->dirents[ps];
+		prev_next = pde->u.hash_next;
+		if (prev_next != htons(entry)) {
+			kunmap_local(pblock);
+			pr_warn("%llx:%llx:%x: not prev in chain b=%x p=%x,%x e=%x %*s",
+				vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
+				iter.bucket, iter.prev_entry, prev_next, entry,
+				name->len, name->name);
+			goto error;
+		}
+		pde->u.hash_next = next;
+		kunmap_local(pblock);
+	}
 
 	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
 
@@ -474,6 +513,8 @@ already_invalidated:
 			   0, 0, 0, 0, name->name);
 	goto out_unmap;
 
+error_unmap:
+	kunmap_local(block);
 error:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error,
 			   0, 0, 0, 0, name->name);
diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c
new file mode 100644
index 000000000000..b25bd892db4d
--- /dev/null
+++ b/fs/afs/dir_search.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Search a directory's hash table.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * https://tools.ietf.org/html/draft-keiser-afs3-directory-object-00
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/iversion.h>
+#include "internal.h"
+#include "afs_fs.h"
+#include "xdr_fs.h"
+
+/*
+ * Calculate the name hash.
+ */
+unsigned int afs_dir_hash_name(const struct qstr *name)
+{
+	const unsigned char *p = name->name;
+	unsigned int hash = 0, i;
+	int bucket;
+
+	for (i = 0; i < name->len; i++)
+		hash = (hash * 173) + p[i];
+	bucket = hash & (AFS_DIR_HASHTBL_SIZE - 1);
+	if (hash > INT_MAX) {
+		bucket = AFS_DIR_HASHTBL_SIZE - bucket;
+		bucket &= (AFS_DIR_HASHTBL_SIZE - 1);
+	}
+	return bucket;
+}
+
+/*
+ * Reset a directory iterator.
+ */
+static bool afs_dir_reset_iter(struct afs_dir_iter *iter)
+{
+	unsigned long long i_size = i_size_read(&iter->dvnode->netfs.inode);
+	unsigned int nblocks;
+
+	/* Work out the maximum number of steps we can take. */
+	nblocks = umin(i_size / AFS_DIR_BLOCK_SIZE, AFS_DIR_MAX_BLOCKS);
+	if (!nblocks)
+		return false;
+	iter->loop_check = nblocks * (AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS);
+	iter->prev_entry = 0; /* Hash head is previous */
+	return true;
+}
+
+/*
+ * Initialise a directory iterator for looking up a name.
+ */
+bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name)
+{
+	iter->nr_slots = afs_dir_calc_slots(name->len);
+	iter->bucket = afs_dir_hash_name(name);
+	return afs_dir_reset_iter(iter);
+}
+
+/*
+ * Get a specific block.
+ */
+union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block)
+{
+	struct folio_queue *fq = iter->fq;
+	struct afs_vnode *dvnode = iter->dvnode;
+	struct folio *folio;
+	size_t blpos = block * AFS_DIR_BLOCK_SIZE;
+	size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos;
+	int slot = iter->fq_slot;
+
+	_enter("%zx,%d", block, slot);
+
+	if (iter->block) {
+		kunmap_local(iter->block);
+		iter->block = NULL;
+	}
+
+	if (dvnode->directory_size < blend)
+		goto fail;
+
+	if (!fq || blpos < fpos) {
+		fq = dvnode->directory;
+		slot = 0;
+		fpos = 0;
+	}
+
+	/* Search the folio queue for the folio containing the block... */
+	for (; fq; fq = fq->next) {
+		for (; slot < folioq_count(fq); slot++) {
+			size_t fsize = folioq_folio_size(fq, slot);
+
+			if (blend <= fpos + fsize) {
+				/* ... and then return the mapped block. */
+				folio = folioq_folio(fq, slot);
+				if (WARN_ON_ONCE(folio_pos(folio) != fpos))
+					goto fail;
+				iter->fq = fq;
+				iter->fq_slot = slot;
+				iter->fpos = fpos;
+				iter->block = kmap_local_folio(folio, blpos - fpos);
+				return iter->block;
+			}
+			fpos += fsize;
+		}
+		slot = 0;
+	}
+
+fail:
+	iter->fq = NULL;
+	iter->fq_slot = 0;
+	afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block);
+	return NULL;
+}
+
+/*
+ * Search through a directory bucket.
+ */
+int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
+			  struct afs_fid *_fid)
+{
+	const union afs_xdr_dir_block *meta;
+	unsigned int entry;
+	int ret = -ESTALE;
+
+	meta = afs_dir_find_block(iter, 0);
+	if (!meta)
+		return -ESTALE;
+
+	entry = ntohs(meta->meta.hashtable[iter->bucket & (AFS_DIR_HASHTBL_SIZE - 1)]);
+	_enter("%x,%x", iter->bucket, entry);
+
+	while (entry) {
+		const union afs_xdr_dir_block *block;
+		const union afs_xdr_dirent *dire;
+		unsigned int blnum = entry / AFS_DIR_SLOTS_PER_BLOCK;
+		unsigned int slot = entry % AFS_DIR_SLOTS_PER_BLOCK;
+		unsigned int resv = (blnum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
+
+		_debug("search %x", entry);
+
+		if (slot < resv) {
+			kdebug("slot out of range h=%x rs=%2x sl=%2x-%2x",
+			       iter->bucket, resv, slot, slot + iter->nr_slots - 1);
+			goto bad;
+		}
+
+		block = afs_dir_find_block(iter, blnum);
+		if (!block)
+			goto bad;
+		dire = &block->dirents[slot];
+
+		if (slot + iter->nr_slots <= AFS_DIR_SLOTS_PER_BLOCK &&
+		    memcmp(dire->u.name, name->name, name->len) == 0 &&
+		    dire->u.name[name->len] == '\0') {
+			_fid->vnode  = ntohl(dire->u.vnode);
+			_fid->unique = ntohl(dire->u.unique);
+			ret = entry;
+			goto found;
+		}
+
+		iter->prev_entry = entry;
+		entry = ntohs(dire->u.hash_next);
+		if (!--iter->loop_check) {
+			kdebug("dir chain loop h=%x", iter->bucket);
+			goto bad;
+		}
+	}
+
+	ret = -ENOENT;
+found:
+	if (iter->block) {
+		kunmap_local(iter->block);
+		iter->block = NULL;
+	}
+
+bad:
+	if (ret == -ESTALE)
+		afs_invalidate_dir(iter->dvnode, afs_dir_invalid_iter_stale);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Search the appropriate hash chain in the contents of an AFS directory.
+ */
+int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
+		   struct afs_fid *_fid, afs_dataversion_t *_dir_version)
+{
+	struct afs_dir_iter iter = { .dvnode = dvnode, };
+	int ret, retry_limit = 3;
+
+	_enter("{%lu},,,", dvnode->netfs.inode.i_ino);
+
+	if (!afs_dir_init_iter(&iter, name))
+		return -ENOENT;
+	do {
+		if (--retry_limit < 0) {
+			pr_warn("afs_read_dir(): Too many retries\n");
+			ret = -ESTALE;
+			break;
+		}
+		ret = afs_read_dir(dvnode, NULL);
+		if (ret < 0) {
+			if (ret != -ESTALE)
+				break;
+			if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) {
+				ret = -ESTALE;
+				break;
+			}
+			continue;
+		}
+		*_dir_version = inode_peek_iversion_raw(&dvnode->netfs.inode);
+
+		ret = afs_dir_search_bucket(&iter, name, _fid);
+		up_read(&dvnode->validate_lock);
+		if (ret == -ESTALE)
+			afs_dir_reset_iter(&iter);
+	} while (ret == -ESTALE);
+
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index acae1b5bfc63..b7d02c105340 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -978,9 +978,14 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
  */
 struct afs_dir_iter {
 	struct afs_vnode	*dvnode;
+	union afs_xdr_dir_block *block;
 	struct folio_queue	*fq;
 	unsigned int		fpos;
 	int			fq_slot;
+	unsigned int		loop_check;
+	u8			nr_slots;
+	u8			bucket;
+	unsigned int		prev_entry;
 };
 
 #include <trace/events/afs.h>
@@ -1065,6 +1070,8 @@ extern const struct address_space_operations afs_dir_aops;
 extern const struct dentry_operations afs_fs_dentry_operations;
 
 ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file);
+ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
+	__acquires(&dvnode->validate_lock);
 extern void afs_d_release(struct dentry *);
 extern void afs_check_for_remote_deletion(struct afs_operation *);
 int afs_single_writepages(struct address_space *mapping,
@@ -1080,6 +1087,17 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 				enum afs_edit_dir_reason why);
 void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
 
+/*
+ * dir_search.c
+ */
+unsigned int afs_dir_hash_name(const struct qstr *name);
+bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name);
+union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block);
+int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
+			  struct afs_fid *_fid);
+int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
+		   struct afs_fid *_fid, afs_dataversion_t *_dir_version);
+
 /*
  * dir_silly.c
  */
-- 
cgit v1.2.3


From 6698c02d64b240861c20d15a531445942600c8ae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:20 +0000
Subject: afs: Locally initialise the contents of a new symlink on creation

Since we know what the contents of a symlink will be when we create it on
the server, initialise its contents locally too to avoid the need to
download it.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-31-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/dir.c             |  2 ++
 fs/afs/inode.c           | 46 ++++++++++++++++++++++++++++++++++++++++------
 fs/afs/internal.h        |  1 +
 fs/netfs/buffered_read.c |  2 +-
 fs/netfs/read_single.c   |  2 +-
 5 files changed, 45 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index dd0c98d25270..a843c36fc471 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1276,6 +1276,8 @@ static void afs_vnode_new_inode(struct afs_operation *op)
 	set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
 	if (S_ISDIR(inode->i_mode))
 		afs_mkdir_init_dir(vnode, dvp->vnode);
+	else if (S_ISLNK(inode->i_mode))
+		afs_init_new_symlink(vnode, op);
 	if (!afs_op_error(op))
 		afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb);
 	d_instantiate(op->dentry, inode);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0e3c43c40632..e9538e91f848 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -25,6 +25,24 @@
 #include "internal.h"
 #include "afs_fs.h"
 
+void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op)
+{
+	size_t size = strlen(op->create.symlink) + 1;
+	size_t dsize = 0;
+	char *p;
+
+	if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size,
+				      mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0)
+		return;
+
+	vnode->directory_size = dsize;
+	p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0);
+	memcpy(p, op->create.symlink, size);
+	kunmap_local(p);
+	set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
+	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
+}
+
 static void afs_put_link(void *arg)
 {
 	struct folio *folio = virt_to_folio(arg);
@@ -41,15 +59,31 @@ const char *afs_get_link(struct dentry *dentry, struct inode *inode,
 	char *content;
 	ssize_t ret;
 
-	if (atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE ||
-	    !test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) {
-		if (!dentry)
+	if (!dentry) {
+		/* RCU pathwalk. */
+		if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode))
 			return ERR_PTR(-ECHILD);
-		ret = afs_read_single(vnode, NULL);
-		if (ret < 0)
-			return ERR_PTR(ret);
+		goto good;
 	}
 
+	if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
+		goto fetch;
+
+	ret = afs_validate(vnode, NULL);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
+	    test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
+		goto good;
+
+fetch:
+	ret = afs_read_single(vnode, NULL);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
+
+good:
 	folio = folioq_folio(vnode->directory, 0);
 	folio_get(folio);
 	content = kmap_local_folio(folio, 0);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index b7d02c105340..90f407774a9a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1221,6 +1221,7 @@ extern void afs_fs_probe_cleanup(struct afs_net *);
  */
 extern const struct afs_operation_ops afs_fetch_status_operation;
 
+void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op);
 const char *afs_get_link(struct dentry *dentry, struct inode *inode,
 			 struct delayed_call *callback);
 int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 0245943d974d..f761d44b3436 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -210,7 +210,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 
 	do {
 		struct netfs_io_subrequest *subreq;
-		enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
+		enum netfs_io_source source = NETFS_SOURCE_UNKNOWN;
 		ssize_t slice;
 
 		subreq = netfs_alloc_subrequest(rreq);
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index 14bc61107182..fea0ecdecc53 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -97,7 +97,7 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
 	if (!subreq)
 		return -ENOMEM;
 
-	subreq->source	= NETFS_DOWNLOAD_FROM_SERVER;
+	subreq->source	= NETFS_SOURCE_UNKNOWN;
 	subreq->start	= 0;
 	subreq->len	= rreq->len;
 	subreq->io_iter	= rreq->buffer.iter;
-- 
cgit v1.2.3


From 3c49e529e1c6aa71cb9b874fd60b72c97dae7ede Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:21 +0000
Subject: afs: Add a tracepoint for afs_read_receive()

Add a tracepoint for afs_read_receive() to allow potential missed wakeups
to be debugged.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241216204124.3752367-32-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/file.c              |  1 +
 include/trace/events/afs.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'fs')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index c296efebb491..fc15497608c6 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -274,6 +274,7 @@ static void afs_read_receive(struct afs_call *call)
 	state = READ_ONCE(call->state);
 	if (state == AFS_CALL_COMPLETE)
 		return;
+	trace_afs_read_recv(op, call);
 
 	while (state < AFS_CALL_COMPLETE && READ_ONCE(call->need_attention)) {
 		WRITE_ONCE(call->need_attention, false);
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index c52fd83ca9b7..2e92487f3f34 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -1775,6 +1775,36 @@ TRACE_EVENT(afs_make_call,
 		      __entry->fid.unique)
 	    );
 
+TRACE_EVENT(afs_read_recv,
+	    TP_PROTO(const struct afs_operation *op, const struct afs_call *call),
+
+	    TP_ARGS(op, call),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		rreq)
+		    __field(unsigned int,		sreq)
+		    __field(unsigned int,		op)
+		    __field(unsigned int,		op_flags)
+		    __field(unsigned int,		call)
+		    __field(enum afs_call_state,	call_state)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->op = op->debug_id;
+		    __entry->sreq = op->fetch.subreq->debug_index;
+		    __entry->rreq = op->fetch.subreq->rreq->debug_id;
+		    __entry->op_flags = op->flags;
+		    __entry->call = call->debug_id;
+		    __entry->call_state = call->state;
+			   ),
+
+	    TP_printk("R=%08x[%x] OP=%08x c=%08x cs=%x of=%x",
+		      __entry->rreq, __entry->sreq,
+		      __entry->op,
+		      __entry->call, __entry->call_state,
+		      __entry->op_flags)
+	    );
+
 #endif /* _TRACE_AFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 794d8cf3a87a6b958d520a7c32d142f7ec30cb92 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 16 Dec 2024 20:41:22 +0000
Subject: netfs: Report on NULL folioq in netfs_writeback_unlock_folios()

It seems that it's possible to get to netfs_writeback_unlock_folios() with
an empty rolling buffer during buffered writes.  This should not be
possible as the rolling buffer is initialised as the write request is set
up and thereafter maintains at least one folio_queue struct therein until
it gets destroyed.  This allows lockless addition and removal of
folio_queue structs in the buffer because, unlike with a ring buffer, the
producer and consumer each only need to look at and alter one pointer into
the buffer.

Now, the rolling buffer is only used for buffered I/O operations as
netfs_collect_write_results() should only call
netfs_writeback_unlock_folios() if the request is of origin type
NETFS_WRITEBACK, NETFS_WRITETHROUGH or NETFS_PGPRIV2_COPY_TO_CACHE.

So it would seem that one of the following occurred: (1) I/O started before
the request was fully initialised, (2) the origin got switched mid-flow or
(3) the request has already been freed and this is a UAF error.  I think
the last is the most likely.

Make netfs_writeback_unlock_folios() report information about the request
and subrequests if folioq is seen to be NULL to try and help debug this,
throw a warning and return.

Note that this does not try to fix the problem.

Reported-by: syzbot+af5c06208fa71bf31b16@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=af5c06208fa71bf31b16
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/ZxshMEW4U7MTgQYa@gmail.com/
Link: https://lore.kernel.org/r/20241216204124.3752367-33-dhowells@redhat.com
cc: Chang Yu <marcus.yu.56@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/write_collect.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'fs')

diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 1b7f53d01b8d..294f67795f79 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -21,6 +21,34 @@
 #define NEED_RETRY		0x10	/* A front op requests retrying */
 #define SAW_FAILURE		0x20	/* One stream or hit a permanent failure */
 
+static void netfs_dump_request(const struct netfs_io_request *rreq)
+{
+	pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n",
+	       rreq->debug_id, refcount_read(&rreq->ref), rreq->flags,
+	       rreq->origin, rreq->error);
+	pr_err("  st=%llx tsl=%zx/%llx/%llx\n",
+	       rreq->start, rreq->transferred, rreq->submitted, rreq->len);
+	pr_err("  cci=%llx/%llx/%llx\n",
+	       rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));
+	pr_err("  iw=%pSR\n", rreq->netfs_ops->issue_write);
+	for (int i = 0; i < NR_IO_STREAMS; i++) {
+		const struct netfs_io_subrequest *sreq;
+		const struct netfs_io_stream *s = &rreq->io_streams[i];
+
+		pr_err("  str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n",
+		       s->stream_nr, s->source, s->error,
+		       s->avail, s->active, s->need_retry, s->failed);
+		pr_err("  str[%x] ct=%llx t=%zx\n",
+		       s->stream_nr, s->collected_to, s->transferred);
+		list_for_each_entry(sreq, &s->subrequests, rreq_link) {
+			pr_err("  sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n",
+			       sreq->stream_nr, sreq->debug_index, sreq->source,
+			       sreq->start, sreq->transferred, sreq->len,
+			       refcount_read(&sreq->ref), sreq->flags);
+		}
+	}
+}
+
 /*
  * Successful completion of write of a folio to the server and/or cache.  Note
  * that we are not allowed to lock the folio here on pain of deadlocking with
@@ -87,6 +115,12 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
 	unsigned long long collected_to = wreq->collected_to;
 	unsigned int slot = wreq->buffer.first_tail_slot;
 
+	if (WARN_ON_ONCE(!folioq)) {
+		pr_err("[!] Writeback unlock found empty rolling buffer!\n");
+		netfs_dump_request(wreq);
+		return;
+	}
+
 	if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
 		if (netfs_pgpriv2_unlock_copied_folios(wreq))
 			*notes |= MADE_PROGRESS;
-- 
cgit v1.2.3


From 135c0c85248a10512a9c5d17dccf65e220398cf4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 9 Oct 2024 16:53:59 -0400
Subject: bcachefs: Fix racy use of jiffies

Calculate the timeout, then check if it's positive before calling
schedule_timeout().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ace291f175dd..3d8fc2642425 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -758,10 +758,12 @@ static int bch2_journal_reclaim_thread(void *arg)
 			journal_empty = fifo_empty(&j->pin);
 			spin_unlock(&j->lock);
 
+			long timeout = j->next_reclaim - jiffies;
+
 			if (journal_empty)
 				schedule();
-			else if (time_after(j->next_reclaim, jiffies))
-				schedule_timeout(j->next_reclaim - jiffies);
+			else if (timeout > 0)
+				schedule_timeout(timeout);
 			else
 				break;
 		}
-- 
cgit v1.2.3


From 1c6d5841aebcad7d1a63e242780e400db6efcf97 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Mon, 7 Oct 2024 09:11:21 +0100
Subject: bcachefs: remove superfluous ; after statements

There are a several statements with two following semicolons, replace
these with just one semicolon.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 2 +-
 fs/bcachefs/ec.c           | 2 +-
 fs/bcachefs/super.c        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 5d809e8bd170..79a274dcd17b 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 	       !(ret = bkey_err(old_k)) &&
 	       bkey_eq(old_pos, old_k.k->p)) {
 		struct bpos whiteout_pos =
-			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
 
 		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
 		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6094afb0c6be..075bfd1cbb15 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -909,7 +909,7 @@ err:
 	bch2_bkey_val_to_text(&msgbuf, c, orig_k);
 	bch_err_ratelimited(c,
 			    "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
-	printbuf_exit(&msgbuf);;
+	printbuf_exit(&msgbuf);
 	ret = -BCH_ERR_stripe_reconstruct;
 	goto out;
 }
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a6ed9a0bf1c7..17442df7326d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1120,12 +1120,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
 
 		prt_bdevname(&buf, fs->bdev);
 		prt_char(&buf, ' ');
-		bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
+		bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
 		prt_newline(&buf);
 
 		prt_bdevname(&buf, sb->bdev);
 		prt_char(&buf, ' ');
-		bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
+		bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
 		prt_newline(&buf);
 
 		if (!opts->no_splitbrain_check)
-- 
cgit v1.2.3


From de92b1ee679bfdf97e6dcd7d4815c53606ea5f01 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 28 Sep 2024 14:27:24 -0400
Subject: bcachefs: bch2_inode_should_have_bp ->
 bch2_inode_should_have_single_bp

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c    | 2 +-
 fs/bcachefs/fsck.c  | 2 +-
 fs/bcachefs/inode.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a41d0d8a2f7b..646b74494a3f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -628,7 +628,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
 		goto err;
 
 	/* regular files may have hardlinks: */
-	if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
+	if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
 				    !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
 				    c,
 				    "dirent points to inode that does not point back:\n  %s",
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 75c8a97a6954..285de12436dd 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2156,7 +2156,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 		return __bch2_fsck_write_inode(trans, target);
 	}
 
-	if (bch2_inode_should_have_bp(target) &&
+	if (bch2_inode_should_have_single_bp(target) &&
 	    !fsck_err(trans, inode_wrong_backpointer,
 		      "dirent points to inode that does not point back:\n  %s",
 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index eab82b5eb897..bdeb6be76038 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -249,7 +249,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
 int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
 void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
 
-static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode)
+static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
 {
 	bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
 
-- 
cgit v1.2.3


From 1325ccf27e7eb247295d84ae8e6fd0f1f3f0e445 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 1 Oct 2024 17:45:58 -0400
Subject: bcachefs: remove_backpointer() now uses dirent_get_by_pos()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 285de12436dd..6b2ddbabe3e7 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -482,6 +482,13 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
 	return ret;
 }
 
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+						struct btree_iter *iter,
+						struct bpos pos)
+{
+	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
 static int remove_backpointer(struct btree_trans *trans,
 			      struct bch_inode_unpacked *inode)
 {
@@ -490,13 +497,11 @@ static int remove_backpointer(struct btree_trans *trans,
 
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	struct bkey_s_c_dirent d =
-		bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
-				     SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0,
-				     dirent);
-	int ret =   bkey_err(d) ?:
-		dirent_points_to_inode(c, d, inode) ?:
-		__remove_dirent(trans, d.k->p);
+	struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
+				     SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
+	int ret = bkey_err(d) ?:
+		  dirent_points_to_inode(c, d, inode) ?:
+		  __remove_dirent(trans, d.k->p);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -1166,13 +1171,6 @@ duplicate_entries:
 	goto out;
 }
 
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
-						struct btree_iter *iter,
-						struct bpos pos)
-{
-	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
 					       struct btree_iter *iter,
 					       struct bch_inode_unpacked *inode,
-- 
cgit v1.2.3


From b836f220146967a0931d0dff58d5c90797c8b88e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 30 Sep 2024 00:14:09 -0400
Subject: bcachefs: __bch2_key_has_snapshot_overwrites uses
 for_each_btree_key_reverse_norestart()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ae57638506c3..feaf2aa0d900 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1735,18 +1735,10 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_iter_init(trans, &iter, id, pos,
-			     BTREE_ITER_not_extents|
-			     BTREE_ITER_all_snapshots);
-	while (1) {
-		k = bch2_btree_iter_prev(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		if (!k.k)
-			break;
-
+	for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
+					     BTREE_ITER_not_extents|
+					     BTREE_ITER_all_snapshots,
+					     k, ret) {
 		if (!bkey_eq(pos, k.k->p))
 			break;
 
-- 
cgit v1.2.3


From a55e2d78eac840cf156445492403ea3ac0a1b1eb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 22 Sep 2024 01:11:36 -0400
Subject: bcachefs: rcu_pending: don't invoke __call_rcu() under lock

In userspace we don't (yet) have an SRCU implementation, so call_srcu()
recurses.

But we don't want to be invoking it under the lock anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rcu_pending.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
index 40a20192eee8..67522aa344a7 100644
--- a/fs/bcachefs/rcu_pending.c
+++ b/fs/bcachefs/rcu_pending.c
@@ -478,7 +478,9 @@ start_gp:
 		 */
 		if (!p->cb_armed) {
 			p->cb_armed = true;
+			spin_unlock_irqrestore(&p->lock, flags);
 			__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
+			goto free_node;
 		} else {
 			__start_poll_synchronize_rcu(pending->srcu);
 		}
-- 
cgit v1.2.3


From 179cdecf225dfe2ad88ca1fbcf776d7e6fc10c26 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 28 Sep 2024 23:10:48 -0400
Subject: bcachefs: bch_verbose_ratelimited

ratelimit "deleting unlinked inode" messages

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 8 ++++++++
 fs/bcachefs/inode.c    | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e94a83b8113e..7db81e182c3c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -293,6 +293,8 @@ do {									\
 
 #define bch_info(c, fmt, ...) \
 	bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_info_ratelimited(c, fmt, ...) \
+	bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_notice(c, fmt, ...) \
 	bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn(c, fmt, ...) \
@@ -352,6 +354,12 @@ do {									\
 		bch_info(c, fmt, ##__VA_ARGS__);			\
 } while (0)
 
+#define bch_verbose_ratelimited(c, fmt, ...)				\
+do {									\
+	if ((c)->opts.verbose)						\
+		bch_info_ratelimited(c, fmt, ##__VA_ARGS__);		\
+} while (0)
+
 #define pr_verbose_init(opts, fmt, ...)					\
 do {									\
 	if (opt_get(opts, verbose))					\
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 039cb7a22244..43653cf050e9 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1380,7 +1380,8 @@ again:
 					NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
 		ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
 		if (ret > 0) {
-			bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
+			bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
+						k.k->p.offset, k.k->p.snapshot);
 
 			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
 			/*
-- 
cgit v1.2.3


From c07beca44ff181bad5928abccff6358ca9d9590b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 1 Oct 2024 16:59:08 -0400
Subject: bcachefs: Pull disk accounting hooks out of trans_commit.c

Also, fix a minor bug in the revert path, where we weren't checking the
journal entry type correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 35 ++++++-----------------------------
 fs/bcachefs/disk_accounting.h    | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 9bf471fa4361..3d951846a1be 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -609,14 +609,6 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 	return 0;
 }
 
-static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
-{
-	return (struct bversion) {
-		.hi = res->seq >> 32,
-		.lo = (res->seq << 32) | (res->offset + offset),
-	};
-}
-
 static inline int
 bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			       struct btree_insert_entry **stopped_at,
@@ -701,25 +693,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	struct jset_entry *entry = trans->journal_entries;
 
 	percpu_down_read(&c->mark_lock);
-
 	for (entry = trans->journal_entries;
 	     entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
 	     entry = vstruct_next(entry))
 		if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
 		    entry->start->k.type == KEY_TYPE_accounting) {
-			BUG_ON(!trans->journal_res.ref);
-
-			struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
-
-			a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
-							(u64 *) entry - (u64 *) trans->journal_entries);
-			BUG_ON(bversion_zero(a->k.bversion));
-
-			if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
-				ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal);
-				if (ret)
-					goto revert_fs_usage;
-			}
+			ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
+			if (ret)
+				goto revert_fs_usage;
 		}
 	percpu_up_read(&c->mark_lock);
 
@@ -833,13 +814,9 @@ revert_fs_usage:
 	     entry2 != entry;
 	     entry2 = vstruct_next(entry2))
 		if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
-		    entry2->start->k.type == KEY_TYPE_accounting) {
-			struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
-
-			bch2_accounting_neg(a);
-			bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
-			bch2_accounting_neg(a);
-		}
+		    entry2->start->k.type == KEY_TYPE_accounting)
+			bch2_accounting_trans_commit_revert(trans,
+					bkey_i_to_accounting(entry2->start), flags);
 	percpu_up_read(&c->mark_lock);
 	return ret;
 }
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 4ea6c8a092bc..6639535dc91c 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_DISK_ACCOUNTING_H
 #define _BCACHEFS_DISK_ACCOUNTING_H
 
+#include "btree_update.h"
 #include "eytzinger.h"
 #include "sb-members.h"
 
@@ -204,6 +205,43 @@ static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
 	bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
 }
 
+static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
+{
+	EBUG_ON(!res->ref);
+
+	return (struct bversion) {
+		.hi = res->seq >> 32,
+		.lo = (res->seq << 32) | (res->offset + offset),
+	};
+}
+
+static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
+						    struct bkey_i_accounting *a,
+						    unsigned commit_flags)
+{
+	a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
+						(u64 *) a - (u64 *) trans->journal_entries);
+
+	EBUG_ON(bversion_zero(a->k.bversion));
+
+	return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
+		? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
+		: 0;
+}
+
+static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
+						       struct bkey_i_accounting *a_i,
+						       unsigned commit_flags)
+{
+	if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+		struct bkey_s_accounting a = accounting_i_to_s(a_i);
+
+		bch2_accounting_neg(a);
+		bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
+		bch2_accounting_neg(a);
+	}
+}
+
 int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
 int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
 void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
-- 
cgit v1.2.3


From bf4e42d158baff9b67ef8f7bd3caa0801bee6374 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Fri, 27 Sep 2024 22:26:53 +0800
Subject: bcachefs: Delete dead code

lock_fail_root_changed has not been used since commit
0d7009d7ca99 ("bcachefs: Delete old deadlock avoidance code")

Remove it.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 2 --
 fs/bcachefs/errcode.h    | 1 -
 2 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index eef9b89c561d..01152fd5ac57 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -748,8 +748,6 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 		ret = btree_node_lock(trans, path, &b->c,
 				      path->level, lock_type, trace_ip);
 		if (unlikely(ret)) {
-			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
-				continue;
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				return ret;
 			BUG();
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 9c4fe5cdbfb7..e3b0ec7a0f73 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -164,7 +164,6 @@
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
 	x(0,				backpointer_to_overwritten_btree_node)	\
-	x(0,				lock_fail_root_changed)			\
 	x(0,				journal_reclaim_would_deadlock)		\
 	x(EINVAL,			fsck)					\
 	x(BCH_ERR_fsck,			fsck_fix)				\
-- 
cgit v1.2.3


From fe818d2039e74fac314e4032b51f057a7f313ad0 Mon Sep 17 00:00:00 2001
From: Thomas Bertschinger <tahbertschinger@gmail.com>
Date: Fri, 13 Sep 2024 18:11:22 -0600
Subject: bcachefs: move bch2_xattr_handlers to .rodata

A series posted previously moved all of the `struct xattr_handler`
tables to .rodata for each filesystem [1].

However, this appears to have been done shortly before bcachefs was
merged, so bcachefs was missed at that time.

Link: https://lkml.kernel.org/r/20230930050033.41174-1-wedsonaf@gmail.com [1]
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Thomas Bertschinger <tahbertschinger@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr.c | 2 +-
 fs/bcachefs/xattr.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 952aca400faf..bf3c6bb50495 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -609,7 +609,7 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
 
 #endif /* NO_BCACHEFS_FS */
 
-const struct xattr_handler *bch2_xattr_handlers[] = {
+const struct xattr_handler * const bch2_xattr_handlers[] = {
 	&bch_xattr_user_handler,
 	&bch_xattr_trusted_handler,
 	&bch_xattr_security_handler,
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index c188a5ad64ce..2c96de051f3e 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -44,6 +44,6 @@ int bch2_xattr_set(struct btree_trans *, subvol_inum,
 
 ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
 
-extern const struct xattr_handler *bch2_xattr_handlers[];
+extern const struct xattr_handler * const bch2_xattr_handlers[];
 
 #endif /* _BCACHEFS_XATTR_H */
-- 
cgit v1.2.3


From d6cf895847f60af83bad62b15f1da14abd331fae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Sep 2024 05:08:39 -0400
Subject: bcachefs: Remove unnecessary peek_slot()

hash_lookup() used to return an errorcode, and a peek_slot() call was
required to get the key it looked up. But we're adding fault injection
for transaction restarts, so fix this old unconverted code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6b2ddbabe3e7..c96025b8b65d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -170,7 +170,7 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	*target = le64_to_cpu(d.v->d_inum);
 	*type = d.v->d_type;
 	bch2_trans_iter_exit(trans, &iter);
-- 
cgit v1.2.3


From 4e1c6ac05a2348be6b74db63b406c10dd553f1ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 23 Sep 2024 22:11:41 -0400
Subject: bcachefs: kill btree_trans_restart_nounlock()

Redundant, the normal btree_trans_restart() doesn't unlock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h         | 7 +++----
 fs/bcachefs/btree_trans_commit.c | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 0bda054f80d7..24406f723283 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -341,21 +341,20 @@ static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans)
 }
 
 __always_inline
-static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
 {
 	BUG_ON(err <= 0);
 	BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
 
 	trans->restarted = err;
-	trans->last_restarted_ip = _THIS_IP_;
+	trans->last_restarted_ip = ip;
 	return -err;
 }
 
 __always_inline
 static int btree_trans_restart(struct btree_trans *trans, int err)
 {
-	btree_trans_restart_nounlock(trans, err);
-	return -err;
+	return btree_trans_restart_ip(trans, err, _THIS_IP_);
 }
 
 bool bch2_btree_node_upgrade(struct btree_trans *,
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 3d951846a1be..b47f11881fe4 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -624,7 +624,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 
 	if (race_fault()) {
 		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
-		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
 	}
 
 	/*
-- 
cgit v1.2.3


From 03525de50638ac0801e5296826e3cdebe4cb553f Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Mon, 23 Sep 2024 16:20:29 +0200
Subject: bcachefs: Remove duplicate included headers

The header files dirent_format.h and disk_groups_format.h are included
twice. Remove the redundant includes and the following warnings reported
by make includecheck:

  disk_groups_format.h is included more than once
  dirent_format.h is included more than once

Reviewed-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5004f6ba997c..6a67df2a2fcd 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -499,8 +499,6 @@ struct bch_sb_field {
 #include "disk_groups_format.h"
 #include "extents_format.h"
 #include "ec_format.h"
-#include "dirent_format.h"
-#include "disk_groups_format.h"
 #include "inode_format.h"
 #include "journal_seq_blacklist_format.h"
 #include "logged_ops_format.h"
-- 
cgit v1.2.3


From 55f524b706b48229685a61e8d5349b484f683e34 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Mon, 23 Sep 2024 16:44:53 +0200
Subject: bcachefs: Use FOREACH_ACL_ENTRY() macro to iterate over acl entries

Use the existing FOREACH_ACL_ENTRY() macro to iterate over POSIX acl
entries and remove the custom acl_for_each_entry() macro.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 87f1be9d4db4..99487727ae64 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -184,11 +184,6 @@ invalid:
 	return ERR_PTR(-EINVAL);
 }
 
-#define acl_for_each_entry(acl, acl_e)			\
-	for (acl_e = acl->a_entries;			\
-	     acl_e < acl->a_entries + acl->a_count;	\
-	     acl_e++)
-
 /*
  * Convert from in-memory to filesystem representation.
  */
@@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans,
 {
 	struct bkey_i_xattr *xattr;
 	bch_acl_header *acl_header;
-	const struct posix_acl_entry *acl_e;
+	const struct posix_acl_entry *acl_e, *pe;
 	void *outptr;
 	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
 
-	acl_for_each_entry(acl, acl_e) {
+	FOREACH_ACL_ENTRY(acl_e, acl, pe) {
 		switch (acl_e->e_tag) {
 		case ACL_USER:
 		case ACL_GROUP:
@@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
 
 	outptr = (void *) acl_header + sizeof(*acl_header);
 
-	acl_for_each_entry(acl, acl_e) {
+	FOREACH_ACL_ENTRY(acl_e, acl, pe) {
 		bch_acl_entry *entry = outptr;
 
 		entry->e_tag = cpu_to_le16(acl_e->e_tag);
-- 
cgit v1.2.3


From eba3d7e57d2a4e0c1f28b5c2e3bb691279ab6eaf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 23 Sep 2024 18:11:07 -0400
Subject: bcachefs: add more path idx debug asserts

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 24406f723283..550db3654f2c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -23,6 +23,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path
 {
 	unsigned idx = path - trans->paths;
 
+	EBUG_ON(idx >= trans->nr_paths);
 	EBUG_ON(!test_bit(idx, trans->paths_allocated));
 	if (unlikely(path->ref == U8_MAX)) {
 		bch2_dump_trans_paths_updates(trans);
@@ -36,6 +37,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path
 
 static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
 {
+	EBUG_ON(path - trans->paths >= trans->nr_paths);
 	EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
 	EBUG_ON(!path->ref);
 	EBUG_ON(!path->intent_ref && intent);
-- 
cgit v1.2.3


From 18f5b84a04707565b926b3dcdfbc6f88ef53988a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 21 Sep 2024 20:21:18 -0400
Subject: bcachefs: bch2_run_explicit_recovery_pass() returns different error
 when not in recovery

if we're not in recovery then there's no way to rewind recovery - give
this a different errcode so that any error messages will give us a
better idea of what happened.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h         | 4 +++-
 fs/bcachefs/recovery_passes.c | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index e3b0ec7a0f73..40bf1e5775a9 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -172,7 +172,9 @@
 	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
 	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
 	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
-	x(0,				restart_recovery)			\
+	x(EINVAL,			restart_recovery)			\
+	x(EINVAL,			not_in_recovery)			\
+	x(EINVAL,			cannot_rewind_recovery)			\
 	x(0,				data_update_done)			\
 	x(EINVAL,			device_state_not_allowed)		\
 	x(EINVAL,			member_info_missing)			\
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index dff589ddc984..1cc010bf1695 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -106,6 +106,9 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
 	if (c->opts.recovery_passes & BIT_ULL(pass))
 		return 0;
 
+	if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
+		return -BCH_ERR_not_in_recovery;
+
 	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
 		 bch2_recovery_passes[pass], pass,
 		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
-- 
cgit v1.2.3


From 26c79fdc580d27c08c050789c523ce89e9a0da44 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 21 Sep 2024 23:22:48 -0400
Subject: bcachefs: lru, accounting are alloc btrees

They can be regenerated by fsck and don't require a btree node scan,
like other alloc btrees.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 6a67df2a2fcd..79a80a78c2d8 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1359,6 +1359,8 @@ static inline bool btree_id_is_alloc(enum btree_id id)
 	case BTREE_ID_need_discard:
 	case BTREE_ID_freespace:
 	case BTREE_ID_bucket_gens:
+	case BTREE_ID_lru:
+	case BTREE_ID_accounting:
 		return true;
 	default:
 		return false;
-- 
cgit v1.2.3


From d65d126c0256cf2349e118a3e8627d8281295eee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 21 Sep 2024 23:27:59 -0400
Subject: bcachefs: Add locking for bch_fs.curr_recovery_pass

Recovery can rewind in certain situations - when we discover we need to
run a pass that doesn't normally run.

This can happen from another thread for btree node read errors, so we
need a bit of locking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  1 +
 fs/bcachefs/recovery_passes.c | 76 ++++++++++++++++++++++++++++++++-----------
 fs/bcachefs/super.c           |  1 +
 3 files changed, 59 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7db81e182c3c..fbd89f91625d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1060,6 +1060,7 @@ struct bch_fs {
 	u64			recovery_passes_complete;
 	/* never rewinds version of curr_recovery_pass */
 	enum bch_recovery_pass	recovery_pass_done;
+	spinlock_t		recovery_pass_lock;
 	struct semaphore	online_fsck_mutex;
 
 	/* DEBUG JUNK */
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 1cc010bf1695..5e7722cc0879 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -100,8 +100,8 @@ u64 bch2_recovery_passes_from_stable(u64 v)
 /*
  * For when we need to rewind recovery passes and run a pass we skipped:
  */
-int bch2_run_explicit_recovery_pass(struct bch_fs *c,
-				    enum bch_recovery_pass pass)
+static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
+					     enum bch_recovery_pass pass)
 {
 	if (c->opts.recovery_passes & BIT_ULL(pass))
 		return 0;
@@ -109,6 +109,13 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
 	if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
 		return -BCH_ERR_not_in_recovery;
 
+	if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
+	    c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
+		bch_info(c, "need recovery pass %s (%u), but already rw",
+			 bch2_recovery_passes[pass], pass);
+		return -BCH_ERR_cannot_rewind_recovery;
+	}
+
 	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
 		 bch2_recovery_passes[pass], pass,
 		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
@@ -124,6 +131,16 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
 	}
 }
 
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+				    enum bch_recovery_pass pass)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&c->recovery_pass_lock, flags);
+	int ret = __bch2_run_explicit_recovery_pass(c, pass);
+	spin_unlock_irqrestore(&c->recovery_pass_lock, flags);
+	return ret;
+}
+
 int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
 					       enum bch_recovery_pass pass)
 {
@@ -237,30 +254,51 @@ int bch2_run_recovery_passes(struct bch_fs *c)
 	c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
 
 	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+		spin_lock_irq(&c->recovery_pass_lock);
+		unsigned pass = c->curr_recovery_pass;
+
 		if (c->opts.recovery_pass_last &&
-		    c->curr_recovery_pass > c->opts.recovery_pass_last)
+		    c->curr_recovery_pass > c->opts.recovery_pass_last) {
+			spin_unlock_irq(&c->recovery_pass_lock);
 			break;
+		}
 
-		if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
-			unsigned pass = c->curr_recovery_pass;
-
-			ret =   bch2_run_recovery_pass(c, c->curr_recovery_pass) ?:
-				bch2_journal_flush(&c->journal);
-			if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
-			    (ret && c->curr_recovery_pass < pass))
-				continue;
-			if (ret)
-				break;
-
-			c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+		if (!should_run_recovery_pass(c, pass)) {
+			c->curr_recovery_pass++;
+			c->recovery_pass_done = max(c->recovery_pass_done, pass);
+			spin_unlock_irq(&c->recovery_pass_lock);
+			continue;
+		}
+		spin_unlock_irq(&c->recovery_pass_lock);
+
+		ret =   bch2_run_recovery_pass(c, pass) ?:
+			bch2_journal_flush(&c->journal);
+
+		spin_lock_irq(&c->recovery_pass_lock);
+		if (c->curr_recovery_pass < pass) {
+			/*
+			 * bch2_run_explicit_recovery_pass() was called: we
+			 * can't always catch -BCH_ERR_restart_recovery because
+			 * it may have been called from another thread (btree
+			 * node read completion)
+			 */
+			spin_unlock_irq(&c->recovery_pass_lock);
+			continue;
+		} else if (c->curr_recovery_pass == pass) {
+			c->curr_recovery_pass++;
+		} else {
+			BUG();
 		}
+		spin_unlock_irq(&c->recovery_pass_lock);
 
-		c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
+		if (ret)
+			break;
 
-		if (!test_bit(BCH_FS_error, &c->flags))
-			bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
+		c->recovery_passes_complete |= BIT_ULL(pass);
+		c->recovery_pass_done = max(c->recovery_pass_done, pass);
 
-		c->curr_recovery_pass++;
+		if (!test_bit(BCH_FS_error, &c->flags))
+			bch2_clear_recovery_pass_required(c, pass);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 17442df7326d..d6411324cd3f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -766,6 +766,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	refcount_set(&c->ro_ref, 1);
 	init_waitqueue_head(&c->ro_ref_wait);
+	spin_lock_init(&c->recovery_pass_lock);
 	sema_init(&c->online_fsck_mutex, 1);
 
 	init_rwsem(&c->gc_lock);
-- 
cgit v1.2.3


From e3c43dbe8e5ff64e77b6f927b32f489bccc7d75e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 21 Sep 2024 23:40:01 -0400
Subject: bcachefs: bch2_btree_lost_data() now uses
 run_explicit_rceovery_pass_persistent()

Also get a bit more fine grained about which passes to run for which
btrees.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c        | 63 ++++++++++++++++++++++++++++---------------
 fs/bcachefs/recovery.h        |  2 +-
 fs/bcachefs/recovery_passes.c | 11 ++++++++
 fs/bcachefs/recovery_passes.h |  1 +
 4 files changed, 54 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3c7f941dde39..b1c83e72c0d8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -34,21 +34,52 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
+int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
 {
-	if (btree >= BTREE_ID_NR_MAX)
-		return;
-
 	u64 b = BIT_ULL(btree);
+	int ret = 0;
+
+	mutex_lock(&c->sb_lock);
 
 	if (!(c->sb.btrees_lost_data & b)) {
 		bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
-
-		mutex_lock(&c->sb_lock);
 		bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
 	}
+
+	switch (btree) {
+	case BTREE_ID_alloc:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_backpointers:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
+		goto out;
+	case BTREE_ID_need_discard:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_freespace:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_bucket_gens:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_lru:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_accounting:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+		goto out;
+	default:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
+		goto out;
+	}
+out:
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
 }
 
 /* for -o reconstruct_alloc: */
@@ -524,22 +555,10 @@ static int read_btree_roots(struct bch_fs *c)
 					c, btree_root_read_error,
 					"error reading btree root %s l=%u: %s",
 					bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
-			if (btree_id_is_alloc(i)) {
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
-				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			if (btree_id_is_alloc(i))
 				r->error = 0;
-			} else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
-				bch_info(c, "will run btree node scan");
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
-			}
 
-			ret = 0;
-			bch2_btree_lost_data(c, i);
+			ret = bch2_btree_lost_data(c, i);
 		}
 	}
 
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 4bf818de1f2f..b0d55754b21b 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
+int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
 
 int bch2_journal_replay(struct bch_fs *);
 
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 5e7722cc0879..1240c5c19fea 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -141,6 +141,17 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
 	return ret;
 }
 
+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
+					       enum bch_recovery_pass pass)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+	__set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+
+	return bch2_run_explicit_recovery_pass(c, pass);
+}
+
 int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
 					       enum bch_recovery_pass pass)
 {
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
index 99b464e127b8..7d7339c8fa29 100644
--- a/fs/bcachefs/recovery_passes.h
+++ b/fs/bcachefs/recovery_passes.h
@@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v);
 u64 bch2_fsck_recovery_passes(void);
 
 int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
 int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
 
 int bch2_run_online_recovery_passes(struct bch_fs *);
-- 
cgit v1.2.3


From 0269e27ce3f7be2bd1e565cc17a88e4074facad1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 9 Oct 2024 21:26:05 -0400
Subject: bcachefs: improved bkey_val_copy()

Factor out some common code, add typechecking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 550db3654f2c..dda07a320488 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -594,13 +594,18 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
 	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
 				       _btree_id, _pos, _flags, KEY_TYPE_##_type))
 
+static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
+{
+	unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
+	memcpy(dst_v, src_k.v, b);
+	if (unlikely(b < dst_size))
+		memset(dst_v + b, 0, dst_size - b);
+}
+
 #define bkey_val_copy(_dst_v, _src_k)					\
 do {									\
-	unsigned b = min_t(unsigned, sizeof(*_dst_v),			\
-			   bkey_val_bytes(_src_k.k));			\
-	memcpy(_dst_v, _src_k.v, b);					\
-	if (b < sizeof(*_dst_v))					\
-		memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b);	\
+	BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v));			\
+	__bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c);		\
 } while (0)
 
 static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
@@ -609,17 +614,10 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
 				unsigned val_size, void *val)
 {
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
-	ret = bkey_err(k);
+	struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
+	int ret = bkey_err(k);
 	if (!ret) {
-		unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
-
-		memcpy(val, k.v, b);
-		if (unlikely(b < sizeof(*val)))
-			memset((void *) val + b, 0, sizeof(*val) - b);
+		__bkey_val_copy(val, val_size, k);
 		bch2_trans_iter_exit(trans, &iter);
 	}
 
-- 
cgit v1.2.3


From 106480e9a869e8d2dd2db34819d04e15ccfd896c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 9 Oct 2024 21:51:05 -0400
Subject: bcachefs: Factor out jset_entry_log_msg_bytes()

Needed for improved userspace cmd_list_journal

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 9 +++++++++
 fs/bcachefs/journal_io.c      | 3 +--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 79a80a78c2d8..c5e3824d5771 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1219,6 +1219,15 @@ struct jset_entry_log {
 	u8			d[];
 } __packed __aligned(8);
 
+static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
+{
+	unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
+
+	while (b && !l->d[b - 1])
+		--b;
+	return b;
+}
+
 struct jset_entry_datetime {
 	struct jset_entry	entry;
 	__le64			seconds;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index fb35dd336331..7c7595e5369b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -738,9 +738,8 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
 				      struct jset_entry *entry)
 {
 	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
-	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
 
-	prt_printf(out, "%.*s", bytes, l->d);
+	prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
 }
 
 static int journal_entry_overwrite_validate(struct bch_fs *c,
-- 
cgit v1.2.3


From 9e2f5f79882b855156cd522acb7354e5a7901418 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 9 Oct 2024 21:27:11 -0400
Subject: bcachefs: better error message in check_snapshot_tree()

If we find a snapshot node and it didn't match the snapshot tree, we
should print it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index feaf2aa0d900..34e01bd8127f 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -506,7 +506,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
 			break;
 		}
 	}
-
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (!ret && !found) {
@@ -536,6 +535,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
 	struct bch_snapshot s;
 	struct bch_subvolume subvol;
 	struct printbuf buf = PRINTBUF;
+	struct btree_iter snapshot_iter = {};
 	u32 root_id;
 	int ret;
 
@@ -545,16 +545,27 @@ static int check_snapshot_tree(struct btree_trans *trans,
 	st = bkey_s_c_to_snapshot_tree(k);
 	root_id = le32_to_cpu(st.v->root_snapshot);
 
-	ret = bch2_snapshot_lookup(trans, root_id, &s);
+	struct bkey_s_c_snapshot snapshot_k =
+		bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
+					 POS(0, root_id), 0, snapshot);
+	ret = bkey_err(snapshot_k);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		goto err;
 
+	if (!ret)
+		bkey_val_copy(&s, snapshot_k);
+
 	if (fsck_err_on(ret ||
 			root_id != bch2_snapshot_root(c, root_id) ||
 			st.k->p.offset != le32_to_cpu(s.tree),
 			trans, snapshot_tree_to_missing_snapshot,
 			"snapshot tree points to missing/incorrect snapshot:\n  %s",
-			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+			(bch2_bkey_val_to_text(&buf, c, st.s_c),
+			 prt_newline(&buf),
+			 ret
+			 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
+			 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
+			 buf.buf))) {
 		ret = bch2_btree_delete_at(trans, iter, 0);
 		goto err;
 	}
@@ -605,6 +616,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
 	}
 err:
 fsck_err:
+	bch2_trans_iter_exit(trans, &snapshot_iter);
 	printbuf_exit(&buf);
 	return ret;
 }
-- 
cgit v1.2.3


From db514cf6775fa58b45780969e407f678e0a5132c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 9 Oct 2024 23:02:04 -0400
Subject: bcachefs: Avoid bch2_btree_id_str()

Prefer bch2_btree_id_to_text() - it prints out the integer ID when
unknown.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c          | 24 +++++++++++---------
 fs/bcachefs/bbpos.h                 |  2 +-
 fs/bcachefs/btree_cache.c           | 37 ++++++++++++++++++------------
 fs/bcachefs/btree_cache.h           |  3 ++-
 fs/bcachefs/btree_gc.c              | 45 ++++++++++++++++++++++---------------
 fs/bcachefs/btree_io.c              | 13 ++++++-----
 fs/bcachefs/btree_iter.c            | 32 +++++++++++++-------------
 fs/bcachefs/btree_journal_iter.c    |  5 ++++-
 fs/bcachefs/btree_node_scan.c       | 10 +++++----
 fs/bcachefs/btree_update_interior.c | 23 ++++++++++---------
 fs/bcachefs/debug.c                 |  4 +++-
 fs/bcachefs/disk_accounting.c       |  3 ++-
 fs/bcachefs/journal_io.c            |  3 ++-
 fs/bcachefs/recovery.c              | 25 +++++++++++++++------
 fs/bcachefs/sysfs.c                 |  3 ++-
 15 files changed, 140 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 654a58132a4d..f323ce4b0b33 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -81,12 +81,11 @@ fsck_err:
 
 void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
 {
-	prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
-	       bch2_btree_id_str(bp->btree_id),
-	       bp->level,
-	       (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-	       (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-	       bp->bucket_len);
+	bch2_btree_id_level_to_text(out, bp->btree_id, bp->level);
+	prt_printf(out, " offset=%llu:%u len=%u pos=",
+		   (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+		   (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+		   bp->bucket_len);
 	bch2_bpos_to_text(out, bp->pos);
 }
 
@@ -501,9 +500,13 @@ found:
 		goto err;
 
 	prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
-	prt_printf(&buf, "\n  %s ", bch2_btree_id_str(btree));
+	prt_printf(&buf, "\n  ");
+	bch2_btree_id_to_text(&buf, btree);
+	prt_str(&buf, " ");
 	bch2_bkey_val_to_text(&buf, c, extent);
-	prt_printf(&buf, "\n  %s ", bch2_btree_id_str(o_btree));
+	prt_printf(&buf, "\n  ");
+	bch2_btree_id_to_text(&buf, o_btree);
+	prt_str(&buf, " ");
 	bch2_bkey_val_to_text(&buf, c, extent2);
 
 	struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
@@ -638,8 +641,9 @@ check_existing_bp:
 	goto err;
 missing:
 	printbuf_reset(&buf);
-	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
-	       bch2_btree_id_str(bp.btree_id), bp.level);
+	prt_str(&buf, "missing backpointer for btree=");
+	bch2_btree_id_to_text(&buf, bp.btree_id);
+	prt_printf(&buf, " l=%u ", bp.level);
 	bch2_bkey_val_to_text(&buf, c, orig_k);
 	prt_printf(&buf, "\n  got:   ");
 	bch2_bkey_val_to_text(&buf, c, bp_k);
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
index be2edced5213..63abe17f35ea 100644
--- a/fs/bcachefs/bbpos.h
+++ b/fs/bcachefs/bbpos.h
@@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos)
 
 static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
 {
-	prt_str(out, bch2_btree_id_str(pos.btree));
+	bch2_btree_id_to_text(out, pos.btree);
 	prt_char(out, ':');
 	bch2_bpos_to_text(out, pos.pos);
 }
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 7123019ab3bc..a0a406b0c7bc 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1004,16 +1004,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 		return;
 
 	prt_printf(&buf,
-	       "btree node header doesn't match ptr\n"
-	       "btree %s level %u\n"
-	       "ptr: ",
-	       bch2_btree_id_str(b->c.btree_id), b->c.level);
+		   "btree node header doesn't match ptr: ");
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_str(&buf, "\nptr: ");
 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
-	prt_printf(&buf, "\nheader: btree %s level %llu\n"
-	       "min ",
-	       bch2_btree_id_str(BTREE_NODE_ID(b->data)),
-	       BTREE_NODE_LEVEL(b->data));
+	prt_str(&buf, "\nheader: ");
+	bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
+	prt_str(&buf, "\nmin ");
 	bch2_bpos_to_text(&buf, b->data->min_key);
 
 	prt_printf(&buf, "\nmax ");
@@ -1398,12 +1396,19 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
 		prt_printf(out, "(unknown btree %u)", btree);
 }
 
+void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
+{
+	prt_str(out, "btree=");
+	bch2_btree_id_to_text(out, btree);
+	prt_printf(out, " level=%u", level);
+}
+
 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
 {
-	prt_printf(out, "%s level %u/%u\n  ",
-	       bch2_btree_id_str(b->c.btree_id),
-	       b->c.level,
-	       bch2_btree_id_root(c, b->c.btree_id)->level);
+	bch2_btree_id_to_text(out, b->c.btree_id);
+	prt_printf(out, " level %u/%u\n  ",
+		   b->c.level,
+		   bch2_btree_id_root(c, b->c.btree_id)->level);
 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 }
 
@@ -1478,8 +1483,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
 	prt_printf(out, "cannibalize lock:\t%p\n",	bc->alloc_lock);
 	prt_newline(out);
 
-	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
-		prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
+	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
+		bch2_btree_id_to_text(out, i);
+		prt_printf(out, "\t");
+		prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
+		prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
+	}
 
 	prt_newline(out);
 	prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 66e86d1a178d..6cfacacb6769 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -138,8 +138,9 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
 	return bch2_btree_id_root(c, b->c.btree_id)->b;
 }
 
-const char *bch2_btree_id_str(enum btree_id);
+const char *bch2_btree_id_str(enum btree_id);	/* avoid */
 void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
+void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
 
 void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 81dcf9e512c0..3c4e66da1ca4 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -56,8 +56,8 @@ void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
 {
 	prt_str(out, bch2_gc_phase_strs[p->phase]);
 	prt_char(out, ' ');
-	bch2_btree_id_to_text(out, p->btree);
-	prt_printf(out, " l=%u ", p->level);
+	bch2_btree_id_level_to_text(out, p->btree, p->level);
+	prt_char(out, ' ');
 	bch2_bpos_to_text(out, p->pos);
 }
 
@@ -209,8 +209,9 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
 	if (bpos_eq(expected_start, cur->data->min_key))
 		return 0;
 
-	prt_printf(&buf, "  at btree %s level %u:\n  parent: ",
-		   bch2_btree_id_str(b->c.btree_id), b->c.level);
+	prt_printf(&buf, "  at ");
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_printf(&buf, ":\n  parent: ");
 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
 	if (prev) {
@@ -277,8 +278,9 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
 	if (bpos_eq(child->key.k.p, b->key.k.p))
 		return 0;
 
-	prt_printf(&buf, "at btree %s level %u:\n  parent: ",
-		   bch2_btree_id_str(b->c.btree_id), b->c.level);
+	prt_printf(&buf, "  at ");
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_printf(&buf, ":\n  parent: ");
 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
 	prt_str(&buf, "\n  child: ");
@@ -341,14 +343,14 @@ again:
 		ret = PTR_ERR_OR_ZERO(cur);
 
 		printbuf_reset(&buf);
+		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
+		prt_char(&buf, ' ');
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
 
 		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
 				trans, btree_node_unreadable,
-				"Topology repair: unreadable btree node at btree %s level %u:\n"
+				"Topology repair: unreadable btree node at\n"
 				"  %s",
-				bch2_btree_id_str(b->c.btree_id),
-				b->c.level - 1,
 				buf.buf)) {
 			bch2_btree_node_evict(trans, cur_k.k);
 			cur = NULL;
@@ -370,7 +372,7 @@ again:
 			break;
 
 		if (bch2_btree_node_is_stale(c, cur)) {
-			bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
+			bch_info(c, "btree node older than nodes found by scanning\n  %s", buf.buf);
 			six_unlock_read(&cur->c.lock);
 			bch2_btree_node_evict(trans, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
@@ -478,14 +480,13 @@ again:
 	}
 
 	printbuf_reset(&buf);
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_newline(&buf);
 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
 	if (mustfix_fsck_err_on(!have_child,
 			trans, btree_node_topology_interior_node_empty,
-			"empty interior btree node at btree %s level %u\n"
-			"  %s",
-			bch2_btree_id_str(b->c.btree_id),
-			b->c.level, buf.buf))
+			"empty interior btree node at %s", buf.buf))
 		ret = DROP_THIS_NODE;
 err:
 fsck_err:
@@ -511,6 +512,7 @@ int bch2_check_topology(struct bch_fs *c)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
 	struct bpos pulled_from_scan = POS_MIN;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	bch2_trans_srcu_unlock(trans);
@@ -519,19 +521,21 @@ int bch2_check_topology(struct bch_fs *c)
 		struct btree_root *r = bch2_btree_id_root(c, i);
 		bool reconstructed_root = false;
 
+		bch2_btree_id_to_text(&buf, i);
+
 		if (r->error) {
 			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
 			if (ret)
 				break;
 reconstruct_root:
-			bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
+			bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
 
 			r->alive = false;
 			r->error = 0;
 
 			if (!bch2_btree_has_scanned_nodes(c, i)) {
 				mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
-						 "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
+						 "no nodes found for btree %s, continue?", buf.buf);
 				bch2_btree_root_alloc_fake_trans(trans, i, 0);
 			} else {
 				bch2_btree_root_alloc_fake_trans(trans, i, 1);
@@ -560,13 +564,14 @@ reconstruct_root:
 			if (!reconstructed_root)
 				goto reconstruct_root;
 
-			bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
+			bch_err(c, "empty btree root %s", buf.buf);
 			bch2_btree_root_alloc_fake_trans(trans, i, 0);
 			r->alive = false;
 			ret = 0;
 		}
 	}
 fsck_err:
+	printbuf_exit(&buf);
 	bch2_trans_put(trans);
 	return ret;
 }
@@ -713,6 +718,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
 	enum btree_id ids[BTREE_ID_NR];
+	struct printbuf buf = PRINTBUF;
 	unsigned i;
 	int ret = 0;
 
@@ -731,10 +737,13 @@ static int bch2_gc_btrees(struct bch_fs *c)
 		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
 					trans, btree_node_read_error,
 			       "btree node read error for %s",
-			       bch2_btree_id_str(btree)))
+			       (printbuf_reset(&buf),
+				bch2_btree_id_to_text(&buf, btree),
+				buf.buf)))
 			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
 	}
 fsck_err:
+	printbuf_exit(&buf);
 	bch2_trans_put(trans);
 	bch_err_fn(c, ret);
 	return ret;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 839d68802e42..89a42ee81e5c 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -25,9 +25,8 @@
 
 static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
 {
-	prt_printf(out, "btree=%s l=%u seq %llux\n",
-		   bch2_btree_id_str(BTREE_NODE_ID(bn)),
-		   (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq);
+	bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
+	prt_printf(out, " seq %llux\n", bn->keys.seq);
 	prt_str(out, "min: ");
 	bch2_bpos_to_text(out, bn->min_key);
 	prt_newline(out);
@@ -1343,9 +1342,11 @@ start:
 	    !btree_node_read_error(b) &&
 	    c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
 		printbuf_reset(&buf);
-		bch2_bpos_to_text(&buf, b->key.k.p);
-		bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
-			 __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
+		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+		prt_str(&buf, " ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n  %s",
+				    __func__, buf.buf);
 
 		bch2_btree_node_rewrite_async(c, b);
 	}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 01152fd5ac57..07bce85dafaf 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1448,10 +1448,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 	trans_for_each_update(trans, i) {
 		struct bkey_s_c old = { &i->old_k, i->old_v };
 
-		prt_printf(buf, "update: btree=%s cached=%u %pS\n",
-		       bch2_btree_id_str(i->btree_id),
-		       i->cached,
-		       (void *) i->ip_allocated);
+		prt_str(buf, "update: btree=");
+		bch2_btree_id_to_text(buf, i->btree_id);
+		prt_printf(buf, " cached=%u %pS\n",
+			   i->cached,
+			   (void *) i->ip_allocated);
 
 		prt_printf(buf, "  old ");
 		bch2_bkey_val_to_text(buf, trans->c, old);
@@ -1484,13 +1485,13 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
 {
 	struct btree_path *path = trans->paths + path_idx;
 
-	prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
+	prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
 		   path_idx, path->ref, path->intent_ref,
 		   path->preserve ? 'P' : ' ',
 		   path->should_be_locked ? 'S' : ' ',
-		   path->cached ? 'C' : 'B',
-		   bch2_btree_id_str(path->btree_id),
-		   path->level);
+		   path->cached ? 'C' : 'B');
+	bch2_btree_id_level_to_text(out, path->btree_id, path->level);
+	prt_str(out, " pos ");
 	bch2_bpos_to_text(out, path->pos);
 
 	if (!path->cached && btree_node_locked(path, path->level)) {
@@ -3336,8 +3337,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
 	pid = owner ? owner->pid : 0;
 	rcu_read_unlock();
 
-	prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
-		   b->level, bch2_btree_id_str(b->btree_id));
+	prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
+	bch2_btree_id_to_text(out, b->btree_id);
+	prt_printf(out, " l=%u:", b->level);
 	bch2_bpos_to_text(out, btree_node_pos(b));
 
 	prt_printf(out, "\t locks %u:%u:%u held by pid %u",
@@ -3376,11 +3378,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 		if (!path->nodes_locked)
 			continue;
 
-		prt_printf(out, "  path %u %c l=%u %s:",
-		       idx,
-		       path->cached ? 'c' : 'b',
-		       path->level,
-		       bch2_btree_id_str(path->btree_id));
+		prt_printf(out, "  path %u %c ",
+			   idx,
+			   path->cached ? 'c' : 'b');
+		bch2_btree_id_to_text(out, path->btree_id);
+		prt_printf(out, " l=%u:", path->level);
 		bch2_bpos_to_text(out, path->pos);
 		prt_newline(out);
 
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index c1657182c275..924b5e3a4390 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -628,8 +628,11 @@ void bch2_journal_keys_dump(struct bch_fs *c)
 
 	darray_for_each(*keys, i) {
 		printbuf_reset(&buf);
+		prt_printf(&buf, "btree=");
+		bch2_btree_id_to_text(&buf, i->btree_id);
+		prt_printf(&buf, " l=%u ", i->level);
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-		pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
+		pr_err("%s", buf.buf);
 	}
 	printbuf_exit(&buf);
 }
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 30131c3bdd97..4b4df31d4b95 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -22,9 +22,9 @@ struct find_btree_nodes_worker {
 
 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
 {
-	prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ",
-		   bch2_btree_id_str(n->btree_id), n->level, n->seq,
-		   n->journal_seq, n->cookie);
+	bch2_btree_id_level_to_text(out, n->btree_id, n->level);
+	prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
+		   n->seq, n->journal_seq, n->cookie);
 	bch2_bpos_to_text(out, n->min_key);
 	prt_str(out, "-");
 	bch2_bpos_to_text(out, n->max_key);
@@ -499,7 +499,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
 	if (c->opts.verbose) {
 		struct printbuf buf = PRINTBUF;
 
-		prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+		prt_str(&buf, "recovery ");
+		bch2_btree_id_level_to_text(&buf, btree, level);
+		prt_str(&buf, " ");
 		bch2_bpos_to_text(&buf, node_min);
 		prt_str(&buf, " - ");
 		bch2_bpos_to_text(&buf, node_max);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d596ef93239f..d62de3f79b29 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -97,9 +97,9 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 			bch2_topology_error(c);
 
 			printbuf_reset(&buf);
-			prt_str(&buf, "end of prev node doesn't match start of next node\n"),
-			prt_printf(&buf, "  in btree %s level %u node ",
-				   bch2_btree_id_str(b->c.btree_id), b->c.level);
+			prt_str(&buf, "end of prev node doesn't match start of next node\n  in ");
+			bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+			prt_str(&buf, " node ");
 			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 			prt_str(&buf, "\n  prev ");
 			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
@@ -118,9 +118,9 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		bch2_topology_error(c);
 
 		printbuf_reset(&buf);
-		prt_str(&buf, "empty interior node\n");
-		prt_printf(&buf, "  in btree %s level %u node ",
-			   bch2_btree_id_str(b->c.btree_id), b->c.level);
+		prt_str(&buf, "empty interior node\n  in ");
+		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+		prt_str(&buf, " node ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
 		need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
@@ -129,9 +129,9 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		bch2_topology_error(c);
 
 		printbuf_reset(&buf);
-		prt_str(&buf, "last child node doesn't end at end of parent node\n");
-		prt_printf(&buf, "  in btree %s level %u node ",
-			   bch2_btree_id_str(b->c.btree_id), b->c.level);
+		prt_str(&buf, "last child node doesn't end at end of parent node\n  in ");
+		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+		prt_str(&buf, " node ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 		prt_str(&buf, "\n  last key ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
@@ -2575,8 +2575,9 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update
 	prt_printf(out, "%ps: ", (void *) as->ip_started);
 	bch2_trans_commit_flags_to_text(out, as->flags);
 
-	prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
-		   bch2_btree_id_str(as->btree_id),
+	prt_str(out, " ");
+	bch2_btree_id_to_text(out, as->btree_id);
+	prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
 		   as->update_level_start,
 		   as->update_level_end,
 		   bch2_btree_update_modes[as->mode],
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 45aec1afdb0e..b5de52a50d10 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -472,7 +472,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
 	if (!out->nr_tabstops)
 		printbuf_tabstop_push(out, 32);
 
-	prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level);
+	prt_printf(out, "%px ", b);
+	bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
+	prt_printf(out, "\n");
 
 	printbuf_indent_add(out, 2);
 
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 07eb8fa1b026..38b563113cfb 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -217,7 +217,8 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
 		prt_printf(out, "id=%u", k->snapshot.id);
 		break;
 	case BCH_DISK_ACCOUNTING_btree:
-		prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id));
+		prt_str(out, "btree=");
+		bch2_btree_id_to_text(out, k->btree.id);
 		break;
 	}
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7c7595e5369b..9bc0caa9d5e4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -421,7 +421,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 			bch2_prt_jset_entry_type(out, entry->type);
 			prt_str(out, ": ");
 		}
-		prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
+		bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
+		prt_char(out, ' ');
 		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
 		first = false;
 	}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b1c83e72c0d8..0e5a53541ce4 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -42,7 +42,10 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
 	mutex_lock(&c->sb_lock);
 
 	if (!(c->sb.btrees_lost_data & b)) {
-		bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+		struct printbuf buf = PRINTBUF;
+		bch2_btree_id_to_text(&buf, btree);
+		bch_err(c, "flagging btree %s lost data", buf.buf);
+		printbuf_exit(&buf);
 		bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
 	}
 
@@ -385,10 +388,13 @@ int bch2_journal_replay(struct bch_fs *c)
 				 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
 				 : 0),
 			     bch2_journal_replay_key(trans, k));
-		bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
-			    bch2_btree_id_str(k->btree_id), k->level);
-		if (ret)
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+			bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
+			bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
+			printbuf_exit(&buf);
 			goto err;
+		}
 
 		BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
 	}
@@ -536,6 +542,7 @@ static int journal_replay_early(struct bch_fs *c,
 
 static int read_btree_roots(struct bch_fs *c)
 {
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
@@ -547,14 +554,17 @@ static int read_btree_roots(struct bch_fs *c)
 		if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
 			continue;
 
+		printbuf_reset(&buf);
+		bch2_btree_id_level_to_text(&buf, i, r->level);
+
 		if (mustfix_fsck_err_on((ret = r->error),
 					c, btree_root_bkey_invalid,
 					"invalid btree root %s",
-					bch2_btree_id_str(i)) ||
+					buf.buf) ||
 		    mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
 					c, btree_root_read_error,
-					"error reading btree root %s l=%u: %s",
-					bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
+					"error reading btree root %s: %s",
+					buf.buf, bch2_err_str(ret))) {
 			if (btree_id_is_alloc(i))
 				r->error = 0;
 
@@ -572,6 +582,7 @@ static int read_btree_roots(struct bch_fs *c)
 		}
 	}
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 03e59f86f360..3270bfab9466 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -302,7 +302,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 
 static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree));
+	bch2_btree_id_to_text(out, c->gc_gens_btree);
+	prt_printf(out, ": ");
 	bch2_bpos_to_text(out, c->gc_gens_pos);
 	prt_printf(out, "\n");
 }
-- 
cgit v1.2.3


From d55d4a0ca27adea2e6bb404eb9b65a19036dd047 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 1 Sep 2024 14:57:26 -0400
Subject: bcachefs: Refactor new stripe path to reduce dependencies on
 ec_stripe_head

We need to add a path for reshaping existing stripes (for e.g. device
removal), and this new path won't necessarily use ec_stripe_head.

Refactor the code to avoid unnecessary references to it for clarity.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 196 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 104 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 075bfd1cbb15..8b727d63af3e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1707,7 +1707,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
 	set_bkey_val_u64s(&s->k, u64s);
 }
 
-static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	struct ec_stripe_new *s;
 
@@ -1715,7 +1715,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 
 	s = kzalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
-		return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+		return NULL;
 
 	mutex_init(&s->lock);
 	closure_init(&s->iodone, NULL);
@@ -1730,10 +1730,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 	ec_stripe_key_init(c, &s->new_stripe.key,
 			   s->nr_data, s->nr_parity,
 			   h->blocksize, h->disk_label);
-
-	h->s = s;
-	h->nr_created++;
-	return 0;
+	return s;
 }
 
 static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
@@ -1878,25 +1875,26 @@ err:
 	return h;
 }
 
-static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
+static int new_stripe_alloc_buckets(struct btree_trans *trans,
+				    struct ec_stripe_head *h, struct ec_stripe_new *s,
 				    enum bch_watermark watermark, struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_devs_mask devs = h->devs;
 	struct open_bucket *ob;
 	struct open_buckets buckets;
-	struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
 	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
 	bool have_cache = true;
 	int ret = 0;
 
-	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
-	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
+	BUG_ON(v->nr_blocks	!= s->nr_data + s->nr_parity);
+	BUG_ON(v->nr_redundant	!= s->nr_parity);
 
 	/* * We bypass the sector allocator which normally does this: */
 	bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
 
-	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
+	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
 		/*
 		 * Note: we don't yet repair invalid blocks (failed/removed
 		 * devices) when reusing stripes - we still need a codepath to
@@ -1906,21 +1904,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 		if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
 			__clear_bit(v->ptrs[i].dev, devs.d);
 
-		if (i < h->s->nr_data)
+		if (i < s->nr_data)
 			nr_have_data++;
 		else
 			nr_have_parity++;
 	}
 
-	BUG_ON(nr_have_data	> h->s->nr_data);
-	BUG_ON(nr_have_parity	> h->s->nr_parity);
+	BUG_ON(nr_have_data	> s->nr_data);
+	BUG_ON(nr_have_parity	> s->nr_parity);
 
 	buckets.nr = 0;
-	if (nr_have_parity < h->s->nr_parity) {
+	if (nr_have_parity < s->nr_parity) {
 		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
 					    &h->parity_stripe,
 					    &devs,
-					    h->s->nr_parity,
+					    s->nr_parity,
 					    &nr_have_parity,
 					    &have_cache, 0,
 					    BCH_DATA_parity,
@@ -1928,14 +1926,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 					    cl);
 
 		open_bucket_for_each(c, &buckets, ob, i) {
-			j = find_next_zero_bit(h->s->blocks_gotten,
-					       h->s->nr_data + h->s->nr_parity,
-					       h->s->nr_data);
-			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+			j = find_next_zero_bit(s->blocks_gotten,
+					       s->nr_data + s->nr_parity,
+					       s->nr_data);
+			BUG_ON(j >= s->nr_data + s->nr_parity);
 
-			h->s->blocks[j] = buckets.v[i];
+			s->blocks[j] = buckets.v[i];
 			v->ptrs[j] = bch2_ob_ptr(c, ob);
-			__set_bit(j, h->s->blocks_gotten);
+			__set_bit(j, s->blocks_gotten);
 		}
 
 		if (ret)
@@ -1943,11 +1941,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 	}
 
 	buckets.nr = 0;
-	if (nr_have_data < h->s->nr_data) {
+	if (nr_have_data < s->nr_data) {
 		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
 					    &h->block_stripe,
 					    &devs,
-					    h->s->nr_data,
+					    s->nr_data,
 					    &nr_have_data,
 					    &have_cache, 0,
 					    BCH_DATA_user,
@@ -1955,13 +1953,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 					    cl);
 
 		open_bucket_for_each(c, &buckets, ob, i) {
-			j = find_next_zero_bit(h->s->blocks_gotten,
-					       h->s->nr_data, 0);
-			BUG_ON(j >= h->s->nr_data);
+			j = find_next_zero_bit(s->blocks_gotten,
+					       s->nr_data, 0);
+			BUG_ON(j >= s->nr_data);
 
-			h->s->blocks[j] = buckets.v[i];
+			s->blocks[j] = buckets.v[i];
 			v->ptrs[j] = bch2_ob_ptr(c, ob);
-			__set_bit(j, h->s->blocks_gotten);
+			__set_bit(j, s->blocks_gotten);
 		}
 
 		if (ret)
@@ -2007,73 +2005,78 @@ static s64 get_existing_stripe(struct bch_fs *c,
 	return ret;
 }
 
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
+static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
 {
-	struct bch_fs *c = trans->c;
-	struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
-	struct bch_stripe *existing_v;
+	struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+	struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
 	unsigned i;
-	s64 idx;
-	int ret;
 
-	/*
-	 * If we can't allocate a new stripe, and there's no stripes with empty
-	 * blocks for us to reuse, that means we have to wait on copygc:
-	 */
-	idx = get_existing_stripe(c, h);
-	if (idx < 0)
-		return -BCH_ERR_stripe_alloc_blocked;
-
-	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
-	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
-			     "reading stripe key: %s", bch2_err_str(ret));
-	if (ret) {
-		bch2_stripe_close(c, h->s);
-		return ret;
-	}
-
-	existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
-
-	BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
-	h->s->nr_data = existing_v->nr_blocks -
+	BUG_ON(existing_v->nr_redundant != s->nr_parity);
+	s->nr_data = existing_v->nr_blocks -
 		existing_v->nr_redundant;
 
-	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+	int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
 	if (ret) {
-		bch2_stripe_close(c, h->s);
+		bch2_stripe_close(c, s);
 		return ret;
 	}
 
-	BUG_ON(h->s->existing_stripe.size != h->blocksize);
-	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
+	BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
 
 	/*
 	 * Free buckets we initially allocated - they might conflict with
 	 * blocks from the stripe we're reusing:
 	 */
-	for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
-		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
-		h->s->blocks[i] = 0;
+	for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
+		bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
+		s->blocks[i] = 0;
 	}
-	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
-	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+	memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
+	memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
 
-	for (i = 0; i < existing_v->nr_blocks; i++) {
+	for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
 		if (stripe_blockcount_get(existing_v, i)) {
-			__set_bit(i, h->s->blocks_gotten);
-			__set_bit(i, h->s->blocks_allocated);
+			__set_bit(i, s->blocks_gotten);
+			__set_bit(i, s->blocks_allocated);
 		}
 
-		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+		ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
 	}
 
-	bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
-	h->s->have_existing_stripe = true;
+	bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
+	s->have_existing_stripe = true;
 
 	return 0;
 }
 
-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
+				       struct ec_stripe_new *s)
+{
+	struct bch_fs *c = trans->c;
+	s64 idx;
+	int ret;
+
+	/*
+	 * If we can't allocate a new stripe, and there's no stripes with empty
+	 * blocks for us to reuse, that means we have to wait on copygc:
+	 */
+	idx = get_existing_stripe(c, h);
+	if (idx < 0)
+		return -BCH_ERR_stripe_alloc_blocked;
+
+	ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
+	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+			     "reading stripe key: %s", bch2_err_str(ret));
+	if (ret) {
+		bch2_stripe_close(c, s);
+		return ret;
+	}
+
+	return init_new_stripe_from_existing(c, s);
+}
+
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
+					 struct ec_stripe_new *s)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -2082,15 +2085,19 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
 	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
 	int ret;
 
-	if (!h->s->res.sectors) {
-		ret = bch2_disk_reservation_get(c, &h->s->res,
+	if (!s->res.sectors) {
+		ret = bch2_disk_reservation_get(c, &s->res,
 					h->blocksize,
-					h->s->nr_parity,
+					s->nr_parity,
 					BCH_DISK_RESERVATION_NOFAIL);
 		if (ret)
 			return ret;
 	}
 
+	/*
+	 * Allocate stripe slot
+	 * XXX: we're going to need a bitrange btree of free stripes
+	 */
 	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
 			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
 		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
@@ -2105,7 +2112,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
 		}
 
 		if (bkey_deleted(k.k) &&
-		    bch2_try_open_stripe(c, h->s, k.k->p.offset))
+		    bch2_try_open_stripe(c, s, k.k->p.offset))
 			break;
 	}
 
@@ -2116,16 +2123,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
 
 	ret = ec_stripe_mem_alloc(trans, &iter);
 	if (ret) {
-		bch2_stripe_close(c, h->s);
+		bch2_stripe_close(c, s);
 		goto err;
 	}
 
-	h->s->new_stripe.key.k.p = iter.pos;
+	s->new_stripe.key.k.p = iter.pos;
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 err:
-	bch2_disk_reservation_put(c, &h->s->res);
+	bch2_disk_reservation_put(c, &s->res);
 	goto out;
 }
 
@@ -2156,22 +2163,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		return h;
 
 	if (!h->s) {
-		ret = ec_new_stripe_alloc(c, h);
-		if (ret) {
+		h->s = ec_new_stripe_alloc(c, h);
+		if (!h->s) {
+			ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
 			bch_err(c, "failed to allocate new stripe");
 			goto err;
 		}
+
+		h->nr_created++;
 	}
 
-	if (h->s->allocated)
+	struct ec_stripe_new *s = h->s;
+
+	if (s->allocated)
 		goto allocated;
 
-	if (h->s->have_existing_stripe)
+	if (s->have_existing_stripe)
 		goto alloc_existing;
 
 	/* First, try to allocate a full stripe: */
-	ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
-		__bch2_ec_stripe_head_reserve(trans, h);
+	ret =   new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
+		__bch2_ec_stripe_head_reserve(trans, h, s);
 	if (!ret)
 		goto allocate_buf;
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
@@ -2183,15 +2195,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	 * existing stripe:
 	 */
 	while (1) {
-		ret = __bch2_ec_stripe_head_reuse(trans, h);
+		ret = __bch2_ec_stripe_head_reuse(trans, h, s);
 		if (!ret)
 			break;
 		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
 			goto err;
 
 		if (watermark == BCH_WATERMARK_copygc) {
-			ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
-				__bch2_ec_stripe_head_reserve(trans, h);
+			ret =   new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
+				__bch2_ec_stripe_head_reserve(trans, h, s);
 			if (ret)
 				goto err;
 			goto allocate_buf;
@@ -2209,19 +2221,19 @@ alloc_existing:
 	 * Retry allocating buckets, with the watermark for this
 	 * particular write:
 	 */
-	ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
+	ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
 	if (ret)
 		goto err;
 
 allocate_buf:
-	ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
+	ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
 	if (ret)
 		goto err;
 
-	h->s->allocated = true;
+	s->allocated = true;
 allocated:
-	BUG_ON(!h->s->idx);
-	BUG_ON(!h->s->new_stripe.data[0]);
+	BUG_ON(!s->idx);
+	BUG_ON(!s->new_stripe.data[0]);
 	BUG_ON(trans->restarted);
 	return h;
 err:
-- 
cgit v1.2.3


From 8b22abb4c84058e9533d71a4814e54316ba2621f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 11 Oct 2024 22:53:09 -0400
Subject: bcachefs: -o norecovery now bails out of recovery earlier

-o norecovery (used by the dump tool) should be doing the absolute
minimum amount of work to get the filesystem up and readable; we
shouldn't be running check and repair code, or going read-write.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 0e5a53541ce4..bc2fd174bb32 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -690,8 +690,13 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (c->opts.norecovery)
-		c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
+	if (c->opts.norecovery) {
+		c->opts.recovery_pass_last = c->opts.recovery_pass_last
+			? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
+			: BCH_RECOVERY_PASS_snapshots_read;
+		c->opts.nochanges = true;
+		c->opts.read_only = true;
+	}
 
 	mutex_lock(&c->sb_lock);
 	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-- 
cgit v1.2.3


From fb8c835b18d48dac953a5d755a8e90b0d8fb9c29 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 11 Oct 2024 22:50:48 -0400
Subject: bcachefs: bch2_journal_meta() takes ref on c->writes

This part of addressing
https://github.com/koverstreet/bcachefs/issues/656

where we're getting stuck in bch2_journal_meta() in the dump tool.

We shouldn't be invoking the journal without a ref on c->writes (if
we're not RW), and there's no reason for the dump tool to be going
read-write.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/journal.c  | 27 +++++++++++++++++----------
 fs/bcachefs/recovery.c |  4 +---
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index fbd89f91625d..d4d95ef6791f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -688,6 +688,7 @@ struct btree_trans_buf {
 	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
 
 #define BCH_WRITE_REFS()						\
+	x(journal)							\
 	x(trans)							\
 	x(write)							\
 	x(promote)							\
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2dc0d60c1745..2cf8f24d50cc 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -831,19 +831,14 @@ out:
 	return ret;
 }
 
-int bch2_journal_meta(struct journal *j)
+static int __bch2_journal_meta(struct journal *j)
 {
-	struct journal_buf *buf;
-	struct journal_res res;
-	int ret;
-
-	memset(&res, 0, sizeof(res));
-
-	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	struct journal_res res = {};
+	int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 	if (ret)
 		return ret;
 
-	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+	struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
 	buf->must_flush = true;
 
 	if (!buf->flush_time) {
@@ -856,6 +851,18 @@ int bch2_journal_meta(struct journal *j)
 	return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
 }
 
+int bch2_journal_meta(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
+		return -EROFS;
+
+	int ret = __bch2_journal_meta(j);
+	bch2_write_ref_put(c, BCH_WRITE_REF_journal);
+	return ret;
+}
+
 /* block/unlock the journal: */
 
 void bch2_journal_unblock(struct journal *j)
@@ -1193,7 +1200,7 @@ void bch2_fs_journal_stop(struct journal *j)
 	 * Always write a new journal entry, to make sure the clock hands are up
 	 * to date (and match the superblock)
 	 */
-	bch2_journal_meta(j);
+	__bch2_journal_meta(j);
 
 	journal_quiesce(j);
 	cancel_delayed_work_sync(&j->write_work);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index bc2fd174bb32..431698189090 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -910,11 +910,9 @@ use_clean:
 	set_bit(BCH_FS_accounting_replay_done, &c->flags);
 
 	/* fsync if we fixed errors */
-	if (test_bit(BCH_FS_errors_fixed, &c->flags) &&
-	    bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) {
+	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
 		bch2_journal_flush_all_pins(&c->journal);
 		bch2_journal_meta(&c->journal);
-		bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
 	}
 
 	/* If we fixed errors, verify that fs is actually clean now: */
-- 
cgit v1.2.3


From be5a7be1062b2e588519d7ed68ff2e8f4ed0a42a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 12 Oct 2024 14:07:44 -0400
Subject: bcachefs: Fix warning about passing flex array member by value

this showed up when building in userspace

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 38b563113cfb..55a00018dc8b 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -244,10 +244,10 @@ void bch2_accounting_swab(struct bkey_s k)
 }
 
 static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
-					    struct disk_accounting_pos acc)
+					    struct disk_accounting_pos *acc)
 {
-	unsafe_memcpy(r, &acc.replicas,
-		      replicas_entry_bytes(&acc.replicas),
+	unsafe_memcpy(r, &acc->replicas,
+		      replicas_entry_bytes(&acc->replicas),
 		      "variable length struct");
 }
 
@@ -258,7 +258,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
 
 	switch (acc_k.type) {
 	case BCH_DISK_ACCOUNTING_replicas:
-		__accounting_to_replicas(r, acc_k);
+		__accounting_to_replicas(r, &acc_k);
 		return true;
 	default:
 		return false;
@@ -626,7 +626,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
 	switch (acc.type) {
 	case BCH_DISK_ACCOUNTING_replicas: {
 		struct bch_replicas_padded r;
-		__accounting_to_replicas(&r.e, acc);
+		__accounting_to_replicas(&r.e, &acc);
 
 		for (unsigned i = 0; i < r.e.nr_devs; i++)
 			if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
-- 
cgit v1.2.3


From 27de0ee39f810dab2e948d2c465f8fcf8cbf9f8c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 14 Oct 2024 21:35:44 -0400
Subject: bcachefs: Add block plugging to read paths

This will help with some of the btree_trans srcu lock hold time warnings
that are still turning up; submit_bio() can block for awhile if the
device is sufficiently congested.

It's not a perfect solution since blk_plug bios are submitted when
scheduling; we might want a way to disable the "submit on context
switch" behaviour, or switch to our own plugging in the future.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c | 19 ++++++++++++++++++-
 fs/bcachefs/fs-io-direct.c   |  5 +++++
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 95972809e76d..0923f38a2fcd 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -248,6 +248,7 @@ void bch2_readahead(struct readahead_control *ractl)
 	struct bch_io_opts opts;
 	struct folio *folio;
 	struct readpages_iter readpages_iter;
+	struct blk_plug plug;
 
 	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
 
@@ -255,6 +256,16 @@ void bch2_readahead(struct readahead_control *ractl)
 	if (ret)
 		return;
 
+	/*
+	 * Besides being a general performance optimization, plugging helps with
+	 * avoiding btree transaction srcu warnings - submitting a bio can
+	 * block, and we don't want todo that with the transaction locked.
+	 *
+	 * However, plugged bios are submitted when we schedule; we ideally
+	 * would have our own scheduler hook to call unlock_long() before
+	 * scheduling.
+	 */
+	blk_start_plug(&plug);
 	bch2_pagecache_add_get(inode);
 
 	struct btree_trans *trans = bch2_trans_get(c);
@@ -281,7 +292,7 @@ void bch2_readahead(struct readahead_control *ractl)
 	bch2_trans_put(trans);
 
 	bch2_pagecache_add_put(inode);
-
+	blk_finish_plug(&plug);
 	darray_exit(&readpages_iter.folios);
 }
 
@@ -296,9 +307,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_read_bio *rbio;
 	struct bch_io_opts opts;
+	struct blk_plug plug;
 	int ret;
 	DECLARE_COMPLETION_ONSTACK(done);
 
+	BUG_ON(folio_test_uptodate(folio));
+	BUG_ON(folio_test_dirty(folio));
+
 	if (!bch2_folio_create(folio, GFP_KERNEL))
 		return -ENOMEM;
 
@@ -313,7 +328,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
 	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
 	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
+	blk_start_plug(&plug);
 	bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
+	blk_finish_plug(&plug);
 	wait_for_completion(&done);
 
 	ret = blk_status_to_errno(rbio->bio.bi_status);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 6d3a05ae5da8..2089c36b5866 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	struct bch_io_opts opts;
 	struct dio_read *dio;
 	struct bio *bio;
+	struct blk_plug plug;
 	loff_t offset = req->ki_pos;
 	bool sync = is_sync_kiocb(req);
 	size_t shorten;
@@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	 */
 	dio->should_dirty = iter_is_iovec(iter);
 
+	blk_start_plug(&plug);
+
 	goto start;
 	while (iter->count) {
 		bio = bio_alloc_bioset(NULL,
@@ -160,6 +163,8 @@ start:
 		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
 	}
 
+	blk_finish_plug(&plug);
+
 	iter->count += shorten;
 
 	if (sync) {
-- 
cgit v1.2.3


From e0c8369bc8444daf0d68d23bcae472d11680d49f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 19:02:44 -0400
Subject: bcachefs: Add version check for bch_btree_ptr_v2.sectors_written
 validate

A user popped up with a very old (0.11) filesystem that needed repair
and wasn't recently backed up.

Reported-by: Manoa <manoa@mail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 37e3d69bec06..85b98c782e1b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -203,7 +203,8 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
 			 c, btree_ptr_v2_min_key_bad,
 			 "min_key > key");
 
-	if (flags & BCH_VALIDATE_write)
+	if ((flags & BCH_VALIDATE_write) &&
+	    c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
 		bkey_fsck_err_on(!bp.v->sectors_written,
 				 c, btree_ptr_v2_written_0,
 				 "sectors_written == 0");
-- 
cgit v1.2.3


From de902e3b4a9881ae02d414d427ad56cc384c9bf1 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Sat, 19 Oct 2024 14:25:27 +0200
Subject: bcachefs: Use str_write_read() helper function

Remove hard-coded strings by using the helper function str_write_read().

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9bc0caa9d5e4..768a3b950997 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,8 @@
 #include "sb-clean.h"
 #include "trace.h"
 
+#include <linux/string_choices.h>
+
 void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
 {
 	lockdep_assert_held(&c->sb_lock);
@@ -666,7 +668,7 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
 	struct jset_entry_clock *clock =
 		container_of(entry, struct jset_entry_clock, entry);
 
-	prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+	prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
 }
 
 static int journal_entry_dev_usage_validate(struct bch_fs *c,
-- 
cgit v1.2.3


From 751d869710ca91b6b2c6f4235137c71eb054ce02 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Sun, 20 Oct 2024 13:20:46 +0200
Subject: bcachefs: Use str_write_read() helper in ec_block_endio()

Remove hard-coded strings by using the helper function str_write_read().

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 8b727d63af3e..b46bf00c4a67 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -26,6 +26,7 @@
 #include "util.h"
 
 #include <linux/sort.h>
+#include <linux/string_choices.h>
 
 #ifdef __KERNEL__
 
@@ -732,7 +733,7 @@ static void ec_block_endio(struct bio *bio)
 			       ? BCH_MEMBER_ERROR_write
 			       : BCH_MEMBER_ERROR_read,
 			       "erasure coding %s error: %s",
-			       bio_data_dir(bio) ? "write" : "read",
+			       str_write_read(bio_data_dir(bio)),
 			       bch2_blk_status_to_str(bio->bi_status)))
 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
 
-- 
cgit v1.2.3


From ac9826f14739023bccf1345e6e4ddb0461fa9a2e Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Sat, 26 Oct 2024 12:47:23 +0200
Subject: bcachefs: Use str_write_read() helper in write_super_endio()

Remove hard-coded strings by using the str_write_read() helper function.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 7c71594f6a8b..c83bd3dedb1b 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -23,6 +23,7 @@
 
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
+#include <linux/string_choices.h>
 
 static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
 };
@@ -878,7 +879,7 @@ static void write_super_endio(struct bio *bio)
 			       ? BCH_MEMBER_ERROR_write
 			       : BCH_MEMBER_ERROR_read,
 			       "superblock %s error: %s",
-			       bio_data_dir(bio) ? "write" : "read",
+			       str_write_read(bio_data_dir(bio)),
 			       bch2_blk_status_to_str(bio->bi_status)))
 		ca->sb_write_error = 1;
 
-- 
cgit v1.2.3


From 901ff6555ba02dd917aa65b1105c9715e25dc994 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Sat, 26 Oct 2024 17:47:04 +0200
Subject: bcachefs: Annotate struct bucket_gens with __counted_by()

Add the __counted_by compiler attribute to the flexible array member b
to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Use struct_size() to calculate the number of bytes to be allocated.

Update bucket_gens->nbuckets and bucket_gens->nbuckets_minus_first when
resizing.

Compile-tested only.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c       | 13 ++++++++-----
 fs/bcachefs/buckets_types.h |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ec7d9a59bea9..8bd17667e243 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1266,8 +1266,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	BUG_ON(resize && ca->buckets_nouse);
 
-	if (!(bucket_gens	= kvmalloc(sizeof(struct bucket_gens) + nbuckets,
-					   GFP_KERNEL|__GFP_ZERO))) {
+	bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets),
+			       GFP_KERNEL|__GFP_ZERO);
+	if (!bucket_gens) {
 		ret = -BCH_ERR_ENOMEM_bucket_gens;
 		goto err;
 	}
@@ -1285,11 +1286,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 
 	if (resize) {
-		size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
-
+		bucket_gens->nbuckets = min(bucket_gens->nbuckets,
+					    old_bucket_gens->nbuckets);
+		bucket_gens->nbuckets_minus_first =
+			bucket_gens->nbuckets - bucket_gens->first_bucket;
 		memcpy(bucket_gens->b,
 		       old_bucket_gens->b,
-		       n);
+		       bucket_gens->nbuckets);
 	}
 
 	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 28bd09a253c8..7174047b8e92 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -24,7 +24,7 @@ struct bucket_gens {
 	u16			first_bucket;
 	size_t			nbuckets;
 	size_t			nbuckets_minus_first;
-	u8			b[];
+	u8			b[] __counted_by(nbuckets);
 };
 
 struct bch_dev_usage {
-- 
cgit v1.2.3


From a1ca525b82312ad42adb11a59fb103243b28407b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 01:11:29 -0400
Subject: bcachefs: avoid 'unsigned flags'

flags should have actual types, where possible: fix btree_update.h
helpers

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.h | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 70b3c989fac2..7e71c4d1111d 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -244,7 +244,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra
 				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
 
 static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
-					struct bkey_s_c *k, unsigned flags,
+					struct bkey_s_c *k,
+					enum btree_iter_update_trigger_flags flags,
 					unsigned type, unsigned min_bytes)
 {
 	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
@@ -261,8 +262,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str
 	return mut;
 }
 
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
-						struct bkey_s_c *k, unsigned flags)
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
+						struct btree_iter *iter, struct bkey_s_c *k,
+						enum btree_iter_update_trigger_flags flags)
 {
 	return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
 }
@@ -274,7 +276,8 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc
 static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
 					 struct btree_iter *iter,
 					 unsigned btree_id, struct bpos pos,
-					 unsigned flags, unsigned type, unsigned min_bytes)
+					 enum btree_iter_update_trigger_flags flags,
+					 unsigned type, unsigned min_bytes)
 {
 	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
 				btree_id, pos, flags|BTREE_ITER_intent, type);
@@ -289,7 +292,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
 static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
 					       struct btree_iter *iter,
 					       unsigned btree_id, struct bpos pos,
-					       unsigned flags)
+					       enum btree_iter_update_trigger_flags flags)
 {
 	return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
 }
@@ -297,7 +300,8 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran
 static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
 					 struct btree_iter *iter,
 					 unsigned btree_id, struct bpos pos,
-					 unsigned flags, unsigned type, unsigned min_bytes)
+					 enum btree_iter_update_trigger_flags flags,
+					 unsigned type, unsigned min_bytes)
 {
 	struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
 				btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
@@ -318,7 +322,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
 static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
 						       struct btree_iter *iter,
 						       unsigned btree_id, struct bpos pos,
-						       unsigned flags, unsigned min_bytes)
+						       enum btree_iter_update_trigger_flags flags,
+						       unsigned min_bytes)
 {
 	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
 }
@@ -326,7 +331,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans
 static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
 					       struct btree_iter *iter,
 					       unsigned btree_id, struct bpos pos,
-					       unsigned flags)
+					       enum btree_iter_update_trigger_flags flags)
 {
 	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
 }
@@ -337,7 +342,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
 			KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
 
 static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
-					       unsigned flags, unsigned type, unsigned val_size)
+					       enum btree_iter_update_trigger_flags flags,
+					       unsigned type, unsigned val_size)
 {
 	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
 	int ret;
-- 
cgit v1.2.3


From 16de2c856af0d93bb798a5e55d790103d5fd888c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 01:16:16 -0400
Subject: bcachefs: use bch2_data_update_opts_to_text() in
 trace_move_extent_fail()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 8e75a852b358..5ea432bc0052 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -110,11 +110,8 @@ static void trace_move_extent_fail2(struct data_update *m,
 {
 	struct bch_fs *c = m->op.c;
 	struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_ptr *ptr;
-	struct extent_ptr_decoded p;
 	struct printbuf buf = PRINTBUF;
-	unsigned i, rewrites_found = 0;
+	unsigned rewrites_found = 0;
 
 	if (!trace_move_extent_fail_enabled())
 		return;
@@ -122,27 +119,25 @@ static void trace_move_extent_fail2(struct data_update *m,
 	prt_str(&buf, msg);
 
 	if (insert) {
-		i = 0;
+		const union bch_extent_entry *entry;
+		struct bch_extent_ptr *ptr;
+		struct extent_ptr_decoded p;
+
+		unsigned ptr_bit = 1;
 		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
-			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+			if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
 			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
 			    !ptr->cached)
-				rewrites_found |= 1U << i;
-			i++;
+				rewrites_found |= ptr_bit;
+			ptr_bit <<= 1;
 		}
 	}
 
-	prt_printf(&buf, "\nrewrite ptrs:   %u%u%u%u",
-		   (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
-		   (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
-		   (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
-		   (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
-
-	prt_printf(&buf, "\nrewrites found: %u%u%u%u",
-		   (rewrites_found & (1 << 0)) != 0,
-		   (rewrites_found & (1 << 1)) != 0,
-		   (rewrites_found & (1 << 2)) != 0,
-		   (rewrites_found & (1 << 3)) != 0);
+	prt_str(&buf, "rewrites found:\t");
+	bch2_prt_u64_base2(&buf, rewrites_found);
+	prt_newline(&buf);
+
+	bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
 
 	prt_str(&buf, "\nold:    ");
 	bch2_bkey_val_to_text(&buf, c, old);
-- 
cgit v1.2.3


From eacb755568d68337de6e6f734b00df40780c4e30 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 01:21:43 -0400
Subject: bcachefs: bch2_io_opts_fixups()

Centralize some io path option fixups - they weren't always being
applied correctly:

- background_compression uses compression if unset
- background_target uses foreground_target if unset
- nocow disables most fancy io path options

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  4 ++--
 fs/bcachefs/extents.c     |  2 +-
 fs/bcachefs/inode.c       |  4 ++--
 fs/bcachefs/opts.c        |  5 ++++-
 fs/bcachefs/opts.h        | 12 ++++++++++--
 fs/bcachefs/rebalance.c   |  4 ++--
 6 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 5ea432bc0052..a176e5439cbf 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -535,7 +535,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
 	prt_newline(out);
 
 	prt_str(out, "compression:\t");
-	bch2_compression_opt_to_text(out, background_compression(*io_opts));
+	bch2_compression_opt_to_text(out, io_opts->background_compression);
 	prt_newline(out);
 
 	prt_str(out, "opts.replicas:\t");
@@ -647,7 +647,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		BCH_WRITE_DATA_ENCODED|
 		BCH_WRITE_MOVE|
 		m->data_opts.write_flags;
-	m->op.compression_opt	= background_compression(io_opts);
+	m->op.compression_opt	= io_opts.background_compression;
 	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
 
 	unsigned durability_have = 0, durability_removing = 0;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 85b98c782e1b..45a67daf0d64 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1504,7 +1504,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
 	struct bkey_s k = bkey_i_to_s(_k);
 	struct bch_extent_rebalance *r;
 	unsigned target = opts->background_target;
-	unsigned compression = background_compression(*opts);
+	unsigned compression = opts->background_compression;
 	bool needs_rebalance;
 
 	if (!bkey_extent_is_direct_data(k.k))
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 43653cf050e9..fbdd11802bdf 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -14,6 +14,7 @@
 #include "extent_update.h"
 #include "fs.h"
 #include "inode.h"
+#include "opts.h"
 #include "str_hash.h"
 #include "snapshot.h"
 #include "subvolume.h"
@@ -1145,8 +1146,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
 	BCH_INODE_OPTS()
 #undef x
 
-	if (opts->nocow)
-		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
+	bch2_io_opts_fixups(opts);
 }
 
 int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 0e2ee262fbd4..1f5843284e9e 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -710,11 +710,14 @@ void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
 
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
 {
-	return (struct bch_io_opts) {
+	struct bch_io_opts opts = {
 #define x(_name, _bits)	._name = src._name,
 	BCH_INODE_OPTS()
 #undef x
 	};
+
+	bch2_io_opts_fixups(&opts);
+	return opts;
 }
 
 bool bch2_opt_is_inode_opt(enum bch_opt_id id)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 23dda014e331..dd27ef556611 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -626,9 +626,17 @@ struct bch_io_opts {
 #undef x
 };
 
-static inline unsigned background_compression(struct bch_io_opts opts)
+static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
 {
-	return opts.background_compression ?: opts.compression;
+	if (!opts->background_target)
+		opts->background_target = opts->foreground_target;
+	if (!opts->background_compression)
+		opts->background_compression = opts->compression;
+	if (opts->nocow) {
+		opts->compression = opts->background_compression = 0;
+		opts->data_checksum = 0;
+		opts->erasure_code = 0;
+	}
 }
 
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index cd6647374353..2f93ae8781bb 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -257,12 +257,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
 
 	if (k.k->p.inode) {
 		target		= io_opts->background_target;
-		compression	= background_compression(*io_opts);
+		compression	= io_opts->background_compression;
 	} else {
 		const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
 
 		target		= r ? r->target : io_opts->background_target;
-		compression	= r ? r->compression : background_compression(*io_opts);
+		compression	= r ? r->compression : io_opts->background_compression;
 	}
 
 	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
-- 
cgit v1.2.3


From 015fafc49bbf81380b65190219e530b4015cb13e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 01:32:55 -0400
Subject: bcachefs: small cleanup for extent ptr bitmasks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 30 +++++++++++++++---------------
 fs/bcachefs/extents.c     | 12 ++++++------
 fs/bcachefs/io_read.c     |  6 +++---
 3 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index a176e5439cbf..90da57a26962 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -189,7 +189,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 		struct bpos next_pos;
 		bool should_check_enospc;
 		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-		unsigned rewrites_found = 0, durability, i;
+		unsigned rewrites_found = 0, durability, ptr_bit;
 
 		bch2_trans_begin(trans);
 
@@ -226,16 +226,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 		 *
 		 * Fist, drop rewrite_ptrs from @new:
 		 */
-		i = 0;
+		ptr_bit = 1;
 		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
-			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+			if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
 			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
 			    !ptr->cached) {
 				bch2_extent_ptr_set_cached(c, &m->op.opts,
 							   bkey_i_to_s(insert), ptr);
-				rewrites_found |= 1U << i;
+				rewrites_found |= ptr_bit;
 			}
-			i++;
+			ptr_bit <<= 1;
 		}
 
 		if (m->data_opts.rewrite_ptrs &&
@@ -609,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+	unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
 	int ret = 0;
 
 	/*
@@ -652,17 +652,17 @@ int bch2_data_update_init(struct btree_trans *trans,
 
 	unsigned durability_have = 0, durability_removing = 0;
 
-	i = 0;
+	unsigned ptr_bit = 1;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		if (!p.ptr.cached) {
 			rcu_read_lock();
-			if (BIT(i) & m->data_opts.rewrite_ptrs) {
+			if (ptr_bit & m->data_opts.rewrite_ptrs) {
 				if (crc_is_compressed(p.crc))
 					reserve_sectors += k.k->size;
 
 				m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
 				durability_removing += bch2_extent_ptr_desired_durability(c, &p);
-			} else if (!(BIT(i) & m->data_opts.kill_ptrs)) {
+			} else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
 				bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
 				durability_have += bch2_extent_ptr_durability(c, &p);
 			}
@@ -682,7 +682,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
 			m->op.incompressible = true;
 
-		i++;
+		ptr_bit <<= 1;
 	}
 
 	unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
@@ -745,14 +745,14 @@ out:
 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	unsigned i = 0;
+	unsigned ptr_bit = 1;
 
 	bkey_for_each_ptr(ptrs, ptr) {
-		if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
-			opts->kill_ptrs |= 1U << i;
-			opts->rewrite_ptrs ^= 1U << i;
+		if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
+			opts->kill_ptrs |= ptr_bit;
+			opts->rewrite_ptrs ^= ptr_bit;
 		}
 
-		i++;
+		ptr_bit <<= 1;
 	}
 }
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 45a67daf0d64..243cb15b74b3 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1414,7 +1414,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
 		unsigned compression_type = bch2_compression_opt_to_type(compression);
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
-		unsigned i = 0;
+		unsigned ptr_bit = 1;
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
@@ -1424,18 +1424,18 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
 			}
 
 			if (!p.ptr.cached && p.crc.compression_type != compression_type)
-				rewrite_ptrs |= 1U << i;
-			i++;
+				rewrite_ptrs |= ptr_bit;
+			ptr_bit <<= 1;
 		}
 	}
 incompressible:
 	if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
-		unsigned i = 0;
+		unsigned ptr_bit = 1;
 
 		bkey_for_each_ptr(ptrs, ptr) {
 			if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
-				rewrite_ptrs |= 1U << i;
-			i++;
+				rewrite_ptrs |= ptr_bit;
+			ptr_bit <<= 1;
 		}
 	}
 
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index b3b934a87c6d..cbc3cc1f6d03 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -231,11 +231,11 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 		update_opts.target = opts.foreground_target;
 
 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		unsigned i = 0;
+		unsigned ptr_bit = 1;
 		bkey_for_each_ptr(ptrs, ptr) {
 			if (bch2_dev_io_failures(failed, ptr->dev))
-				update_opts.rewrite_ptrs |= BIT(i);
-			i++;
+				update_opts.rewrite_ptrs |= ptr_bit;
+			ptr_bit <<= 1;
 		}
 	}
 
-- 
cgit v1.2.3


From c8908959aead268223baaf17e990a200a085bb5c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 01:40:19 -0400
Subject: bcachefs: kill bch2_bkey_needs_rebalance()

Dead code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 17 -----------------
 fs/bcachefs/extents.h |  1 -
 2 files changed, 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 243cb15b74b3..6ad5ff7c8239 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1442,23 +1442,6 @@ incompressible:
 	return rewrite_ptrs;
 }
 
-bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
-	const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
-	/*
-	 * If it's an indirect extent, we don't delete the rebalance entry when
-	 * done so that we know what options were applied - check if it still
-	 * needs work done:
-	 */
-	if (r &&
-	    k.k->type == KEY_TYPE_reflink_v &&
-	    !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
-		r = NULL;
-
-	return r != NULL;
-}
-
 static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
 				       unsigned target, unsigned compression)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index bcffcf60aaaf..9374599b384d 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -713,7 +713,6 @@ void bch2_ptr_swab(struct bkey_s);
 const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
 unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
 				       unsigned, unsigned);
-bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
 
 int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
-- 
cgit v1.2.3


From 282faf952474a3125d3f3fd7d32ae77f8b4c96c3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 02:14:53 -0400
Subject: bcachefs: kill __bch2_bkey_sectors_need_rebalance()

Single caller, fold into bch2_bkey_sectors_need_rebalance()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 6ad5ff7c8239..5ec0fec597f7 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1442,16 +1442,19 @@ incompressible:
 	return rewrite_ptrs;
 }
 
-static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
-				       unsigned target, unsigned compression)
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
 {
+	const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+	if (!opts)
+		return 0;
+
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	u64 sectors = 0;
 
-	if (compression) {
-		unsigned compression_type = bch2_compression_opt_to_type(compression);
+	if (opts->compression) {
+		unsigned compression_type = bch2_compression_opt_to_type(opts->compression);
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
@@ -1465,22 +1468,16 @@ static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c
 		}
 	}
 incompressible:
-	if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+	if (opts->target &&
+	    bch2_target_accepts_data(c, BCH_DATA_user, opts->target)) {
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
+			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->target))
 				sectors += p.crc.compressed_size;
 	}
 
 	return sectors;
 }
 
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
-	const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
-	return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
-}
-
 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
 				  struct bch_io_opts *opts)
 {
-- 
cgit v1.2.3


From 5fffe1a3c3992cdcfc2f22e8e80afb91885840fa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 02:21:28 -0400
Subject: bcachefs: rename bch_extent_rebalance fields to match other opts
 structs

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c        | 36 ++++++++++++++++++------------------
 fs/bcachefs/extents_format.h |  8 ++++----
 fs/bcachefs/rebalance.c      | 12 ++++++------
 3 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 5ec0fec597f7..ee0e86f0becd 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1161,11 +1161,11 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 
 			prt_str(out, "rebalance: target ");
 			if (c)
-				bch2_target_to_text(out, c, r->target);
+				bch2_target_to_text(out, c, r->background_target);
 			else
-				prt_printf(out, "%u", r->target);
+				prt_printf(out, "%u", r->background_target);
 			prt_str(out, " compression ");
-			bch2_compression_opt_to_text(out, r->compression);
+			bch2_compression_opt_to_text(out, r->background_compression);
 			break;
 		}
 		default:
@@ -1453,8 +1453,8 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
 	struct extent_ptr_decoded p;
 	u64 sectors = 0;
 
-	if (opts->compression) {
-		unsigned compression_type = bch2_compression_opt_to_type(opts->compression);
+	if (opts->background_compression) {
+		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
@@ -1468,10 +1468,10 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
 		}
 	}
 incompressible:
-	if (opts->target &&
-	    bch2_target_accepts_data(c, BCH_DATA_user, opts->target)) {
+	if (opts->background_target &&
+	    bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->target))
+			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
 				sectors += p.crc.compressed_size;
 	}
 
@@ -1500,14 +1500,14 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
 			 * they're referenced by different inodes with different
 			 * options:
 			 */
-			if (r->target)
-				target = r->target;
-			if (r->compression)
-				compression = r->compression;
+			if (r->background_target)
+				target = r->background_target;
+			if (r->background_compression)
+				compression = r->background_compression;
 		}
 
-		r->target	= target;
-		r->compression	= compression;
+		r->background_target		= target;
+		r->background_compression	= compression;
 	}
 
 	needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
@@ -1515,10 +1515,10 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
 	if (needs_rebalance && !r) {
 		union bch_extent_entry *new = bkey_val_end(k);
 
-		new->rebalance.type		= 1U << BCH_EXTENT_ENTRY_rebalance;
-		new->rebalance.compression	= compression;
-		new->rebalance.target		= target;
-		new->rebalance.unused		= 0;
+		new->rebalance.type			= 1U << BCH_EXTENT_ENTRY_rebalance;
+		new->rebalance.background_compression	= compression;
+		new->rebalance.background_target	= target;
+		new->rebalance.unused			= 0;
 		k.k->u64s += extent_entry_u64s(new);
 	} else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
 		/*
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
index 3bd2fdbb0817..2cc3e60f3b12 100644
--- a/fs/bcachefs/extents_format.h
+++ b/fs/bcachefs/extents_format.h
@@ -205,11 +205,11 @@ struct bch_extent_rebalance {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:6,
 				unused:34,
-				compression:8, /* enum bch_compression_opt */
-				target:16;
+				background_compression:8, /* enum bch_compression_opt */
+				background_target:16;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			target:16,
-				compression:8,
+	__u64			background_target:16,
+				background_compression:8,
 				unused:34,
 				type:6;
 #endif
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 2f93ae8781bb..dc70e6feaf79 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -157,8 +157,8 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 	memset(data_opts, 0, sizeof(*data_opts));
 
 	data_opts->rewrite_ptrs		=
-		bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
-	data_opts->target		= r->target;
+		bch2_bkey_ptrs_need_rebalance(c, k, r->background_target, r->background_compression);
+	data_opts->target		= r->background_target;
 	data_opts->write_flags		|= BCH_WRITE_ONLY_SPECIFIED_DEVS;
 
 	if (!data_opts->rewrite_ptrs) {
@@ -179,9 +179,9 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 		struct printbuf buf = PRINTBUF;
 
 		prt_str(&buf, "target=");
-		bch2_target_to_text(&buf, c, r->target);
+		bch2_target_to_text(&buf, c, r->background_target);
 		prt_str(&buf, " compression=");
-		bch2_compression_opt_to_text(&buf, r->compression);
+		bch2_compression_opt_to_text(&buf, r->background_compression);
 		prt_str(&buf, " ");
 		bch2_bkey_val_to_text(&buf, c, k);
 
@@ -261,8 +261,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
 	} else {
 		const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
 
-		target		= r ? r->target : io_opts->background_target;
-		compression	= r ? r->compression : io_opts->background_compression;
+		target		= r ? r->background_target : io_opts->background_target;
+		compression	= r ? r->background_compression : io_opts->background_compression;
 	}
 
 	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
-- 
cgit v1.2.3


From 3000855cab65831f39ecdedd9447c396893287f6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 02:28:51 -0400
Subject: bcachefs: io_opts_to_rebalance_opts()

New helper to simplify bch2_bkey_set_needs_rebalance()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c    |  2 +-
 fs/bcachefs/extents.c        | 60 ++++++++++++--------------------------------
 fs/bcachefs/extents.h        |  3 +--
 fs/bcachefs/extents_format.h |  5 ++++
 fs/bcachefs/io_misc.c        |  2 +-
 fs/bcachefs/io_write.c       |  2 +-
 fs/bcachefs/opts.h           | 13 ++++++++++
 fs/bcachefs/reflink.c        |  2 +-
 8 files changed, 39 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 90da57a26962..e4af2ccdf4c8 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -357,7 +357,7 @@ restart_drop_extra_replicas:
 						k.k->p, bkey_start_pos(&insert->k)) ?:
 			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
 						k.k->p, insert->k.p) ?:
-			bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
+			bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
 			bch2_trans_update(trans, &iter, insert,
 				BTREE_UPDATE_internal_snapshot_node) ?:
 			bch2_trans_commit(trans, &op->res,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ee0e86f0becd..a134aa5a76bb 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1478,55 +1478,27 @@ incompressible:
 	return sectors;
 }
 
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
-				  struct bch_io_opts *opts)
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
+				  struct bkey_i *_k)
 {
-	struct bkey_s k = bkey_i_to_s(_k);
-	struct bch_extent_rebalance *r;
-	unsigned target = opts->background_target;
-	unsigned compression = opts->background_compression;
-	bool needs_rebalance;
-
-	if (!bkey_extent_is_direct_data(k.k))
+	if (!bkey_extent_is_direct_data(&_k->k))
 		return 0;
 
-	/* get existing rebalance entry: */
-	r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
-	if (r) {
-		if (k.k->type == KEY_TYPE_reflink_v) {
-			/*
-			 * indirect extents: existing options take precedence,
-			 * so that we don't move extents back and forth if
-			 * they're referenced by different inodes with different
-			 * options:
-			 */
-			if (r->background_target)
-				target = r->background_target;
-			if (r->background_compression)
-				compression = r->background_compression;
+	struct bkey_s k = bkey_i_to_s(_k);
+	struct bch_extent_rebalance *old =
+		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+
+	if (k.k->type == KEY_TYPE_reflink_v ||
+	    bch2_bkey_ptrs_need_rebalance(c, k.s_c, opts->background_target, opts->background_compression)) {
+		if (!old) {
+			old = bkey_val_end(k);
+			k.k->u64s += sizeof(*old) / sizeof(u64);
 		}
 
-		r->background_target		= target;
-		r->background_compression	= compression;
-	}
-
-	needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
-
-	if (needs_rebalance && !r) {
-		union bch_extent_entry *new = bkey_val_end(k);
-
-		new->rebalance.type			= 1U << BCH_EXTENT_ENTRY_rebalance;
-		new->rebalance.background_compression	= compression;
-		new->rebalance.background_target	= target;
-		new->rebalance.unused			= 0;
-		k.k->u64s += extent_entry_u64s(new);
-	} else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
-		/*
-		 * For indirect extents, don't delete the rebalance entry when
-		 * we're finished so that we know we specifically moved it or
-		 * compressed it to its current location/compression type
-		 */
-		extent_entry_drop(k, (union bch_extent_entry *) r);
+		*old = io_opts_to_rebalance_opts(opts);
+	} else {
+		if (old)
+			extent_entry_drop(k, (union bch_extent_entry *) old);
 	}
 
 	return 0;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 9374599b384d..97af0d6e4319 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -715,8 +715,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
 				       unsigned, unsigned);
 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
 
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
-				  struct bch_io_opts *);
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
 
 /* Generic extent code: */
 
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
index 2cc3e60f3b12..520697f236c0 100644
--- a/fs/bcachefs/extents_format.h
+++ b/fs/bcachefs/extents_format.h
@@ -215,6 +215,11 @@ struct bch_extent_rebalance {
 #endif
 };
 
+/* subset of BCH_INODE_OPTS */
+#define BCH_REBALANCE_OPTS()			\
+	x(background_compression)		\
+	x(background_target)
+
 union bch_extent_entry {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
 	unsigned long			type;
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index f283051758d6..e2acf21ac9b0 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -461,7 +461,7 @@ case LOGGED_OP_FINSERT_shift_extents:
 
 		op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
 
-		ret =   bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
+		ret =   bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
 			bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
 			bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
 			bch2_logged_op_update(trans, &op->k_i) ?:
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 96720adcfee0..f2f69e5e0910 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -369,7 +369,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 				     bkey_start_pos(&sk.k->k),
 				     BTREE_ITER_slots|BTREE_ITER_intent);
 
-		ret =   bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
+		ret =   bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?:
 			bch2_extent_update(trans, inum, &iter, sk.k,
 					&op->res,
 					op->new_i_size, &op->i_sectors_delta,
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index dd27ef556611..13555bc35f00 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -642,4 +642,17 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
 bool bch2_opt_is_inode_opt(enum bch_opt_id);
 
+/* rebalance opts: */
+
+static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts)
+{
+	return (struct bch_extent_rebalance) {
+		.type = BIT(BCH_EXTENT_ENTRY_rebalance),
+#define x(_name)							\
+		._name = opts->_name,
+		BCH_REBALANCE_OPTS()
+#undef x
+	};
+};
+
 #endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index f457925fa362..8a36ebd9dd9c 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -547,7 +547,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 				min(src_k.k->p.offset - src_want.offset,
 				    dst_end.offset - dst_iter.pos.offset));
 
-		ret =   bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
+		ret =   bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
 			bch2_extent_update(trans, dst_inum, &dst_iter,
 					new_dst.k, &disk_res,
 					new_i_size, i_sectors_delta,
-- 
cgit v1.2.3


From 7a7c43a0c1ecf174218f88cc46f5af361314b3c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 19 Oct 2024 23:26:11 -0400
Subject: bcachefs: Add bch_io_opts fields for indicating whether the opts came
 from the inode

This is going to be used in the bch_extent_rebalance improvements, which
propagate io_path options into the extent (important for rebalance,
which needs something present in the extent for transactionally tagging
them in the rebalance_work btree, and also for indirect extents).

By tracking in bch_extent_rebalance whether the option came from the
filesystem or the inode we can correctly handle options being changed on
indirect extents.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 8 +++++++-
 fs/bcachefs/opts.h  | 3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index fbdd11802bdf..5dd9d3edae77 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1142,7 +1142,13 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
 void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
 			 struct bch_inode_unpacked *inode)
 {
-#define x(_name, _bits)		opts->_name = inode_opt_get(c, inode, _name);
+#define x(_name, _bits)							\
+	if ((inode)->bi_##_name) {					\
+		opts->_name = inode->bi_##_name - 1;			\
+		opts->_name##_from_inode = true;			\
+	} else {							\
+		opts->_name = c->opts._name;				\
+	}
 	BCH_INODE_OPTS()
 #undef x
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 13555bc35f00..918eb6730117 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -624,6 +624,9 @@ struct bch_io_opts {
 #define x(_name, _bits)	u##_bits _name;
 	BCH_INODE_OPTS()
 #undef x
+#define x(_name, _bits)	u64 _name##_from_inode:1;
+	BCH_INODE_OPTS()
+#undef x
 };
 
 static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
-- 
cgit v1.2.3


From c22508470439014d0b902adda1f0d97567b574d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 24 Oct 2024 01:06:53 -0400
Subject: bcachefs: copygc_enabled, rebalance_enabled now opts.h options

They can now be set at mount time

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  2 --
 fs/bcachefs/movinggc.c        |  4 ++--
 fs/bcachefs/opts.h            | 12 ++++++++++++
 fs/bcachefs/rebalance.c       |  4 ++--
 fs/bcachefs/rebalance_types.h |  2 --
 fs/bcachefs/super.c           |  3 ---
 fs/bcachefs/sysfs.c           | 31 +++++++------------------------
 7 files changed, 23 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d4d95ef6791f..e1ab67c533f0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1096,8 +1096,6 @@ struct bch_fs {
 	u64			counters_on_mount[BCH_COUNTER_NR];
 	u64 __percpu		*counters;
 
-	unsigned		copy_gc_enabled:1;
-
 	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
 
 	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index d658be90f737..725292d69fd6 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -350,9 +350,9 @@ static int bch2_copygc_thread(void *arg)
 		bch2_trans_unlock_long(ctxt.trans);
 		cond_resched();
 
-		if (!c->copy_gc_enabled) {
+		if (!c->opts.copygc_enabled) {
 			move_buckets_wait(&ctxt, buckets, true);
-			kthread_wait_freezable(c->copy_gc_enabled ||
+			kthread_wait_freezable(c->opts.copygc_enabled ||
 					       kthread_should_stop());
 		}
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 918eb6730117..e0e23c29c2d6 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -473,6 +473,18 @@ enum fsck_err_opts {
 	  BCH2_NO_SB_OPT,			true,			\
 	  NULL,		"Enable nocow mode: enables runtime locking in\n"\
 			"data move path needed if nocow will ever be in use\n")\
+	x(copygc_enabled,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Enable copygc: disable for debugging, or to\n"\
+			"quiet the system when doing performance testing\n")\
+	x(rebalance_enabled,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Enable rebalance: disable for debugging, or to\n"\
+			"quiet the system when doing performance testing\n")\
 	x(no_data_io,			u8,				\
 	  OPT_MOUNT,							\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index dc70e6feaf79..d8cb346ac138 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -338,9 +338,9 @@ static int do_rebalance(struct moving_context *ctxt)
 			     BTREE_ITER_all_snapshots);
 
 	while (!bch2_move_ratelimit(ctxt)) {
-		if (!r->enabled) {
+		if (!c->opts.rebalance_enabled) {
 			bch2_moving_ctxt_flush_all(ctxt);
-			kthread_wait_freezable(r->enabled ||
+			kthread_wait_freezable(c->opts.rebalance_enabled ||
 					       kthread_should_stop());
 		}
 
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 0fffb536c1d0..fe5098c17dfc 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -30,8 +30,6 @@ struct bch_fs_rebalance {
 	struct bbpos			scan_start;
 	struct bbpos			scan_end;
 	struct bch_move_stats		scan_stats;
-
-	unsigned			enabled:1;
 };
 
 #endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d6411324cd3f..7e2431de3a94 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -810,9 +810,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->vfs_inodes_list);
 	mutex_init(&c->vfs_inodes_lock);
 
-	c->copy_gc_enabled		= 1;
-	c->rebalance.enabled		= 1;
-
 	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
 	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
 	c->journal.flush_seq_time	= &c->times[BCH_TIME_journal_flush_seq];
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 3270bfab9466..4ab0ccba2ab5 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -213,10 +213,8 @@ BCH_PERSISTENT_COUNTERS()
 rw_attribute(discard);
 rw_attribute(label);
 
-rw_attribute(copy_gc_enabled);
 read_attribute(copy_gc_wait);
 
-rw_attribute(rebalance_enabled);
 sysfs_pd_controller_attribute(rebalance);
 read_attribute(rebalance_status);
 
@@ -340,9 +338,6 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_gc_gens_pos)
 		bch2_gc_gens_pos_to_text(out, c);
 
-	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
-
-	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
 	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
 
 	if (attr == &sysfs_copy_gc_wait)
@@ -419,23 +414,6 @@ STORE(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 
-	if (attr == &sysfs_copy_gc_enabled) {
-		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
-			?: (ssize_t) size;
-
-		if (c->copygc_thread)
-			wake_up_process(c->copygc_thread);
-		return ret;
-	}
-
-	if (attr == &sysfs_rebalance_enabled) {
-		ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
-			?: (ssize_t) size;
-
-		rebalance_wakeup(c);
-		return ret;
-	}
-
 	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
 
 	/* Debugging: */
@@ -611,10 +589,8 @@ struct attribute *bch2_fs_internal_files[] = {
 
 	&sysfs_gc_gens_pos,
 
-	&sysfs_copy_gc_enabled,
 	&sysfs_copy_gc_wait,
 
-	&sysfs_rebalance_enabled,
 	sysfs_pd_controller_files(rebalance),
 
 	&sysfs_moving_ctxts,
@@ -683,6 +659,13 @@ STORE(bch2_fs_opts_dir)
 	     (id == Opt_compression && !c->opts.background_compression)))
 		bch2_set_rebalance_needs_scan(c, 0);
 
+	if (v && id == Opt_rebalance_enabled)
+		rebalance_wakeup(c);
+
+	if (v && id == Opt_copygc_enabled &&
+	    c->copygc_thread)
+		wake_up_process(c->copygc_thread);
+
 	ret = size;
 err:
 	bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
-- 
cgit v1.2.3


From ed13bb5726ee344d1c1a5593e474c1ccc6fd8c5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 28 Oct 2024 01:14:53 -0400
Subject: bcachefs: bch2_prt_csum_opt()

bounds checking helper

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 2 +-
 fs/bcachefs/checksum.h        | 2 +-
 fs/bcachefs/opts.c            | 3 ++-
 fs/bcachefs/opts.h            | 7 ++++---
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index c5e3824d5771..dc14bfe37e3b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1030,7 +1030,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
 	x(crc64,		2)	\
 	x(xxhash,		3)
 
-enum bch_csum_opts {
+enum bch_csum_opt {
 #define x(t, n) BCH_CSUM_OPT_##t = n,
 	BCH_CSUM_OPTS()
 #undef x
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index e40499fde9a4..43b9d71f2f2b 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -109,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool);
 void bch2_fs_encryption_exit(struct bch_fs *);
 int bch2_fs_encryption_init(struct bch_fs *);
 
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
 						       bool data)
 {
 	switch (type) {
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 1f5843284e9e..49c59aec6954 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -48,7 +48,7 @@ static const char * const __bch2_csum_types[] = {
 	NULL
 };
 
-const char * const bch2_csum_opts[] = {
+const char * const __bch2_csum_opts[] = {
 	BCH_CSUM_OPTS()
 	NULL
 };
@@ -113,6 +113,7 @@ void bch2_prt_##name(struct printbuf *out, type t)				\
 PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type,	enum bch_jset_entry_type);
 PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type,	enum bch_fs_usage_type);
 PRT_STR_OPT_BOUNDSCHECKED(data_type,		enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_opt,		enum bch_csum_opt);
 PRT_STR_OPT_BOUNDSCHECKED(csum_type,		enum bch_csum_type);
 PRT_STR_OPT_BOUNDSCHECKED(compression_type,	enum bch_compression_type);
 PRT_STR_OPT_BOUNDSCHECKED(str_hash_type,	enum bch_str_hash_type);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index e0e23c29c2d6..f6dc0628b025 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,7 +16,7 @@ extern const char * const bch2_version_upgrade_opts[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
 extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_opts[];
+extern const char * const __bch2_csum_opts[];
 extern const char * const bch2_compression_opts[];
 extern const char * const __bch2_str_hash_types[];
 extern const char * const bch2_str_hash_opts[];
@@ -27,6 +27,7 @@ extern const char * const bch2_d_types[];
 void bch2_prt_jset_entry_type(struct printbuf *,	enum bch_jset_entry_type);
 void bch2_prt_fs_usage_type(struct printbuf *,		enum bch_fs_usage_type);
 void bch2_prt_data_type(struct printbuf *,		enum bch_data_type);
+void bch2_prt_csum_opt(struct printbuf *,		enum bch_csum_opt);
 void bch2_prt_csum_type(struct printbuf *,		enum bch_csum_type);
 void bch2_prt_compression_type(struct printbuf *,	enum bch_compression_type);
 void bch2_prt_str_hash_type(struct printbuf *,		enum bch_str_hash_type);
@@ -171,12 +172,12 @@ enum fsck_err_opts {
 	  "size",	"Maximum size of checksummed/compressed extents")\
 	x(metadata_checksum,		u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_STR(bch2_csum_opts),					\
+	  OPT_STR(__bch2_csum_opts),					\
 	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
 	x(data_checksum,		u8,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_STR(bch2_csum_opts),					\
+	  OPT_STR(__bch2_csum_opts),					\
 	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
 	x(compression,			u8,				\
-- 
cgit v1.2.3


From 2d21d112538ea3b5f35a92fded43324dc693af25 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 19 Oct 2024 21:41:20 -0400
Subject: bcachefs: New bch_extent_rebalance fields

- Add more io path options to bch_extent_rebalance
- For each option, track whether it came from the filesystem or the
  inode

This will be used for improved rebalance support for reflinked data.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c        | 65 ++++++++++++++++++++++++++++++++++++--------
 fs/bcachefs/extents_format.h | 34 +++++++++++++++++++++--
 fs/bcachefs/opts.h           |  3 +-
 3 files changed, 87 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index a134aa5a76bb..467ffed0809e 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1121,6 +1121,57 @@ void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_cr
 	bch2_prt_compression_type(out, crc->compression_type);
 }
 
+static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
+					  const struct bch_extent_rebalance *r)
+{
+	prt_str(out, "rebalance:");
+
+	prt_printf(out, " replicas=%u", r->data_replicas);
+	if (r->data_replicas_from_inode)
+		prt_str(out, " (inode)");
+
+	prt_str(out, " checksum=");
+	bch2_prt_csum_opt(out, r->data_checksum);
+	if (r->data_checksum_from_inode)
+		prt_str(out, " (inode)");
+
+	if (r->background_compression || r->background_compression_from_inode) {
+		prt_str(out, " background_compression=");
+		bch2_compression_opt_to_text(out, r->background_compression);
+
+		if (r->background_compression_from_inode)
+			prt_str(out, " (inode)");
+	}
+
+	if (r->background_target || r->background_target_from_inode) {
+		prt_str(out, " background_target=");
+		if (c)
+			bch2_target_to_text(out, c, r->background_target);
+		else
+			prt_printf(out, "%u", r->background_target);
+
+		if (r->background_target_from_inode)
+			prt_str(out, " (inode)");
+	}
+
+	if (r->promote_target || r->promote_target_from_inode) {
+		prt_str(out, " promote_target=");
+		if (c)
+			bch2_target_to_text(out, c, r->promote_target);
+		else
+			prt_printf(out, "%u", r->promote_target);
+
+		if (r->promote_target_from_inode)
+			prt_str(out, " (inode)");
+	}
+
+	if (r->erasure_code || r->erasure_code_from_inode) {
+		prt_printf(out, " ec=%u", r->erasure_code);
+		if (r->erasure_code_from_inode)
+			prt_str(out, " (inode)");
+	}
+}
+
 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			    struct bkey_s_c k)
 {
@@ -1156,18 +1207,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			       (u64) ec->idx, ec->block);
 			break;
 		}
-		case BCH_EXTENT_ENTRY_rebalance: {
-			const struct bch_extent_rebalance *r = &entry->rebalance;
-
-			prt_str(out, "rebalance: target ");
-			if (c)
-				bch2_target_to_text(out, c, r->background_target);
-			else
-				prt_printf(out, "%u", r->background_target);
-			prt_str(out, " compression ");
-			bch2_compression_opt_to_text(out, r->background_compression);
+		case BCH_EXTENT_ENTRY_rebalance:
+			bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
 			break;
-		}
+
 		default:
 			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
 			return;
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
index 520697f236c0..222eed6b46d8 100644
--- a/fs/bcachefs/extents_format.h
+++ b/fs/bcachefs/extents_format.h
@@ -204,21 +204,49 @@ struct bch_extent_stripe_ptr {
 struct bch_extent_rebalance {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:6,
-				unused:34,
+				unused:3,
+
+				promote_target_from_inode:1,
+				erasure_code_from_inode:1,
+				data_checksum_from_inode:1,
+				background_compression_from_inode:1,
+				data_replicas_from_inode:1,
+				background_target_from_inode:1,
+
+				promote_target:16,
+				erasure_code:1,
+				data_checksum:4,
+				data_replicas:4,
 				background_compression:8, /* enum bch_compression_opt */
 				background_target:16;
 #elif defined (__BIG_ENDIAN_BITFIELD)
 	__u64			background_target:16,
 				background_compression:8,
-				unused:34,
+				data_replicas:4,
+				data_checksum:4,
+				erasure_code:1,
+				promote_target:16,
+
+				background_target_from_inode:1,
+				data_replicas_from_inode:1,
+				background_compression_from_inode:1,
+				data_checksum_from_inode:1,
+				erasure_code_from_inode:1,
+				promote_target_from_inode:1,
+
+				unused:3,
 				type:6;
 #endif
 };
 
 /* subset of BCH_INODE_OPTS */
 #define BCH_REBALANCE_OPTS()			\
+	x(data_checksum)			\
 	x(background_compression)		\
-	x(background_target)
+	x(data_replicas)			\
+	x(promote_target)			\
+	x(background_target)			\
+	x(erasure_code)
 
 union bch_extent_entry {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index f6dc0628b025..39cdc185fa73 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -665,7 +665,8 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_i
 	return (struct bch_extent_rebalance) {
 		.type = BIT(BCH_EXTENT_ENTRY_rebalance),
 #define x(_name)							\
-		._name = opts->_name,
+		._name = opts->_name,					\
+		._name##_from_inode = opts->_name##_from_inode,
 		BCH_REBALANCE_OPTS()
 #undef x
 	};
-- 
cgit v1.2.3


From 4ae6bbb522f59aef34bb7fb3f7048ecaf72f0ab7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 20:53:53 -0400
Subject: bcachefs: bch2_write_inode() now checks for changing rebalance
 options

Previously, BCHFS_IOC_REINHERIT_ATTRS didn't trigger rebalance scans
when changing rebalance options - it had been missed, only the xattr
interface triggered them.

Ideally they'd be done by the transactional trigger, but unpacking the
inode to get the options is too heavy to be done in the low level
trigger - the inode trigger is run on every extent update, since the
bch_inode.bi_journal_seq has to be updated for fsync.

bch2_write_inode() is a good compromise, it already unpacks and repacks
and is not run in any super-fast paths.

Additionally, creating the new rebalance entry to trigger the scan is
now done in the same transaction as the inode update that changed the
options.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c        | 26 +++++++++++++++++++++-----
 fs/bcachefs/inode.h     |  8 ++++++++
 fs/bcachefs/rebalance.c |  4 ++--
 fs/bcachefs/rebalance.h |  1 +
 fs/bcachefs/xattr.c     |  7 -------
 5 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 646b74494a3f..e0ffe4648bb8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -23,6 +23,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "quota.h"
+#include "rebalance.h"
 #include "snapshot.h"
 #include "super.h"
 #include "xattr.h"
@@ -89,10 +90,25 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 retry:
 	bch2_trans_begin(trans);
 
-	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
-				BTREE_ITER_intent) ?:
-		(set ? set(trans, inode, &inode_u, p) : 0) ?:
-		bch2_inode_write(trans, &iter, &inode_u) ?:
+	ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+
+	ret = (set ? set(trans, inode, &inode_u, p) : 0);
+	if (ret)
+		goto err;
+
+	struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+
+	if (memcmp(&old_r, &new_r, sizeof(new_r))) {
+		ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
+		if (ret)
+			goto err;
+	}
+
+	ret   = bch2_inode_write(trans, &iter, &inode_u) ?:
 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 
 	/*
@@ -101,7 +117,7 @@ retry:
 	 */
 	if (!ret)
 		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
-
+err:
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index bdeb6be76038..f52336cb298f 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -262,6 +262,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
 			 struct bch_inode_unpacked *);
 int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
 
+static inline struct bch_extent_rebalance
+bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
+{
+	struct bch_io_opts io_opts;
+	bch2_inode_opts_get(&io_opts, c, inode);
+	return io_opts_to_rebalance_opts(&io_opts);
+}
+
 int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
 int bch2_delete_dead_inodes(struct bch_fs *);
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index d8cb346ac138..926b9d5eba45 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -33,7 +33,7 @@ static const char * const bch2_rebalance_state_strs[] = {
 #undef x
 };
 
-static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -73,7 +73,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
 	int ret = bch2_trans_commit_do(c, NULL, NULL,
 				       BCH_TRANS_COMMIT_no_enospc|
 				       BCH_TRANS_COMMIT_lazy_rw,
-			    __bch2_set_rebalance_needs_scan(trans, inum));
+			    bch2_set_rebalance_needs_scan_trans(trans, inum));
 	rebalance_wakeup(c);
 	return ret;
 }
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 28a52638f16c..791649c04ff5 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -4,6 +4,7 @@
 
 #include "rebalance_types.h"
 
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
 int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
 int bch2_set_fs_needs_rebalance(struct bch_fs *);
 
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index bf3c6bb50495..ed418a747cdd 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -565,13 +565,6 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
 err:
 	mutex_unlock(&inode->ei_update_lock);
-
-	if (value &&
-	    (opt_id == Opt_background_target ||
-	     opt_id == Opt_background_compression ||
-	     (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
-		bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
-
 err_class_exit:
 	return bch2_err_class(ret);
 }
-- 
cgit v1.2.3


From 6aa0bd0fd59a1ab9ecd5c51332c29fac0d8969c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 19 Oct 2024 21:41:20 -0400
Subject: bcachefs: get_update_rebalance_opts()

bch2_move_get_io_opts() now synchronizes options loaded from the
filesystem and inode (if present, i.e. not walking the reflink btree
directly) with options from the bch_extent_rebalance_entry, updating the
extent if necessary.

Since bch_extent_rebalance tracks where its option came from we can
preserve "inode options override filesystem options", even for indirect
extents where we don't have access to the inode the options came from.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c   | 17 +++++++++
 fs/bcachefs/extents.h   |  1 +
 fs/bcachefs/move.c      | 94 ++++++++++++++++++++++++++++++++++++-------------
 fs/bcachefs/move.h      |  5 ++-
 fs/bcachefs/rebalance.c |  2 +-
 5 files changed, 91 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 467ffed0809e..4988056ab4f1 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1521,6 +1521,23 @@ incompressible:
 	return sectors;
 }
 
+bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
+				      struct bkey_s_c k)
+{
+	if (!bkey_extent_is_direct_data(k.k))
+		return 0;
+
+	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
+
+	if (k.k->type == KEY_TYPE_reflink_v ||
+	    bch2_bkey_ptrs_need_rebalance(c, k, opts->background_target, opts->background_compression)) {
+		struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
+		return old == NULL || memcmp(old, &new, sizeof(new));
+	} else {
+		return old != NULL;
+	}
+}
+
 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
 				  struct bkey_i *_k)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 97af0d6e4319..abe7d4b2fc6b 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -715,6 +715,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
 				       unsigned, unsigned);
 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
 
+bool bch2_bkey_rebalance_needs_update(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c);
 int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
 
 /* Generic extent code: */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 0ef4a86850bb..1003f7fe4f50 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -379,14 +379,57 @@ err:
 	return ret;
 }
 
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+static int get_update_rebalance_opts(struct btree_trans *trans,
+				     struct bch_io_opts *io_opts,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k)
+{
+	BUG_ON(iter->flags & BTREE_ITER_is_extents);
+	BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
+
+	const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
+		? bch2_bkey_rebalance_opts(k) : NULL;
+	if (r) {
+#define x(_name)							\
+		if (r->_name##_from_inode) {				\
+			io_opts->_name = r->_name;			\
+			io_opts->_name##_from_inode = true;		\
+		}
+		BCH_REBALANCE_OPTS()
+#undef x
+	}
+
+	if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
+		return 0;
+
+	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(n, k);
+
+	/* On successfull transaction commit, @k was invalidated: */
+
+	return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0) ?:
+		-BCH_ERR_transaction_restart_nested;
+}
+
+static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
 			  struct per_snapshot_io_opts *io_opts,
+			  struct btree_iter *extent_iter,
 			  struct bkey_s_c extent_k)
 {
 	struct bch_fs *c = trans->c;
 	u32 restart_count = trans->restart_count;
+	struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
 	int ret = 0;
 
+	if (extent_k.k->type == KEY_TYPE_reflink_v)
+		goto out;
+
 	if (io_opts->cur_inum != extent_k.k->p.inode) {
 		io_opts->d.nr = 0;
 
@@ -415,43 +458,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
 
 	if (extent_k.k->p.snapshot)
 		darray_for_each(io_opts->d, i)
-			if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
-				return &i->io_opts;
-
-	return &io_opts->fs_io_opts;
+			if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
+				opts_ret = &i->io_opts;
+				break;
+			}
+out:
+	ret = get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
+	if (ret)
+		return ERR_PTR(ret);
+	return opts_ret;
 }
 
 int bch2_move_get_io_opts_one(struct btree_trans *trans,
 			      struct bch_io_opts *io_opts,
+			      struct btree_iter *extent_iter,
 			      struct bkey_s_c extent_k)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
+	struct bch_fs *c = trans->c;
+
+	*io_opts = bch2_opts_to_inode_opts(c->opts);
 
 	/* reflink btree? */
-	if (!extent_k.k->p.inode) {
-		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
-		return 0;
-	}
+	if (!extent_k.k->p.inode)
+		goto out;
 
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+	struct btree_iter inode_iter;
+	struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
 			       SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
 			       BTREE_ITER_cached);
-	ret = bkey_err(k);
+	int ret = bkey_err(inode_k);
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		return ret;
 
-	if (!ret && bkey_is_inode(k.k)) {
+	if (!ret && bkey_is_inode(inode_k.k)) {
 		struct bch_inode_unpacked inode;
-		bch2_inode_unpack(k, &inode);
-		bch2_inode_opts_get(io_opts, trans->c, &inode);
-	} else {
-		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+		bch2_inode_unpack(inode_k, &inode);
+		bch2_inode_opts_get(io_opts, c, &inode);
 	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	return 0;
+	bch2_trans_iter_exit(trans, &inode_iter);
+out:
+	return get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
 }
 
 int bch2_move_ratelimit(struct moving_context *ctxt)
@@ -552,7 +598,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
-		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, &iter, k);
 		ret = PTR_ERR_OR_ZERO(io_opts);
 		if (ret)
 			continue;
@@ -728,7 +774,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 			bch2_bkey_buf_reassemble(&sk, c, k);
 			k = bkey_i_to_s_c(sk.k);
 
-			ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+			ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
 			if (ret) {
 				bch2_trans_iter_exit(trans, &iter);
 				continue;
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 9baf3093a678..51e0505a8156 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -110,9 +110,8 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt
 	darray_exit(&io_opts->d);
 }
 
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
-				struct per_snapshot_io_opts *, struct bkey_s_c);
-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
+			      struct btree_iter *, struct bkey_s_c);
 
 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 926b9d5eba45..e79459a5891d 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -216,7 +216,7 @@ static int do_rebalance_extent(struct moving_context *ctxt,
 	if (ret || !k.k)
 		goto out;
 
-	ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+	ret = bch2_move_get_io_opts_one(trans, &io_opts, extent_iter, k);
 	if (ret)
 		goto out;
 
-- 
cgit v1.2.3


From 3de8b72731dbb41b980af44805ff5bf032a19bc1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 19 Oct 2024 21:41:20 -0400
Subject: bcachefs: Simplify option logic in rebalance

Since bch2_move_get_io_opts() now synchronizes io_opts with options from
bch_extent_rebalance, delete the ad-hoc logic in rebalance.c that
previously did this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c   | 20 ++++++++++----------
 fs/bcachefs/extents.h   |  3 +--
 fs/bcachefs/rebalance.c | 47 +++++++++++++++--------------------------------
 3 files changed, 26 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4988056ab4f1..bee083d787f2 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1447,14 +1447,15 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
 	return NULL;
 }
 
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
-				       unsigned target, unsigned compression)
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
+				       struct bch_io_opts *opts,
+				       struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	unsigned rewrite_ptrs = 0;
 
-	if (compression) {
-		unsigned compression_type = bch2_compression_opt_to_type(compression);
+	if (opts->background_compression) {
+		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
 		unsigned ptr_bit = 1;
@@ -1472,11 +1473,12 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}
 incompressible:
-	if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+	if (opts->background_target &&
+	    bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
 		unsigned ptr_bit = 1;
 
 		bkey_for_each_ptr(ptrs, ptr) {
-			if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+			if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
 				rewrite_ptrs |= ptr_bit;
 			ptr_bit <<= 1;
 		}
@@ -1529,8 +1531,7 @@ bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts
 
 	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
 
-	if (k.k->type == KEY_TYPE_reflink_v ||
-	    bch2_bkey_ptrs_need_rebalance(c, k, opts->background_target, opts->background_compression)) {
+	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
 		struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
 		return old == NULL || memcmp(old, &new, sizeof(new));
 	} else {
@@ -1548,8 +1549,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
 	struct bch_extent_rebalance *old =
 		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
 
-	if (k.k->type == KEY_TYPE_reflink_v ||
-	    bch2_bkey_ptrs_need_rebalance(c, k.s_c, opts->background_target, opts->background_compression)) {
+	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
 		if (!old) {
 			old = bkey_val_end(k);
 			k.k->u64s += sizeof(*old) / sizeof(u64);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index abe7d4b2fc6b..156fbb8e04d5 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -711,8 +711,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
 void bch2_ptr_swab(struct bkey_s);
 
 const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
-				       unsigned, unsigned);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c);
 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
 
 bool bch2_bkey_rebalance_needs_update(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index e79459a5891d..3be9c85dd55d 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -121,6 +121,9 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
 					   struct btree_iter *iter,
 					   struct bkey_s_c k)
 {
+	if (!bch2_bkey_rebalance_opts(k))
+		return 0;
+
 	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
 	int ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
@@ -134,31 +137,27 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 			struct bpos work_pos,
 			struct btree_iter *extent_iter,
+			struct bch_io_opts *io_opts,
 			struct data_update_opts *data_opts)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
 
 	bch2_trans_iter_exit(trans, extent_iter);
 	bch2_trans_iter_init(trans, extent_iter,
 			     work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
 			     work_pos,
 			     BTREE_ITER_all_snapshots);
-	k = bch2_btree_iter_peek_slot(extent_iter);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
 	if (bkey_err(k))
 		return k;
 
-	const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
-	if (!r) {
-		/* raced due to btree write buffer, nothing to do */
-		return bkey_s_c_null;
-	}
+	int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
+	if (ret)
+		return bkey_s_c_err(ret);
 
 	memset(data_opts, 0, sizeof(*data_opts));
-
-	data_opts->rewrite_ptrs		=
-		bch2_bkey_ptrs_need_rebalance(c, k, r->background_target, r->background_compression);
-	data_opts->target		= r->background_target;
+	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
+	data_opts->target		= io_opts->background_target;
 	data_opts->write_flags		|= BCH_WRITE_ONLY_SPECIFIED_DEVS;
 
 	if (!data_opts->rewrite_ptrs) {
@@ -179,9 +178,9 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 		struct printbuf buf = PRINTBUF;
 
 		prt_str(&buf, "target=");
-		bch2_target_to_text(&buf, c, r->background_target);
+		bch2_target_to_text(&buf, c, io_opts->background_target);
 		prt_str(&buf, " compression=");
-		bch2_compression_opt_to_text(&buf, r->background_compression);
+		bch2_compression_opt_to_text(&buf, io_opts->background_compression);
 		prt_str(&buf, " ");
 		bch2_bkey_val_to_text(&buf, c, k);
 
@@ -212,14 +211,10 @@ static int do_rebalance_extent(struct moving_context *ctxt,
 	bch2_bkey_buf_init(&sk);
 
 	ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
-						 extent_iter, &data_opts));
+				extent_iter, &io_opts, &data_opts));
 	if (ret || !k.k)
 		goto out;
 
-	ret = bch2_move_get_io_opts_one(trans, &io_opts, extent_iter, k);
-	if (ret)
-		goto out;
-
 	atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
 
 	/*
@@ -253,20 +248,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
 			   struct bch_io_opts *io_opts,
 			   struct data_update_opts *data_opts)
 {
-	unsigned target, compression;
-
-	if (k.k->p.inode) {
-		target		= io_opts->background_target;
-		compression	= io_opts->background_compression;
-	} else {
-		const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
-		target		= r ? r->background_target : io_opts->background_target;
-		compression	= r ? r->background_compression : io_opts->background_compression;
-	}
-
-	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
-	data_opts->target		= target;
+	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
+	data_opts->target		= io_opts->background_target;
 	data_opts->write_flags		|= BCH_WRITE_ONLY_SPECIFIED_DEVS;
 	return data_opts->rewrite_ptrs != 0;
 }
-- 
cgit v1.2.3


From a652c56590a912144579684c508f4c36c252f245 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 26 Oct 2024 01:42:57 -0400
Subject: bcachefs: Improve trace_rebalance_extent

We now say explicitly which pointers are being moved or compressed

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c   | 35 +++--------------------------------
 fs/bcachefs/rebalance.c | 26 +++++++++++++++++++++-----
 fs/bcachefs/rebalance.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index bee083d787f2..6f9514c19b2f 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -21,6 +21,7 @@
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
@@ -1452,39 +1453,9 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
 				       struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	unsigned rewrite_ptrs = 0;
 
-	if (opts->background_compression) {
-		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		unsigned ptr_bit = 1;
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
-			    p.ptr.unwritten) {
-				rewrite_ptrs = 0;
-				goto incompressible;
-			}
-
-			if (!p.ptr.cached && p.crc.compression_type != compression_type)
-				rewrite_ptrs |= ptr_bit;
-			ptr_bit <<= 1;
-		}
-	}
-incompressible:
-	if (opts->background_target &&
-	    bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
-		unsigned ptr_bit = 1;
-
-		bkey_for_each_ptr(ptrs, ptr) {
-			if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
-				rewrite_ptrs |= ptr_bit;
-			ptr_bit <<= 1;
-		}
-	}
-
-	return rewrite_ptrs;
+	return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
+		bch2_bkey_ptrs_need_move(c, opts, ptrs);
 }
 
 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 3be9c85dd55d..124da250cbe7 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -177,12 +177,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 	if (trace_rebalance_extent_enabled()) {
 		struct printbuf buf = PRINTBUF;
 
-		prt_str(&buf, "target=");
-		bch2_target_to_text(&buf, c, io_opts->background_target);
-		prt_str(&buf, " compression=");
-		bch2_compression_opt_to_text(&buf, io_opts->background_compression);
-		prt_str(&buf, " ");
 		bch2_bkey_val_to_text(&buf, c, k);
+		prt_newline(&buf);
+
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+		unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
+		if (p) {
+			prt_str(&buf, "compression=");
+			bch2_compression_opt_to_text(&buf, io_opts->background_compression);
+			prt_str(&buf, " ");
+			bch2_prt_u64_base2(&buf, p);
+			prt_newline(&buf);
+		}
+
+		p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
+		if (p) {
+			prt_str(&buf, "move=");
+			bch2_target_to_text(&buf, c, io_opts->background_target);
+			prt_str(&buf, " ");
+			bch2_prt_u64_base2(&buf, p);
+			prt_newline(&buf);
+		}
 
 		trace_rebalance_extent(c, buf.buf);
 		printbuf_exit(&buf);
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 791649c04ff5..606c88f49f7f 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -2,8 +2,57 @@
 #ifndef _BCACHEFS_REBALANCE_H
 #define _BCACHEFS_REBALANCE_H
 
+#include "compress.h"
+#include "disk_groups.h"
 #include "rebalance_types.h"
 
+static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
+					   struct bch_io_opts *opts,
+					   struct bkey_s_c k,
+					   struct bkey_ptrs_c ptrs)
+{
+	if (!opts->background_compression)
+		return 0;
+
+	unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned ptr_bit = 1;
+	unsigned rewrite_ptrs = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+		    p.ptr.unwritten)
+			return 0;
+
+		if (!p.ptr.cached && p.crc.compression_type != compression_type)
+			rewrite_ptrs |= ptr_bit;
+		ptr_bit <<= 1;
+	}
+
+	return rewrite_ptrs;
+}
+
+static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
+				       struct bch_io_opts *opts,
+				       struct bkey_ptrs_c ptrs)
+{
+	if (!opts->background_target ||
+	    !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
+		return 0;
+
+	unsigned ptr_bit = 1;
+	unsigned rewrite_ptrs = 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
+			rewrite_ptrs |= ptr_bit;
+		ptr_bit <<= 1;
+	}
+
+	return rewrite_ptrs;
+}
+
 int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
 int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
 int bch2_set_fs_needs_rebalance(struct bch_fs *);
-- 
cgit v1.2.3


From 161d13835e38e4803f9c44e0912aaec9c35f127f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 28 Oct 2024 23:23:18 -0400
Subject: bcachefs: Move bch_extent_rebalance code to rebalance.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c          |   1 +
 fs/bcachefs/extents.c          |  99 ----------------------
 fs/bcachefs/extents.h          |   7 --
 fs/bcachefs/extents_format.h   |  48 +----------
 fs/bcachefs/move.c             |  43 +---------
 fs/bcachefs/rebalance.c        | 186 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/rebalance.h        |  52 ++----------
 fs/bcachefs/rebalance_format.h |  53 ++++++++++++
 8 files changed, 251 insertions(+), 238 deletions(-)
 create mode 100644 fs/bcachefs/rebalance_format.h

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 8bd17667e243..c4123fa4f250 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -18,6 +18,7 @@
 #include "error.h"
 #include "inode.h"
 #include "movinggc.h"
+#include "rebalance.h"
 #include "recovery.h"
 #include "reflink.h"
 #include "replicas.h"
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 6f9514c19b2f..bc7cfdb66687 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1436,105 +1436,6 @@ void bch2_ptr_swab(struct bkey_s k)
 	}
 }
 
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-
-	bkey_extent_entry_for_each(ptrs, entry)
-		if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
-			return &entry->rebalance;
-
-	return NULL;
-}
-
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
-				       struct bch_io_opts *opts,
-				       struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
-		bch2_bkey_ptrs_need_move(c, opts, ptrs);
-}
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
-	const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
-	if (!opts)
-		return 0;
-
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	u64 sectors = 0;
-
-	if (opts->background_compression) {
-		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
-			    p.ptr.unwritten) {
-				sectors = 0;
-				goto incompressible;
-			}
-
-			if (!p.ptr.cached && p.crc.compression_type != compression_type)
-				sectors += p.crc.compressed_size;
-		}
-	}
-incompressible:
-	if (opts->background_target &&
-	    bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
-				sectors += p.crc.compressed_size;
-	}
-
-	return sectors;
-}
-
-bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
-				      struct bkey_s_c k)
-{
-	if (!bkey_extent_is_direct_data(k.k))
-		return 0;
-
-	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
-
-	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
-		struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
-		return old == NULL || memcmp(old, &new, sizeof(new));
-	} else {
-		return old != NULL;
-	}
-}
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
-				  struct bkey_i *_k)
-{
-	if (!bkey_extent_is_direct_data(&_k->k))
-		return 0;
-
-	struct bkey_s k = bkey_i_to_s(_k);
-	struct bch_extent_rebalance *old =
-		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
-
-	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
-		if (!old) {
-			old = bkey_val_end(k);
-			k.k->u64s += sizeof(*old) / sizeof(u64);
-		}
-
-		*old = io_opts_to_rebalance_opts(opts);
-	} else {
-		if (old)
-			extent_entry_drop(k, (union bch_extent_entry *) old);
-	}
-
-	return 0;
-}
-
 /* Generic extent code: */
 
 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 156fbb8e04d5..ba33788fee36 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -710,13 +710,6 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
 
 void bch2_ptr_swab(struct bkey_s);
 
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c);
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
-
-bool bch2_bkey_rebalance_needs_update(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c);
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
-
 /* Generic extent code: */
 
 enum bch_extent_overlap {
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
index 222eed6b46d8..c198dfc376d6 100644
--- a/fs/bcachefs/extents_format.h
+++ b/fs/bcachefs/extents_format.h
@@ -201,52 +201,8 @@ struct bch_extent_stripe_ptr {
 #endif
 };
 
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:6,
-				unused:3,
-
-				promote_target_from_inode:1,
-				erasure_code_from_inode:1,
-				data_checksum_from_inode:1,
-				background_compression_from_inode:1,
-				data_replicas_from_inode:1,
-				background_target_from_inode:1,
-
-				promote_target:16,
-				erasure_code:1,
-				data_checksum:4,
-				data_replicas:4,
-				background_compression:8, /* enum bch_compression_opt */
-				background_target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			background_target:16,
-				background_compression:8,
-				data_replicas:4,
-				data_checksum:4,
-				erasure_code:1,
-				promote_target:16,
-
-				background_target_from_inode:1,
-				data_replicas_from_inode:1,
-				background_compression_from_inode:1,
-				data_checksum_from_inode:1,
-				erasure_code_from_inode:1,
-				promote_target_from_inode:1,
-
-				unused:3,
-				type:6;
-#endif
-};
-
-/* subset of BCH_INODE_OPTS */
-#define BCH_REBALANCE_OPTS()			\
-	x(data_checksum)			\
-	x(background_compression)		\
-	x(data_replicas)			\
-	x(promote_target)			\
-	x(background_target)			\
-	x(erasure_code)
+/* bch_extent_rebalance: */
+#include "rebalance_format.h"
 
 union bch_extent_entry {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 1003f7fe4f50..d6e68265e039 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -21,6 +21,7 @@
 #include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "snapshot.h"
 #include "super-io.h"
@@ -379,44 +380,6 @@ err:
 	return ret;
 }
 
-static int get_update_rebalance_opts(struct btree_trans *trans,
-				     struct bch_io_opts *io_opts,
-				     struct btree_iter *iter,
-				     struct bkey_s_c k)
-{
-	BUG_ON(iter->flags & BTREE_ITER_is_extents);
-	BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
-
-	const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
-		? bch2_bkey_rebalance_opts(k) : NULL;
-	if (r) {
-#define x(_name)							\
-		if (r->_name##_from_inode) {				\
-			io_opts->_name = r->_name;			\
-			io_opts->_name##_from_inode = true;		\
-		}
-		BCH_REBALANCE_OPTS()
-#undef x
-	}
-
-	if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
-		return 0;
-
-	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
-	int ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	bkey_reassemble(n, k);
-
-	/* On successfull transaction commit, @k was invalidated: */
-
-	return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
-		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0) ?:
-		-BCH_ERR_transaction_restart_nested;
-}
-
 static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
 			  struct per_snapshot_io_opts *io_opts,
 			  struct btree_iter *extent_iter,
@@ -463,7 +426,7 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
 				break;
 			}
 out:
-	ret = get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
+	ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
 	if (ret)
 		return ERR_PTR(ret);
 	return opts_ret;
@@ -497,7 +460,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans,
 	}
 	bch2_trans_iter_exit(trans, &inode_iter);
 out:
-	return get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
+	return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
 }
 
 int bch2_move_ratelimit(struct moving_context *ctxt)
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 124da250cbe7..d1b580e76ba4 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -24,6 +24,192 @@
 #include <linux/kthread.h>
 #include <linux/sched/cputime.h>
 
+/* bch_extent_rebalance: */
+
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+
+	bkey_extent_entry_for_each(ptrs, entry)
+		if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+			return &entry->rebalance;
+
+	return NULL;
+}
+
+static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
+					   struct bch_io_opts *opts,
+					   struct bkey_s_c k,
+					   struct bkey_ptrs_c ptrs)
+{
+	if (!opts->background_compression)
+		return 0;
+
+	unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned ptr_bit = 1;
+	unsigned rewrite_ptrs = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+		    p.ptr.unwritten)
+			return 0;
+
+		if (!p.ptr.cached && p.crc.compression_type != compression_type)
+			rewrite_ptrs |= ptr_bit;
+		ptr_bit <<= 1;
+	}
+
+	return rewrite_ptrs;
+}
+
+static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
+				       struct bch_io_opts *opts,
+				       struct bkey_ptrs_c ptrs)
+{
+	if (!opts->background_target ||
+	    !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
+		return 0;
+
+	unsigned ptr_bit = 1;
+	unsigned rewrite_ptrs = 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
+			rewrite_ptrs |= ptr_bit;
+		ptr_bit <<= 1;
+	}
+
+	return rewrite_ptrs;
+}
+
+static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
+					      struct bch_io_opts *opts,
+					      struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
+		bch2_bkey_ptrs_need_move(c, opts, ptrs);
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+	if (!opts)
+		return 0;
+
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	u64 sectors = 0;
+
+	if (opts->background_compression) {
+		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+			    p.ptr.unwritten) {
+				sectors = 0;
+				goto incompressible;
+			}
+
+			if (!p.ptr.cached && p.crc.compression_type != compression_type)
+				sectors += p.crc.compressed_size;
+		}
+	}
+incompressible:
+	if (opts->background_target &&
+	    bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
+				sectors += p.crc.compressed_size;
+	}
+
+	return sectors;
+}
+
+static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
+					     struct bkey_s_c k)
+{
+	if (!bkey_extent_is_direct_data(k.k))
+		return 0;
+
+	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
+
+	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
+		struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
+		return old == NULL || memcmp(old, &new, sizeof(new));
+	} else {
+		return old != NULL;
+	}
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
+				  struct bkey_i *_k)
+{
+	if (!bkey_extent_is_direct_data(&_k->k))
+		return 0;
+
+	struct bkey_s k = bkey_i_to_s(_k);
+	struct bch_extent_rebalance *old =
+		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+
+	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
+		if (!old) {
+			old = bkey_val_end(k);
+			k.k->u64s += sizeof(*old) / sizeof(u64);
+		}
+
+		*old = io_opts_to_rebalance_opts(opts);
+	} else {
+		if (old)
+			extent_entry_drop(k, (union bch_extent_entry *) old);
+	}
+
+	return 0;
+}
+
+int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+				   struct bch_io_opts *io_opts,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k)
+{
+	BUG_ON(iter->flags & BTREE_ITER_is_extents);
+	BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
+
+	const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
+		? bch2_bkey_rebalance_opts(k) : NULL;
+	if (r) {
+#define x(_name)							\
+		if (r->_name##_from_inode) {				\
+			io_opts->_name = r->_name;			\
+			io_opts->_name##_from_inode = true;		\
+		}
+		BCH_REBALANCE_OPTS()
+#undef x
+	}
+
+	if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
+		return 0;
+
+	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(n, k);
+
+	/* On successfull transaction commit, @k was invalidated: */
+
+	return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0) ?:
+		-BCH_ERR_transaction_restart_nested;
+}
+
 #define REBALANCE_WORK_SCAN_OFFSET	(U64_MAX - 1)
 
 static const char * const bch2_rebalance_state_strs[] = {
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 606c88f49f7f..0a0821ab895d 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -6,52 +6,12 @@
 #include "disk_groups.h"
 #include "rebalance_types.h"
 
-static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
-					   struct bch_io_opts *opts,
-					   struct bkey_s_c k,
-					   struct bkey_ptrs_c ptrs)
-{
-	if (!opts->background_compression)
-		return 0;
-
-	unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned ptr_bit = 1;
-	unsigned rewrite_ptrs = 0;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
-		    p.ptr.unwritten)
-			return 0;
-
-		if (!p.ptr.cached && p.crc.compression_type != compression_type)
-			rewrite_ptrs |= ptr_bit;
-		ptr_bit <<= 1;
-	}
-
-	return rewrite_ptrs;
-}
-
-static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
-				       struct bch_io_opts *opts,
-				       struct bkey_ptrs_c ptrs)
-{
-	if (!opts->background_target ||
-	    !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
-		return 0;
-
-	unsigned ptr_bit = 1;
-	unsigned rewrite_ptrs = 0;
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
-			rewrite_ptrs |= ptr_bit;
-		ptr_bit <<= 1;
-	}
-
-	return rewrite_ptrs;
-}
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
+int bch2_get_update_rebalance_opts(struct btree_trans *,
+				   struct bch_io_opts *,
+				   struct btree_iter *,
+				   struct bkey_s_c);
 
 int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
 int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
new file mode 100644
index 000000000000..ff9a1342a22b
--- /dev/null
+++ b/fs/bcachefs/rebalance_format.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_FORMAT_H
+#define _BCACHEFS_REBALANCE_FORMAT_H
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:6,
+				unused:3,
+
+				promote_target_from_inode:1,
+				erasure_code_from_inode:1,
+				data_checksum_from_inode:1,
+				background_compression_from_inode:1,
+				data_replicas_from_inode:1,
+				background_target_from_inode:1,
+
+				promote_target:16,
+				erasure_code:1,
+				data_checksum:4,
+				data_replicas:4,
+				background_compression:8, /* enum bch_compression_opt */
+				background_target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			background_target:16,
+				background_compression:8,
+				data_replicas:4,
+				data_checksum:4,
+				erasure_code:1,
+				promote_target:16,
+
+				background_target_from_inode:1,
+				data_replicas_from_inode:1,
+				background_compression_from_inode:1,
+				data_checksum_from_inode:1,
+				erasure_code_from_inode:1,
+				promote_target_from_inode:1,
+
+				unused:3,
+				type:6;
+#endif
+};
+
+/* subset of BCH_INODE_OPTS */
+#define BCH_REBALANCE_OPTS()			\
+	x(data_checksum)			\
+	x(background_compression)		\
+	x(data_replicas)			\
+	x(promote_target)			\
+	x(background_target)			\
+	x(erasure_code)
+
+#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
+
-- 
cgit v1.2.3


From d3d8ec90babcaf8b925cd63c668573abb423e715 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Tue, 29 Oct 2024 20:53:50 +0800
Subject: bcachefs: remove write permission for gc_gens_pos sysfs interface

The gc_gens_pos is used to show the status of bucket gen gc.
There is no need to assign write permissions for this attribute.
Here we can use read_attribute helper to define this attribute.

```
[Before]
  $ ll internal/gc_gens_pos
  -rw-r--r-- 1 root root 4096 Oct 28 15:27 internal/gc_gens_pos

[After]
  $ ll internal/gc_gens_pos
  -r--r--r-- 1 root root 4096 Oct 28 17:27 internal/gc_gens_pos
```

Fixes: ac516d0e7db7 ("bcachefs: Add the status of bucket gen gc to sysfs")
Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4ab0ccba2ab5..47ac8d5ab562 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -146,7 +146,7 @@ write_attribute(trigger_journal_writes);
 write_attribute(trigger_btree_cache_shrink);
 write_attribute(trigger_btree_key_cache_shrink);
 write_attribute(trigger_freelist_wakeup);
-rw_attribute(gc_gens_pos);
+read_attribute(gc_gens_pos);
 
 read_attribute(uuid);
 read_attribute(minor);
-- 
cgit v1.2.3


From 32e573c362db3d15b8046d10cd194c314adf7b82 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Tue, 29 Oct 2024 20:54:08 +0800
Subject: bcachefs: use attribute define helper for sysfs attribute

The sysfs attribute definition has been wrapped into macro:
rw_attribute, read_attribute and write_attribute, we can
use these helpers to uniform the attribute definition.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 47ac8d5ab562..97733c766948 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -211,6 +211,7 @@ BCH_PERSISTENT_COUNTERS()
 #undef x
 
 rw_attribute(discard);
+read_attribute(state);
 rw_attribute(label);
 
 read_attribute(copy_gc_wait);
@@ -235,11 +236,6 @@ write_attribute(perf_test);
 	BCH_TIME_STATS()
 #undef x
 
-static struct attribute sysfs_state_rw = {
-	.name = "state",
-	.mode =  0444,
-};
-
 static size_t bch2_btree_cache_size(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
@@ -774,7 +770,7 @@ SHOW(bch2_dev)
 		prt_char(out, '\n');
 	}
 
-	if (attr == &sysfs_state_rw) {
+	if (attr == &sysfs_state) {
 		prt_string_option(out, bch2_member_states, ca->mi.state);
 		prt_char(out, '\n');
 	}
@@ -854,7 +850,7 @@ struct attribute *bch2_dev_files[] = {
 
 	/* settings: */
 	&sysfs_discard,
-	&sysfs_state_rw,
+	&sysfs_state,
 	&sysfs_label,
 
 	&sysfs_has_data,
-- 
cgit v1.2.3


From cc944fbe06d8e7b1098d42b9b824272dad5cea44 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 31 Oct 2024 03:35:41 -0400
Subject: bcachefs: Add assert for use of journal replay keys for updates

The journal replay keys mechanism can only be used for updates in early
recovery, when still single threaded.

Add some asserts to make sure we never accidentally use it elsewhere.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h           | 6 ++++++
 fs/bcachefs/btree_trans_commit.c | 2 ++
 fs/bcachefs/super.c              | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e1ab67c533f0..c59a58b93a92 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -743,6 +743,12 @@ struct bch_fs {
 #else
 	struct percpu_ref	writes;
 #endif
+	/*
+	 * Certain operations are only allowed in single threaded mode, during
+	 * recovery, and we want to assert that this is the case:
+	 */
+	struct task_struct	*recovery_task;
+
 	/*
 	 * Analagous to c->writes, for asynchronous ops that don't necessarily
 	 * need fs to be read-write
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index b47f11881fe4..529a5a19ab8a 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -999,6 +999,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 
+	BUG_ON(current != c->recovery_task);
+
 	trans_for_each_update(trans, i) {
 		int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
 		if (ret)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7e2431de3a94..7e0ff17a6dbb 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -441,6 +441,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 {
 	int ret;
 
+	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+
 	if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
 		bch_err(c, "cannot go rw, unfixed btree errors");
 		return -BCH_ERR_erofs_unfixed_errors;
@@ -1031,9 +1033,12 @@ int bch2_fs_start(struct bch_fs *c)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
+	c->recovery_task = current;
 	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
 		? bch2_fs_recovery(c)
 		: bch2_fs_initialize(c);
+	c->recovery_task = NULL;
+
 	if (ret)
 		goto err;
 
-- 
cgit v1.2.3


From a34b026482125b8170dae3d059120c0575ff6893 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 31 Oct 2024 03:39:32 -0400
Subject: bcachefs: Kill BCH_TRANS_COMMIT_lazy_rw

We unconditionally go read-write, if we're going to do so, before
journal replay: lazy_rw is obsolete.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c           |  2 +-
 fs/bcachefs/btree_trans_commit.c | 31 +++++--------------------------
 fs/bcachefs/btree_update.c       |  3 +--
 fs/bcachefs/btree_update.h       |  1 -
 fs/bcachefs/lru.c                |  2 +-
 fs/bcachefs/rebalance.c          |  3 +--
 fs/bcachefs/snapshot.c           |  8 ++++++--
 fs/bcachefs/subvolume.c          |  2 +-
 fs/bcachefs/super.h              | 10 ----------
 9 files changed, 16 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 3c4e66da1ca4..833d743dee0c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -908,7 +908,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c)
 					POS(ca->dev_idx, ca->mi.first_bucket),
 					POS(ca->dev_idx, ca->mi.nbuckets - 1),
 					BTREE_ITER_slots|BTREE_ITER_prefetch, k,
-					NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+					NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 				bch2_alloc_write_key(trans, &iter, ca, k)));
 		if (ret) {
 			bch2_dev_put(ca);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 529a5a19ab8a..3aca746d08f6 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -971,24 +971,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 	return ret;
 }
 
-static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
-	    test_bit(BCH_FS_started, &c->flags))
-		return -BCH_ERR_erofs_trans_commit;
-
-	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
-	if (ret)
-		return ret;
-
-	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
-	return 0;
-}
-
 /*
  * This is for updates done in the early part of fsck - btree_gc - before we've
  * gone RW. we only add the new key to the list of keys for journal replay to
@@ -1037,16 +1019,13 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	if (ret)
 		goto out_reset;
 
-	if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
-		ret = do_bch2_trans_commit_to_journal_replay(trans);
-		goto out_reset;
-	}
-
 	if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
 	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
-		ret = bch2_trans_commit_get_rw_cold(trans, flags);
-		if (ret)
-			goto out_reset;
+		if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
+			ret = do_bch2_trans_commit_to_journal_replay(trans);
+		else
+			ret = -BCH_ERR_erofs_trans_commit;
+		goto out_reset;
 	}
 
 	EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 79a274dcd17b..a9a29fba4902 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -865,8 +865,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
 		memcpy(l->d, buf.buf, buf.pos);
 		c->journal.early_journal_entries.nr += jset_u64s(u64s);
 	} else {
-		ret = bch2_trans_commit_do(c, NULL, NULL,
-			BCH_TRANS_COMMIT_lazy_rw|commit_flags,
+		ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
 			__bch2_trans_log_msg(trans, &buf, u64s));
 	}
 err:
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 7e71c4d1111d..3bc57d43aa83 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -24,7 +24,6 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
 #define BCH_TRANS_COMMIT_FLAGS()							\
 	x(no_enospc,	"don't check for enospc")					\
 	x(no_check_rw,	"don't attempt to take a ref on c->writes")			\
-	x(lazy_rw,	"go read-write if we haven't yet - only for use in recovery")	\
 	x(no_journal_res, "don't take a journal reservation, instead "			\
 			"pin journal entry referred to by trans->journal_res.seq")	\
 	x(journal_reclaim, "operation required for journal reclaim; may return error"	\
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 10857eccdeaf..c18242748ca3 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -192,7 +192,7 @@ int bch2_check_lrus(struct bch_fs *c)
 	int ret = bch2_trans_run(c,
 		for_each_btree_key_commit(trans, iter,
 				BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 			bch2_check_lru_key(trans, &iter, k, &last_flushed)));
 
 	bch2_bkey_buf_exit(&last_flushed, c);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index d1b580e76ba4..4adc74cd3f70 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -257,8 +257,7 @@ err:
 int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
 {
 	int ret = bch2_trans_commit_do(c, NULL, NULL,
-				       BCH_TRANS_COMMIT_no_enospc|
-				       BCH_TRANS_COMMIT_lazy_rw,
+				       BCH_TRANS_COMMIT_no_enospc,
 			    bch2_set_rebalance_needs_scan_trans(trans, inum));
 	rebalance_wakeup(c);
 	return ret;
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 34e01bd8127f..6a52090485dc 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1733,8 +1733,12 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
 
 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 {
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
-	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
+		return;
+
+	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+
+	if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
 		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 80e5efaff524..cb45ef769c54 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -675,7 +675,7 @@ err:
 /* set bi_subvol on root inode */
 int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 {
-	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 				       __bch2_fs_upgrade_for_subvolumes(trans));
 	bch_err_fn(c, ret);
 	return ret;
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index dada09331d2e..fa6d52216510 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -34,16 +34,6 @@ void bch2_fs_read_only(struct bch_fs *);
 int bch2_fs_read_write(struct bch_fs *);
 int bch2_fs_read_write_early(struct bch_fs *);
 
-/*
- * Only for use in the recovery/fsck path:
- */
-static inline void bch2_fs_lazy_rw(struct bch_fs *c)
-{
-	if (!test_bit(BCH_FS_rw, &c->flags) &&
-	    !test_bit(BCH_FS_was_rw, &c->flags))
-		bch2_fs_read_write_early(c);
-}
-
 void __bch2_fs_stop(struct bch_fs *);
 void bch2_fs_free(struct bch_fs *);
 void bch2_fs_stop(struct bch_fs *);
-- 
cgit v1.2.3


From b09b34499c43197a2578f179ffbcfdd5a5bc85b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 31 Oct 2024 00:25:36 -0400
Subject: bcachefs: Improved check_topology() assert

On interior btree node updates, we always verify that we're not
introducing topology errors: child nodes should exactly span the range
of the parent node.

single_device.ktest small_nodes has been popping this assert: change it
to give us more information.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d62de3f79b29..865c4724d550 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1418,15 +1418,26 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
 	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
 		;
 
-	while (!bch2_keylist_empty(keys)) {
-		insert = bch2_keylist_front(keys);
+	for (;
+	     insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
+	     insert = bkey_next(insert))
+		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
 
-		if (bpos_gt(insert->k.p, b->key.k.p))
-			break;
+	if (bch2_btree_node_check_topology(trans, b)) {
+		struct printbuf buf = PRINTBUF;
 
-		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
-		bch2_keylist_pop_front(keys);
+		for (struct bkey_i *k = keys->keys;
+		     k != insert;
+		     k = bkey_next(k)) {
+			bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
+			prt_newline(&buf);
+		}
+
+		panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf);
 	}
+
+	memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
+	keys->top_p -= insert->_data - keys->keys_p;
 }
 
 static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
@@ -1575,8 +1586,6 @@ static void btree_split_insert_keys(struct btree_update *as,
 		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
 
 		bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
-
-		BUG_ON(bch2_btree_node_check_topology(trans, b));
 	}
 }
 
@@ -1827,8 +1836,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 
 	btree_update_updated_node(as, b);
 	bch2_btree_node_unlock_write(trans, path, b);
-
-	BUG_ON(bch2_btree_node_check_topology(trans, b));
 	return 0;
 split:
 	/*
-- 
cgit v1.2.3


From 69785001c669e6e8681efdc3e49afee9f6a38559 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 7 Nov 2024 22:00:05 -0500
Subject: bcachefs: Fix unhandled transaction restart in evacuate_bucket()

Generally, releasing a transaction within a transaction restart means an
unhandled transaction restart: but this can happen legitimately within
the move code, e.g. when bch2_move_ratelimit() tells us to exit before
we've retried.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index d6e68265e039..a6b503278519 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -197,6 +197,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 	list_del(&ctxt->list);
 	mutex_unlock(&c->moving_context_lock);
 
+	/*
+	 * Generally, releasing a transaction within a transaction restart means
+	 * an unhandled transaction restart: but this can happen legitimately
+	 * within the move code, e.g. when bch2_move_ratelimit() tells us to
+	 * exit before we've retried
+	 */
+	bch2_trans_begin(ctxt->trans);
 	bch2_trans_put(ctxt->trans);
 	memset(ctxt, 0, sizeof(*ctxt));
 }
-- 
cgit v1.2.3


From 2434fc38ef62731f1d9b8684625a71385112805f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 14 Oct 2024 23:33:57 -0400
Subject: bcachefs: Assert we're not in a restart in bch2_trans_put()

This always indicates a transaction restart handling bug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 07bce85dafaf..98375c66021a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3261,6 +3261,9 @@ void bch2_trans_put(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 
+	if (trans->restarted)
+		bch2_trans_in_restart_error(trans);
+
 	bch2_trans_unlock(trans);
 
 	trans_for_each_update(trans, i)
-- 
cgit v1.2.3


From a71a1fac904d89fe6a5a1d407a85de0b078f1dee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 14 Oct 2024 23:52:51 -0400
Subject: bcachefs: Better in_restart error

We're ramping up on checking transaction restart handling correctness -
so, in debug mode we now save a backtrace for where the restart was
emitted, which makes it much easier to track down the incorrect
handling.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 12 ++++++++++++
 fs/bcachefs/btree_iter.h  |  4 ++++
 fs/bcachefs/btree_types.h |  3 +++
 3 files changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 98375c66021a..acf70aaf2fd2 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1427,9 +1427,17 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_
 
 void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
 {
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct printbuf buf = PRINTBUF;
+	bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
+	panic("in transaction restart: %s, last restarted by\n%s",
+	      bch2_err_str(trans->restarted),
+	      buf.buf);
+#else
 	panic("in transaction restart: %s, last restarted by %pS\n",
 	      bch2_err_str(trans->restarted),
 	      (void *) trans->last_restarted_ip);
+#endif
 }
 
 void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
@@ -3287,6 +3295,10 @@ void bch2_trans_put(struct btree_trans *trans)
 	closure_return_sync(&trans->ref);
 	trans->locking_wait.task = NULL;
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+	darray_exit(&trans->last_restarted_trace);
+#endif
+
 	unsigned long *paths_allocated = trans->paths_allocated;
 	trans->paths_allocated	= NULL;
 	trans->paths		= NULL;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index dda07a320488..36899c6b134e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -350,6 +350,10 @@ static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned l
 
 	trans->restarted = err;
 	trans->last_restarted_ip = ip;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	darray_exit(&trans->last_restarted_trace);
+	bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
+#endif
 	return -err;
 }
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4568a41fefaf..baab5288ecc9 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -513,6 +513,9 @@ struct btree_trans {
 	u64			last_begin_time;
 	unsigned long		last_begin_ip;
 	unsigned long		last_restarted_ip;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	bch_stacktrace		last_restarted_trace;
+#endif
 	unsigned long		last_unlock_ip;
 	unsigned long		srcu_lock_time;
 
-- 
cgit v1.2.3


From b318882022a8ab67e9b1682bed52366072592fa7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 27 Oct 2024 19:32:40 -0400
Subject: bcachefs: bch2_trans_verify_not_unlocked_or_in_restart()

Fold two asserts into one.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            | 36 ++++++++++++++++++++----------------
 fs/bcachefs/btree_iter.h            | 20 ++++++--------------
 fs/bcachefs/btree_locking.h         |  2 +-
 fs/bcachefs/btree_trans_commit.c    |  9 +++------
 fs/bcachefs/btree_update_interior.c |  3 +--
 fs/bcachefs/btree_update_interior.h |  2 +-
 6 files changed, 32 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index acf70aaf2fd2..1efc77fc9abf 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -327,7 +327,7 @@ out:
 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
 			    struct bpos pos)
 {
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
 	struct btree_path *path;
 	struct trans_for_each_path_inorder_iter iter;
@@ -1265,7 +1265,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 {
 	int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
 
-	bch2_trans_verify_not_in_restart(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	EBUG_ON(!trans->paths[path_idx].ref);
 
 	trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
@@ -1425,7 +1425,7 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_
 	      (void *) trans->last_begin_ip);
 }
 
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct printbuf buf = PRINTBUF;
@@ -1440,10 +1440,16 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
 #endif
 }
 
-void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
 {
-	panic("trans should be locked, unlocked by %pS\n",
-	      (void *) trans->last_unlock_ip);
+	if (trans->restarted)
+		bch2_trans_in_restart_error(trans);
+
+	if (!trans->locked)
+		panic("trans should be locked, unlocked by %pS\n",
+		      (void *) trans->last_unlock_ip);
+
+	BUG();
 }
 
 noinline __cold
@@ -1724,8 +1730,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
 	struct trans_for_each_path_inorder_iter iter;
 	btree_path_idx_t path_pos = 0, path_idx;
 
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_trans_verify_not_in_restart(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	bch2_trans_verify_locks(trans);
 
 	btree_trans_sort_paths(trans);
@@ -1877,7 +1882,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 	struct btree_trans *trans = iter->trans;
 	int ret;
 
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
 	iter->path = bch2_btree_path_set_pos(trans, iter->path,
 					btree_iter_search_key(iter),
@@ -1952,7 +1957,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	int ret;
 
 	EBUG_ON(trans->paths[iter->path].cached);
-	bch2_trans_verify_not_in_restart(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	bch2_btree_iter_verify(iter);
 
 	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2161,8 +2166,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_verify_not_in_restart(trans);
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
 	if ((iter->flags & BTREE_ITER_key_cache_fill) &&
 	    bpos_eq(iter->pos, pos))
@@ -2302,7 +2306,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 	struct bpos iter_pos;
 	int ret;
 
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
 
 	if (iter->update_path) {
@@ -2475,7 +2479,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 	btree_path_idx_t saved_path = 0;
 	int ret;
 
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	EBUG_ON(btree_iter_path(trans, iter)->cached ||
 		btree_iter_path(trans, iter)->level);
 
@@ -2614,7 +2618,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
 	EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
@@ -3136,7 +3140,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 		trans->notrace_relock_fail = false;
 	}
 
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	return trans->restart_count;
 }
 
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 36899c6b134e..6b1c46e95432 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -236,12 +236,12 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
 					      btree_path_idx_t,
 					      unsigned, unsigned long);
 
-static inline void bch2_trans_verify_not_unlocked(struct btree_trans *);
+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
 
 static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 					  btree_path_idx_t path, unsigned flags)
 {
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
 	if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
 		return 0;
@@ -326,20 +326,12 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
 		bch2_trans_restart_error(trans, restart_count);
 }
 
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
 
-static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
 {
-	if (trans->restarted)
-		bch2_trans_in_restart_error(trans);
-}
-
-void __noreturn bch2_trans_unlocked_error(struct btree_trans *);
-
-static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans)
-{
-	if (!trans->locked)
-		bch2_trans_unlocked_error(trans);
+	if (trans->restarted || !trans->locked)
+		bch2_trans_unlocked_or_in_restart_error(trans);
 }
 
 __always_inline
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7c07f9fa9add..ca4aeefd631e 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -282,7 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	int ret = 0;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
 	if (likely(six_trylock_type(&b->lock, type)) ||
 	    btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 3aca746d08f6..cf313477567a 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -619,8 +619,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	unsigned u64s = 0;
 	int ret = 0;
 
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_trans_verify_not_in_restart(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
 	if (race_fault()) {
 		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
@@ -1008,8 +1007,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	struct bch_fs *c = trans->c;
 	int ret = 0;
 
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_trans_verify_not_in_restart(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
 	if (!trans->nr_updates &&
 	    !trans->journal_entries_u64s)
@@ -1070,8 +1068,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	}
 retry:
 	errored_at = NULL;
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_trans_verify_not_in_restart(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
 		memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 	memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 865c4724d550..c11babe31f54 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1960,8 +1960,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	u64 start_time = local_clock();
 	int ret = 0;
 
-	bch2_trans_verify_not_in_restart(trans);
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 	BUG_ON(!trans->paths[path].should_be_locked);
 	BUG_ON(!btree_node_locked(&trans->paths[path], level));
 
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 10f400957f21..1c6cf3e2e6a9 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -159,7 +159,7 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
 					      unsigned level,
 					      unsigned flags)
 {
-	bch2_trans_verify_not_unlocked(trans);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
 	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
 						    btree_prev_sib) ?:
-- 
cgit v1.2.3


From 65b14fa3d83588d55441384ea1c0b3eacfac0a1b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Oct 2024 22:31:20 -0400
Subject: bcachefs: Assert that we're not violating key cache coherency rules

We're not allowed to have a dirty key in the key cache if the key
doesn't exist at all in the btree - creation has to bypass the key
cache, so that iteration over the btree can check if the key is present
in the key cache.

Things break in subtle ways if cache coherency is broken, so this needs
an assert.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 244610b1d0b5..3bd40ea0fa3d 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -424,8 +424,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 	    !test_bit(JOURNAL_space_low, &c->journal.flags))
 		commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
 
-	ret   = bch2_btree_iter_traverse(&b_iter) ?:
-		bch2_trans_update(trans, &b_iter, ck->k,
+	struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
+	ret = bkey_err(btree_k);
+	if (ret)
+		goto err;
+
+	/* * Check that we're not violating cache coherency rules: */
+	BUG_ON(bkey_deleted(btree_k.k));
+
+	ret   = bch2_trans_update(trans, &b_iter, ck->k,
 				  BTREE_UPDATE_key_cache_reclaim|
 				  BTREE_UPDATE_internal_snapshot_node|
 				  BTREE_TRIGGER_norun) ?:
@@ -433,7 +440,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 				  BCH_TRANS_COMMIT_no_check_rw|
 				  BCH_TRANS_COMMIT_no_enospc|
 				  commit_flags);
-
+err:
 	bch2_fs_fatal_err_on(ret &&
 			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
-- 
cgit v1.2.3


From 000fe8d573c4f287ed0aceb328f0a1a7698790c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 24 Oct 2024 18:39:59 -0400
Subject: bcachefs: Rename btree_iter_peek_upto() -> btree_iter_peek_max()

We'll be introducing btree_iter_peek_prev_min(), so rename for
consistency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c   |  6 +++---
 fs/bcachefs/btree_gc.c           |  2 +-
 fs/bcachefs/btree_iter.c         | 10 +++++-----
 fs/bcachefs/btree_iter.h         | 36 ++++++++++++++++++------------------
 fs/bcachefs/btree_journal_iter.c |  4 ++--
 fs/bcachefs/btree_journal_iter.h |  2 +-
 fs/bcachefs/btree_update.c       |  6 +++---
 fs/bcachefs/dirent.c             |  4 ++--
 fs/bcachefs/ec.c                 |  2 +-
 fs/bcachefs/extent_update.c      |  2 +-
 fs/bcachefs/fs-io-pagecache.c    |  2 +-
 fs/bcachefs/fs-io.c              |  8 ++++----
 fs/bcachefs/fs.c                 |  2 +-
 fs/bcachefs/fsck.c               |  8 ++++----
 fs/bcachefs/inode.c              |  6 +++---
 fs/bcachefs/io_misc.c            |  6 +++---
 fs/bcachefs/io_write.c           |  4 ++--
 fs/bcachefs/movinggc.c           |  2 +-
 fs/bcachefs/reflink.c            |  2 +-
 fs/bcachefs/str_hash.h           |  6 +++---
 fs/bcachefs/subvolume.h          | 12 ++++++------
 fs/bcachefs/tests.c              | 26 +++++++++++++-------------
 fs/bcachefs/xattr.c              |  2 +-
 23 files changed, 80 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c84a91572a1d..af791f4dab99 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1045,7 +1045,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
 		 * btree node min/max is a closed interval, upto takes a half
 		 * open interval:
 		 */
-		k = bch2_btree_iter_peek_upto(&iter2, end);
+		k = bch2_btree_iter_peek_max(&iter2, end);
 		next = iter2.pos;
 		bch2_trans_iter_exit(iter->trans, &iter2);
 
@@ -1886,7 +1886,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 	 * successful commit:
 	 */
 	ret = bch2_trans_run(c,
-		for_each_btree_key_upto(trans, iter,
+		for_each_btree_key_max(trans, iter,
 				   BTREE_ID_need_discard,
 				   POS(ca->dev_idx, 0),
 				   POS(ca->dev_idx, U64_MAX), 0, k,
@@ -2101,7 +2101,7 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter
 {
 	struct bkey_s_c k;
 again:
-	k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+	k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
 	if (!k.k && !*wrapped) {
 		bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
 		*wrapped = true;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 833d743dee0c..e45cf32a6403 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -904,7 +904,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c)
 
 	for_each_member_device(c, ca) {
 		ret = bch2_trans_run(c,
-			for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
+			for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
 					POS(ca->dev_idx, ca->mi.first_bucket),
 					POS(ca->dev_idx, ca->mi.nbuckets - 1),
 					BTREE_ITER_slots|BTREE_ITER_prefetch, k,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1efc77fc9abf..21cadc98bdae 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2113,7 +2113,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
 {
 	struct btree_path *path = btree_iter_path(trans, iter);
 
-	return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+	return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
 					   path->level,
 					   path->pos,
 					   end_pos,
@@ -2291,14 +2291,14 @@ out:
 }
 
 /**
- * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * bch2_btree_iter_peek_max() - returns first key greater than or equal to
  * iterator's current position
  * @iter:	iterator to peek from
  * @end:	search limit: returns keys less than or equal to @end
  *
  * Returns:	key if found, or an error extractable with bkey_err().
  */
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = btree_iter_search_key(iter);
@@ -2682,7 +2682,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			struct btree_iter iter2;
 
 			bch2_trans_copy_iter(&iter2, iter);
-			k = bch2_btree_iter_peek_upto(&iter2, end);
+			k = bch2_btree_iter_peek_max(&iter2, end);
 
 			if (k.k && !bkey_err(k)) {
 				swap(iter->key_cache_path, iter2.key_cache_path);
@@ -2693,7 +2693,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		} else {
 			struct bpos pos = iter->pos;
 
-			k = bch2_btree_iter_peek_upto(iter, end);
+			k = bch2_btree_iter_peek_max(iter, end);
 			if (unlikely(bkey_err(k)))
 				bch2_btree_iter_set_pos(iter, pos);
 			else
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 6b1c46e95432..cd9022ce15a5 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -381,12 +381,12 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
 struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
 struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 
 static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
-	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+	return bch2_btree_iter_peek_max(iter, SPOS_MAX);
 }
 
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
@@ -672,12 +672,12 @@ static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
 						bch2_btree_iter_peek(iter);
 }
 
-static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
 							     struct bpos end,
 							     unsigned flags)
 {
 	if (!(flags & BTREE_ITER_slots))
-		return bch2_btree_iter_peek_upto(iter, end);
+		return bch2_btree_iter_peek_max(iter, end);
 
 	if (bkey_gt(iter->pos, end))
 		return bkey_s_c_null;
@@ -741,7 +741,7 @@ transaction_restart:							\
 	_ret2 ?: trans_was_restarted(_trans, _restart_count);		\
 })
 
-#define for_each_btree_key_upto_continue(_trans, _iter,			\
+#define for_each_btree_key_max_continue(_trans, _iter,			\
 					 _end, _flags, _k, _do)		\
 ({									\
 	struct bkey_s_c _k;						\
@@ -749,7 +749,7 @@ transaction_restart:							\
 									\
 	do {								\
 		_ret3 = lockrestart_do(_trans, ({			\
-			(_k) = bch2_btree_iter_peek_upto_type(&(_iter),	\
+			(_k) = bch2_btree_iter_peek_max_type(&(_iter),	\
 						_end, (_flags));	\
 			if (!(_k).k)					\
 				break;					\
@@ -763,9 +763,9 @@ transaction_restart:							\
 })
 
 #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do)	\
-	for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
+	for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
 
-#define for_each_btree_key_upto(_trans, _iter, _btree_id,		\
+#define for_each_btree_key_max(_trans, _iter, _btree_id,		\
 				_start, _end, _flags, _k, _do)		\
 ({									\
 	bch2_trans_begin(trans);					\
@@ -774,12 +774,12 @@ transaction_restart:							\
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
 			     (_start), (_flags));			\
 									\
-	for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\
+	for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
 })
 
 #define for_each_btree_key(_trans, _iter, _btree_id,			\
 			   _start, _flags, _k, _do)			\
-	for_each_btree_key_upto(_trans, _iter, _btree_id, _start,	\
+	for_each_btree_key_max(_trans, _iter, _btree_id, _start,	\
 				 SPOS_MAX, _flags, _k, _do)
 
 #define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
@@ -823,33 +823,33 @@ transaction_restart:							\
 			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_commit_flags)))
 
-#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id,	\
+#define for_each_btree_key_max_commit(_trans, _iter, _btree_id,	\
 				  _start, _end, _iter_flags, _k,	\
 				  _disk_res, _journal_seq, _commit_flags,\
 				  _do)					\
-	for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+	for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
 			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_commit_flags)))
 
 struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
 
-#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
+#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id,	\
 			   _start, _end, _flags, _k, _ret)		\
 	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
 				  (_start), (_flags));			\
-	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+	     (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 	     bch2_btree_iter_advance(&(_iter)))
 
-#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
+#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
 	for (;									\
-	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),	\
+	     (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),	\
 	     !((_ret) = bkey_err(_k)) && (_k).k;				\
 	     bch2_btree_iter_advance(&(_iter)))
 
 #define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
 			   _start, _flags, _k, _ret)			\
-	for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
+	for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
 					  SPOS_MAX, _flags, _k, _ret)
 
 #define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id,	\
@@ -861,7 +861,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
 	     bch2_btree_iter_rewind(&(_iter)))
 
 #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
-	for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+	for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
 
 /*
  * This should not be used in a fastpath, without first trying _do in
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 924b5e3a4390..c9dee4b4627a 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -61,7 +61,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
 }
 
 /* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
 					   unsigned level, struct bpos pos,
 					   struct bpos end_pos, size_t *idx)
 {
@@ -112,7 +112,7 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
 {
 	size_t idx = 0;
 
-	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+	return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
 }
 
 static void journal_iter_verify(struct journal_iter *iter)
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 1653de9d609b..754939f604d5 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -43,7 +43,7 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour
 	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
 }
 
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
 				unsigned, struct bpos, struct bpos, size_t *);
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
 					   unsigned, struct bpos);
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index a9a29fba4902..6afd77c68411 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 			     BTREE_ITER_intent|
 			     BTREE_ITER_with_updates|
 			     BTREE_ITER_not_extents);
-	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+	k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
 	if ((ret = bkey_err(k)))
 		goto err;
 	if (!k.k)
@@ -323,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 			goto out;
 next:
 		bch2_btree_iter_advance(&iter);
-		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+		k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
 		if ((ret = bkey_err(k)))
 			goto err;
 		if (!k.k)
@@ -721,7 +721,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 	int ret = 0;
 
 	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
-	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+	while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
 		struct disk_reservation disk_res =
 			bch2_disk_reservation_init(trans->c, 0);
 		struct bkey_i delete;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index faffc98d5605..4c22f78b0484 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -500,7 +500,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
 			   SPOS(dir, 0, snapshot),
 			   POS(dir, U64_MAX), 0, k, ret)
 		if (k.k->type == KEY_TYPE_dirent) {
@@ -549,7 +549,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 	bch2_bkey_buf_init(&sk);
 
 	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
 				   POS(inum.inum, ctx->pos),
 				   POS(inum.inum, U64_MAX),
 				   inum.subvol, 0, k, ({
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b46bf00c4a67..5c404f24bddc 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -2299,7 +2299,7 @@ err:
 int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
 {
 	return bch2_trans_run(c,
-		for_each_btree_key_upto_commit(trans, iter,
+		for_each_btree_key_max_commit(trans, iter,
 				  BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
 				  BTREE_ITER_intent, k,
 				  NULL, NULL, 0, ({
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 5f4fecb358da..45c87c019f6b 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
 
 	bch2_trans_copy_iter(&copy, iter);
 
-	for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
+	for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
 		unsigned offset = 0;
 
 		if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 1d4910ea0f1d..51a499c5a7b6 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -199,7 +199,7 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
 	unsigned folio_idx = 0;
 
 	return bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
 				   POS(inum.inum, offset),
 				   POS(inum.inum, U64_MAX),
 				   inum.subvol, BTREE_ITER_slots, k, ({
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2456c41b215e..0021db191480 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -222,7 +222,7 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
 				 struct bpos end)
 {
 	return bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end,
 						    subvol, 0, k, ({
 			bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
 		})));
@@ -806,7 +806,7 @@ static int quota_reserve_range(struct bch_inode_info *inode,
 	u64 sectors = end - start;
 
 	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_upto(trans, iter,
+		for_each_btree_key_in_subvolume_max(trans, iter,
 				BTREE_ID_extents,
 				POS(inode->v.i_ino, start),
 				POS(inode->v.i_ino, end - 1),
@@ -922,7 +922,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 		return -ENXIO;
 
 	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
 				   POS(inode->v.i_ino, offset >> 9),
 				   POS(inode->v.i_ino, U64_MAX),
 				   inum.subvol, 0, k, ({
@@ -958,7 +958,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 		return -ENXIO;
 
 	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
 				   POS(inode->v.i_ino, offset >> 9),
 				   POS(inode->v.i_ino, U64_MAX),
 				   inum.subvol, BTREE_ITER_slots, k, ({
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index e0ffe4648bb8..91fce04272a1 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1294,7 +1294,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 
 		bch2_btree_iter_set_snapshot(&iter, snapshot);
 
-		k = bch2_btree_iter_peek_upto(&iter, end);
+		k = bch2_btree_iter_peek_max(&iter, end);
 		ret = bkey_err(k);
 		if (ret)
 			continue;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c96025b8b65d..2229f0dcc860 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -73,7 +73,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
 {
 	u64 sectors = 0;
 
-	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
 				SPOS(inum, 0, snapshot),
 				POS(inum, U64_MAX),
 				0, k, ({
@@ -90,7 +90,7 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
 {
 	u64 subdirs = 0;
 
-	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
 				    SPOS(inum, 0, snapshot),
 				    POS(inum, U64_MAX),
 				    0, k, ({
@@ -1751,7 +1751,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
 			     BTREE_ITER_all_snapshots|
 			     BTREE_ITER_not_extents);
-	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
+	k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
 	ret = bkey_err(k1);
 	if (ret)
 		goto err;
@@ -1776,7 +1776,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
 	while (1) {
 		bch2_btree_iter_advance(&iter2);
 
-		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
+		k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
 		ret = bkey_err(k2);
 		if (ret)
 			goto err;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 5dd9d3edae77..5c603ab66be0 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -618,7 +618,7 @@ bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter
 	struct bkey_s_c k;
 	int ret = 0;
 
-	for_each_btree_key_upto_norestart(trans, *iter, btree,
+	for_each_btree_key_max_norestart(trans, *iter, btree,
 					  bpos_successor(pos),
 					  SPOS(pos.inode, pos.offset, U32_MAX),
 					  flags|BTREE_ITER_all_snapshots, k, ret)
@@ -653,7 +653,7 @@ int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
 	struct bkey_s_c k;
 	int ret = 0;
 
-	for_each_btree_key_upto_norestart(trans, iter,
+	for_each_btree_key_max_norestart(trans, iter,
 			BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
 			BTREE_ITER_all_snapshots|
 			BTREE_ITER_with_updates, k, ret)
@@ -967,7 +967,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 
 		bch2_btree_iter_set_snapshot(&iter, snapshot);
 
-		k = bch2_btree_iter_peek_upto(&iter, end);
+		k = bch2_btree_iter_peek_max(&iter, end);
 		ret = bkey_err(k);
 		if (ret)
 			goto err;
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index e2acf21ac9b0..ff661a072000 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -164,9 +164,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 		bch2_btree_iter_set_snapshot(iter, snapshot);
 
 		/*
-		 * peek_upto() doesn't have ideal semantics for extents:
+		 * peek_max() doesn't have ideal semantics for extents:
 		 */
-		k = bch2_btree_iter_peek_upto(iter, end_pos);
+		k = bch2_btree_iter_peek_max(iter, end_pos);
 		if (!k.k)
 			break;
 
@@ -427,7 +427,7 @@ case LOGGED_OP_FINSERT_shift_extents:
 
 		k = insert
 			? bch2_btree_iter_peek_prev(&iter)
-			: bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+			: bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
 		if ((ret = bkey_err(k)))
 			goto btree_err;
 
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f2f69e5e0910..f11e11279f01 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -164,7 +164,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 
 	bch2_trans_copy_iter(&iter, extent_iter);
 
-	for_each_btree_key_upto_continue_norestart(iter,
+	for_each_btree_key_max_continue_norestart(iter,
 				new->k.p, BTREE_ITER_slots, old, ret) {
 		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
 			max(bkey_start_offset(&new->k),
@@ -1165,7 +1165,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 	struct btree_trans *trans = bch2_trans_get(c);
 
 	for_each_keylist_key(&op->insert_keys, orig) {
-		int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+		int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
 				     bkey_start_pos(&orig->k), orig->k.p,
 				     BTREE_ITER_intent, k,
 				     NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 725292d69fd6..85c361e78ba5 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -167,7 +167,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
 
 	bch2_trans_begin(trans);
 
-	ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
+	ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
 				  lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
 				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
 				  0, k, ({
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 8a36ebd9dd9c..96cf50f4705d 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -409,7 +409,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) {
+	for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
 		if (bkey_extent_is_unwritten(k))
 			continue;
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index ec2b1feea520..00c785055d22 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -160,7 +160,7 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
 			   POS(inum.inum, U64_MAX),
 			   BTREE_ITER_slots|flags, k, ret) {
@@ -210,7 +210,7 @@ bch2_hash_hole(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
 			   POS(inum.inum, U64_MAX),
 			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
@@ -265,7 +265,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
 	bool found = false;
 	int ret;
 
-	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
 			   SPOS(insert->k.p.inode,
 				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
 				snapshot),
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index f897d106e142..07b23dc08614 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -34,7 +34,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
 int bch2_subvol_is_ro(struct bch_fs *, u32);
 
 static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end,
+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
 					    u32 subvolid, unsigned flags)
 {
 	u32 snapshot;
@@ -43,10 +43,10 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos
 		return bkey_s_c_err(ret);
 
 	bch2_btree_iter_set_snapshot(iter, snapshot);
-	return bch2_btree_iter_peek_upto_type(iter, end, flags);
+	return bch2_btree_iter_peek_max_type(iter, end, flags);
 }
 
-#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter,		\
+#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter,		\
 					 _end, _subvolid, _flags, _k, _do)	\
 ({										\
 	struct bkey_s_c _k;							\
@@ -54,7 +54,7 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos
 										\
 	do {									\
 		_ret3 = lockrestart_do(_trans, ({				\
-			(_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter),	\
+			(_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter),	\
 						_end, _subvolid, (_flags));	\
 			if (!(_k).k)						\
 				break;						\
@@ -67,14 +67,14 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos
 	_ret3;									\
 })
 
-#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id,		\
+#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id,		\
 				_start, _end, _subvolid, _flags, _k, _do)	\
 ({										\
 	struct btree_iter _iter;						\
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),			\
 			     (_start), (_flags));				\
 										\
-	for_each_btree_key_in_subvolume_upto_continue(_trans, _iter,		\
+	for_each_btree_key_in_subvolume_max_continue(_trans, _iter,		\
 					_end, _subvolid, _flags, _k, _do);	\
 })
 
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index fb5c1543e52f..6c6469814637 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -131,7 +131,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 	i = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
 					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 					0, k, ({
 			BUG_ON(k.k->p.offset != i++);
@@ -186,7 +186,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 	i = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
 					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 					0, k, ({
 			BUG_ON(bkey_start_offset(k.k) != i);
@@ -242,7 +242,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 	i = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
 					  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 					  0, k, ({
 			BUG_ON(k.k->p.offset != i);
@@ -259,7 +259,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 	i = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
 					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 					BTREE_ITER_slots, k, ({
 			if (i >= nr * 2)
@@ -302,7 +302,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 	i = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
 					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 					0, k, ({
 			BUG_ON(bkey_start_offset(k.k) != i + 8);
@@ -320,7 +320,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 	i = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
 					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 					BTREE_ITER_slots, k, ({
 			if (i == nr)
@@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
 	bch2_trans_iter_exit(trans, &iter);
@@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
 	bch2_trans_iter_exit(trans, &iter);
@@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 	trans = bch2_trans_get(c);
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, snapid_lo), 0);
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
 
 	BUG_ON(k.k->p.snapshot != U32_MAX);
 
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
 			     BTREE_ITER_intent);
-	k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
+	k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -726,7 +726,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 static int seq_lookup(struct bch_fs *c, u64 nr)
 {
 	return bch2_trans_run(c,
-		for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k,
 		0));
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index ed418a747cdd..820c1791545a 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -309,7 +309,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	u64 offset = 0, inum = inode->ei_inode.bi_inum;
 
 	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs,
 				   POS(inum, offset),
 				   POS(inum, U64_MAX),
 				   inode->ei_inum.subvol, 0, k, ({
-- 
cgit v1.2.3


From db6e584b8514556894ba64b5afddbb2d2217ce62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Oct 2024 01:48:26 -0400
Subject: bcachefs: Simplify btree_iter_peek() filter_snapshots

Collapse all the BTREE_ITER_filter_snapshots handling down into a single
block; btree iteration is much simpler in the !filter_snapshots case.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 129 +++++++++++++++++++++++------------------------
 1 file changed, 62 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 21cadc98bdae..580fee86a965 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1855,7 +1855,6 @@ hole:
 	return (struct bkey_s_c) { u, NULL };
 }
 
-
 void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
 {
 	struct btree_trans *trans = iter->trans;
@@ -2212,8 +2211,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 	bch2_btree_iter_verify(iter);
 
 	while (1) {
-		struct btree_path_level *l;
-
 		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
 					iter->flags & BTREE_ITER_intent,
 					btree_iter_ip_allocated(iter));
@@ -2227,7 +2224,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 		}
 
 		struct btree_path *path = btree_iter_path(trans, iter);
-		l = path_l(path);
+		struct btree_path_level *l = path_l(path);
 
 		if (unlikely(!l->b)) {
 			/* No btree nodes at requested level: */
@@ -2303,10 +2300,11 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
 	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_s_c k;
-	struct bpos iter_pos;
+	struct bpos iter_pos = iter->pos;
 	int ret;
 
 	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	bch2_btree_iter_verify_entry_exit(iter);
 	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
 
 	if (iter->update_path) {
@@ -2315,8 +2313,6 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
 		iter->update_path = 0;
 	}
 
-	bch2_btree_iter_verify_entry_exit(iter);
-
 	while (1) {
 		k = __bch2_btree_iter_peek(iter, search_key);
 		if (unlikely(!k.k))
@@ -2324,75 +2320,74 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
 		if (unlikely(bkey_err(k)))
 			goto out_no_locked;
 
-		/*
-		 * We need to check against @end before FILTER_SNAPSHOTS because
-		 * if we get to a different inode that requested we might be
-		 * seeing keys for a different snapshot tree that will all be
-		 * filtered out.
-		 *
-		 * But we can't do the full check here, because bkey_start_pos()
-		 * isn't monotonically increasing before FILTER_SNAPSHOTS, and
-		 * that's what we check against in extents mode:
-		 */
-		if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
-			     ? bkey_gt(k.k->p, end)
-			     : k.k->p.inode > end.inode))
-			goto end;
+		if (iter->flags & BTREE_ITER_filter_snapshots) {
+			/*
+			 * We need to check against @end before FILTER_SNAPSHOTS because
+			 * if we get to a different inode that requested we might be
+			 * seeing keys for a different snapshot tree that will all be
+			 * filtered out.
+			 *
+			 * But we can't do the full check here, because bkey_start_pos()
+			 * isn't monotonically increasing before FILTER_SNAPSHOTS, and
+			 * that's what we check against in extents mode:
+			 */
+			if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
+				     ? bkey_gt(k.k->p, end)
+				     : k.k->p.inode > end.inode))
+				goto end;
+
+			if (iter->update_path &&
+			    !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
+				bch2_path_put_nokeep(trans, iter->update_path,
+						     iter->flags & BTREE_ITER_intent);
+				iter->update_path = 0;
+			}
 
-		if (iter->update_path &&
-		    !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
-			bch2_path_put_nokeep(trans, iter->update_path,
-					     iter->flags & BTREE_ITER_intent);
-			iter->update_path = 0;
-		}
+			if ((iter->flags & BTREE_ITER_intent) &&
+			    !(iter->flags & BTREE_ITER_is_extents) &&
+			    !iter->update_path) {
+				struct bpos pos = k.k->p;
 
-		if ((iter->flags & BTREE_ITER_filter_snapshots) &&
-		    (iter->flags & BTREE_ITER_intent) &&
-		    !(iter->flags & BTREE_ITER_is_extents) &&
-		    !iter->update_path) {
-			struct bpos pos = k.k->p;
+				if (pos.snapshot < iter->snapshot) {
+					search_key = bpos_successor(k.k->p);
+					continue;
+				}
 
-			if (pos.snapshot < iter->snapshot) {
-				search_key = bpos_successor(k.k->p);
-				continue;
-			}
+				pos.snapshot = iter->snapshot;
 
-			pos.snapshot = iter->snapshot;
+				/*
+				 * advance, same as on exit for iter->path, but only up
+				 * to snapshot
+				 */
+				__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
+				iter->update_path = iter->path;
+
+				iter->update_path = bch2_btree_path_set_pos(trans,
+							iter->update_path, pos,
+							iter->flags & BTREE_ITER_intent,
+							_THIS_IP_);
+				ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+				if (unlikely(ret)) {
+					k = bkey_s_c_err(ret);
+					goto out_no_locked;
+				}
+			}
 
 			/*
-			 * advance, same as on exit for iter->path, but only up
-			 * to snapshot
+			 * We can never have a key in a leaf node at POS_MAX, so
+			 * we don't have to check these successor() calls:
 			 */
-			__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
-			iter->update_path = iter->path;
-
-			iter->update_path = bch2_btree_path_set_pos(trans,
-						iter->update_path, pos,
-						iter->flags & BTREE_ITER_intent,
-						_THIS_IP_);
-			ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
-			if (unlikely(ret)) {
-				k = bkey_s_c_err(ret);
-				goto out_no_locked;
+			if (!bch2_snapshot_is_ancestor(trans->c,
+						       iter->snapshot,
+						       k.k->p.snapshot)) {
+				search_key = bpos_successor(k.k->p);
+				continue;
 			}
-		}
 
-		/*
-		 * We can never have a key in a leaf node at POS_MAX, so
-		 * we don't have to check these successor() calls:
-		 */
-		if ((iter->flags & BTREE_ITER_filter_snapshots) &&
-		    !bch2_snapshot_is_ancestor(trans->c,
-					       iter->snapshot,
-					       k.k->p.snapshot)) {
-			search_key = bpos_successor(k.k->p);
-			continue;
-		}
-
-		if (bkey_whiteout(k.k) &&
-		    !(iter->flags & BTREE_ITER_all_snapshots)) {
-			search_key = bkey_successor(iter, k.k->p);
-			continue;
+			if (bkey_whiteout(k.k)) {
+				search_key = bkey_successor(iter, k.k->p);
+				continue;
+			}
 		}
 
 		/*
-- 
cgit v1.2.3


From e69df6adf8e5d8f0b4a1a3ee6ba66d826c2e3094 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Oct 2024 22:16:19 -0400
Subject: bcachefs: Kill unnecessary iter_rewind() in bkey_get_empty_slot()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 6afd77c68411..f3d7ca3d92b9 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -588,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi
 int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
 			     enum btree_id btree, struct bpos end)
 {
-	struct bkey_s_c k;
-	int ret = 0;
-
 	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
-	k = bch2_btree_iter_prev(iter);
-	ret = bkey_err(k);
+	struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
+	int ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-- 
cgit v1.2.3


From e5ea05293a2ba181cdd04cd075b0483f11868f8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 6 Nov 2024 13:13:25 -0500
Subject: bcachefs: Move fsck ioctl code to fsck.c

chardev.c and fs-ioctl.c are not organized by subject; let's try to fix
this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c | 219 +-------------------------------------------------
 fs/bcachefs/fsck.c    | 218 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/fsck.h    |   3 +
 3 files changed, 222 insertions(+), 218 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 2182b555c112..46e9e32105a9 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -6,11 +6,11 @@
 #include "buckets.h"
 #include "chardev.h"
 #include "disk_accounting.h"
+#include "fsck.h"
 #include "journal.h"
 #include "move.h"
 #include "recovery_passes.h"
 #include "replicas.h"
-#include "super.h"
 #include "super-io.h"
 #include "thread_with_file.h"
 
@@ -127,130 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
 }
 #endif
 
-struct fsck_thread {
-	struct thread_with_stdio thr;
-	struct bch_fs		*c;
-	struct bch_opts		opts;
-};
-
-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
-{
-	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
-	kfree(thr);
-}
-
-static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
-{
-	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
-	struct bch_fs *c = thr->c;
-
-	int ret = PTR_ERR_OR_ZERO(c);
-	if (ret)
-		return ret;
-
-	ret = bch2_fs_start(thr->c);
-	if (ret)
-		goto err;
-
-	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
-		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
-		ret |= 1;
-	}
-	if (test_bit(BCH_FS_error, &c->flags)) {
-		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
-		ret |= 4;
-	}
-err:
-	bch2_fs_stop(c);
-	return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
-	.exit		= bch2_fsck_thread_exit,
-	.fn		= bch2_fsck_offline_thread_fn,
-};
-
-static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
-{
-	struct bch_ioctl_fsck_offline arg;
-	struct fsck_thread *thr = NULL;
-	darray_str(devs) = {};
-	long ret = 0;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if (arg.flags)
-		return -EINVAL;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	for (size_t i = 0; i < arg.nr_devs; i++) {
-		u64 dev_u64;
-		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
-		if (ret)
-			goto err;
-
-		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
-		ret = PTR_ERR_OR_ZERO(dev_str);
-		if (ret)
-			goto err;
-
-		ret = darray_push(&devs, dev_str);
-		if (ret) {
-			kfree(dev_str);
-			goto err;
-		}
-	}
-
-	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
-	if (!thr) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	thr->opts = bch2_opts_empty();
-
-	if (arg.opts) {
-		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-		ret =   PTR_ERR_OR_ZERO(optstr) ?:
-			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
-		if (!IS_ERR(optstr))
-			kfree(optstr);
-
-		if (ret)
-			goto err;
-	}
-
-	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
-	opt_set(thr->opts, read_only, 1);
-	opt_set(thr->opts, ratelimit_errors, 0);
-
-	/* We need request_key() to be called before we punt to kthread: */
-	opt_set(thr->opts, nostart, true);
-
-	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
-
-	thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
-
-	if (!IS_ERR(thr->c) &&
-	    thr->c->opts.errors == BCH_ON_ERROR_panic)
-		thr->c->opts.errors = BCH_ON_ERROR_ro;
-
-	ret = __bch2_run_thread_with_stdio(&thr->thr);
-out:
-	darray_for_each(devs, i)
-		kfree(*i);
-	darray_exit(&devs);
-	return ret;
-err:
-	if (thr)
-		bch2_fsck_thread_exit(&thr->thr);
-	pr_err("ret %s", bch2_err_str(ret));
-	goto out;
-}
-
 static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 {
 	long ret;
@@ -775,99 +651,6 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
 	return ret;
 }
 
-static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
-{
-	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
-	struct bch_fs *c = thr->c;
-
-	c->stdio_filter = current;
-	c->stdio = &thr->thr.stdio;
-
-	/*
-	 * XXX: can we figure out a way to do this without mucking with c->opts?
-	 */
-	unsigned old_fix_errors = c->opts.fix_errors;
-	if (opt_defined(thr->opts, fix_errors))
-		c->opts.fix_errors = thr->opts.fix_errors;
-	else
-		c->opts.fix_errors = FSCK_FIX_ask;
-
-	c->opts.fsck = true;
-	set_bit(BCH_FS_fsck_running, &c->flags);
-
-	c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
-	int ret = bch2_run_online_recovery_passes(c);
-
-	clear_bit(BCH_FS_fsck_running, &c->flags);
-	bch_err_fn(c, ret);
-
-	c->stdio = NULL;
-	c->stdio_filter = NULL;
-	c->opts.fix_errors = old_fix_errors;
-
-	up(&c->online_fsck_mutex);
-	bch2_ro_ref_put(c);
-	return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
-	.exit		= bch2_fsck_thread_exit,
-	.fn		= bch2_fsck_online_thread_fn,
-};
-
-static long bch2_ioctl_fsck_online(struct bch_fs *c,
-				   struct bch_ioctl_fsck_online arg)
-{
-	struct fsck_thread *thr = NULL;
-	long ret = 0;
-
-	if (arg.flags)
-		return -EINVAL;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (!bch2_ro_ref_tryget(c))
-		return -EROFS;
-
-	if (down_trylock(&c->online_fsck_mutex)) {
-		bch2_ro_ref_put(c);
-		return -EAGAIN;
-	}
-
-	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
-	if (!thr) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	thr->c = c;
-	thr->opts = bch2_opts_empty();
-
-	if (arg.opts) {
-		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
-		ret =   PTR_ERR_OR_ZERO(optstr) ?:
-			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
-		if (!IS_ERR(optstr))
-			kfree(optstr);
-
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
-err:
-	if (ret < 0) {
-		bch_err_fn(c, ret);
-		if (thr)
-			bch2_fsck_thread_exit(&thr->thr);
-		up(&c->online_fsck_mutex);
-		bch2_ro_ref_put(c);
-	}
-	return ret;
-}
-
 #define BCH_IOCTL(_name, _argtype)					\
 do {									\
 	_argtype i;							\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 2229f0dcc860..e0335265de3d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bcachefs_ioctl.h"
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_update.h"
@@ -16,6 +17,7 @@
 #include "recovery_passes.h"
 #include "snapshot.h"
 #include "super.h"
+#include "thread_with_file.h"
 #include "xattr.h"
 
 #include <linux/bsearch.h>
@@ -3192,3 +3194,219 @@ int bch2_fix_reflink_p(struct bch_fs *c)
 	bch_err_fn(c, ret);
 	return ret;
 }
+
+struct fsck_thread {
+	struct thread_with_stdio thr;
+	struct bch_fs		*c;
+	struct bch_opts		opts;
+};
+
+static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
+{
+	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
+	kfree(thr);
+}
+
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
+{
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+	struct bch_fs *c = thr->c;
+
+	int ret = PTR_ERR_OR_ZERO(c);
+	if (ret)
+		return ret;
+
+	ret = bch2_fs_start(thr->c);
+	if (ret)
+		goto err;
+
+	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+		ret |= 1;
+	}
+	if (test_bit(BCH_FS_error, &c->flags)) {
+		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+		ret |= 4;
+	}
+err:
+	bch2_fs_stop(c);
+	return ret;
+}
+
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+	.exit		= bch2_fsck_thread_exit,
+	.fn		= bch2_fsck_offline_thread_fn,
+};
+
+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
+{
+	struct bch_ioctl_fsck_offline arg;
+	struct fsck_thread *thr = NULL;
+	darray_str(devs) = {};
+	long ret = 0;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	for (size_t i = 0; i < arg.nr_devs; i++) {
+		u64 dev_u64;
+		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+		if (ret)
+			goto err;
+
+		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+		ret = PTR_ERR_OR_ZERO(dev_str);
+		if (ret)
+			goto err;
+
+		ret = darray_push(&devs, dev_str);
+		if (ret) {
+			kfree(dev_str);
+			goto err;
+		}
+	}
+
+	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+	if (!thr) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	thr->opts = bch2_opts_empty();
+
+	if (arg.opts) {
+		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+		ret =   PTR_ERR_OR_ZERO(optstr) ?:
+			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
+		if (!IS_ERR(optstr))
+			kfree(optstr);
+
+		if (ret)
+			goto err;
+	}
+
+	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+	opt_set(thr->opts, read_only, 1);
+	opt_set(thr->opts, ratelimit_errors, 0);
+
+	/* We need request_key() to be called before we punt to kthread: */
+	opt_set(thr->opts, nostart, true);
+
+	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
+
+	thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+	if (!IS_ERR(thr->c) &&
+	    thr->c->opts.errors == BCH_ON_ERROR_panic)
+		thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+	ret = __bch2_run_thread_with_stdio(&thr->thr);
+out:
+	darray_for_each(devs, i)
+		kfree(*i);
+	darray_exit(&devs);
+	return ret;
+err:
+	if (thr)
+		bch2_fsck_thread_exit(&thr->thr);
+	pr_err("ret %s", bch2_err_str(ret));
+	goto out;
+}
+
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
+{
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+	struct bch_fs *c = thr->c;
+
+	c->stdio_filter = current;
+	c->stdio = &thr->thr.stdio;
+
+	/*
+	 * XXX: can we figure out a way to do this without mucking with c->opts?
+	 */
+	unsigned old_fix_errors = c->opts.fix_errors;
+	if (opt_defined(thr->opts, fix_errors))
+		c->opts.fix_errors = thr->opts.fix_errors;
+	else
+		c->opts.fix_errors = FSCK_FIX_ask;
+
+	c->opts.fsck = true;
+	set_bit(BCH_FS_fsck_running, &c->flags);
+
+	c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+	int ret = bch2_run_online_recovery_passes(c);
+
+	clear_bit(BCH_FS_fsck_running, &c->flags);
+	bch_err_fn(c, ret);
+
+	c->stdio = NULL;
+	c->stdio_filter = NULL;
+	c->opts.fix_errors = old_fix_errors;
+
+	up(&c->online_fsck_mutex);
+	bch2_ro_ref_put(c);
+	return ret;
+}
+
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+	.exit		= bch2_fsck_thread_exit,
+	.fn		= bch2_fsck_online_thread_fn,
+};
+
+long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
+{
+	struct fsck_thread *thr = NULL;
+	long ret = 0;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!bch2_ro_ref_tryget(c))
+		return -EROFS;
+
+	if (down_trylock(&c->online_fsck_mutex)) {
+		bch2_ro_ref_put(c);
+		return -EAGAIN;
+	}
+
+	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+	if (!thr) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	thr->c = c;
+	thr->opts = bch2_opts_empty();
+
+	if (arg.opts) {
+		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+		ret =   PTR_ERR_OR_ZERO(optstr) ?:
+			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
+		if (!IS_ERR(optstr))
+			kfree(optstr);
+
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
+err:
+	if (ret < 0) {
+		bch_err_fn(c, ret);
+		if (thr)
+			bch2_fsck_thread_exit(&thr->thr);
+		up(&c->online_fsck_mutex);
+		bch2_ro_ref_put(c);
+	}
+	return ret;
+}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 1cca31011530..4481b40a881d 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -14,4 +14,7 @@ int bch2_check_directory_structure(struct bch_fs *);
 int bch2_check_nlinks(struct bch_fs *);
 int bch2_fix_reflink_p(struct bch_fs *);
 
+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *);
+long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online);
+
 #endif /* _BCACHEFS_FSCK_H */
-- 
cgit v1.2.3


From 394033dcc976d1f83f0fc6e7d4dd041ce376d245 Mon Sep 17 00:00:00 2001
From: Integral <integral@murena.io>
Date: Wed, 23 Oct 2024 18:00:33 +0800
Subject: bcachefs: add support for true/false & yes/no in bool-type options

Here is the patch which uses existing constant table:

Currently, when using bcachefs-tools to set options, bool-type options
can only accept 1 or 0. Add support for accepting true/false and yes/no
for these options.

Signed-off-by: Integral <integral@murena.io>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: David Howells <dhowells@redhat.com>
---
 fs/bcachefs/opts.c        | 16 +++++++++-------
 fs/fs_parser.c            |  3 ++-
 include/linux/fs_parser.h |  2 ++
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 49c59aec6954..0ba58d74c21f 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/kernel.h>
+#include <linux/fs_parser.h>
 
 #include "bcachefs.h"
 #include "compress.h"
@@ -334,17 +335,18 @@ int bch2_opt_parse(struct bch_fs *c,
 	switch (opt->type) {
 	case BCH_OPT_BOOL:
 		if (val) {
-			ret = kstrtou64(val, 10, res);
+			ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
+			if (ret != -BCH_ERR_option_not_bool) {
+				*res = ret;
+			} else {
+				if (err)
+					prt_printf(err, "%s: must be bool", opt->attr.name);
+				return ret;
+			}
 		} else {
-			ret = 0;
 			*res = 1;
 		}
 
-		if (ret < 0 || (*res != 0 && *res != 1)) {
-			if (err)
-				prt_printf(err, "%s: must be bool", opt->attr.name);
-			return ret < 0 ? ret : -BCH_ERR_option_not_bool;
-		}
 		break;
 	case BCH_OPT_UINT:
 		if (!val) {
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 16fa61ef56bf..e635a81e17d9 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -13,7 +13,7 @@
 #include <linux/namei.h>
 #include "internal.h"
 
-static const struct constant_table bool_names[] = {
+const struct constant_table bool_names[] = {
 	{ "0",		false },
 	{ "1",		true },
 	{ "false",	false },
@@ -22,6 +22,7 @@ static const struct constant_table bool_names[] = {
 	{ "yes",	true },
 	{ },
 };
+EXPORT_SYMBOL(bool_names);
 
 static const struct constant_table *
 __lookup_constant(const struct constant_table *tbl, const char *name)
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index 3cef566088fc..53e566efd5fd 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -84,6 +84,8 @@ extern int fs_lookup_param(struct fs_context *fc,
 
 extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found);
 
+extern const struct constant_table bool_names[];
+
 #ifdef CONFIG_VALIDATE_FS_PARSER
 extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
 				    int low, int high, int special);
-- 
cgit v1.2.3


From 4fa5d8e166e1e64c4cebedda53c523f0b80ed69b Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Wed, 16 Oct 2024 09:50:26 +0800
Subject: bcachefs: Correct the description of the '--bucket=size' options

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 39cdc185fa73..6b29339ea725 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -501,7 +501,7 @@ enum fsck_err_opts {
 	  OPT_DEVICE,							\
 	  OPT_UINT(0, S64_MAX),						\
 	  BCH2_NO_SB_OPT,		0,				\
-	  "size",	"Size of filesystem on device")			\
+	  "size",	"Specifies the bucket size; must be greater than the btree node size")\
 	x(durability,			u8,				\
 	  OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS,				\
 	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
-- 
cgit v1.2.3


From 4f1a6b0ab4ef9ccd0b5940efcba32b34a5c2da08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 9 Jul 2024 09:11:33 +0800
Subject: bcachefs: Add support for FS_IOC_GETFSUUID

Use super_set_uuid() to set `sb->s_uuid_len` to avoid returning `-ENOTTY`
with sb->s_uuid_len being 0.

Original patch link:
[1]: https://lore.kernel.org/all/20240207025624.1019754-2-kent.overstreet@linux.dev/

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 91fce04272a1..396a8f677621 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2216,7 +2216,7 @@ got_sb:
 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
-	sb->s_uuid		= c->sb.user_uuid;
+	super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
 	sb->s_shrink->seeks	= 0;
 	c->vfs_sb		= sb;
 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
-- 
cgit v1.2.3


From dc003efbc7769f22919e1d7d924bf8fc4d2ff841 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 9 Jul 2024 09:11:34 +0800
Subject: bcachefs: Add support for FS_IOC_GETFSSYSFSPATH

[TEST]:
```
$ cat ioctl_getsysfspath.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <linux/fs.h>
 #include <unistd.h>

 int main(int argc, char *argv[]) {
     int fd;
     struct fs_sysfs_path sysfs_path = {};

     if (argc != 2) {
         fprintf(stderr, "Usage: %s <path_to_file_or_directory>\n", argv[0]);
         exit(EXIT_FAILURE);
     }

     fd = open(argv[1], O_RDONLY);
     if (fd == -1) {
         perror("open");
         exit(EXIT_FAILURE);
     }

     if (ioctl(fd, FS_IOC_GETFSSYSFSPATH, &sysfs_path) == -1) {
         perror("ioctl FS_IOC_GETFSSYSFSPATH");
         close(fd);
         exit(EXIT_FAILURE);
     }

     printf("FS_IOC_GETFSSYSFSPATH: %s\n", sysfs_path.name);
     close(fd);
     return 0;
 }

$ gcc ioctl_getsysfspath.c
$ sudo bcachefs format /dev/sda
$ sudo mount.bcachefs /dev/sda /mnt
$ sudo ./a.out /mnt
  FS_IOC_GETFSSYSFSPATH: bcachefs/c380b4ab-fbb6-41d2-b805-7a89cae9cadb
```

Original patch link:
[1]: https://lore.kernel.org/all/20240207025624.1019754-8-kent.overstreet@linux.dev/

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Youling Tang <youling.tang@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 396a8f677621..7a269dbcf44b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2217,6 +2217,7 @@ got_sb:
 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
 	super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
+	super_set_sysfs_name_uuid(sb);
 	sb->s_shrink->seeks	= 0;
 	c->vfs_sb		= sb;
 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
-- 
cgit v1.2.3


From 5abd7ac19ddac6dd7e9ef872e92b76e200d99531 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Tue, 24 Sep 2024 10:53:50 +0800
Subject: bcachefs: Removes NULL pointer checks for __filemap_get_folio return
 values

__filemap_get_folio the return value cannot be NULL, so unnecessary checks
are removed.

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c  | 2 +-
 fs/bcachefs/fs-io-pagecache.c | 2 +-
 fs/bcachefs/fs-io.c           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 0923f38a2fcd..b853cecd3c1b 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -686,7 +686,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
 				    FGP_WRITEBEGIN | fgf_set_order(len),
 				    mapping_gfp_mask(mapping));
-	if (IS_ERR_OR_NULL(folio))
+	if (IS_ERR(folio))
 		goto err_unlock;
 
 	offset = pos - folio_pos(folio);
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 51a499c5a7b6..e072900e6a5b 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -29,7 +29,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
 			break;
 
 		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-		if (IS_ERR_OR_NULL(f))
+		if (IS_ERR(f))
 			break;
 
 		BUG_ON(fs->nr && folio_pos(f) != pos);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 0021db191480..c6fdfec51082 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -256,7 +256,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 
 		folio = __filemap_get_folio(mapping, index,
 					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
-		if (IS_ERR_OR_NULL(folio)) {
+		if (IS_ERR(folio)) {
 			ret = -ENOMEM;
 			goto out;
 		}
-- 
cgit v1.2.3


From 924e81c530ccbe986fd01381b495d410f8a63805 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Fri, 27 Sep 2024 16:40:42 +0800
Subject: bcachefs: Remove redundant initialization in bch2_vfs_inode_init()

`inode->v.i_ino` has been initialized to `inum.inum`. If `inum.inum` and
`bi->bi_inum` are not equal, BUG_ON() is triggered in
bch2_inode_update_after_write().

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7a269dbcf44b..f852dbf30aa2 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1752,7 +1752,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans,
 	bch2_inode_update_after_write(trans, inode, bi, ~0);
 
 	inode->v.i_blocks	= bi->bi_sectors;
-	inode->v.i_ino		= bi->bi_inum;
 	inode->v.i_rdev		= bi->bi_dev;
 	inode->v.i_generation	= bi->bi_generation;
 	inode->v.i_size		= bi->bi_size;
-- 
cgit v1.2.3


From 385d1a3c81fee16202b1c5a980653807a8c8dc95 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Wed, 16 Oct 2024 09:49:11 +0800
Subject: bcachefs: Simplify code in bch2_dev_alloc()

- Remove unnecessary variable 'ret'.
- Remove unnecessary bch2_dev_free() operations.

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7e0ff17a6dbb..ab678231afd4 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1369,7 +1369,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 {
 	struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
 	struct bch_dev *ca = NULL;
-	int ret = 0;
 
 	if (bch2_fs_init_fault("dev_alloc"))
 		goto err;
@@ -1381,10 +1380,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	ca->fs = c;
 
 	bch2_dev_attach(c, ca, dev_idx);
-	return ret;
+	return 0;
 err:
-	if (ca)
-		bch2_dev_free(ca);
 	return -BCH_ERR_ENOMEM_dev_alloc;
 }
 
-- 
cgit v1.2.3


From 5c3911ac94229893b820315d0fde175c07fa28a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 7 Nov 2024 19:15:38 -0500
Subject: bcachefs: Don't use page allocator for sb_read_scratch

Kill another unnecessary dependency on PAGE_SIZE

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 7 ++++---
 fs/bcachefs/super-io.h | 2 ++
 fs/bcachefs/super.c    | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c83bd3dedb1b..4c29f8215d54 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -892,14 +892,15 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
 	struct bch_sb *sb = ca->disk_sb.sb;
 	struct bio *bio = ca->disk_sb.bio;
 
+	memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE);
+
 	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
 	bio->bi_end_io		= write_super_endio;
 	bio->bi_private		= ca;
-	bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
+	bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE);
 
-	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
-		     bio_sectors(bio));
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
 
 	percpu_ref_get(&ca->io_ref);
 	closure_bio_submit(bio, &c->sb_write);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index fadd364e2802..90e7b176cdd0 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -10,6 +10,8 @@
 
 #include <asm/byteorder.h>
 
+#define BCH_SB_READ_SCRATCH_BUF_SIZE		4096
+
 static inline bool bch2_version_compatible(u16 version)
 {
 	return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ab678231afd4..6ab93db52eca 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1201,7 +1201,7 @@ static void bch2_dev_free(struct bch_dev *ca)
 
 	free_percpu(ca->io_done);
 	bch2_dev_buckets_free(ca);
-	free_page((unsigned long) ca->sb_read_scratch);
+	kfree(ca->sb_read_scratch);
 
 	bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
 	bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
@@ -1340,7 +1340,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-	    !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
+	    !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
 	    bch2_dev_buckets_alloc(c, ca) ||
 	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
 		goto err;
-- 
cgit v1.2.3


From d985e63dba24bcb0ede1e8975dc67b9c6a2b2c3b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 7 Nov 2024 21:50:00 -0500
Subject: bcachefs: Fix shutdown message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6ab93db52eca..37eee352fa21 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -290,7 +290,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 
 	bch2_fs_journal_stop(&c->journal);
 
-	bch_info(c, "%sshutdown complete, journal seq %llu",
+	bch_info(c, "%sclean shutdown complete, journal seq %llu",
 		 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
 		 c->journal.seq_ondisk);
 
-- 
cgit v1.2.3


From 1f282f1ee0f80e40e6fb2ecd2dabbfb24a6d9e1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 12 Nov 2024 03:53:30 -0500
Subject: bcachefs: delete dead code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 6551ada926b6..81af0b8ddb52 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -54,26 +54,6 @@ int bch2_topology_error(struct bch_fs *);
 	_ret;								\
 })
 
-/*
- * Later we might want to mark only the particular device inconsistent, not the
- * entire filesystem:
- */
-
-#define bch2_dev_inconsistent(ca, ...)					\
-do {									\
-	bch_err(ca, __VA_ARGS__);					\
-	bch2_inconsistent_error((ca)->fs);				\
-} while (0)
-
-#define bch2_dev_inconsistent_on(cond, ca, ...)				\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-									\
-	if (_ret)							\
-		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
-	_ret;								\
-})
-
 /*
  * When a transaction update discovers or is causing a fs inconsistency, it's
  * helpful to also dump the pending updates:
-- 
cgit v1.2.3


From 7d1918b0d86dcf049fac0d548509459e0e0300da Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 26 Oct 2024 23:35:03 -0400
Subject: bcachefs: bch2_btree_bit_mod_iter()

factor out a new helper, make it handle extents bitset btrees
(freespace).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 57 ++++++++----------------------------------
 fs/bcachefs/btree_update.c     | 37 +++++++++++++--------------
 fs/bcachefs/btree_update.h     |  3 +--
 3 files changed, 29 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index af791f4dab99..a1bd75a44d79 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -671,44 +671,31 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 				bool set)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c old;
-	struct bkey_i *k;
 	enum btree_id btree;
+	struct bpos pos;
 	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
 	struct printbuf buf = PRINTBUF;
-	int ret;
 
 	if (a->data_type != BCH_DATA_free &&
 	    a->data_type != BCH_DATA_need_discard)
 		return 0;
 
-	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
-	if (IS_ERR(k))
-		return PTR_ERR(k);
-
-	bkey_init(&k->k);
-	k->k.type = new_type;
-
 	switch (a->data_type) {
 	case BCH_DATA_free:
 		btree = BTREE_ID_freespace;
-		k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
-		bch2_key_resize(&k->k, 1);
+		pos = alloc_freespace_pos(alloc_k.k->p, *a);
 		break;
 	case BCH_DATA_need_discard:
 		btree = BTREE_ID_need_discard;
-		k->k.p = alloc_k.k->p;
+		pos = alloc_k.k->p;
 		break;
 	default:
 		return 0;
 	}
 
-	old = bch2_bkey_get_iter(trans, &iter, btree,
-			     bkey_start_pos(&k->k),
-			     BTREE_ITER_intent);
-	ret = bkey_err(old);
+	struct btree_iter iter;
+	struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
+	int ret = bkey_err(old);
 	if (ret)
 		return ret;
 
@@ -728,7 +715,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 		goto err;
 	}
 
-	ret = bch2_trans_update(trans, &iter, k, 0);
+	ret = bch2_btree_bit_mod_iter(trans, &iter, set);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
@@ -1163,18 +1150,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 			bch2_bkey_types[k.k->type],
 			bch2_bkey_types[discard_key_type],
 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		struct bkey_i *update =
-			bch2_trans_kmalloc(trans, sizeof(*update));
-
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_init(&update->k);
-		update->k.type	= discard_key_type;
-		update->k.p	= discard_iter->pos;
-
-		ret = bch2_trans_update(trans, discard_iter, update, 0);
+		ret = bch2_btree_bit_mod_iter(trans, discard_iter, !!discard_key_type);
 		if (ret)
 			goto err;
 	}
@@ -1194,19 +1170,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 			bch2_bkey_types[freespace_key_type],
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		struct bkey_i *update =
-			bch2_trans_kmalloc(trans, sizeof(*update));
-
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_init(&update->k);
-		update->k.type	= freespace_key_type;
-		update->k.p	= freespace_iter->pos;
-		bch2_key_resize(&update->k, 1);
-
-		ret = bch2_trans_update(trans, freespace_iter, update, 0);
+		ret = bch2_btree_bit_mod_iter(trans, freespace_iter, !!freespace_key_type);
 		if (ret)
 			goto err;
 	}
@@ -1420,8 +1384,7 @@ fsck_err:
 	printbuf_exit(&buf);
 	return ret;
 delete:
-	ret =   bch2_btree_delete_extent_at(trans, iter,
-			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
+	ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 			BCH_TRANS_COMMIT_no_enospc);
 	goto out;
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index f3d7ca3d92b9..06fd5aa62296 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -669,27 +669,19 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
 			     bch2_btree_insert_trans(trans, id, k, iter_flags));
 }
 
-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
-				unsigned len, unsigned update_flags)
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned update_flags)
 {
-	struct bkey_i *k;
-
-	k = bch2_trans_kmalloc(trans, sizeof(*k));
-	if (IS_ERR(k))
-		return PTR_ERR(k);
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+	int ret = PTR_ERR_OR_ZERO(k);
+	if (ret)
+		return ret;
 
 	bkey_init(&k->k);
 	k->k.p = iter->pos;
-	bch2_key_resize(&k->k, len);
 	return bch2_trans_update(trans, iter, k, update_flags);
 }
 
-int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned update_flags)
-{
-	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-}
-
 int bch2_btree_delete(struct btree_trans *trans,
 		      enum btree_id btree, struct bpos pos,
 		      unsigned update_flags)
@@ -791,8 +783,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	return ret;
 }
 
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-		       struct bpos pos, bool set)
+int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
 {
 	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
 	int ret = PTR_ERR_OR_ZERO(k);
@@ -801,13 +792,21 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
 
 	bkey_init(&k->k);
 	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	k->k.p = pos;
+	k->k.p = iter->pos;
+	if (iter->flags & BTREE_ITER_is_extents)
+		bch2_key_resize(&k->k, 1);
+
+	return bch2_trans_update(trans, iter, k, 0);
+}
 
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+		       struct bpos pos, bool set)
+{
 	struct btree_iter iter;
 	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
 
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, 0);
+	int ret = bch2_btree_iter_traverse(&iter) ?:
+		  bch2_btree_bit_mod_iter(trans, &iter, set);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 3bc57d43aa83..58df20194306 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -46,8 +46,6 @@ enum bch_trans_commit_flags {
 
 void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
 
-int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
-				unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
 
@@ -65,6 +63,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, unsigned, u64 *);
 
+int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
 int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
 int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
 
-- 
cgit v1.2.3


From b6269cd0ec0a55f256d6f79a6e70dd7177050150 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 26 Oct 2024 22:52:06 -0400
Subject: bcachefs: Delete dead code from bch2_discard_one_bucket()

alloc key validation ensures that if a bucket is in need_discard state
the sector counts are all zero - we don't have to check for that.

The NEED_INC_GEN check appears to be dead code, as well: we only see
buckets in the need_discard btree, and it's an error if they aren't in
the need_discard state.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a1bd75a44d79..38df36f8e70a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1756,22 +1756,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	if (bch2_bucket_sectors_total(a->v)) {
-		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
-					       trans, "attempting to discard bucket with dirty data\n%s",
-					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = -EIO;
-		goto out;
-	}
-
 	if (a->v.data_type != BCH_DATA_need_discard) {
-		if (data_type_is_empty(a->v.data_type) &&
-		    BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
-			a->v.gen++;
-			SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
-			goto write;
-		}
-
 		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
 					       trans, "bucket incorrectly set in need_discard btree\n"
 					       "%s",
@@ -1814,7 +1799,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	}
 
 	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
-write:
 	alloc_data_type_set(&a->v, a->v.data_type);
 
 	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
-- 
cgit v1.2.3


From 79c5e3c7934cb1358ae403d8a9e4bf84c2195581 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 29 Oct 2024 01:17:08 -0400
Subject: bcachefs: lru errors are expected when reconstructing alloc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 431698189090..7086a7226989 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -113,6 +113,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
 	__set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
 	__set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
 
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
+
 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
-- 
cgit v1.2.3


From eb73e7773fd665c6b2f26e8b5057197ad6794f50 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 28 Oct 2024 21:27:23 -0400
Subject: bcachefs: Kill FSCK_NEED_FSCK

If we find an error that indicates that we need to run fsck, we can
specify that directly with run_explicit_recovery_pass().

These are now log_fsck_err() calls: we're just logging in the superblock
that an error occurred - and possibly doing an emergency shutdown,
depending on policy.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 18 ++++++------------
 fs/bcachefs/buckets.c               | 29 +++++++++++++++++------------
 fs/bcachefs/error.c                 | 21 +++++++++++++--------
 fs/bcachefs/error.h                 | 12 ++++++------
 fs/bcachefs/sb-errors_format.h      |  5 ++---
 5 files changed, 44 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c11babe31f54..faa2816e02a0 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -62,7 +62,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		if (!bpos_eq(b->data->min_key, POS_MIN)) {
 			printbuf_reset(&buf);
 			bch2_bpos_to_text(&buf, b->data->min_key);
-			need_fsck_err(trans, btree_root_bad_min_key,
+			log_fsck_err(trans, btree_root_bad_min_key,
 				      "btree root with incorrect min_key: %s", buf.buf);
 			goto topology_repair;
 		}
@@ -70,7 +70,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
 			printbuf_reset(&buf);
 			bch2_bpos_to_text(&buf, b->data->max_key);
-			need_fsck_err(trans, btree_root_bad_max_key,
+			log_fsck_err(trans, btree_root_bad_max_key,
 				      "btree root with incorrect max_key: %s", buf.buf);
 			goto topology_repair;
 		}
@@ -106,7 +106,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 			prt_str(&buf, "\n  next ");
 			bch2_bkey_val_to_text(&buf, c, k);
 
-			need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
+			log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
 			goto topology_repair;
 		}
 
@@ -123,7 +123,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		prt_str(&buf, " node ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
-		need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
+		log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
 		goto topology_repair;
 	} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
 		bch2_topology_error(c);
@@ -136,7 +136,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 		prt_str(&buf, "\n  last key ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
 
-		need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
+		log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
 		goto topology_repair;
 	}
 out:
@@ -146,13 +146,7 @@ fsck_err:
 	printbuf_exit(&buf);
 	return ret;
 topology_repair:
-	if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
-	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
-		bch2_inconsistent_error(c);
-		ret = -BCH_ERR_btree_need_topology_repair;
-	} else {
-		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
-	}
+	ret = bch2_topology_error(c);
 	goto out;
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c4123fa4f250..5b42f0a7b0cb 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -20,6 +20,7 @@
 #include "movinggc.h"
 #include "rebalance.h"
 #include "recovery.h"
+#include "recovery_passes.h"
 #include "reflink.h"
 #include "replicas.h"
 #include "subvolume.h"
@@ -402,8 +403,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	BUG_ON(!sectors);
 
 	if (gen_after(ptr->gen, b_gen)) {
-		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      ptr_gen_newer_than_bucket_gen,
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, ptr_gen_newer_than_bucket_gen,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -416,8 +417,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	}
 
 	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
-		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      ptr_too_stale,
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, ptr_too_stale,
 			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -436,8 +437,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	}
 
 	if (b_gen != ptr->gen) {
-		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      stale_dirty_ptr,
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, stale_dirty_ptr,
 			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -452,8 +453,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	}
 
 	if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
-		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      ptr_bucket_data_type_mismatch,
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, ptr_bucket_data_type_mismatch,
 			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -467,8 +468,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 	}
 
 	if ((u64) *bucket_sectors + sectors > U32_MAX) {
-		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      bucket_sector_count_overflow,
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, bucket_sector_count_overflow,
 			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
@@ -486,7 +487,9 @@ out:
 	printbuf_exit(&buf);
 	return ret;
 err:
+fsck_err:
 	bch2_dump_trans_updates(trans);
+	bch2_inconsistent_error(c);
 	ret = -BCH_ERR_bucket_ref_update;
 	goto out;
 }
@@ -952,6 +955,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 				    enum bch_data_type type,
 				    unsigned sectors)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	int ret = 0;
 
@@ -961,8 +965,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		return PTR_ERR(a);
 
 	if (a->v.data_type && type && a->v.data_type != type) {
-		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      bucket_metadata_type_mismatch,
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, bucket_metadata_type_mismatch,
 			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
 			iter.pos.inode, iter.pos.offset, a->v.gen,
@@ -980,6 +984,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
 	}
 err:
+fsck_err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index b679def8fb98..22b0fa405a39 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -385,9 +385,7 @@ int __bch2_fsck_err(struct bch_fs *c,
 			prt_str(out, ", not ");
 			prt_actioning(out, action);
 		}
-	} else if (flags & FSCK_NEED_FSCK) {
-		prt_str(out, " (run fsck to correct)");
-	} else {
+	} else if (!(flags & FSCK_CAN_IGNORE)) {
 		prt_str(out, " (repair unimplemented)");
 	}
 
@@ -424,11 +422,18 @@ int __bch2_fsck_err(struct bch_fs *c,
 	if (inconsistent)
 		bch2_inconsistent_error(c);
 
-	if (ret == -BCH_ERR_fsck_fix) {
-		set_bit(BCH_FS_errors_fixed, &c->flags);
-	} else {
-		set_bit(BCH_FS_errors_not_fixed, &c->flags);
-		set_bit(BCH_FS_error, &c->flags);
+	/*
+	 * We don't yet track whether the filesystem currently has errors, for
+	 * log_fsck_err()s: that would require us to track for every error type
+	 * which recovery pass corrects it, to get the fsck exit status correct:
+	 */
+	if (flags & FSCK_CAN_FIX) {
+		if (ret == -BCH_ERR_fsck_fix) {
+			set_bit(BCH_FS_errors_fixed, &c->flags);
+		} else {
+			set_bit(BCH_FS_errors_not_fixed, &c->flags);
+			set_bit(BCH_FS_error, &c->flags);
+		}
 	}
 err:
 	if (action != action_orig)
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 81af0b8ddb52..24c41a9994df 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -129,12 +129,6 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 	(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
 })
 
-#define need_fsck_err_on(cond, c, _err_type, ...)				\
-	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
-
-#define need_fsck_err(c, _err_type, ...)				\
-	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
-
 #define mustfix_fsck_err(c, _err_type, ...)				\
 	__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
 
@@ -147,6 +141,12 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 #define fsck_err_on(cond, c, _err_type, ...)				\
 	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
+#define log_fsck_err(c, _err_type, ...)					\
+	__fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+
+#define log_fsck_err_on(cond, c, _err_type, ...)				\
+	__fsck_err_on(cond, c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+
 enum bch_validate_flags;
 __printf(5, 6)
 int __bch2_bkey_fsck_err(struct bch_fs *,
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 9feb6739f77a..f2b38493356d 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -5,9 +5,8 @@
 enum bch_fsck_flags {
 	FSCK_CAN_FIX		= 1 << 0,
 	FSCK_CAN_IGNORE		= 1 << 1,
-	FSCK_NEED_FSCK		= 1 << 2,
-	FSCK_NO_RATELIMIT	= 1 << 3,
-	FSCK_AUTOFIX		= 1 << 4,
+	FSCK_NO_RATELIMIT	= 1 << 2,
+	FSCK_AUTOFIX		= 1 << 3,
 };
 
 #define BCH_SB_ERRS()									\
-- 
cgit v1.2.3


From 61f854da4cba9708dd54f45b0e19ded6ffd10d01 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 28 Oct 2024 23:43:16 -0400
Subject: bcachefs: Reserve 8 bits in bch_reflink_p

Better repair for reflink pointers, as well as propagating new inode
options to indirect extents, are going to require a few extra bits
bch_reflink_p: so claim a few from the high end of the destination
index.

Also add some missing bounds checking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extent_update.c  |  2 +-
 fs/bcachefs/extents.c        |  2 +-
 fs/bcachefs/io_read.c        | 14 +++++---------
 fs/bcachefs/reflink.c        | 45 ++++++++++++++++++++++++++++++--------------
 fs/bcachefs/reflink_format.h |  4 +++-
 5 files changed, 41 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 45c87c019f6b..6aac579a692a 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 		break;
 	case KEY_TYPE_reflink_p: {
 		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-		u64 idx = le64_to_cpu(p.v->idx);
+		u64 idx = REFLINK_P_IDX(p.v);
 		unsigned sectors = bpos_min(*end, p.k->p).offset -
 			bkey_start_offset(p.k);
 		struct btree_iter iter;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index bc7cfdb66687..98bb680b3860 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1495,7 +1495,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 	case KEY_TYPE_reflink_p: {
 		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
 
-		le64_add_cpu(&p.v->idx, sub);
+		SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
 		break;
 	}
 	case KEY_TYPE_inline_data:
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index cbc3cc1f6d03..c700a95df89e 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -754,17 +754,13 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 				unsigned *offset_into_extent,
 				struct bkey_buf *orig_k)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 reflink_offset;
-	int ret;
+	struct bkey_i_reflink_p *p = bkey_i_to_reflink_p(orig_k->k);
+	u64 reflink_offset = REFLINK_P_IDX(&p->v) + *offset_into_extent;
 
-	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
-		*offset_into_extent;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
 			       POS(0, reflink_offset), 0);
-	ret = bkey_err(k);
+	int ret = bkey_err(k);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 96cf50f4705d..addaf5f74624 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -35,10 +35,10 @@ int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	int ret = 0;
 
-	bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad),
+	bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad),
 			 c, reflink_p_front_pad_bad,
 			 "idx < front_pad (%llu < %u)",
-			 le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+			 REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad));
 fsck_err:
 	return ret;
 }
@@ -49,7 +49,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
 	prt_printf(out, "idx %llu front_pad %u back_pad %u",
-	       le64_to_cpu(p.v->idx),
+	       REFLINK_P_IDX(p.v),
 	       le32_to_cpu(p.v->front_pad),
 	       le32_to_cpu(p.v->back_pad));
 }
@@ -65,7 +65,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 	 */
 	return false;
 
-	if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+	if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
 		return false;
 
 	bch2_key_resize(l.k, l.k->size + r.k->size);
@@ -115,12 +115,12 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
 		u64 pad;
 
 		pad = max_t(s64, le32_to_cpu(v->front_pad),
-			    le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+			    REFLINK_P_IDX(v) - bkey_start_offset(&k->k));
 		BUG_ON(pad > U32_MAX);
 		v->front_pad = cpu_to_le32(pad);
 
 		pad = max_t(s64, le32_to_cpu(v->back_pad),
-			    k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+			    k->k.p.offset - p.k->size - REFLINK_P_IDX(v));
 		BUG_ON(pad > U32_MAX);
 		v->back_pad = cpu_to_le32(pad);
 	}
@@ -147,8 +147,8 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct reflink_gc *r;
 	int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
-	u64 start = le64_to_cpu(p.v->idx);
-	u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+	u64 start = REFLINK_P_IDX(p.v);
+	u64 end = start + p.k->size;
 	u64 next_idx = end + le32_to_cpu(p.v->back_pad);
 	s64 ret = 0;
 	struct printbuf buf = PRINTBUF;
@@ -210,8 +210,8 @@ static int __trigger_reflink_p(struct btree_trans *trans,
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	int ret = 0;
 
-	u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
-	u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad);
+	u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
+	u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
 
 	if (flags & BTREE_TRIGGER_transactional) {
 		while (idx < end && !ret)
@@ -258,7 +258,16 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
 int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
 			    enum bch_validate_flags flags)
 {
-	return bch2_bkey_ptrs_validate(c, k, flags);
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
+			 c, reflink_v_pos_bad,
+			 "indirect extent above maximum position 0:%llu",
+			 REFLINK_P_IDX_MAX);
+
+	ret = bch2_bkey_ptrs_validate(c, k, flags);
+fsck_err:
+	return ret;
 }
 
 void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -358,6 +367,14 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
+	/*
+	 * XXX: we're assuming that 56 bits will be enough for the life of the
+	 * filesystem: we need to implement wraparound, with a cursor in the
+	 * logged ops btree:
+	 */
+	if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size)))
+		return -ENOSPC;
+
 	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
 	ret = PTR_ERR_OR_ZERO(r_v);
 	if (ret)
@@ -394,7 +411,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 	memset(&r_p->v, 0, sizeof(r_p->v));
 #endif
 
-	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+	SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
 
 	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
 				BTREE_UPDATE_internal_snapshot_node);
@@ -533,11 +550,11 @@ s64 bch2_remap_range(struct bch_fs *c,
 			struct bkey_i_reflink_p *dst_p =
 				bkey_reflink_p_init(new_dst.k);
 
-			u64 offset = le64_to_cpu(src_p.v->idx) +
+			u64 offset = REFLINK_P_IDX(src_p.v) +
 				(src_want.offset -
 				 bkey_start_offset(src_k.k));
 
-			dst_p->v.idx = cpu_to_le64(offset);
+			SET_REFLINK_P_IDX(&dst_p->v, offset);
 		} else {
 			BUG();
 		}
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
index 6772eebb1fc6..0d8de13b9ddf 100644
--- a/fs/bcachefs/reflink_format.h
+++ b/fs/bcachefs/reflink_format.h
@@ -4,7 +4,7 @@
 
 struct bch_reflink_p {
 	struct bch_val		v;
-	__le64			idx;
+	__le64			idx_flags;
 	/*
 	 * A reflink pointer might point to an indirect extent which is then
 	 * later split (by copygc or rebalance). If we only pointed to part of
@@ -17,6 +17,8 @@ struct bch_reflink_p {
 	__le32			back_pad;
 } __packed __aligned(8);
 
+LE64_BITMASK(REFLINK_P_IDX,	struct bch_reflink_p, idx_flags,  0, 56);
+
 struct bch_reflink_v {
 	struct bch_val		v;
 	__le64			refcount;
-- 
cgit v1.2.3


From 3d338378d76391bb98a18e3cfb154a539866d0cf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 31 Oct 2024 01:25:09 -0400
Subject: bcachefs: Reorganize reflink.c a bit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  92 ---------------------
 fs/bcachefs/reflink.c  | 214 ++++++++++++++++++++++++++++++++++++-------------
 fs/bcachefs/reflink.h  |   3 +
 3 files changed, 160 insertions(+), 149 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e45cf32a6403..2e8cfc4d3265 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -937,98 +937,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
 	return ret;
 }
 
-static int bch2_gc_write_reflink_key(struct btree_trans *trans,
-				     struct btree_iter *iter,
-				     struct bkey_s_c k,
-				     size_t *idx)
-{
-	struct bch_fs *c = trans->c;
-	const __le64 *refcount = bkey_refcount_c(k);
-	struct printbuf buf = PRINTBUF;
-	struct reflink_gc *r;
-	int ret = 0;
-
-	if (!refcount)
-		return 0;
-
-	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
-	       r->offset < k.k->p.offset)
-		++*idx;
-
-	if (!r ||
-	    r->offset != k.k->p.offset ||
-	    r->size != k.k->size) {
-		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-		return -EINVAL;
-	}
-
-	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
-			trans, reflink_v_refcount_wrong,
-			"reflink key has wrong refcount:\n"
-			"  %s\n"
-			"  should be %u",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-			r->refcount)) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto out;
-
-		if (!r->refcount)
-			new->k.type = KEY_TYPE_deleted;
-		else
-			*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
-		ret = bch2_trans_update(trans, iter, new, 0);
-	}
-out:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_gc_reflink_done(struct bch_fs *c)
-{
-	size_t idx = 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_reflink, POS_MIN,
-				BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
-	c->reflink_gc_nr = 0;
-	return ret;
-}
-
-static int bch2_gc_reflink_start(struct bch_fs *c)
-{
-	c->reflink_gc_nr = 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
-				   BTREE_ITER_prefetch, k, ({
-			const __le64 *refcount = bkey_refcount_c(k);
-
-			if (!refcount)
-				continue;
-
-			struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
-							c->reflink_gc_nr++, GFP_KERNEL);
-			if (!r) {
-				ret = -BCH_ERR_ENOMEM_gc_reflink_start;
-				break;
-			}
-
-			r->offset	= k.k->p.offset;
-			r->size		= k.k->size;
-			r->refcount	= 0;
-			0;
-		})));
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
 static int bch2_gc_write_stripes_key(struct btree_trans *trans,
 				     struct btree_iter *iter,
 				     struct bkey_s_c k)
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index addaf5f74624..36fb1e9473ff 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -72,6 +72,66 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 	return true;
 }
 
+/* indirect extents */
+
+int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
+			    enum bch_validate_flags flags)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
+			 c, reflink_v_pos_bad,
+			 "indirect extent above maximum position 0:%llu",
+			 REFLINK_P_IDX_MAX);
+
+	ret = bch2_bkey_ptrs_validate(c, k, flags);
+fsck_err:
+	return ret;
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+	prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+#if 0
+Currently disabled, needs to be debugged:
+
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+#endif
+
+/* indirect inline data */
+
+int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+				      enum bch_validate_flags flags)
+{
+	return 0;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+				       struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
+
+	prt_printf(out, "refcount %llu datalen %u: %*phN",
+	       le64_to_cpu(d.v->refcount), datalen,
+	       min(datalen, 32U), d.v->data);
+}
+
+/* reflink pointer trigger */
+
 static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
 			struct bkey_s_c_reflink_p p, u64 *idx,
 			enum btree_iter_update_trigger_flags flags)
@@ -253,44 +313,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
 	return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
 }
 
-/* indirect extents */
-
-int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
-			    enum bch_validate_flags flags)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
-			 c, reflink_v_pos_bad,
-			 "indirect extent above maximum position 0:%llu",
-			 REFLINK_P_IDX_MAX);
-
-	ret = bch2_bkey_ptrs_validate(c, k, flags);
-fsck_err:
-	return ret;
-}
-
-void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-	prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
-
-	bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-#if 0
-Currently disabled, needs to be debugged:
-
-bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
-	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
-	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
-
-	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
-}
-#endif
+/* indirect extent trigger */
 
 static inline void
 check_indirect_extent_deleting(struct bkey_s new,
@@ -316,25 +339,6 @@ int bch2_trigger_reflink_v(struct btree_trans *trans,
 	return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
 }
 
-/* indirect inline data */
-
-int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
-				      enum bch_validate_flags flags)
-{
-	return 0;
-}
-
-void bch2_indirect_inline_data_to_text(struct printbuf *out,
-				       struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
-	unsigned datalen = bkey_inline_data_bytes(k.k);
-
-	prt_printf(out, "refcount %llu datalen %u: %*phN",
-	       le64_to_cpu(d.v->refcount), datalen,
-	       min(datalen, 32U), d.v->data);
-}
-
 int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
 			      enum btree_id btree_id, unsigned level,
 			      struct bkey_s_c old, struct bkey_s new,
@@ -345,6 +349,8 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
 	return 0;
 }
 
+/* create */
+
 static int bch2_make_extent_indirect(struct btree_trans *trans,
 				     struct btree_iter *extent_iter,
 				     struct bkey_i *orig)
@@ -608,3 +614,97 @@ err:
 
 	return dst_done ?: ret ?: ret2;
 }
+
+/* fsck */
+
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     size_t *idx)
+{
+	struct bch_fs *c = trans->c;
+	const __le64 *refcount = bkey_refcount_c(k);
+	struct printbuf buf = PRINTBUF;
+	struct reflink_gc *r;
+	int ret = 0;
+
+	if (!refcount)
+		return 0;
+
+	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+	       r->offset < k.k->p.offset)
+		++*idx;
+
+	if (!r ||
+	    r->offset != k.k->p.offset ||
+	    r->size != k.k->size) {
+		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+		return -EINVAL;
+	}
+
+	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
+			trans, reflink_v_refcount_wrong,
+			"reflink key has wrong refcount:\n"
+			"  %s\n"
+			"  should be %u",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			r->refcount)) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto out;
+
+		if (!r->refcount)
+			new->k.type = KEY_TYPE_deleted;
+		else
+			*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+		ret = bch2_trans_update(trans, iter, new, 0);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_gc_reflink_done(struct bch_fs *c)
+{
+	size_t idx = 0;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_reflink, POS_MIN,
+				BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
+	c->reflink_gc_nr = 0;
+	return ret;
+}
+
+int bch2_gc_reflink_start(struct bch_fs *c)
+{
+	c->reflink_gc_nr = 0;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+				   BTREE_ITER_prefetch, k, ({
+			const __le64 *refcount = bkey_refcount_c(k);
+
+			if (!refcount)
+				continue;
+
+			struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
+							c->reflink_gc_nr++, GFP_KERNEL);
+			if (!r) {
+				ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+				break;
+			}
+
+			r->offset	= k.k->p.offset;
+			r->size		= k.k->size;
+			r->refcount	= 0;
+			0;
+		})));
+
+	bch_err_fn(c, ret);
+	return ret;
+}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 51afe11d8ed6..6ec3a9ea6bb4 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -76,4 +76,7 @@ static inline __le64 *bkey_refcount(struct bkey_s k)
 s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
 		     subvol_inum, u64, u64, u64, s64 *);
 
+int bch2_gc_reflink_done(struct bch_fs *);
+int bch2_gc_reflink_start(struct bch_fs *);
+
 #endif /* _BCACHEFS_REFLINK_H */
-- 
cgit v1.2.3


From 7579c85d9cc3b25c7969474e9662f2d1c1130ab8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 20:27:44 -0400
Subject: bcachefs: Don't delete reflink pointers to missing indirect extents

To avoid tragic loss in the event of transient errors (i.e., a btree
node topology error that was later corrected by btree node scan), we
can't delete reflink pointers to correct errors.

This adds a new error bit to bch_reflink_p, indicating that it is known
to point to a missing indirect extent, and the error has already been
reported.

Indirect extent lookups now use bch2_lookup_indirect_extent(), which on
error reports it as a fsck_err() and sets the error bit, and clears it
if necessary on succesful lookup.

This also gets rid of the bch2_inconsistent_error() call in
__bch2_read_indirect_extent, and in the reflink_p trigger: part of the
online self healing project.

An on disk format change isn't necessary here: setting the error bit
will be interpreted by older versions as pointing to a different index,
which will also be missing - which is fine.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c |   5 +-
 fs/bcachefs/fs.c             |   8 +-
 fs/bcachefs/io_read.c        |  45 +-------
 fs/bcachefs/io_read.h        |  28 +++--
 fs/bcachefs/reflink.c        | 241 +++++++++++++++++++++++++++++++++----------
 fs/bcachefs/reflink.h        |   4 +
 fs/bcachefs/reflink_format.h |   1 +
 7 files changed, 222 insertions(+), 110 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index b853cecd3c1b..d55e215e8aa6 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -164,7 +164,8 @@ static void bchfs_read(struct btree_trans *trans,
 			     BTREE_ITER_slots);
 	while (1) {
 		struct bkey_s_c k;
-		unsigned bytes, sectors, offset_into_extent;
+		unsigned bytes, sectors;
+		s64 offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
 
 		bch2_trans_begin(trans);
@@ -197,7 +198,7 @@ static void bchfs_read(struct btree_trans *trans,
 
 		k = bkey_i_to_s_c(sk.k);
 
-		sectors = min(sectors, k.k->size - offset_into_extent);
+		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
 
 		if (readpages_iter) {
 			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f852dbf30aa2..50d323fca001 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1261,7 +1261,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf cur, prev;
-	unsigned offset_into_extent, sectors;
 	bool have_extent = false;
 	int ret = 0;
 
@@ -1308,9 +1307,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 			continue;
 		}
 
-		offset_into_extent	= iter.pos.offset -
-			bkey_start_offset(k.k);
-		sectors			= k.k->size - offset_into_extent;
+		s64 offset_into_extent	= iter.pos.offset - bkey_start_offset(k.k);
+		unsigned sectors	= k.k->size - offset_into_extent;
 
 		bch2_bkey_buf_reassemble(&cur, c, k);
 
@@ -1322,7 +1320,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 		k = bkey_i_to_s_c(cur.k);
 		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
 
-		sectors = min(sectors, k.k->size - offset_into_extent);
+		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
 
 		bch2_cut_front(POS(k.k->p.inode,
 				   bkey_start_offset(k.k) +
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index c700a95df89e..eb8d12fd6398 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -21,6 +21,7 @@
 #include "io_read.h"
 #include "io_misc.h"
 #include "io_write.h"
+#include "reflink.h"
 #include "subvolume.h"
 #include "trace.h"
 
@@ -750,41 +751,6 @@ static void bch2_read_endio(struct bio *bio)
 	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
 }
 
-int __bch2_read_indirect_extent(struct btree_trans *trans,
-				unsigned *offset_into_extent,
-				struct bkey_buf *orig_k)
-{
-	struct bkey_i_reflink_p *p = bkey_i_to_reflink_p(orig_k->k);
-	u64 reflink_offset = REFLINK_P_IDX(&p->v) + *offset_into_extent;
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
-			       POS(0, reflink_offset), 0);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_reflink_v &&
-	    k.k->type != KEY_TYPE_indirect_inline_data) {
-		bch_err_inum_offset_ratelimited(trans->c,
-			orig_k->k->k.p.inode,
-			orig_k->k->k.p.offset << 9,
-			"%llu len %u points to nonexistent indirect extent %llu",
-			orig_k->k->k.p.offset,
-			orig_k->k->k.size,
-			reflink_offset);
-		bch2_inconsistent_error(trans->c);
-		ret = -BCH_ERR_missing_indirect_extent;
-		goto err;
-	}
-
-	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
-	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
 						   struct bch_dev *ca,
 						   struct bkey_s_c k,
@@ -1160,7 +1126,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 			     BTREE_ITER_slots);
 
 	while (1) {
-		unsigned bytes, sectors, offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
 
 		bch2_trans_begin(trans);
@@ -1180,9 +1145,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 		if (ret)
 			goto err;
 
-		offset_into_extent = iter.pos.offset -
+		s64 offset_into_extent = iter.pos.offset -
 			bkey_start_offset(k.k);
-		sectors = k.k->size - offset_into_extent;
+		unsigned sectors = k.k->size - offset_into_extent;
 
 		bch2_bkey_buf_reassemble(&sk, c, k);
 
@@ -1197,9 +1162,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 		 * With indirect extents, the amount of data to read is the min
 		 * of the original extent and the indirect extent:
 		 */
-		sectors = min(sectors, k.k->size - offset_into_extent);
+		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
 
-		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+		unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
 		swap(bvec_iter.bi_size, bytes);
 
 		if (bvec_iter.bi_size == bytes)
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index d9c18bb7d403..a82e8a94ccb6 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_IO_READ_H
 
 #include "bkey_buf.h"
+#include "reflink.h"
 
 struct bch_read_bio {
 	struct bch_fs		*c;
@@ -79,19 +80,32 @@ struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_ptr_decoded;
 
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-				struct bkey_buf *);
-
 static inline int bch2_read_indirect_extent(struct btree_trans *trans,
 					    enum btree_id *data_btree,
-					    unsigned *offset_into_extent,
-					    struct bkey_buf *k)
+					    s64 *offset_into_extent,
+					    struct bkey_buf *extent)
 {
-	if (k->k->k.type != KEY_TYPE_reflink_p)
+	if (extent->k->k.type != KEY_TYPE_reflink_p)
 		return 0;
 
 	*data_btree = BTREE_ID_reflink;
-	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
+						offset_into_extent,
+						bkey_i_to_s_c_reflink_p(extent->k),
+						true, 0);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (bkey_deleted(k.k)) {
+		bch2_trans_iter_exit(trans, &iter);
+		return -BCH_ERR_missing_indirect_extent;
+	}
+
+	bch2_bkey_buf_reassemble(extent, trans->c, k);
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
 }
 
 enum bch_read_flags {
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 36fb1e9473ff..38db5a011702 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -15,6 +15,17 @@
 
 #include <linux/sched/signal.h>
 
+static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_reflink_v:
+	case KEY_TYPE_indirect_inline_data:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 {
 	switch (k->type) {
@@ -68,6 +79,9 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 	if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
 		return false;
 
+	if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
+		return false;
+
 	bch2_key_resize(l.k, l.k->size + r.k->size);
 	return true;
 }
@@ -130,6 +144,144 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
 	       min(datalen, 32U), d.v->data);
 }
 
+/* lookup */
+
+static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
+					    bool should_commit)
+{
+	struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+	int ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
+
+	SET_REFLINK_P_ERROR(&new->v, false);
+	ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+	if (ret)
+		return ret;
+
+	if (!should_commit)
+		return 0;
+
+	return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+		-BCH_ERR_transaction_restart_nested;
+}
+
+static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
+					      struct bkey_s_c_reflink_p p,
+					      u64 missing_start, u64 missing_end,
+					      bool should_commit)
+{
+	if (REFLINK_P_ERROR(p.v))
+		return -BCH_ERR_missing_indirect_extent;
+
+	struct bch_fs *c = trans->c;
+	u64 live_start	= REFLINK_P_IDX(p.v);
+	u64 live_end	= REFLINK_P_IDX(p.v) + p.k->size;
+	u64 refd_start	= live_start	- le32_to_cpu(p.v->front_pad);
+	u64 refd_end	= live_end	+ le32_to_cpu(p.v->back_pad);
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	BUG_ON(missing_start	< refd_start);
+	BUG_ON(missing_end	> refd_end);
+
+	if (fsck_err(trans, reflink_p_to_missing_reflink_v,
+		     "pointer to missing indirect extent\n"
+		     "  %s\n"
+		     "  missing range %llu-%llu",
+		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+		     missing_start, missing_end)) {
+		struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto err;
+
+		/*
+		 * Is the missing range not actually needed?
+		 *
+		 * p.v->idx refers to the data that we actually want, but if the
+		 * indirect extent we point to was bigger, front_pad and back_pad
+		 * indicate the range we took a reference on.
+		 */
+
+		if (missing_end <= live_start) {
+			new->v.front_pad = cpu_to_le32(live_start - missing_end);
+		} else if (missing_start >= live_end) {
+			new->v.back_pad = cpu_to_le32(missing_start - live_end);
+		} else {
+			struct bpos new_start	= bkey_start_pos(&new->k);
+			struct bpos new_end	= new->k.p;
+
+			if (missing_start > live_start)
+				new_start.offset += missing_start - live_start;
+			if (missing_end < live_end)
+				new_end.offset -= live_end - missing_end;
+
+			bch2_cut_front(new_start, &new->k_i);
+			bch2_cut_back(new_end, &new->k_i);
+
+			SET_REFLINK_P_ERROR(&new->v, true);
+		}
+
+		ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+		if (ret)
+			goto err;
+
+		if (should_commit)
+			ret =   bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+				-BCH_ERR_transaction_restart_nested;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * This is used from the read path, which doesn't expect to have to do a
+ * transaction commit, and from triggers, which should not be doing a commit:
+ */
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
+					    struct btree_iter *iter,
+					    s64 *offset_into_extent,
+					    struct bkey_s_c_reflink_p p,
+					    bool should_commit,
+					    unsigned iter_flags)
+{
+	BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
+	BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
+
+	u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
+
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
+				       POS(0, reflink_offset), iter_flags);
+	if (bkey_err(k))
+		return k;
+
+	if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
+		bch2_trans_iter_exit(trans, iter);
+
+		unsigned size = min((u64) k.k->size,
+				    REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
+				    reflink_offset);
+		bch2_key_resize(&iter->k, size);
+
+		int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
+							     k.k->p.offset, should_commit);
+		if (ret)
+			return bkey_s_c_err(ret);
+	} else if (unlikely(REFLINK_P_ERROR(p.v))) {
+		bch2_trans_iter_exit(trans, iter);
+
+		int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
+		if (ret)
+			return bkey_s_c_err(ret);
+	}
+
+	*offset_into_extent = reflink_offset - bkey_start_offset(k.k);
+	return k;
+}
+
 /* reflink pointer trigger */
 
 static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
@@ -137,37 +289,37 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
 			enum btree_iter_update_trigger_flags flags)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_i *k;
-	__le64 *refcount;
-	int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
 	struct printbuf buf = PRINTBUF;
-	int ret;
 
-	k = bch2_bkey_get_mut_noupdate(trans, &iter,
-			BTREE_ID_reflink, POS(0, *idx),
-			BTREE_ITER_with_updates);
-	ret = PTR_ERR_OR_ZERO(k);
+	s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
+							BTREE_ITER_intent|
+							BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
 	if (ret)
-		goto err;
+		return ret;
 
-	refcount = bkey_refcount(bkey_i_to_s(k));
-	if (!refcount) {
-		bch2_bkey_val_to_text(&buf, c, p.s_c);
-		bch2_trans_inconsistent(trans,
-			"nonexistent indirect extent at %llu while marking\n  %s",
-			*idx, buf.buf);
-		ret = -EIO;
-		goto err;
+	if (bkey_deleted(k.k)) {
+		if (!(flags & BTREE_TRIGGER_overwrite))
+			ret = -BCH_ERR_missing_indirect_extent;
+		goto next;
 	}
 
+	struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		goto err;
+
+	__le64 *refcount = bkey_refcount(bkey_i_to_s(new));
 	if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
 		bch2_bkey_val_to_text(&buf, c, p.s_c);
-		bch2_trans_inconsistent(trans,
-			"indirect extent refcount underflow at %llu while marking\n  %s",
-			*idx, buf.buf);
-		ret = -EIO;
-		goto err;
+		prt_printf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		log_fsck_err(trans, reflink_refcount_underflow,
+			     "indirect extent refcount underflow while marking\n  %s",
+			   buf.buf);
+		goto next;
 	}
 
 	if (flags & BTREE_TRIGGER_insert) {
@@ -175,25 +327,26 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
 		u64 pad;
 
 		pad = max_t(s64, le32_to_cpu(v->front_pad),
-			    REFLINK_P_IDX(v) - bkey_start_offset(&k->k));
+			    REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
 		BUG_ON(pad > U32_MAX);
 		v->front_pad = cpu_to_le32(pad);
 
 		pad = max_t(s64, le32_to_cpu(v->back_pad),
-			    k->k.p.offset - p.k->size - REFLINK_P_IDX(v));
+			    new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
 		BUG_ON(pad > U32_MAX);
 		v->back_pad = cpu_to_le32(pad);
 	}
 
-	le64_add_cpu(refcount, add);
+	le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
 
 	bch2_btree_iter_set_pos_to_extent_start(&iter);
-	ret = bch2_trans_update(trans, &iter, k, 0);
+	ret = bch2_trans_update(trans, &iter, new, 0);
 	if (ret)
 		goto err;
-
-	*idx = k->k.p.offset;
+next:
+	*idx = k.k->p.offset;
 err:
+fsck_err:
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
 	return ret;
@@ -207,9 +360,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct reflink_gc *r;
 	int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
-	u64 start = REFLINK_P_IDX(p.v);
-	u64 end = start + p.k->size;
-	u64 next_idx = end + le32_to_cpu(p.v->back_pad);
+	u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
 	s64 ret = 0;
 	struct printbuf buf = PRINTBUF;
 
@@ -228,36 +379,14 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
 	*idx = r->offset;
 	return 0;
 not_found:
-	BUG_ON(!(flags & BTREE_TRIGGER_check_repair));
-
-	if (fsck_err(trans, reflink_p_to_missing_reflink_v,
-		     "pointer to missing indirect extent\n"
-		     "  %s\n"
-		     "  missing range %llu-%llu",
-		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
-		     *idx, next_idx)) {
-		struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c);
-		ret = PTR_ERR_OR_ZERO(update);
+	if (flags & BTREE_TRIGGER_check_repair) {
+		ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
 		if (ret)
 			goto err;
-
-		if (next_idx <= start) {
-			bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx);
-		} else if (*idx >= end) {
-			bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end);
-		} else {
-			bkey_error_init(update);
-			update->k.p		= p.k->p;
-			update->k.size		= p.k->size;
-			set_bkey_val_u64s(&update->k, 0);
-		}
-
-		ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun);
 	}
 
 	*idx = next_idx;
 err:
-fsck_err:
 	printbuf_exit(&buf);
 	return ret;
 }
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 6ec3a9ea6bb4..b61a4bdd8e82 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -73,6 +73,10 @@ static inline __le64 *bkey_refcount(struct bkey_s k)
 	}
 }
 
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
+					    s64 *, struct bkey_s_c_reflink_p,
+					    bool, unsigned);
+
 s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
 		     subvol_inum, u64, u64, u64, s64 *);
 
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
index 0d8de13b9ddf..53502627b2c5 100644
--- a/fs/bcachefs/reflink_format.h
+++ b/fs/bcachefs/reflink_format.h
@@ -18,6 +18,7 @@ struct bch_reflink_p {
 } __packed __aligned(8);
 
 LE64_BITMASK(REFLINK_P_IDX,	struct bch_reflink_p, idx_flags,  0, 56);
+LE64_BITMASK(REFLINK_P_ERROR,	struct bch_reflink_p, idx_flags, 56, 57);
 
 struct bch_reflink_v {
 	struct bch_val		v;
-- 
cgit v1.2.3


From 724e49c6778c3bf899a49e5d24f9c0bb67451a71 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 27 Oct 2024 00:05:54 -0400
Subject: bcachefs: kill inconsistent err in invalidate_one_bucket()

Change it to a normal fsck_err() - meaning it'll get repaired at runtime
when that's flipped on.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 38df36f8e70a..72ba7354adac 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1977,8 +1977,11 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 		return 1;
 
 	if (!bch2_dev_bucket_exists(c, bucket)) {
-		prt_str(&buf, "lru entry points to invalid bucket");
-		goto err;
+		if (fsck_err(trans, lru_entry_to_invalid_bucket,
+			     "lru key points to nonexistent device:bucket %llu:%llu",
+			     bucket.inode, bucket.offset))
+			return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
+		goto out;
 	}
 
 	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
@@ -2019,28 +2022,9 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
 	--*nr_to_invalidate;
 out:
+fsck_err:
 	printbuf_exit(&buf);
 	return ret;
-err:
-	prt_str(&buf, "\n  lru key: ");
-	bch2_bkey_val_to_text(&buf, c, lru_k);
-
-	prt_str(&buf, "\n  lru entry: ");
-	bch2_lru_pos_to_text(&buf, lru_iter->pos);
-
-	prt_str(&buf, "\n  alloc key: ");
-	if (!a)
-		bch2_bpos_to_text(&buf, bucket);
-	else
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
-
-	bch_err(c, "%s", buf.buf);
-	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
-		bch2_inconsistent_error(c);
-		ret = -EINVAL;
-	}
-
-	goto out;
 }
 
 static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
-- 
cgit v1.2.3


From 731d06e138b07e1e54ac84a22054ca83ab5b1082 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 27 Oct 2024 20:47:03 -0400
Subject: bcachefs: rework bch2_bucket_alloc_freelist() freelist iteration

Prep work for converting try_alloc_bucket() to use
bch2_check_discard_freespace_key().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 59 +++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 372178c8d416..645d8a269142 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -276,9 +276,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 }
 
 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
-					    enum bch_watermark watermark, u64 free_entry,
+					    enum bch_watermark watermark,
 					    struct bucket_alloc_state *s,
-					    struct bkey_s_c freespace_k,
+					    struct btree_iter *freespace_iter,
 					    struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
@@ -287,8 +287,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	struct open_bucket *ob;
 	struct bch_alloc_v4 a_convert;
 	const struct bch_alloc_v4 *a;
-	u64 b = free_entry & ~(~0ULL << 56);
-	unsigned genbits = free_entry >> 56;
+	u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
+	unsigned genbits = freespace_iter->pos.offset >> 56;
 	struct printbuf buf = PRINTBUF;
 	int ret;
 
@@ -296,7 +296,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
 		       "  freespace key ",
 			ca->mi.first_bucket, ca->mi.nbuckets);
-		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		bch2_bkey_to_text(&buf, &freespace_iter->k);
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
 		ob = ERR_PTR(-EIO);
 		goto err;
@@ -321,7 +321,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
 		prt_printf(&buf, "non free bucket in freespace btree\n"
 		       "  freespace key ");
-		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		bch2_bkey_to_text(&buf, &freespace_iter->k);
 		prt_printf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
@@ -334,7 +334,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
 		       "  freespace key ",
 		       genbits, alloc_freespace_genbits(*a) >> 56);
-		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		bch2_bkey_to_text(&buf, &freespace_iter->k);
 		prt_printf(&buf, "\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 		bch2_trans_inconsistent(trans, "%s", buf.buf);
@@ -492,17 +492,20 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
 
 	BUG_ON(ca->new_fs_bucket_idx);
 again:
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
-				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
-		if (k.k->p.inode != ca->dev_idx)
-			break;
+	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
+					 POS(ca->dev_idx, alloc_cursor),
+					 POS(ca->dev_idx, U64_MAX),
+					 0, k, ret) {
+		/*
+		 * peek normally dosen't trim extents - they can span iter.pos,
+		 * which is not what we want here:
+		 */
+		iter.k.size = iter.k.p.offset - iter.pos.offset;
 
-		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
-		     alloc_cursor < k.k->p.offset;
-		     alloc_cursor++) {
+		while (iter.k.size) {
 			s->buckets_seen++;
 
-			u64 bucket = alloc_cursor & ~(~0ULL << 56);
+			u64 bucket = iter.pos.offset & ~(~0ULL << 56);
 			if (s->btree_bitmap != BTREE_BITMAP_ANY &&
 			    s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
 					bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
@@ -511,32 +514,36 @@ again:
 					goto fail;
 
 				bucket = sector_to_bucket(ca,
-						round_up(bucket_to_sector(ca, bucket) + 1,
+						round_up(bucket_to_sector(ca, bucket + 1),
 							 1ULL << ca->mi.btree_bitmap_shift));
-				u64 genbits = alloc_cursor >> 56;
-				alloc_cursor = bucket | (genbits << 56);
+				alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
 
-				if (alloc_cursor > k.k->p.offset)
-					bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
+				bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
 				s->skipped_mi_btree_bitmap++;
-				continue;
+				goto next;
 			}
 
-			ob = try_alloc_bucket(trans, ca, watermark,
-					      alloc_cursor, s, k, cl);
+			ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
 			if (ob) {
+				if (!IS_ERR(ob))
+					*dev_alloc_cursor = iter.pos.offset;
 				bch2_set_btree_iter_dontneed(&iter);
 				break;
 			}
-		}
 
+			iter.k.size--;
+			iter.pos.offset++;
+		}
+next:
 		if (ob || ret)
 			break;
 	}
 fail:
 	bch2_trans_iter_exit(trans, &iter);
 
-	if (!ob && ret)
+	BUG_ON(ob && ret);
+
+	if (ret)
 		ob = ERR_PTR(ret);
 
 	if (!ob && alloc_start > ca->mi.first_bucket) {
@@ -544,8 +551,6 @@ fail:
 		goto again;
 	}
 
-	*dev_alloc_cursor = alloc_cursor;
-
 	return ob;
 }
 
-- 
cgit v1.2.3


From c97118f1dc5081171625f66f082fe12980a4bcbe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 27 Oct 2024 00:40:43 -0400
Subject: bcachefs: try_alloc_bucket() now uses
 bch2_check_discard_freespace_key()

check_discard_freespace_key() was doing all the same checks as
try_alloc_bucket(), but with repair.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 80 ++++++++++++++++++++----------------
 fs/bcachefs/alloc_background.h |  2 +
 fs/bcachefs/alloc_foreground.c | 93 +++++++-----------------------------------
 3 files changed, 62 insertions(+), 113 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 72ba7354adac..1f42dd208957 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1332,51 +1332,53 @@ fsck_err:
 	return ret;
 }
 
-static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
-					      struct btree_iter *iter)
+int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter alloc_iter;
-	struct bkey_s_c alloc_k;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-	u64 genbits;
-	struct bpos pos;
 	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
 		? BCH_DATA_need_discard
 		: BCH_DATA_free;
 	struct printbuf buf = PRINTBUF;
-	int ret;
 
-	pos = iter->pos;
-	pos.offset &= ~(~0ULL << 56);
-	genbits = iter->pos.offset & (~0ULL << 56);
+	struct bpos bucket = iter->pos;
+	bucket.offset &= ~(~0ULL << 56);
+	u64 genbits = iter->pos.offset & (~0ULL << 56);
 
-	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
-	ret = bkey_err(alloc_k);
+	struct btree_iter alloc_iter;
+	struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, BTREE_ITER_cached);
+	int ret = bkey_err(alloc_k);
 	if (ret)
 		return ret;
 
-	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
-			trans, need_discard_freespace_key_to_invalid_dev_bucket,
-			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
-			bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
-		goto delete;
+	if (!bch2_dev_bucket_exists(c, bucket)) {
+		if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
+			     "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+			     bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
+			goto delete;
+		ret = 1;
+		goto out;
+	}
 
-	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	if (a->data_type != state ||
+	    (state == BCH_DATA_free &&
+	     genbits != alloc_freespace_genbits(*a))) {
+		if (fsck_err(trans, need_discard_freespace_key_bad,
+			     "%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
+			     (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+			     bch2_btree_id_str(iter->btree_id),
+			     iter->pos.inode,
+			     iter->pos.offset,
+			     a->data_type == state,
+			     genbits >> 56, alloc_freespace_genbits(*a) >> 56))
+			goto delete;
+		ret = 1;
+		goto out;
+	}
 
-	if (fsck_err_on(a->data_type != state ||
-			(state == BCH_DATA_free &&
-			 genbits != alloc_freespace_genbits(*a)),
-			trans, need_discard_freespace_key_bad,
-			"%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
-			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
-			bch2_btree_id_str(iter->btree_id),
-			iter->pos.inode,
-			iter->pos.offset,
-			a->data_type == state,
-			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
-		goto delete;
+	*gen = a->gen;
 out:
 fsck_err:
 	bch2_set_btree_iter_dontneed(&alloc_iter);
@@ -1386,10 +1388,18 @@ fsck_err:
 delete:
 	ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
 		bch2_trans_commit(trans, NULL, NULL,
-			BCH_TRANS_COMMIT_no_enospc);
+			BCH_TRANS_COMMIT_no_enospc) ?:
+		1;
 	goto out;
 }
 
+static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
+{
+	u8 gen;
+	int ret = bch2_check_discard_freespace_key(trans, iter, &gen);
+	return ret < 0 ? ret : 0;
+}
+
 /*
  * We've already checked that generation numbers in the bucket_gens btree are
  * valid for buckets that exist; this just checks for keys for nonexistent
@@ -1544,7 +1554,7 @@ bkey_err:
 	ret = for_each_btree_key(trans, iter,
 			BTREE_ID_need_discard, POS_MIN,
 			BTREE_ITER_prefetch, k,
-		bch2_check_discard_freespace_key(trans, &iter));
+		bch2_check_discard_freespace_key_fsck(trans, &iter));
 	if (ret)
 		goto err;
 
@@ -1557,7 +1567,7 @@ bkey_err:
 			break;
 
 		ret = bkey_err(k) ?:
-			bch2_check_discard_freespace_key(trans, &iter);
+			bch2_check_discard_freespace_key_fsck(trans, &iter);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 			ret = 0;
 			continue;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 163a67b97a40..57723a37abb8 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -307,6 +307,8 @@ int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
 int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s,
 		       enum btree_iter_update_trigger_flags);
+
+int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *);
 int bch2_check_alloc_info(struct bch_fs *);
 int bch2_check_alloc_to_lru_refs(struct bch_fs *);
 void bch2_dev_do_discards(struct bch_dev *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 645d8a269142..955ea6ae868f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -207,9 +207,8 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
 }
 
 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-					      u64 bucket,
+					      u64 bucket, u8 gen,
 					      enum bch_watermark watermark,
-					      const struct bch_alloc_v4 *a,
 					      struct bucket_alloc_state *s,
 					      struct closure *cl)
 {
@@ -261,7 +260,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->dev		= ca->dev_idx;
-	ob->gen		= a->gen;
+	ob->gen		= gen;
 	ob->bucket	= bucket;
 	spin_unlock(&ob->lock);
 
@@ -282,98 +281,36 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 					    struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter = { NULL };
-	struct bkey_s_c k;
-	struct open_bucket *ob;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
 	u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
-	unsigned genbits = freespace_iter->pos.offset >> 56;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
-		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
-		       "  freespace key ",
-			ca->mi.first_bucket, ca->mi.nbuckets);
-		bch2_bkey_to_text(&buf, &freespace_iter->k);
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
-		ob = ERR_PTR(-EIO);
-		goto err;
-	}
-
-	k = bch2_bkey_get_iter(trans, &iter,
-			       BTREE_ID_alloc, POS(ca->dev_idx, b),
-			       BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret) {
-		ob = ERR_PTR(ret);
-		goto err;
-	}
-
-	a = bch2_alloc_to_v4(k, &a_convert);
-
-	if (a->data_type != BCH_DATA_free) {
-		if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
-			ob = NULL;
-			goto err;
-		}
+	u8 gen;
 
-		prt_printf(&buf, "non free bucket in freespace btree\n"
-		       "  freespace key ");
-		bch2_bkey_to_text(&buf, &freespace_iter->k);
-		prt_printf(&buf, "\n  ");
-		bch2_bkey_val_to_text(&buf, c, k);
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
-		ob = ERR_PTR(-EIO);
-		goto err;
-	}
-
-	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
-	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
-		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
-		       "  freespace key ",
-		       genbits, alloc_freespace_genbits(*a) >> 56);
-		bch2_bkey_to_text(&buf, &freespace_iter->k);
-		prt_printf(&buf, "\n  ");
-		bch2_bkey_val_to_text(&buf, c, k);
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
-		ob = ERR_PTR(-EIO);
-		goto err;
-	}
+	int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret)
+		return NULL;
 
-	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+	if (unlikely(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers)) {
 		struct bch_backpointer bp;
 		struct bpos bp_pos = POS_MIN;
 
 		ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1,
 						&bp_pos, &bp,
 						BTREE_ITER_nopreserve);
-		if (ret) {
-			ob = ERR_PTR(ret);
-			goto err;
-		}
+		if (ret)
+			return ERR_PTR(ret);
 
 		if (!bkey_eq(bp_pos, POS_MAX)) {
 			/*
 			 * Bucket may have data in it - we don't call
-			 * bc2h_trans_inconnsistent() because fsck hasn't
+			 * bch2_trans_inconsistent() because fsck hasn't
 			 * finished yet
 			 */
-			ob = NULL;
-			goto err;
+			return NULL;
 		}
 	}
 
-	ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
-	if (!ob)
-		bch2_set_btree_iter_dontneed(&iter);
-err:
-	if (iter.path)
-		bch2_set_btree_iter_dontneed(&iter);
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ob;
+	return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
 }
 
 /*
@@ -452,7 +389,7 @@ again:
 
 		s->buckets_seen++;
 
-		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, a->gen, watermark, s, cl);
 next:
 		bch2_set_btree_iter_dontneed(&citer);
 		bch2_trans_iter_exit(trans, &citer);
-- 
cgit v1.2.3


From c8e588135ce2bdb76b2edc640130e8b1aacd1810 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 26 Oct 2024 22:21:20 -0400
Subject: bcachefs: bch2_bucket_do_index(): inconsistent_err -> fsck_err

Factor out a common helper, need_discard_or_freespace_err(), which is
now used by both fsck and the runtime checks, and can repair.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 83 ++++++++++++++++++++++--------------------
 fs/bcachefs/error.c            |  7 ++--
 fs/bcachefs/error.h            |  6 ++-
 3 files changed, 51 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1f42dd208957..0c044201787f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -664,17 +664,44 @@ int bch2_alloc_read(struct bch_fs *c)
 
 /* Free space/discard btree: */
 
+static int __need_discard_or_freespace_err(struct btree_trans *trans,
+					   struct bkey_s_c alloc_k,
+					   bool set, bool discard, bool repair)
+{
+	struct bch_fs *c = trans->c;
+	enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
+	enum bch_sb_error_id err_id = discard
+		? BCH_FSCK_ERR_need_discard_key_wrong
+		: BCH_FSCK_ERR_freespace_key_wrong;
+	enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
+	struct printbuf buf = PRINTBUF;
+
+	bch2_bkey_val_to_text(&buf, c, alloc_k);
+
+	int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
+				  "bucket incorrectly %sset in %s btree\n"
+				  "  %s",
+				  set ? "" : "un",
+				  bch2_btree_id_str(btree),
+				  buf.buf);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+#define need_discard_or_freespace_err(...)		\
+	fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
+
+#define need_discard_or_freespace_err_on(cond, ...)		\
+	(unlikely(cond) ?  need_discard_or_freespace_err(__VA_ARGS__) : false)
+
 static int bch2_bucket_do_index(struct btree_trans *trans,
 				struct bch_dev *ca,
 				struct bkey_s_c alloc_k,
 				const struct bch_alloc_v4 *a,
 				bool set)
 {
-	struct bch_fs *c = trans->c;
 	enum btree_id btree;
 	struct bpos pos;
-	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	struct printbuf buf = PRINTBUF;
 
 	if (a->data_type != BCH_DATA_free &&
 	    a->data_type != BCH_DATA_need_discard)
@@ -699,26 +726,14 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	if (ca->mi.freespace_initialized &&
-	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
-	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
-			"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
-			"  for %s",
-			set ? "setting" : "clearing",
-			bch2_btree_id_str(btree),
-			iter.pos.inode,
-			iter.pos.offset,
-			bch2_bkey_types[old.k->type],
-			bch2_bkey_types[old_type],
-			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		ret = -EIO;
-		goto err;
-	}
+	need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
+					 !old.k->type != set,
+					 trans, alloc_k, set,
+					 btree == BTREE_ID_need_discard, false);
 
 	ret = bch2_btree_bit_mod_iter(trans, &iter, set);
-err:
+fsck_err:
 	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1116,7 +1131,6 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_alloc_v4 a_convert;
 	const struct bch_alloc_v4 *a;
-	unsigned discard_key_type, freespace_key_type;
 	unsigned gens_offset;
 	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
@@ -1136,41 +1150,30 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 
 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
 
-	discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
 	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
 	k = bch2_btree_iter_peek_slot(discard_iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	if (fsck_err_on(k.k->type != discard_key_type,
-			trans, need_discard_key_wrong,
-			"incorrect key in need_discard btree (got %s should be %s)\n"
-			"  %s",
-			bch2_bkey_types[k.k->type],
-			bch2_bkey_types[discard_key_type],
-			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		ret = bch2_btree_bit_mod_iter(trans, discard_iter, !!discard_key_type);
+	bool is_discarded = a->data_type == BCH_DATA_need_discard;
+	if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
+					     trans, alloc_k, !is_discarded, true, true)) {
+		ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
 		if (ret)
 			goto err;
 	}
 
-	freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
 	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
 	k = bch2_btree_iter_peek_slot(freespace_iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	if (fsck_err_on(k.k->type != freespace_key_type,
-			trans, freespace_key_wrong,
-			"incorrect key in freespace btree (got %s should be %s)\n"
-			"  %s",
-			bch2_bkey_types[k.k->type],
-			bch2_bkey_types[freespace_key_type],
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		ret = bch2_btree_bit_mod_iter(trans, freespace_iter, !!freespace_key_type);
+	bool is_free = a->data_type == BCH_DATA_free;
+	if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
+					     trans, alloc_k, !is_free, false, true)) {
+		ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
 		if (ret)
 			goto err;
 	}
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 22b0fa405a39..2960baa023f6 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -256,9 +256,10 @@ int __bch2_fsck_err(struct bch_fs *c,
 		!trans &&
 		bch2_current_has_btree_trans(c));
 
-	if ((flags & FSCK_CAN_FIX) &&
-	    test_bit(err, c->sb.errors_silent))
-		return -BCH_ERR_fsck_fix;
+	if (test_bit(err, c->sb.errors_silent))
+		return flags & FSCK_CAN_FIX
+			? -BCH_ERR_fsck_fix
+			: -BCH_ERR_fsck_ignore;
 
 	bch2_sb_error_count(c, err);
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 24c41a9994df..8327a3461535 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -103,9 +103,9 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
 
 void bch2_flush_fsck_errs(struct bch_fs *);
 
-#define __fsck_err(c, _flags, _err_type, ...)				\
+#define fsck_err_wrap(_do)						\
 ({									\
-	int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__);	\
+	int _ret = _do;							\
 	if (_ret != -BCH_ERR_fsck_fix &&				\
 	    _ret != -BCH_ERR_fsck_ignore) {				\
 		ret = _ret;						\
@@ -115,6 +115,8 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 	_ret == -BCH_ERR_fsck_fix;					\
 })
 
+#define __fsck_err(...)		fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
+
 /* These macros return true if error should be fixed: */
 
 /* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-- 
cgit v1.2.3


From acd1fc7b1fb7585780b55edc79b4ee3bfd5ee1ce Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 26 Oct 2024 23:25:17 -0400
Subject: bcachefs: discard_one_bucket() now uses
 need_discard_or_freespace_err()

More conversion of inconsistent errors to fsck errors.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 0c044201787f..e90561b6def6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1770,11 +1770,13 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		goto out;
 
 	if (a->v.data_type != BCH_DATA_need_discard) {
-		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
-					       trans, "bucket incorrectly set in need_discard btree\n"
-					       "%s",
-					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = -EIO;
+		if (need_discard_or_freespace_err(trans, k, true, true, true)) {
+			ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
+			if (ret)
+				goto out;
+			goto commit;
+		}
+
 		goto out;
 	}
 
@@ -1814,16 +1816,20 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
 	alloc_data_type_set(&a->v, a->v.data_type);
 
-	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BCH_WATERMARK_btree|
-				  BCH_TRANS_COMMIT_no_enospc);
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	if (ret)
+		goto out;
+commit:
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BCH_WATERMARK_btree|
+				BCH_TRANS_COMMIT_no_enospc);
 	if (ret)
 		goto out;
 
 	count_event(c, bucket_discard);
 	s->discarded++;
 out:
+fsck_err:
 	if (discard_locked)
 		discard_in_flight_remove(ca, iter.pos.offset);
 	s->seen++;
-- 
cgit v1.2.3


From 7e5b8e00e2631ee1fa72edeb420e7393ad078ab3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 24 Oct 2024 22:12:37 -0400
Subject: bcachefs: Implement bch2_btree_iter_prev_min()

A user contributed a filessytem dump, where the dump was actually
corrupted (due to being taken while the filesystem was online), but
which exposed an interesting bug in fsck - reconstruct_inode().

When itearting in BTREE_ITER_filter_snapshots mode, it's required to
give an end position for the iteration and it can't span inode numbers;
continuing into the next inode might mean we start seeing keys from a
different snapshot tree, that the is_ancestor() checks always filter,
thus we're never able to return a key and stop iterating.

Backwards iteration never implemented the end position because nothing
else needed it - except for reconstuct_inode().

Additionally, backwards iteration is now able to overlay keys from the
journal, which will be useful if we ever decide to start doing journal
replay in the background.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c         | 256 +++++++++++++++++++++++++++------------
 fs/bcachefs/btree_iter.h         |   8 +-
 fs/bcachefs/btree_journal_iter.c |  46 +++++++
 fs/bcachefs/btree_journal_iter.h |   2 +
 fs/bcachefs/errcode.h            |   1 -
 fs/bcachefs/fsck.c               |   4 +-
 fs/bcachefs/io_misc.c            |   2 +-
 7 files changed, 234 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 580fee86a965..d66d773a37b4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 	BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
 	       iter->pos.snapshot != iter->snapshot);
 
-	BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
-	       bkey_gt(iter->pos, iter->k.p));
+	BUG_ON(iter->flags & BTREE_ITER_all_snapshots	? !bpos_eq(iter->pos, iter->k.p) :
+	       !(iter->flags & BTREE_ITER_is_extents)	? !bkey_eq(iter->pos, iter->k.p) :
+	       (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+		bkey_gt(iter->pos, iter->k.p)));
 }
 
 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
@@ -2152,6 +2154,37 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
 	return k;
 }
 
+static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      struct bpos end_pos)
+{
+	struct btree_path *path = btree_iter_path(trans, iter);
+
+	return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
+					   path->level,
+					   path->pos,
+					   end_pos,
+					   &iter->journal_idx);
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bkey_s_c k)
+{
+	struct btree_path *path = btree_iter_path(trans, iter);
+	struct bkey_i *next_journal =
+		bch2_btree_journal_peek_prev(trans, iter,
+				k.k ? k.k->p : path_l(path)->b->key.k.p);
+
+	if (next_journal) {
+		iter->k = next_journal->k;
+		k = bkey_i_to_s_c(next_journal);
+	}
+
+	return k;
+}
+
 /*
  * Checks btree key cache for key at iter->pos and returns it if present, or
  * bkey_s_c_null:
@@ -2457,127 +2490,187 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 	return bch2_btree_iter_peek(iter);
 }
 
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bkey_s_c k, k2;
+
+	bch2_btree_iter_verify(iter);
+
+	while (1) {
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
+
+		int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			break;
+		}
+
+		struct btree_path *path = btree_iter_path(trans, iter);
+		struct btree_path_level *l = path_l(path);
+
+		if (unlikely(!l->b)) {
+			/* No btree nodes at requested level: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			break;
+		}
+
+		btree_path_set_should_be_locked(trans, path);
+
+		k = btree_path_level_peek_all(trans->c, l, &iter->k);
+		if (!k.k || bpos_gt(k.k->p, search_key)) {
+			k = btree_path_level_prev(trans, path, l, &iter->k);
+
+			BUG_ON(k.k && bpos_gt(k.k->p, search_key));
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
+		    k.k &&
+		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+			k = k2;
+			if (bkey_err(k2)) {
+				bch2_btree_iter_set_pos(iter, iter->pos);
+				break;
+			}
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_with_journal))
+			k = btree_trans_peek_prev_journal(trans, iter, k);
+
+		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
+			     trans->nr_updates))
+			bch2_btree_trans_peek_prev_updates(trans, iter, &k);
+
+		if (likely(k.k && !bkey_deleted(k.k))) {
+			break;
+		} else if (k.k) {
+			search_key = bpos_predecessor(k.k->p);
+		} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
+			/* Advance to previous leaf node: */
+			search_key = bpos_predecessor(path->l[0].b->data->min_key);
+		} else {
+			/* Start of btree: */
+			bch2_btree_iter_set_pos(iter, POS_MIN);
+			k = bkey_s_c_null;
+			break;
+		}
+	}
+
+	bch2_btree_iter_verify(iter);
+	return k;
+}
+
 /**
- * bch2_btree_iter_peek_prev() - returns first key less than or equal to
+ * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
  * iterator's current position
  * @iter:	iterator to peek from
+ * @end:	search limit: returns keys greater than or equal to @end
  *
  * Returns:	key if found, or an error extractable with bkey_err().
  */
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = iter->pos;
 	struct bkey_s_c k;
-	struct bkey saved_k;
-	const struct bch_val *saved_v;
 	btree_path_idx_t saved_path = 0;
-	int ret;
 
 	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	EBUG_ON(btree_iter_path(trans, iter)->cached ||
-		btree_iter_path(trans, iter)->level);
-
-	if (iter->flags & BTREE_ITER_with_journal)
-		return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
-
-	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
+	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
 
 	if (iter->flags & BTREE_ITER_filter_snapshots)
 		search_key.snapshot = U32_MAX;
 
 	while (1) {
-		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-						iter->flags & BTREE_ITER_intent,
-						btree_iter_ip_allocated(iter));
-
-		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-		if (unlikely(ret)) {
-			/* ensure that iter->k is consistent with iter->pos: */
-			bch2_btree_iter_set_pos(iter, iter->pos);
-			k = bkey_s_c_err(ret);
+		k = __bch2_btree_iter_peek_prev(iter, search_key);
+		if (unlikely(!k.k))
+			goto end;
+		if (unlikely(bkey_err(k)))
 			goto out_no_locked;
-		}
-
-		struct btree_path *path = btree_iter_path(trans, iter);
 
-		k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
-		if (!k.k ||
-		    ((iter->flags & BTREE_ITER_is_extents)
-		     ? bpos_ge(bkey_start_pos(k.k), search_key)
-		     : bpos_gt(k.k->p, search_key)))
-			k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
+		if (iter->flags & BTREE_ITER_filter_snapshots) {
+			struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
+			if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
+				/*
+				 * If we have a saved candidate, and we're past
+				 * the last possible snapshot overwrite, return
+				 * it:
+				 */
+				bch2_path_put_nokeep(trans, iter->path,
+					      iter->flags & BTREE_ITER_intent);
+				iter->path = saved_path;
+				saved_path = 0;
+				k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
+				break;
+			}
 
-		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-			     trans->nr_updates))
-			bch2_btree_trans_peek_prev_updates(trans, iter, &k);
+			/*
+			 * We need to check against @end before FILTER_SNAPSHOTS because
+			 * if we get to a different inode that requested we might be
+			 * seeing keys for a different snapshot tree that will all be
+			 * filtered out.
+			 */
+			if (unlikely(bkey_lt(k.k->p, end)))
+				goto end;
 
-		if (likely(k.k)) {
-			if (iter->flags & BTREE_ITER_filter_snapshots) {
-				if (k.k->p.snapshot == iter->snapshot)
-					goto got_key;
+			if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
+				search_key = bpos_predecessor(k.k->p);
+				continue;
+			}
 
+			if (k.k->p.snapshot != iter->snapshot) {
 				/*
-				 * If we have a saved candidate, and we're no
-				 * longer at the same _key_ (not pos), return
-				 * that candidate
+				 * Have a key visible in iter->snapshot, but
+				 * might have overwrites: - save it and keep
+				 * searching. Unless it's a whiteout - then drop
+				 * our previous saved candidate:
 				 */
-				if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
-					bch2_path_put_nokeep(trans, iter->path,
-						      iter->flags & BTREE_ITER_intent);
-					iter->path = saved_path;
+				if (saved_path) {
+					bch2_path_put_nokeep(trans, saved_path,
+					      iter->flags & BTREE_ITER_intent);
 					saved_path = 0;
-					iter->k	= saved_k;
-					k.v	= saved_v;
-					goto got_key;
 				}
 
-				if (bch2_snapshot_is_ancestor(trans->c,
-							      iter->snapshot,
-							      k.k->p.snapshot)) {
-					if (saved_path)
-						bch2_path_put_nokeep(trans, saved_path,
-						      iter->flags & BTREE_ITER_intent);
+				if (!bkey_whiteout(k.k)) {
 					saved_path = btree_path_clone(trans, iter->path,
 								iter->flags & BTREE_ITER_intent,
 								_THIS_IP_);
-					path = btree_iter_path(trans, iter);
-					trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
-					saved_k = *k.k;
-					saved_v = k.v;
+					trace_btree_path_save_pos(trans,
+								  trans->paths + iter->path,
+								  trans->paths + saved_path);
 				}
 
 				search_key = bpos_predecessor(k.k->p);
 				continue;
 			}
-got_key:
-			if (bkey_whiteout(k.k) &&
-			    !(iter->flags & BTREE_ITER_all_snapshots)) {
+
+			if (bkey_whiteout(k.k)) {
 				search_key = bkey_predecessor(iter, k.k->p);
-				if (iter->flags & BTREE_ITER_filter_snapshots)
-					search_key.snapshot = U32_MAX;
+				search_key.snapshot = U32_MAX;
 				continue;
 			}
-
-			btree_path_set_should_be_locked(trans, path);
-			break;
-		} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
-			/* Advance to previous leaf node: */
-			search_key = bpos_predecessor(path->l[0].b->data->min_key);
-		} else {
-			/* Start of btree: */
-			bch2_btree_iter_set_pos(iter, POS_MIN);
-			k = bkey_s_c_null;
-			goto out_no_locked;
 		}
-	}
 
-	EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
+		EBUG_ON(iter->flags & BTREE_ITER_all_snapshots		? bpos_gt(k.k->p, iter->pos) :
+			iter->flags & BTREE_ITER_is_extents		? bkey_ge(bkey_start_pos(k.k), iter->pos) :
+									  bkey_gt(k.k->p, iter->pos));
+
+		if (unlikely(iter->flags & BTREE_ITER_all_snapshots	? bpos_lt(k.k->p, end) :
+			     iter->flags & BTREE_ITER_is_extents	? bkey_le(k.k->p, end) :
+									  bkey_lt(k.k->p, end)))
+			goto end;
+
+		break;
+	}
 
 	/* Extents can straddle iter->pos: */
-	if (bkey_lt(k.k->p, iter->pos))
-		iter->pos = k.k->p;
+	iter->pos = bpos_min(iter->pos, k.k->p);;
 
 	if (iter->flags & BTREE_ITER_filter_snapshots)
 		iter->pos.snapshot = iter->snapshot;
@@ -2587,8 +2680,11 @@ out_no_locked:
 
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
-
 	return k;
+end:
+	bch2_btree_iter_set_pos(iter, end);
+	k = bkey_s_c_null;
+	goto out_no_locked;
 }
 
 /**
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index cd9022ce15a5..3477fc8c0396 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -389,7 +389,13 @@ static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	return bch2_btree_iter_peek_max(iter, SPOS_MAX);
 }
 
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
+
+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+{
+	return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
+}
+
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index c9dee4b4627a..c44889ef9817 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -107,6 +107,52 @@ search:
 	return NULL;
 }
 
+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos,
+					   struct bpos end_pos, size_t *idx)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	unsigned iters = 0;
+	struct journal_key *k;
+
+	BUG_ON(*idx > keys->nr);
+search:
+	if (!*idx)
+		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+	while (*idx &&
+	       __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+		(*idx)++;
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
+	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+		if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
+			return NULL;
+
+		if (k->overwritten) {
+			--(*idx);
+			continue;
+		}
+
+		if (__journal_key_cmp(btree_id, level, pos, k) >= 0)
+			return k->k;
+
+		--(*idx);
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
+	return NULL;
+}
+
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
 					   unsigned level, struct bpos pos)
 {
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 754939f604d5..fa8c4f82c9c7 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -45,6 +45,8 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour
 
 struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
 				unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
+				unsigned, struct bpos, struct bpos, size_t *);
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
 					   unsigned, struct bpos);
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 40bf1e5775a9..18c995d41203 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -193,7 +193,6 @@
 	x(EINVAL,			opt_parse_error)			\
 	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
 	x(EINVAL,			remove_would_lose_data)			\
-	x(EINVAL,			btree_iter_with_journal_not_supported)	\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e0335265de3d..e10abd2e6c69 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -620,7 +620,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
 		struct btree_iter iter = {};
 
 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
-		struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+		struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
 		bch2_trans_iter_exit(trans, &iter);
 		int ret = bkey_err(k);
 		if (ret)
@@ -1649,7 +1649,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
 		if (i->count != count2) {
 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
 					    w->last_pos.inode, i->snapshot, i->count, count2);
-			return -BCH_ERR_internal_fsck_err;
+			i->count = count2;
 		}
 
 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index ff661a072000..524e31e7411b 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -426,7 +426,7 @@ case LOGGED_OP_FINSERT_shift_extents:
 		bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
 
 		k = insert
-			? bch2_btree_iter_peek_prev(&iter)
+			? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
 			: bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
 		if ((ret = bkey_err(k)))
 			goto btree_err;
-- 
cgit v1.2.3


From ac745efb429226434d9a8c5a1496dc0373efa359 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 25 Oct 2024 20:41:06 -0400
Subject: bcachefs: peek_prev_min(): Search forwards for extents, snapshots

With extents and snapshots, for slightly different reasons, we may have
to search forwards to find a key that compares equal to iter->pos (i.e.
a key that peek_prev() should return, as it returns keys <= iter->pos).

peek_slot() does this, and is an easy way to fix this case.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d66d773a37b4..ed74f0655d98 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2575,6 +2575,26 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
  */
 struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
 {
+	if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
+	   !bkey_eq(iter->pos, POS_MAX)) {
+		/*
+		 * bkey_start_pos(), for extents, is not monotonically
+		 * increasing until after filtering for snapshots:
+		 *
+		 * Thus, for extents we need to search forward until we find a
+		 * real visible extents - easiest to just use peek_slot() (which
+		 * internally uses peek() for extents)
+		 */
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+		if (bkey_err(k))
+			return k;
+
+		if (!bkey_deleted(k.k) &&
+		    (!(iter->flags & BTREE_ITER_is_extents) ||
+		     bkey_lt(bkey_start_pos(k.k), iter->pos)))
+			return k;
+	}
+
 	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = iter->pos;
 	struct bkey_s_c k;
@@ -2584,9 +2604,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
 	bch2_btree_iter_verify_entry_exit(iter);
 	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
 
-	if (iter->flags & BTREE_ITER_filter_snapshots)
-		search_key.snapshot = U32_MAX;
-
 	while (1) {
 		k = __bch2_btree_iter_peek_prev(iter, search_key);
 		if (unlikely(!k.k))
-- 
cgit v1.2.3


From 7815809fca37b0e6287de349f88b2d5d6eadd82d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 14 Nov 2024 21:28:40 -0500
Subject: bcachefs: Delete backpointers check in try_alloc_bucket()

try_alloc_bucket() has a "safety" check, which avoids allocating a
bucket if there's any backpointers present.

But backpointers are not the source of truth for live data in a bucket,
the bucket sector counts are; this check was fairly useless, and we're
also deferring backpointers checks from fsck to runtime in the near
future.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 955ea6ae868f..6d665b720f72 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -290,26 +290,6 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	if (ret)
 		return NULL;
 
-	if (unlikely(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers)) {
-		struct bch_backpointer bp;
-		struct bpos bp_pos = POS_MIN;
-
-		ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1,
-						&bp_pos, &bp,
-						BTREE_ITER_nopreserve);
-		if (ret)
-			return ERR_PTR(ret);
-
-		if (!bkey_eq(bp_pos, POS_MAX)) {
-			/*
-			 * Bucket may have data in it - we don't call
-			 * bch2_trans_inconsistent() because fsck hasn't
-			 * finished yet
-			 */
-			return NULL;
-		}
-	}
-
 	return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
 }
 
-- 
cgit v1.2.3


From 9e92d6e9efb02c60c700a97e981b8cb97ed9451b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 14 Nov 2024 21:53:38 -0500
Subject: bcachefs: Kill bch2_get_next_backpointer()

Since for quite some time backpointers have only been stored in the
backpointers btree, not alloc keys (an aborted experiment, support for
which has been removed) - we can replace get_next_backpointer() with
simple btree iteration.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 125 +++++++++++++--------------------------------
 fs/bcachefs/backpointers.h |  11 ++--
 fs/bcachefs/ec.c           |  41 ++++++---------
 fs/bcachefs/move.c         |  41 +++++++--------
 4 files changed, 75 insertions(+), 143 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index f323ce4b0b33..a9ffbea277bd 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -215,59 +215,9 @@ err:
 	return ret;
 }
 
-/*
- * Find the next backpointer >= *bp_offset:
- */
-int bch2_get_next_backpointer(struct btree_trans *trans,
-			      struct bch_dev *ca,
-			      struct bpos bucket, int gen,
-			      struct bpos *bp_pos,
-			      struct bch_backpointer *bp,
-			      unsigned iter_flags)
-{
-	struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0);
-	struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
-	struct bkey_s_c k;
-	int ret = 0;
-
-	if (bpos_ge(*bp_pos, bp_end_pos))
-		goto done;
-
-	if (gen >= 0) {
-		k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
-				       bucket, BTREE_ITER_cached|iter_flags);
-		ret = bkey_err(k);
-		if (ret)
-			goto out;
-
-		if (k.k->type != KEY_TYPE_alloc_v4 ||
-		    bkey_s_c_to_alloc_v4(k).v->gen != gen)
-			goto done;
-	}
-
-	*bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0));
-
-	for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
-				     *bp_pos, iter_flags, k, ret) {
-		if (bpos_ge(k.k->p, bp_end_pos))
-			break;
-
-		*bp_pos = k.k->p;
-		*bp = *bkey_s_c_to_backpointer(k).v;
-		goto out;
-	}
-done:
-	*bp_pos = SPOS_MAX;
-out:
-	bch2_trans_iter_exit(trans, &bp_iter);
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	return ret;
-}
-
-static void backpointer_not_found(struct btree_trans *trans,
-				  struct bpos bp_pos,
-				  struct bch_backpointer bp,
-				  struct bkey_s_c k)
+static void backpointer_target_not_found(struct btree_trans *trans,
+					 struct bkey_s_c_backpointer bp,
+					 struct bkey_s_c target_k)
 {
 	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
@@ -281,22 +231,22 @@ static void backpointer_not_found(struct btree_trans *trans,
 		return;
 
 	struct bpos bucket;
-	if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+	if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket))
 		return;
 
 	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
-		   bp.level ? "btree node" : "extent");
+		   bp.v->level ? "btree node" : "extent");
 	prt_printf(&buf, "bucket: ");
 	bch2_bpos_to_text(&buf, bucket);
 	prt_printf(&buf, "\n  ");
 
 	prt_printf(&buf, "backpointer pos: ");
-	bch2_bpos_to_text(&buf, bp_pos);
+	bch2_bpos_to_text(&buf, bp.k->p);
 	prt_printf(&buf, "\n  ");
 
-	bch2_backpointer_to_text(&buf, &bp);
+	bch2_backpointer_to_text(&buf, bp.v);
 	prt_printf(&buf, "\n  ");
-	bch2_bkey_val_to_text(&buf, c, k);
+	bch2_bkey_val_to_text(&buf, c, target_k);
 	if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
 		bch_err_ratelimited(c, "%s", buf.buf);
 	else
@@ -306,21 +256,20 @@ static void backpointer_not_found(struct btree_trans *trans,
 }
 
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+					 struct bkey_s_c_backpointer bp,
 					 struct btree_iter *iter,
-					 struct bpos bp_pos,
-					 struct bch_backpointer bp,
 					 unsigned iter_flags)
 {
-	if (likely(!bp.level)) {
+	if (likely(!bp.v->level)) {
 		struct bch_fs *c = trans->c;
 
 		struct bpos bucket;
-		if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+		if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket))
 			return bkey_s_c_err(-EIO);
 
 		bch2_trans_node_iter_init(trans, iter,
-					  bp.btree_id,
-					  bp.pos,
+					  bp.v->btree_id,
+					  bp.v->pos,
 					  0, 0,
 					  iter_flags);
 		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
@@ -329,14 +278,15 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 			return k;
 		}
 
-		if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+		if (k.k &&
+		    extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bucket, *bp.v))
 			return k;
 
 		bch2_trans_iter_exit(trans, iter);
-		backpointer_not_found(trans, bp_pos, bp, k);
+		backpointer_target_not_found(trans, bp, k);
 		return bkey_s_c_null;
 	} else {
-		struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+		struct btree *b = bch2_backpointer_get_node(trans, bp, iter);
 
 		if (IS_ERR_OR_NULL(b)) {
 			bch2_trans_iter_exit(trans, iter);
@@ -347,39 +297,38 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 }
 
 struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
-					struct btree_iter *iter,
-					struct bpos bp_pos,
-					struct bch_backpointer bp)
+					struct bkey_s_c_backpointer bp,
+					struct btree_iter *iter)
 {
 	struct bch_fs *c = trans->c;
 
-	BUG_ON(!bp.level);
+	BUG_ON(!bp.v->level);
 
 	struct bpos bucket;
-	if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+	if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket))
 		return ERR_PTR(-EIO);
 
 	bch2_trans_node_iter_init(trans, iter,
-				  bp.btree_id,
-				  bp.pos,
+				  bp.v->btree_id,
+				  bp.v->pos,
 				  0,
-				  bp.level - 1,
+				  bp.v->level - 1,
 				  0);
 	struct btree *b = bch2_btree_iter_peek_node(iter);
 	if (IS_ERR_OR_NULL(b))
 		goto err;
 
-	BUG_ON(b->c.level != bp.level - 1);
+	BUG_ON(b->c.level != bp.v->level - 1);
 
-	if (extent_matches_bp(c, bp.btree_id, bp.level,
+	if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
 			      bkey_i_to_s_c(&b->key),
-			      bucket, bp))
+			      bucket, *bp.v))
 		return b;
 
 	if (btree_node_will_make_reachable(b)) {
 		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
 	} else {
-		backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
+		backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key));
 		b = NULL;
 	}
 err:
@@ -581,10 +530,10 @@ check_existing_bp:
 	if (bp_k.k->type != KEY_TYPE_backpointer)
 		goto missing;
 
-	struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
+	struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
 
 	struct bkey_s_c other_extent =
-		bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
+		bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0);
 	ret = bkey_err(other_extent);
 	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 		ret = 0;
@@ -603,7 +552,7 @@ check_existing_bp:
 		bch_err(c, "%s", buf.buf);
 
 		if (other_extent.k->size <= orig_k.k->size) {
-			ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
+			ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bucket.inode);
 			if (ret)
 				goto err;
 			goto out;
@@ -615,7 +564,7 @@ check_existing_bp:
 		}
 	}
 
-	ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+	ret = check_extent_checksum(trans, other_bp.v->btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -623,7 +572,7 @@ check_existing_bp:
 		goto missing;
 	}
 
-	ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
+	ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.v->btree_id, other_extent, bucket.inode);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -964,18 +913,16 @@ static int check_one_backpointer(struct btree_trans *trans,
 
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
 	struct bbpos pos = bp_to_bbpos(*bp.v);
-	struct bkey_s_c k;
 	struct printbuf buf = PRINTBUF;
-	int ret;
 
 	if (bbpos_cmp(pos, start) < 0 ||
 	    bbpos_cmp(pos, end) > 0)
 		return 0;
 
-	k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
-	ret = bkey_err(k);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0);
+	int ret = bkey_err(k);
 	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 		return 0;
 	if (ret)
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 3b29fdf519dd..74c96aee713e 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -165,13 +165,10 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
 	__bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors);
 }
 
-int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int,
-			      struct bpos *, struct bch_backpointer *, unsigned);
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
-					 struct bpos, struct bch_backpointer,
-					 unsigned);
-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
-					struct bpos, struct bch_backpointer);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
+					 struct btree_iter *, unsigned);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
+					struct btree_iter *);
 
 int bch2_check_btree_backpointers(struct bch_fs *);
 int bch2_check_extents_to_backpointers(struct bch_fs *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 5c404f24bddc..146e03ffe8b7 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1267,11 +1267,10 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 				   struct bch_dev *ca,
 				   struct bpos bucket, u8 gen,
 				   struct ec_stripe_buf *s,
-				   struct bpos *bp_pos)
+				   struct bkey_s_c_backpointer bp)
 {
 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 	struct bch_fs *c = trans->c;
-	struct bch_backpointer bp;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	const struct bch_extent_ptr *ptr_c;
@@ -1280,33 +1279,26 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	struct bkey_i *n;
 	int ret, dev, block;
 
-	ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
-				bp_pos, &bp, BTREE_ITER_cached);
-	if (ret)
-		return ret;
-	if (bpos_eq(*bp_pos, SPOS_MAX))
-		return 0;
-
-	if (bp.level) {
+	if (bp.v->level) {
 		struct printbuf buf = PRINTBUF;
 		struct btree_iter node_iter;
 		struct btree *b;
 
-		b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
+		b = bch2_backpointer_get_node(trans, bp, &node_iter);
 		bch2_trans_iter_exit(trans, &node_iter);
 
 		if (!b)
 			return 0;
 
 		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
-		bch2_backpointer_to_text(&buf, &bp);
+		bch2_backpointer_to_text(&buf, bp.v);
 
 		bch2_fs_inconsistent(c, "%s", buf.buf);
 		printbuf_exit(&buf);
 		return -EIO;
 	}
 
-	k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
+	k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent);
 	ret = bkey_err(k);
 	if (ret)
 		return ret;
@@ -1365,7 +1357,6 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 	struct bch_fs *c = trans->c;
 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 	struct bch_extent_ptr ptr = v->ptrs[block];
-	struct bpos bp_pos = POS_MIN;
 	int ret = 0;
 
 	struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
@@ -1374,18 +1365,20 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
 
-	while (1) {
-		ret = commit_do(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_check_rw|
-				BCH_TRANS_COMMIT_no_enospc,
-			ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
-		if (ret)
-			break;
-		if (bkey_eq(bp_pos, POS_MAX))
+	ret = for_each_btree_key_commit(trans, bp_iter, BTREE_ID_backpointers,
+			bucket_pos_to_bp(ca, bucket_pos, 0), 0, bp_k,
+			NULL, NULL,
+			BCH_TRANS_COMMIT_no_check_rw|
+			BCH_TRANS_COMMIT_no_enospc, ({
+		if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
 			break;
 
-		bp_pos = bpos_nosnap_successor(bp_pos);
-	}
+		if (bp_k.k->type != KEY_TYPE_backpointer)
+			continue;
+
+		ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
+					bkey_s_c_to_backpointer(bp_k));
+	}));
 
 	bch2_dev_put(ca);
 	return ret;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a6b503278519..88ab9d7e1a1b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -670,16 +670,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 	struct bch_fs *c = trans->c;
 	bool is_kthread = current->flags & PF_KTHREAD;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	struct btree_iter iter;
+	struct btree_iter iter = {}, bp_iter = {};
 	struct bkey_buf sk;
-	struct bch_backpointer bp;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
 	struct bkey_s_c k;
 	struct data_update_opts data_opts;
 	unsigned dirty_sectors, bucket_size;
 	u64 fragmentation;
-	struct bpos bp_pos = POS_MIN;
 	int ret = 0;
 
 	struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
@@ -695,21 +691,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 	 */
 	bch2_trans_begin(trans);
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     bucket, BTREE_ITER_cached);
-	ret = lockrestart_do(trans,
-			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+			     bucket_pos_to_bp(ca, bucket, 0), 0);
 
 	bch_err_msg(c, ret, "looking up alloc key");
 	if (ret)
 		goto err;
 
-	a = bch2_alloc_to_v4(k, &a_convert);
-	dirty_sectors = bch2_bucket_sectors_dirty(*a);
-	bucket_size = ca->mi.bucket_size;
-	fragmentation = alloc_lru_idx_fragmentation(*a, ca);
-
 	ret = bch2_btree_write_buffer_tryflush(trans);
 	bch_err_msg(c, ret, "flushing btree write buffer");
 	if (ret)
@@ -721,18 +709,24 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 
 		bch2_trans_begin(trans);
 
-		ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
-						&bp_pos, &bp,
-						BTREE_ITER_cached);
+		k = bch2_btree_iter_peek(&bp_iter);
+		ret = bkey_err(k);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
 			goto err;
-		if (bkey_eq(bp_pos, POS_MAX))
+
+		if (!k.k ||
+		    bkey_ge(k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)))
 			break;
 
-		if (!bp.level) {
-			k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
+		if (k.k->type != KEY_TYPE_backpointer)
+			goto next;
+
+		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+		if (!bp.v->level) {
+			k = bch2_backpointer_get_key(trans, bp, &iter, 0);
 			ret = bkey_err(k);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
@@ -785,7 +779,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 		} else {
 			struct btree *b;
 
-			b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
+			b = bch2_backpointer_get_node(trans, bp, &iter);
 			ret = PTR_ERR_OR_ZERO(b);
 			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 				continue;
@@ -814,11 +808,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 			}
 		}
 next:
-		bp_pos = bpos_nosnap_successor(bp_pos);
+		bch2_btree_iter_advance(&bp_iter);
 	}
 
 	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
 err:
+	bch2_trans_iter_exit(trans, &bp_iter);
 	bch2_dev_put(ca);
 	bch2_bkey_buf_exit(&sk, c);
 	return ret;
-- 
cgit v1.2.3


From cec51e0a5d6d48eeeef6bfcfb8c5e91147dcdddb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Nov 2024 03:31:01 -0500
Subject: bcachefs: add missing BTREE_ITER_intent

this fixes excessive transaction restarts due to trans_commit having to
upgrade

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_write.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f11e11279f01..f97ebb30f6c0 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -216,6 +216,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
 			      SPOS(0,
 				   extent_iter->pos.inode,
 				   extent_iter->snapshot),
+			      BTREE_ITER_intent|
 			      BTREE_ITER_cached);
 	int ret = bkey_err(k);
 	if (unlikely(ret))
-- 
cgit v1.2.3


From 6a4ce7a92fcc12762154c7f695c969787780c7c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 16 Nov 2024 21:03:53 -0500
Subject: bcachefs: compression workspaces should be indexed by opt, not type

type includes lz4 and lz4_old, which do not get different compression
workspaces, and incompressible, a fake type - BCH_COMPRESSION_OPTS() is
the correct enum to use.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  2 +-
 fs/bcachefs/compress.c | 19 +++++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c59a58b93a92..60ad547c52a8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -982,7 +982,7 @@ struct bch_fs {
 	struct rhashtable	promote_table;
 
 	mempool_t		compression_bounce[2];
-	mempool_t		compress_workspace[BCH_COMPRESSION_TYPE_NR];
+	mempool_t		compress_workspace[BCH_COMPRESSION_OPT_NR];
 	mempool_t		decompress_workspace;
 	size_t			zstd_workspace_size;
 
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 1410365a8891..4f541a195c84 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -394,8 +394,11 @@ static unsigned __bio_compress(struct bch_fs *c,
 	unsigned pad;
 	int ret = 0;
 
-	BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
-	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+	/* bch2_compression_decode catches unknown compression types: */
+	BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
+
+	mempool_t *workspace_pool = &c->compress_workspace[compression.type];
+	BUG_ON(!mempool_initialized(workspace_pool));
 
 	/* If it's only one block, don't bother trying to compress: */
 	if (src->bi_iter.bi_size <= c->opts.block_size)
@@ -404,7 +407,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 	dst_data = bio_map_or_bounce(c, dst, WRITE);
 	src_data = bio_map_or_bounce(c, src, READ);
 
-	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
+	workspace = mempool_alloc(workspace_pool, GFP_NOFS);
 
 	*src_len = src->bi_iter.bi_size;
 	*dst_len = dst->bi_iter.bi_size;
@@ -447,7 +450,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 		*src_len = round_down(*src_len, block_bytes(c));
 	}
 
-	mempool_free(workspace, &c->compress_workspace[compression_type]);
+	mempool_free(workspace, workspace_pool);
 
 	if (ret)
 		goto err;
@@ -576,17 +579,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 
 	struct {
 		unsigned			feature;
-		enum bch_compression_type	type;
+		enum bch_compression_opts	type;
 		size_t				compress_workspace;
 		size_t				decompress_workspace;
 	} compression_types[] = {
-		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
+		{ BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
 			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
 			0 },
-		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
+		{ BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
 			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
 			zlib_inflate_workspacesize(), },
-		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
+		{ BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
 			c->zstd_workspace_size,
 			zstd_dctx_workspace_bound() },
 	}, *i;
-- 
cgit v1.2.3


From e1702b989151649b27a73b776fc5bc494d03f725 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Nov 2024 00:52:20 -0500
Subject: bcachefs: Don't use a shared decompress workspace mempool

gzip and zstd require different decompress workspace sizes, and if we
start with one and then start using the other at runtime we may not get
the correct size

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 -
 fs/bcachefs/compress.c | 52 ++++++++++++++++++++++++++++++--------------------
 fs/bcachefs/errcode.h  |  1 -
 3 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 60ad547c52a8..7a947d43d504 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -983,7 +983,6 @@ struct bch_fs {
 
 	mempool_t		compression_bounce[2];
 	mempool_t		compress_workspace[BCH_COMPRESSION_OPT_NR];
-	mempool_t		decompress_workspace;
 	size_t			zstd_workspace_size;
 
 	struct crypto_shash	*sha256;
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 4f541a195c84..2813e4556f0d 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -9,6 +9,24 @@
 #include <linux/zlib.h>
 #include <linux/zstd.h>
 
+static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
+{
+	switch (type) {
+	case BCH_COMPRESSION_TYPE_none:
+	case BCH_COMPRESSION_TYPE_incompressible:
+		return BCH_COMPRESSION_OPT_none;
+	case BCH_COMPRESSION_TYPE_lz4_old:
+	case BCH_COMPRESSION_TYPE_lz4:
+		return BCH_COMPRESSION_OPT_lz4;
+	case BCH_COMPRESSION_TYPE_gzip:
+		return BCH_COMPRESSION_OPT_gzip;
+	case BCH_COMPRESSION_TYPE_zstd:
+		return BCH_COMPRESSION_OPT_zstd;
+	default:
+		BUG();
+	}
+}
+
 /* Bounce buffer: */
 struct bbuf {
 	void		*b;
@@ -158,6 +176,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 	void *workspace;
 	int ret;
 
+	enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
+	mempool_t *workspace_pool = &c->compress_workspace[opt];
+	BUG_ON(!mempool_initialized(workspace_pool));
+
 	src_data = bio_map_or_bounce(c, src, READ);
 
 	switch (crc.compression_type) {
@@ -176,13 +198,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 			.avail_out	= dst_len,
 		};
 
-		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+		workspace = mempool_alloc(workspace_pool, GFP_NOFS);
 
 		zlib_set_workspace(&strm, workspace);
 		zlib_inflateInit2(&strm, -MAX_WBITS);
 		ret = zlib_inflate(&strm, Z_FINISH);
 
-		mempool_free(workspace, &c->decompress_workspace);
+		mempool_free(workspace, workspace_pool);
 
 		if (ret != Z_STREAM_END)
 			goto err;
@@ -195,14 +217,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 		if (real_src_len > src_len - 4)
 			goto err;
 
-		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+		workspace = mempool_alloc(workspace_pool, GFP_NOFS);
 		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
 
 		ret = zstd_decompress_dctx(ctx,
 				dst_data,	dst_len,
 				src_data.b + 4, real_src_len);
 
-		mempool_free(workspace, &c->decompress_workspace);
+		mempool_free(workspace, workspace_pool);
 
 		if (ret != dst_len)
 			goto err;
@@ -562,7 +584,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 {
 	unsigned i;
 
-	mempool_exit(&c->decompress_workspace);
 	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
 		mempool_exit(&c->compress_workspace[i]);
 	mempool_exit(&c->compression_bounce[WRITE]);
@@ -571,7 +592,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
-	size_t decompress_workspace_size = 0;
 	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
 						 c->opts.encoded_extent_max);
 
@@ -581,17 +601,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		unsigned			feature;
 		enum bch_compression_opts	type;
 		size_t				compress_workspace;
-		size_t				decompress_workspace;
 	} compression_types[] = {
 		{ BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
-			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
-			0 },
+			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
 		{ BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
-			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
-			zlib_inflate_workspacesize(), },
+			max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+			    zlib_inflate_workspacesize()) },
 		{ BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
-			c->zstd_workspace_size,
-			zstd_dctx_workspace_bound() },
+			max(c->zstd_workspace_size,
+			    zstd_dctx_workspace_bound()) },
 	}, *i;
 	bool have_compressed = false;
 
@@ -616,9 +634,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 	for (i = compression_types;
 	     i < compression_types + ARRAY_SIZE(compression_types);
 	     i++) {
-		decompress_workspace_size =
-			max(decompress_workspace_size, i->decompress_workspace);
-
 		if (!(features & (1 << i->feature)))
 			continue;
 
@@ -631,11 +646,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 			return -BCH_ERR_ENOMEM_compression_workspace_init;
 	}
 
-	if (!mempool_initialized(&c->decompress_workspace) &&
-	    mempool_init_kvmalloc_pool(&c->decompress_workspace,
-				       1, decompress_workspace_size))
-		return -BCH_ERR_ENOMEM_decompression_workspace_init;
-
 	return 0;
 }
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 18c995d41203..3affdafc2c04 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -54,7 +54,6 @@
 	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
 	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
 	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
-	x(ENOMEM,			ENOMEM_decompression_workspace_init)	\
 	x(ENOMEM,			ENOMEM_bucket_gens)			\
 	x(ENOMEM,			ENOMEM_buckets_nouse)			\
 	x(ENOMEM,			ENOMEM_usage_init)			\
-- 
cgit v1.2.3


From 3d0b3b51c5abaf27e35d9eeca880eed44c8690b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 14 Nov 2024 23:03:40 -0500
Subject: bcachefs: Don't BUG_ON() when superblock feature wasn't set for
 compressed data

We don't allocate the mempools for compression/decompression unless we
need them - but that means there's an inconsistency to check for.

Reported-by: syzbot+cb3fbcfb417448cfd278@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c         | 29 +++++++++++++++++++++++++++--
 fs/bcachefs/errcode.h          |  1 +
 fs/bcachefs/opts.c             |  2 +-
 fs/bcachefs/opts.h             |  1 +
 fs/bcachefs/sb-errors_format.h |  4 +++-
 5 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 2813e4556f0d..f99ff1819597 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -2,7 +2,9 @@
 #include "bcachefs.h"
 #include "checksum.h"
 #include "compress.h"
+#include "error.h"
 #include "extents.h"
+#include "opts.h"
 #include "super-io.h"
 
 #include <linux/lz4.h>
@@ -178,7 +180,16 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 
 	enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
 	mempool_t *workspace_pool = &c->compress_workspace[opt];
-	BUG_ON(!mempool_initialized(workspace_pool));
+	if (unlikely(!mempool_initialized(workspace_pool))) {
+		if (fsck_err(c, compression_type_not_marked_in_sb,
+			     "compression type %s set but not marked in superblock",
+			     __bch2_compression_types[crc.compression_type]))
+			ret = bch2_check_set_has_compressed_data(c, opt);
+		else
+			ret = -BCH_ERR_compression_workspace_not_initialized;
+		if (ret)
+			goto out;
+	}
 
 	src_data = bio_map_or_bounce(c, src, READ);
 
@@ -234,6 +245,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 		BUG();
 	}
 	ret = 0;
+fsck_err:
 out:
 	bio_unmap_or_unbounce(c, src_data);
 	return ret;
@@ -420,7 +432,17 @@ static unsigned __bio_compress(struct bch_fs *c,
 	BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
 
 	mempool_t *workspace_pool = &c->compress_workspace[compression.type];
-	BUG_ON(!mempool_initialized(workspace_pool));
+	if (unlikely(!mempool_initialized(workspace_pool))) {
+		if (fsck_err(c, compression_opt_not_marked_in_sb,
+			     "compression opt %s set but not marked in superblock",
+			     bch2_compression_opts[compression.type])) {
+			ret = bch2_check_set_has_compressed_data(c, compression.type);
+			if (ret) /* memory allocation failure, don't compress */
+				return 0;
+		} else {
+			return 0;
+		}
+	}
 
 	/* If it's only one block, don't bother trying to compress: */
 	if (src->bi_iter.bi_size <= c->opts.block_size)
@@ -502,6 +524,9 @@ out:
 err:
 	ret = BCH_COMPRESSION_TYPE_incompressible;
 	goto out;
+fsck_err:
+	ret = 0;
+	goto out;
 }
 
 unsigned bch2_bio_compress(struct bch_fs *c,
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 3affdafc2c04..2dda7f962e5b 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -54,6 +54,7 @@
 	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
 	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
 	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
+	x(EIO,				compression_workspace_not_initialized)	\
 	x(ENOMEM,			ENOMEM_bucket_gens)			\
 	x(ENOMEM,			ENOMEM_buckets_nouse)			\
 	x(ENOMEM,			ENOMEM_usage_init)			\
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 0ba58d74c21f..6772faf385a5 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -54,7 +54,7 @@ const char * const __bch2_csum_opts[] = {
 	NULL
 };
 
-static const char * const __bch2_compression_types[] = {
+const char * const __bch2_compression_types[] = {
 	BCH_COMPRESSION_TYPES()
 	NULL
 };
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 6b29339ea725..ea69099e681d 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -17,6 +17,7 @@ extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
 extern const char * const __bch2_btree_ids[];
 extern const char * const __bch2_csum_opts[];
+extern const char * const __bch2_compression_types[];
 extern const char * const bch2_compression_opts[];
 extern const char * const __bch2_str_hash_types[];
 extern const char * const bch2_str_hash_opts[];
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index f2b38493356d..d5b18ff1645c 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -305,7 +305,9 @@ enum bch_fsck_flags {
 	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
 	x(accounting_key_version_0,				282,	FSCK_AUTOFIX)	\
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
-	x(MAX,							295,	0)
+	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
+	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
+	x(MAX,							297,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From 57026c41c9c5009f0d363cb11b15763c5289abb9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 16 Nov 2024 23:54:19 -0500
Subject: bcachefs: kill bch2_journal_entries_free()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_journal_iter.c | 17 ++++++-----------
 fs/bcachefs/btree_journal_iter.h |  2 --
 2 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index c44889ef9817..39898baa8854 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -527,16 +527,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
 
 /* sort and dedup all keys in the journal: */
 
-void bch2_journal_entries_free(struct bch_fs *c)
-{
-	struct journal_replay **i;
-	struct genradix_iter iter;
-
-	genradix_for_each(&c->journal_entries, iter, i)
-		kvfree(*i);
-	genradix_free(&c->journal_entries);
-}
-
 /*
  * When keys compare equal, oldest compares first:
  */
@@ -569,7 +559,12 @@ void bch2_journal_keys_put(struct bch_fs *c)
 	keys->data = NULL;
 	keys->nr = keys->gap = keys->size = 0;
 
-	bch2_journal_entries_free(c);
+	struct journal_replay **i;
+	struct genradix_iter iter;
+
+	genradix_for_each(&c->journal_entries, iter, i)
+		kvfree(*i);
+	genradix_free(&c->journal_entries);
 }
 
 static void __journal_keys_sort(struct journal_keys *keys)
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index fa8c4f82c9c7..5ddbb7571770 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -81,8 +81,6 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
 	c->journal_keys.initial_ref_held = false;
 }
 
-void bch2_journal_entries_free(struct bch_fs *);
-
 int bch2_journal_keys_sort(struct bch_fs *);
 
 void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
-- 
cgit v1.2.3


From 06d7a56fe0bfc4f8c17bff6ca12d8bdd33e9f7e8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Nov 2024 14:20:35 -0500
Subject: bcachefs: journal keys: sort keys for interior nodes first

There's an unavoidable issue with btree lookups when we're overlaying
journal keys and the journal has many deletions for keys present in the
btree - peek operations will have to iterate over all those deletions to
find the next live key to return.

This is mainly a problem for lookups in interior nodes, if we have to
traverse to a leaf. Looking up an insert position in a leaf (for journal
replay) doesn't have to find the next live key, but walking down the
btree does.

So to ameloriate this, change journal key sort ordering so that we
replay keys from roots and interior nodes first.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_journal_iter.c | 10 ++++------
 fs/bcachefs/btree_journal_iter.h | 13 ++++++++++---
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 39898baa8854..dbc9bc233cca 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -172,9 +172,8 @@ static void journal_iter_verify(struct journal_iter *iter)
 	if (iter->idx < keys->size) {
 		struct journal_key *k = keys->data + iter->idx;
 
-		int cmp = cmp_int(k->btree_id,	iter->btree_id) ?:
-			  cmp_int(k->level,	iter->level);
-		BUG_ON(cmp < 0);
+		int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
+		BUG_ON(cmp > 0);
 	}
 }
 
@@ -365,9 +364,8 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 	while (iter->idx < iter->keys->size) {
 		struct journal_key *k = iter->keys->data + iter->idx;
 
-		int cmp = cmp_int(k->btree_id,	iter->btree_id) ?:
-			  cmp_int(k->level,	iter->level);
-		if (cmp > 0)
+		int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
+		if (cmp < 0)
 			break;
 		BUG_ON(cmp);
 
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 5ddbb7571770..118ada4cdd1b 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -28,14 +28,21 @@ struct btree_and_journal_iter {
 	bool			prefetch;
 };
 
+static inline int __journal_key_btree_cmp(enum btree_id	l_btree_id,
+					  unsigned	l_level,
+					  const struct journal_key *r)
+{
+	return -cmp_int(l_level,	r->level) ?:
+		cmp_int(l_btree_id,	r->btree_id);
+}
+
 static inline int __journal_key_cmp(enum btree_id	l_btree_id,
 				    unsigned		l_level,
 				    struct bpos	l_pos,
 				    const struct journal_key *r)
 {
-	return (cmp_int(l_btree_id,	r->btree_id) ?:
-		cmp_int(l_level,	r->level) ?:
-		bpos_cmp(l_pos,	r->k->k.p));
+	return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
+		bpos_cmp(l_pos,	r->k->k.p);
 }
 
 static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-- 
cgit v1.2.3


From 854724d116cbd1145cd3888239a5cc7ea44d8cdc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Nov 2024 14:39:46 -0500
Subject: bcachefs: btree_and_journal_iter: don't iterate over too many
 whiteouts when prefetching

To help ameloriate issues with peek operations having to skip over
deletions in the journal - just bail out if all we're doing is
prefetching btree nodes.

Since btree node prefetching runs every time we iterate to a new node,
and has to sequentially scan ahead, this avoids another O(n^2).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c         | 2 ++
 fs/bcachefs/btree_journal_iter.c | 7 +++++++
 fs/bcachefs/btree_journal_iter.h | 1 +
 3 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ed74f0655d98..89f9665ce70d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -825,6 +825,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
 
 	bch2_bkey_buf_init(&tmp);
 
+	jiter->fail_if_too_many_whiteouts = true;
+
 	while (nr-- && !ret) {
 		if (!bch2_btree_node_relock(trans, path, path->level))
 			break;
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index dbc9bc233cca..cc7f5fad90c6 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -426,6 +426,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
 		: (level > 1 ? 1 : 16);
 
 	iter.prefetch = false;
+	iter.fail_if_too_many_whiteouts = true;
 	bch2_bkey_buf_init(&tmp);
 
 	while (nr--) {
@@ -444,6 +445,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
 {
 	struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
+	size_t iters = 0;
 
 	if (iter->prefetch && iter->journal.level)
 		btree_and_journal_iter_prefetch(iter);
@@ -451,6 +453,11 @@ again:
 	if (iter->at_end)
 		return bkey_s_c_null;
 
+	iters++;
+
+	if (iters > 20 && iter->fail_if_too_many_whiteouts)
+		return bkey_s_c_null;
+
 	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
 	       bpos_lt(btree_k.k->p, iter->pos))
 		bch2_journal_iter_advance_btree(iter);
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 118ada4cdd1b..9e8f8ab1c6ff 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -26,6 +26,7 @@ struct btree_and_journal_iter {
 	struct bpos		pos;
 	bool			at_end;
 	bool			prefetch;
+	bool			fail_if_too_many_whiteouts;
 };
 
 static inline int __journal_key_btree_cmp(enum btree_id	l_btree_id,
-- 
cgit v1.2.3


From eae6c4a6255b03161c5c2c3e3a9cbfb4e22fa025 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Nov 2024 02:23:24 -0500
Subject: bcachefs: fix O(n^2) issue with whiteouts in journal keys

The journal_keys array can't be substantially modified after we go RW,
because lookups need to be able to check it locklessly - thus we're
limited on what we can do when a key in the journal has been
overwritten.

This is a problem when there's many overwrites to skip over for peek()
operations. To fix this, add tracking of ranges of overwrites: we create
a range entry when there's more than one contiguous whiteout.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h                 |  23 +----
 fs/bcachefs/btree_journal_iter.c       | 156 +++++++++++++++++++++++++++++----
 fs/bcachefs/btree_journal_iter.h       |   2 +
 fs/bcachefs/btree_journal_iter_types.h |  36 ++++++++
 fs/bcachefs/super.c                    |   3 +-
 5 files changed, 179 insertions(+), 41 deletions(-)
 create mode 100644 fs/bcachefs/btree_journal_iter_types.h

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7a947d43d504..11f9ed42a9da 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -205,6 +205,7 @@
 #include <linux/zstd.h>
 
 #include "bcachefs_format.h"
+#include "btree_journal_iter_types.h"
 #include "disk_accounting_types.h"
 #include "errcode.h"
 #include "fifo.h"
@@ -658,28 +659,6 @@ struct journal_seq_blacklist_table {
 	}			entries[];
 };
 
-struct journal_keys {
-	/* must match layout in darray_types.h */
-	size_t			nr, size;
-	struct journal_key {
-		u64		journal_seq;
-		u32		journal_offset;
-		enum btree_id	btree_id:8;
-		unsigned	level:8;
-		bool		allocated;
-		bool		overwritten;
-		struct bkey_i	*k;
-	}			*data;
-	/*
-	 * Gap buffer: instead of all the empty space in the array being at the
-	 * end of the buffer - from @nr to @size - the empty space is at @gap.
-	 * This means that sequential insertions are O(n) instead of O(n^2).
-	 */
-	size_t			gap;
-	atomic_t		ref;
-	bool			initial_ref_held;
-};
-
 struct btree_trans_buf {
 	struct btree_trans	*trans;
 };
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index cc7f5fad90c6..de3db161d6ab 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -16,6 +16,17 @@
  * operations for the regular btree iter code to use:
  */
 
+static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
+{
+	size_t gap_size = keys->size - keys->nr;
+
+	BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
+
+	if (pos >= keys->gap)
+		pos -= gap_size;
+	return pos;
+}
+
 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
 {
 	size_t gap_size = keys->size - keys->nr;
@@ -84,27 +95,37 @@ search:
 		}
 	}
 
+	struct bkey_i *ret = NULL;
+	rcu_read_lock(); /* for overwritten_ranges */
+
 	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
 		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
-			return NULL;
+			break;
 
 		if (k->overwritten) {
-			(*idx)++;
+			if (k->overwritten_range)
+				*idx = rcu_dereference(k->overwritten_range)->end;
+			else
+				*idx += 1;
 			continue;
 		}
 
-		if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
-			return k->k;
+		if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
+			ret = k->k;
+			break;
+		}
 
 		(*idx)++;
 		iters++;
 		if (iters == 10) {
 			*idx = 0;
+			rcu_read_unlock();
 			goto search;
 		}
 	}
 
-	return NULL;
+	rcu_read_unlock();
+	return ret;
 }
 
 struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
@@ -130,17 +151,25 @@ search:
 		}
 	}
 
+	struct bkey_i *ret = NULL;
+	rcu_read_lock(); /* for overwritten_ranges */
+
 	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
 		if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
-			return NULL;
+			break;
 
 		if (k->overwritten) {
-			--(*idx);
+			if (k->overwritten_range)
+				*idx = rcu_dereference(k->overwritten_range)->start - 1;
+			else
+				*idx -= 1;
 			continue;
 		}
 
-		if (__journal_key_cmp(btree_id, level, pos, k) >= 0)
-			return k->k;
+		if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
+			ret = k->k;
+			break;
+		}
 
 		--(*idx);
 		iters++;
@@ -150,7 +179,8 @@ search:
 		}
 	}
 
-	return NULL;
+	rcu_read_unlock();
+	return ret;
 }
 
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
@@ -163,6 +193,7 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
 
 static void journal_iter_verify(struct journal_iter *iter)
 {
+#ifdef CONFIG_BCACHEFS_DEBUG
 	struct journal_keys *keys = iter->keys;
 	size_t gap_size = keys->size - keys->nr;
 
@@ -175,6 +206,7 @@ static void journal_iter_verify(struct journal_iter *iter)
 		int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
 		BUG_ON(cmp > 0);
 	}
+#endif
 }
 
 static void journal_iters_fix(struct bch_fs *c)
@@ -335,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
 		bkey_deleted(&keys->data[idx].k->k));
 }
 
+static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
+{
+	struct journal_key *k = keys->data + pos;
+	size_t idx = pos_to_idx(keys, pos);
+
+	k->overwritten = true;
+
+	struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
+	struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
+
+	bool prev_overwritten = prev && prev->overwritten;
+	bool next_overwritten = next && next->overwritten;
+
+	struct journal_key_range_overwritten *prev_range =
+		prev_overwritten ? prev->overwritten_range : NULL;
+	struct journal_key_range_overwritten *next_range =
+		next_overwritten ? next->overwritten_range : NULL;
+
+	BUG_ON(prev_range && prev_range->end != idx);
+	BUG_ON(next_range && next_range->start != idx + 1);
+
+	if (prev_range && next_range) {
+		prev_range->end = next_range->end;
+
+		keys->data[pos].overwritten_range = prev_range;
+		for (size_t i = next_range->start; i < next_range->end; i++) {
+			struct journal_key *ip = keys->data + idx_to_pos(keys, i);
+			BUG_ON(ip->overwritten_range != next_range);
+			ip->overwritten_range = prev_range;
+		}
+
+		kfree_rcu_mightsleep(next_range);
+	} else if (prev_range) {
+		prev_range->end++;
+		k->overwritten_range = prev_range;
+		if (next_overwritten) {
+			prev_range->end++;
+			next->overwritten_range = prev_range;
+		}
+	} else if (next_range) {
+		next_range->start--;
+		k->overwritten_range = next_range;
+		if (prev_overwritten) {
+			next_range->start--;
+			prev->overwritten_range = next_range;
+		}
+	} else if (prev_overwritten || next_overwritten) {
+		struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
+		if (!r)
+			return;
+
+		r->start = idx - (size_t) prev_overwritten;
+		r->end = idx + 1 + (size_t) next_overwritten;
+
+		rcu_assign_pointer(k->overwritten_range, r);
+		if (prev_overwritten)
+			prev->overwritten_range = r;
+		if (next_overwritten)
+			next->overwritten_range = r;
+	}
+}
+
 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 				  unsigned level, struct bpos pos)
 {
@@ -344,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 	if (idx < keys->size &&
 	    keys->data[idx].btree_id	== btree &&
 	    keys->data[idx].level	== level &&
-	    bpos_eq(keys->data[idx].k->k.p, pos))
-		keys->data[idx].overwritten = true;
+	    bpos_eq(keys->data[idx].k->k.p, pos) &&
+	    !keys->data[idx].overwritten) {
+		mutex_lock(&keys->overwrite_lock);
+		__bch2_journal_key_overwritten(keys, idx);
+		mutex_unlock(&keys->overwrite_lock);
+	}
 }
 
 static void bch2_journal_iter_advance(struct journal_iter *iter)
@@ -359,8 +457,11 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
 
 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 {
+	struct bkey_s_c ret = bkey_s_c_null;
+
 	journal_iter_verify(iter);
 
+	rcu_read_lock();
 	while (iter->idx < iter->keys->size) {
 		struct journal_key *k = iter->keys->data + iter->idx;
 
@@ -369,13 +470,19 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 			break;
 		BUG_ON(cmp);
 
-		if (!k->overwritten)
-			return bkey_i_to_s_c(k->k);
+		if (!k->overwritten) {
+			ret = bkey_i_to_s_c(k->k);
+			break;
+		}
 
-		bch2_journal_iter_advance(iter);
+		if (k->overwritten_range)
+			iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
+		else
+			bch2_journal_iter_advance(iter);
 	}
+	rcu_read_unlock();
 
-	return bkey_s_c_null;
+	return ret;
 }
 
 static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -556,9 +663,15 @@ void bch2_journal_keys_put(struct bch_fs *c)
 
 	move_gap(keys, keys->nr);
 
-	darray_for_each(*keys, i)
+	darray_for_each(*keys, i) {
+		if (i->overwritten_range &&
+		    (i == &darray_last(*keys) ||
+		     i->overwritten_range != i[1].overwritten_range))
+			kfree(i->overwritten_range);
+
 		if (i->allocated)
 			kfree(i->k);
+	}
 
 	kvfree(keys->data);
 	keys->data = NULL;
@@ -682,3 +795,12 @@ void bch2_journal_keys_dump(struct bch_fs *c)
 	}
 	printbuf_exit(&buf);
 }
+
+void bch2_fs_journal_keys_init(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+
+	atomic_set(&keys->ref, 1);
+	keys->initial_ref_held = true;
+	mutex_init(&keys->overwrite_lock);
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 9e8f8ab1c6ff..2a3082919b8d 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -97,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
 
 void bch2_journal_keys_dump(struct bch_fs *);
 
+void bch2_fs_journal_keys_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
new file mode 100644
index 000000000000..8b773823704f
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+
+struct journal_key_range_overwritten {
+	size_t			start, end;
+};
+
+struct journal_key {
+	u64			journal_seq;
+	u32			journal_offset;
+	enum btree_id		btree_id:8;
+	unsigned		level:8;
+	bool			allocated;
+	bool			overwritten;
+	struct journal_key_range_overwritten __rcu *
+				overwritten_range;
+	struct bkey_i		*k;
+};
+
+struct journal_keys {
+	/* must match layout in darray_types.h */
+	size_t			nr, size;
+	struct journal_key	*data;
+	/*
+	 * Gap buffer: instead of all the empty space in the array being at the
+	 * end of the buffer - from @nr to @size - the empty space is at @gap.
+	 * This means that sequential insertions are O(n) instead of O(n^2).
+	 */
+	size_t			gap;
+	atomic_t		ref;
+	bool			initial_ref_held;
+	struct mutex		overwrite_lock;
+};
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 37eee352fa21..08170a3d524f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -773,8 +773,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	init_rwsem(&c->gc_lock);
 	mutex_init(&c->gc_gens_lock);
-	atomic_set(&c->journal_keys.ref, 1);
-	c->journal_keys.initial_ref_held = true;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_init(&c->times[i]);
@@ -784,6 +782,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
 	bch2_fs_btree_iter_init_early(c);
 	bch2_fs_btree_interior_update_init_early(c);
+	bch2_fs_journal_keys_init(c);
 	bch2_fs_allocator_background_init(c);
 	bch2_fs_allocator_foreground_init(c);
 	bch2_fs_rebalance_init(c);
-- 
cgit v1.2.3


From 4de2c24aa9453564ceb285039e2816bd22fa4ac6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 9 Dec 2024 06:18:49 -0500
Subject: bcachefs: Fix evacuate_bucket tracepoint

86a494c8eef9 ("bcachefs: Kill bch2_get_next_backpointer()") dropped some
things the tracepoint emitted because bch2_evacuate_bucket() no longer
looks at the alloc key - but we did want at least some of that.

We still no longer look at the alloc key so we can't report on the
fragmentation number, but that's a direct function of dirty_sectors and
a copygc concern anyways - copygc should get its own tracepoint that
includes information from the fragmentation LRU.

But we can report on the number of sectors we moved and the bucket size.

Co-developed-by: Piotr Zalewski <pZ010001011111@proton.me>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c  | 21 +++++++++++++--------
 fs/bcachefs/trace.h | 10 ++++------
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 88ab9d7e1a1b..74839268d6ab 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -674,8 +674,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 	struct bkey_buf sk;
 	struct bkey_s_c k;
 	struct data_update_opts data_opts;
-	unsigned dirty_sectors, bucket_size;
-	u64 fragmentation;
+	unsigned sectors_moved = 0;
 	int ret = 0;
 
 	struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
@@ -748,14 +747,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 			data_opts.target	= io_opts.background_target;
 			data_opts.rewrite_ptrs = 0;
 
+			unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
 			unsigned i = 0;
-			bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-				if (ptr->dev == bucket.inode) {
-					data_opts.rewrite_ptrs |= 1U << i;
-					if (ptr->cached) {
+			const union bch_extent_entry *entry;
+			struct extent_ptr_decoded p;
+			bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+				if (p.ptr.dev == bucket.inode) {
+					if (p.ptr.cached) {
 						bch2_trans_iter_exit(trans, &iter);
 						goto next;
 					}
+					data_opts.rewrite_ptrs |= 1U << i;
+					break;
 				}
 				i++;
 			}
@@ -775,7 +778,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 				goto err;
 
 			if (ctxt->stats)
-				atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+				atomic64_add(sectors, &ctxt->stats->sectors_seen);
+			sectors_moved += sectors;
 		} else {
 			struct btree *b;
 
@@ -806,12 +810,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 				atomic64_add(sectors, &ctxt->stats->sectors_seen);
 				atomic64_add(sectors, &ctxt->stats->sectors_moved);
 			}
+			sectors_moved += btree_sectors(c);
 		}
 next:
 		bch2_btree_iter_advance(&bp_iter);
 	}
 
-	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
+	trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
 err:
 	bch2_trans_iter_exit(trans, &bp_iter);
 	bch2_dev_put(ca);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 5597b9d6297f..2d5932d2881e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -848,8 +848,8 @@ TRACE_EVENT(move_data,
 TRACE_EVENT(evacuate_bucket,
 	TP_PROTO(struct bch_fs *c, struct bpos *bucket,
 		 unsigned sectors, unsigned bucket_size,
-		 u64 fragmentation, int ret),
-	TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
+		 int ret),
+	TP_ARGS(c, bucket, sectors, bucket_size, ret),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev		)
@@ -857,7 +857,6 @@ TRACE_EVENT(evacuate_bucket,
 		__field(u64,		bucket		)
 		__field(u32,		sectors		)
 		__field(u32,		bucket_size	)
-		__field(u64,		fragmentation	)
 		__field(int,		ret		)
 	),
 
@@ -867,15 +866,14 @@ TRACE_EVENT(evacuate_bucket,
 		__entry->bucket			= bucket->offset;
 		__entry->sectors		= sectors;
 		__entry->bucket_size		= bucket_size;
-		__entry->fragmentation		= fragmentation;
 		__entry->ret			= ret;
 	),
 
-	TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
+	TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->member, __entry->bucket,
 		  __entry->sectors, __entry->bucket_size,
-		  __entry->fragmentation, __entry->ret)
+		  __entry->ret)
 );
 
 TRACE_EVENT(copygc,
-- 
cgit v1.2.3


From 3b6ebc94a011f3c0f057f984560265705b020cd9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 14 Nov 2024 22:49:40 -0500
Subject: bcachefs: fix bp_pos_to_bucket_nodev_noerror

_noerror means don't produce inconsistent errors, so it should be using
bch2_dev_rcu_noerror().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 74c96aee713e..eda3a78a5e2b 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -46,7 +46,7 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos
 static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
 {
 	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode);
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
 	if (ca)
 		*bucket = bp_pos_to_bucket(ca, bp_pos);
 	rcu_read_unlock();
-- 
cgit v1.2.3


From debe6965ac47c3d8b0ebe87aa67ae6504baa8072 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 18 Nov 2024 00:16:52 -0500
Subject: bcachefs: check for backpointers to invalid device

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c     | 4 ++++
 fs/bcachefs/sb-errors_format.h | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index a9ffbea277bd..1c7ddaed6c1c 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -59,6 +59,10 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
 			 "backpointer level bad: %u >= %u",
 			 bp.v->level, BTREE_MAX_DEPTH);
 
+	bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
+			 c, backpointer_dev_bad,
+			 "backpointer for BCH_SB_MEMBER_INVALID");
+
 	rcu_read_lock();
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
 	if (!ca) {
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index d5b18ff1645c..9e3425f533bc 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -138,6 +138,7 @@ enum bch_fsck_flags {
 	x(discarding_bucket_not_in_need_discard_btree,		291,	0)		\
 	x(backpointer_bucket_offset_wrong,			125,	0)		\
 	x(backpointer_level_bad,				294,	0)		\
+	x(backpointer_dev_bad,					297,	0)		\
 	x(backpointer_to_missing_device,			126,	0)		\
 	x(backpointer_to_missing_alloc,				127,	0)		\
 	x(backpointer_to_missing_ptr,				128,	0)		\
@@ -307,7 +308,7 @@ enum bch_fsck_flags {
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
 	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
 	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
-	x(MAX,							297,	0)
+	x(MAX,							298,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From 5b5a7ae8fa827572c1f9c55fd1494468ef3ae988 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Nov 2024 16:30:30 -0500
Subject: bcachefs: bucket_pos_to_bp_end()

Better helpers for iterating over backpointers within a specific bucket

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.h | 10 ++++++++++
 fs/bcachefs/ec.c           |  5 +++--
 fs/bcachefs/move.c         |  5 ++---
 3 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index eda3a78a5e2b..595db7960939 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -80,6 +80,16 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
 	return ret;
 }
 
+static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
+{
+	return bucket_pos_to_bp(ca, bucket, 0);
+}
+
+static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
+{
+	return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
+}
+
 int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *,
 				struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 146e03ffe8b7..8197ceb506e9 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1365,8 +1365,9 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
 
-	ret = for_each_btree_key_commit(trans, bp_iter, BTREE_ID_backpointers,
-			bucket_pos_to_bp(ca, bucket_pos, 0), 0, bp_k,
+	ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
+			bucket_pos_to_bp_start(ca, bucket_pos),
+			bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
 			NULL, NULL,
 			BCH_TRANS_COMMIT_no_check_rw|
 			BCH_TRANS_COMMIT_no_enospc, ({
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 74839268d6ab..460175464762 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -691,7 +691,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 	bch2_trans_begin(trans);
 
 	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
-			     bucket_pos_to_bp(ca, bucket, 0), 0);
+			     bucket_pos_to_bp_start(ca, bucket), 0);
 
 	bch_err_msg(c, ret, "looking up alloc key");
 	if (ret)
@@ -715,8 +715,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 		if (ret)
 			goto err;
 
-		if (!k.k ||
-		    bkey_ge(k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)))
+		if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
 			break;
 
 		if (k.k->type != KEY_TYPE_backpointer)
-- 
cgit v1.2.3


From abff9b149dcc0a584cbadc869167e5a7199cd1ce Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Nov 2024 17:45:44 -0500
Subject: bcachefs: Drop swab code for backpointers in alloc keys

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e90561b6def6..ae9fdb5ad758 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -322,7 +322,6 @@ fsck_err:
 void bch2_alloc_v4_swab(struct bkey_s k)
 {
 	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
-	struct bch_backpointer *bp, *bps;
 
 	a->journal_seq		= swab64(a->journal_seq);
 	a->flags		= swab32(a->flags);
@@ -333,13 +332,6 @@ void bch2_alloc_v4_swab(struct bkey_s k)
 	a->stripe		= swab32(a->stripe);
 	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
 	a->stripe_sectors	= swab32(a->stripe_sectors);
-
-	bps = alloc_v4_backpointers(a);
-	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
-		bp->bucket_offset	= swab40(bp->bucket_offset);
-		bp->bucket_len		= swab32(bp->bucket_len);
-		bch2_bpos_swab(&bp->pos);
-	}
 }
 
 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-- 
cgit v1.2.3


From eb25733aba20544035d56bd53fee5c8ecf87bc2e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Nov 2024 17:36:09 -0500
Subject: bcachefs: bch_backpointer -> bkey_i_backpointer

Since we no longer store backpointers in alloc keys, there's no reason
not to pass around bkey_i_backpointers; this means we don't have to pass
the bucket pos separately.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 215 ++++++++++++++++++---------------------------
 fs/bcachefs/backpointers.h |  51 +++++------
 fs/bcachefs/buckets.c      |   8 +-
 fs/bcachefs/ec.c           |   2 +-
 4 files changed, 111 insertions(+), 165 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 1c7ddaed6c1c..24804f0bb2fd 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -14,40 +14,6 @@
 
 #include <linux/mm.h>
 
-static bool extent_matches_bp(struct bch_fs *c,
-			      enum btree_id btree_id, unsigned level,
-			      struct bkey_s_c k,
-			      struct bpos bucket,
-			      struct bch_backpointer bp)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	rcu_read_lock();
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bpos bucket2;
-		struct bch_backpointer bp2;
-
-		if (p.ptr.cached)
-			continue;
-
-		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-		if (!ca)
-			continue;
-
-		bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2);
-		if (bpos_eq(bucket, bucket2) &&
-		    !memcmp(&bp, &bp2, sizeof(bp))) {
-			rcu_read_unlock();
-			return true;
-		}
-	}
-	rcu_read_unlock();
-
-	return false;
-}
-
 int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
 			      enum bch_validate_flags flags)
 {
@@ -78,23 +44,15 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
 	bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
 			 !bpos_eq(bp.k->p, bp_pos),
 			 c, backpointer_bucket_offset_wrong,
-			 "backpointer bucket_offset wrong");
+			 "backpointer bucket_offset wrong (%llu)", (u64) bp.v->bucket_offset);
 fsck_err:
 	return ret;
 }
 
-void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
-	bch2_btree_id_level_to_text(out, bp->btree_id, bp->level);
-	prt_printf(out, " offset=%llu:%u len=%u pos=",
-		   (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-		   (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-		   bp->bucket_len);
-	bch2_bpos_to_text(out, bp->pos);
-}
+	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 
-void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
 	rcu_read_lock();
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
 	if (ca) {
@@ -107,7 +65,12 @@ void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct b
 		rcu_read_unlock();
 	}
 
-	bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+	bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
+	prt_printf(out, " offset=%llu:%u len=%u pos=",
+		   (u64) (bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+		   (u32) bp.v->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+		   bp.v->bucket_len);
+	bch2_bpos_to_text(out, bp.v->pos);
 }
 
 void bch2_backpointer_swab(struct bkey_s k)
@@ -119,10 +82,43 @@ void bch2_backpointer_swab(struct bkey_s k)
 	bch2_bpos_swab(&bp.v->pos);
 }
 
+static bool extent_matches_bp(struct bch_fs *c,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c k,
+			      struct bkey_s_c_backpointer bp)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	rcu_read_lock();
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket2;
+		struct bkey_i_backpointer bp2;
+
+		if (p.ptr.cached)
+			continue;
+
+		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+		if (!ca)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2);
+		if (bpos_eq(bp.k->p, bp2.k.p) &&
+		    !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) {
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+
+	return false;
+}
+
 static noinline int backpointer_mod_err(struct btree_trans *trans,
-					struct bch_backpointer bp,
-					struct bkey_s_c bp_k,
 					struct bkey_s_c orig_k,
+					struct bkey_i_backpointer *new_bp,
+					struct bkey_s_c found_bp,
 					bool insert)
 {
 	struct bch_fs *c = trans->c;
@@ -130,12 +126,12 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 
 	if (insert) {
 		prt_printf(&buf, "existing backpointer found when inserting ");
-		bch2_backpointer_to_text(&buf, &bp);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
 		prt_newline(&buf);
 		printbuf_indent_add(&buf, 2);
 
 		prt_printf(&buf, "found ");
-		bch2_bkey_val_to_text(&buf, c, bp_k);
+		bch2_bkey_val_to_text(&buf, c, found_bp);
 		prt_newline(&buf);
 
 		prt_printf(&buf, "for ");
@@ -147,11 +143,11 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 		printbuf_indent_add(&buf, 2);
 
 		prt_printf(&buf, "searching for ");
-		bch2_backpointer_to_text(&buf, &bp);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
 		prt_newline(&buf);
 
 		prt_printf(&buf, "got ");
-		bch2_bkey_val_to_text(&buf, c, bp_k);
+		bch2_bkey_val_to_text(&buf, c, found_bp);
 		prt_newline(&buf);
 
 		prt_printf(&buf, "for ");
@@ -170,50 +166,35 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 }
 
 int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
-				struct bch_dev *ca,
-				struct bpos bucket,
-				struct bch_backpointer bp,
 				struct bkey_s_c orig_k,
+				struct bkey_i_backpointer *bp,
 				bool insert)
 {
 	struct btree_iter bp_iter;
-	struct bkey_s_c k;
-	struct bkey_i_backpointer *bp_k;
-	int ret;
-
-	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
-	ret = PTR_ERR_OR_ZERO(bp_k);
-	if (ret)
-		return ret;
-
-	bkey_backpointer_init(&bp_k->k_i);
-	bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
-	bp_k->v = bp;
-
-	if (!insert) {
-		bp_k->k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&bp_k->k, 0);
-	}
-
-	k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
-			       bp_k->k.p,
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+			       bp->k.p,
 			       BTREE_ITER_intent|
 			       BTREE_ITER_slots|
 			       BTREE_ITER_with_updates);
-	ret = bkey_err(k);
+	int ret = bkey_err(k);
 	if (ret)
-		goto err;
+		return ret;
 
 	if (insert
 	    ? k.k->type
 	    : (k.k->type != KEY_TYPE_backpointer ||
-	       memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
-		ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+	       memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) {
+		ret = backpointer_mod_err(trans, orig_k, bp, k, insert);
 		if (ret)
 			goto err;
 	}
 
-	ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+	if (!insert) {
+		bp->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&bp->k, 0);
+	}
+
+	ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0);
 err:
 	bch2_trans_iter_exit(trans, &bp_iter);
 	return ret;
@@ -234,22 +215,11 @@ static void backpointer_target_not_found(struct btree_trans *trans,
 	if (likely(!bch2_backpointers_no_use_write_buffer))
 		return;
 
-	struct bpos bucket;
-	if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket))
-		return;
-
 	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
 		   bp.v->level ? "btree node" : "extent");
-	prt_printf(&buf, "bucket: ");
-	bch2_bpos_to_text(&buf, bucket);
-	prt_printf(&buf, "\n  ");
-
-	prt_printf(&buf, "backpointer pos: ");
-	bch2_bpos_to_text(&buf, bp.k->p);
+	bch2_bkey_val_to_text(&buf, c, bp.s_c);
 	prt_printf(&buf, "\n  ");
 
-	bch2_backpointer_to_text(&buf, bp.v);
-	prt_printf(&buf, "\n  ");
 	bch2_bkey_val_to_text(&buf, c, target_k);
 	if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
 		bch_err_ratelimited(c, "%s", buf.buf);
@@ -267,10 +237,6 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 	if (likely(!bp.v->level)) {
 		struct bch_fs *c = trans->c;
 
-		struct bpos bucket;
-		if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket))
-			return bkey_s_c_err(-EIO);
-
 		bch2_trans_node_iter_init(trans, iter,
 					  bp.v->btree_id,
 					  bp.v->pos,
@@ -283,7 +249,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 		}
 
 		if (k.k &&
-		    extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bucket, *bp.v))
+		    extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
 			return k;
 
 		bch2_trans_iter_exit(trans, iter);
@@ -308,10 +274,6 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
 
 	BUG_ON(!bp.v->level);
 
-	struct bpos bucket;
-	if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket))
-		return ERR_PTR(-EIO);
-
 	bch2_trans_node_iter_init(trans, iter,
 				  bp.v->btree_id,
 				  bp.v->pos,
@@ -325,8 +287,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
 	BUG_ON(b->c.level != bp.v->level - 1);
 
 	if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
-			      bkey_i_to_s_c(&b->key),
-			      bucket, *bp.v))
+			      bkey_i_to_s_c(&b->key), bp))
 		return b;
 
 	if (btree_node_will_make_reachable(b)) {
@@ -480,8 +441,7 @@ err:
 
 static int check_bp_exists(struct btree_trans *trans,
 			   struct extents_to_bp_state *s,
-			   struct bpos bucket,
-			   struct bch_backpointer bp,
+			   struct bkey_i_backpointer *bp,
 			   struct bkey_s_c orig_k)
 {
 	struct bch_fs *c = trans->c;
@@ -491,30 +451,28 @@ static int check_bp_exists(struct btree_trans *trans,
 	struct bkey_s_c bp_k;
 	int ret = 0;
 
-	struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket);
+	struct bch_dev *ca = bch2_dev_tryget_noerror(c, bp->k.p.inode);
 	if (!ca) {
-		prt_str(&buf, "extent for nonexistent device:bucket ");
-		bch2_bpos_to_text(&buf, bucket);
-		prt_str(&buf, "\n  ");
+		prt_printf(&buf, "extent for nonexistent device %llu\n", bp->k.p.inode);
 		bch2_bkey_val_to_text(&buf, c, orig_k);
 		bch_err(c, "%s", buf.buf);
 		ret = -BCH_ERR_fsck_repair_unimplemented;
 		goto err;
 	}
 
+	struct bpos bucket = bp_pos_to_bucket(ca, bp->k.p);
+
 	if (bpos_lt(bucket, s->bucket_start) ||
 	    bpos_gt(bucket, s->bucket_end))
 		goto out;
 
-	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
-				  bucket_pos_to_bp(ca, bucket, bp.bucket_offset),
-				  0);
+	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
 	ret = bkey_err(bp_k);
 	if (ret)
 		goto err;
 
 	if (bp_k.k->type != KEY_TYPE_backpointer ||
-	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
+	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) {
 		ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
 		if (ret)
 			goto err;
@@ -561,14 +519,17 @@ check_existing_bp:
 				goto err;
 			goto out;
 		} else {
-			ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
+			ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bucket.inode);
 			if (ret)
 				goto err;
 			goto missing;
 		}
 	}
 
-	ret = check_extent_checksum(trans, other_bp.v->btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+	ret = check_extent_checksum(trans,
+				    other_bp.v->btree_id, other_extent,
+				    bp->v.btree_id, orig_k,
+				    bucket.inode);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -576,7 +537,8 @@ check_existing_bp:
 		goto missing;
 	}
 
-	ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.v->btree_id, other_extent, bucket.inode);
+	ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
+				    other_bp.v->btree_id, other_extent, bucket.inode);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -594,22 +556,15 @@ check_existing_bp:
 	goto err;
 missing:
 	printbuf_reset(&buf);
-	prt_str(&buf, "missing backpointer for btree=");
-	bch2_btree_id_to_text(&buf, bp.btree_id);
-	prt_printf(&buf, " l=%u ", bp.level);
+	prt_str(&buf, "missing backpointer ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
+	prt_newline(&buf);
 	bch2_bkey_val_to_text(&buf, c, orig_k);
 	prt_printf(&buf, "\n  got:   ");
 	bch2_bkey_val_to_text(&buf, c, bp_k);
 
-	struct bkey_i_backpointer n_bp_k;
-	bkey_backpointer_init(&n_bp_k.k_i);
-	n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
-	n_bp_k.v = bp;
-	prt_printf(&buf, "\n  want:  ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
-
 	if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
-		ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
+		ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true);
 
 	goto out;
 }
@@ -627,8 +582,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 
 	ptrs = bch2_bkey_ptrs_c(k);
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bpos bucket_pos = POS_MIN;
-		struct bch_backpointer bp;
+		struct bpos bucket_pos;
+		struct bkey_i_backpointer bp;
 
 		if (p.ptr.cached)
 			continue;
@@ -642,7 +597,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 		if (!ca)
 			continue;
 
-		ret = check_bp_exists(trans, s, bucket_pos, bp, k);
+		ret = check_bp_exists(trans, s, &bp, k);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 595db7960939..5f34a25b599a 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -19,13 +19,12 @@ static inline u64 swab40(u64 x)
 }
 
 int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags);
-void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
-void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_backpointer_swab(struct bkey_s);
 
 #define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
 	.key_validate	= bch2_backpointer_validate,	\
-	.val_to_text	= bch2_backpointer_k_to_text,	\
+	.val_to_text	= bch2_backpointer_to_text,	\
 	.swab		= bch2_backpointer_swab,	\
 	.min_val_size	= 32,				\
 })
@@ -53,12 +52,6 @@ static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos
 	return ca != NULL;
 }
 
-static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
-{
-	return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket),
-					c, "backpointer for missing device %llu", bp_pos.inode);
-}
-
 static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
 						   struct bpos bucket,
 						   u64 bucket_offset)
@@ -90,31 +83,25 @@ static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct
 	return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
 }
 
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *,
-				struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool);
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
+				struct bkey_s_c,
+				struct bkey_i_backpointer *,
+				bool);
 
 static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
-				struct bch_dev *ca,
-				struct bpos bucket,
-				struct bch_backpointer bp,
 				struct bkey_s_c orig_k,
+				struct bkey_i_backpointer *bp,
 				bool insert)
 {
 	if (unlikely(bch2_backpointers_no_use_write_buffer))
-		return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert);
-
-	struct bkey_i_backpointer bp_k;
-
-	bkey_backpointer_init(&bp_k.k_i);
-	bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
-	bp_k.v = bp;
+		return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
 
 	if (!insert) {
-		bp_k.k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&bp_k.k, 0);
+		bp->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&bp->k, 0);
 	}
 
-	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
+	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
 }
 
 static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
@@ -148,17 +135,21 @@ static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
 			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c k, struct extent_ptr_decoded p,
 			   const union bch_extent_entry *entry,
-			   struct bpos *bucket_pos, struct bch_backpointer *bp,
+			   struct bpos *bucket, struct bkey_i_backpointer *bp,
 			   u64 sectors)
 {
 	u32 bucket_offset;
-	*bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
-	*bp = (struct bch_backpointer) {
+	*bucket = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
+
+	u64 bp_bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
+
+	bkey_backpointer_init(&bp->k_i);
+	bp->k.p = bucket_pos_to_bp(ca, *bucket, bp_bucket_offset);
+	bp->v	= (struct bch_backpointer) {
 		.btree_id	= btree_id,
 		.level		= level,
 		.data_type	= bch2_bkey_ptr_data_type(k, p, entry),
-		.bucket_offset	= ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
-			p.crc.offset,
+		.bucket_offset	= bp_bucket_offset,
 		.bucket_len	= sectors,
 		.pos		= k.k->p,
 	};
@@ -168,7 +159,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
 			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c k, struct extent_ptr_decoded p,
 			   const union bch_extent_entry *entry,
-			   struct bpos *bucket_pos, struct bch_backpointer *bp)
+			   struct bpos *bucket_pos, struct bkey_i_backpointer *bp)
 {
 	u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5b42f0a7b0cb..1547141ba2a0 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -585,18 +585,18 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	}
 
 	struct bpos bucket;
-	struct bch_backpointer bp;
+	struct bkey_i_backpointer bp;
 	__bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors);
 
 	if (flags & BTREE_TRIGGER_transactional) {
 		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
 		ret = PTR_ERR_OR_ZERO(a) ?:
-			__mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v);
+			__mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v);
 		if (ret)
 			goto err;
 
 		if (!p.ptr.cached) {
-			ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert);
+			ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
 			if (ret)
 				goto err;
 		}
@@ -614,7 +614,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 
 		bucket_lock(g);
 		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new);
+		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new);
 		alloc_to_bucket(g, new);
 		bucket_unlock(g);
 err_unlock:
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 8197ceb506e9..cd377ce6cf64 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1291,7 +1291,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 			return 0;
 
 		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
-		bch2_backpointer_to_text(&buf, bp.v);
+		bch2_bkey_val_to_text(&buf, c, bp.s_c);
 
 		bch2_fs_inconsistent(c, "%s", buf.buf);
 		printbuf_exit(&buf);
-- 
cgit v1.2.3


From e48fda6cdc2799ef4a63692f0f11240613067b38 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 18 Nov 2024 00:32:57 -0500
Subject: bcachefs: Fix check_backpointers_to_extents range limiting

bch2_get_btree_in_memory_pos() will return positions that refer directly
to the btree it's checking will fit in memory - i.e. backpointer
positions, not buckets.

This also means check_bp_exists() no longer has to refer to the device,
and we can delete some code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 63 ++++++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 24804f0bb2fd..5963217cd90c 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -352,8 +352,8 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
 }
 
 struct extents_to_bp_state {
-	struct bpos	bucket_start;
-	struct bpos	bucket_end;
+	struct bpos	bp_start;
+	struct bpos	bp_end;
 	struct bkey_buf last_flushed;
 };
 
@@ -445,29 +445,16 @@ static int check_bp_exists(struct btree_trans *trans,
 			   struct bkey_s_c orig_k)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter bp_iter = {};
 	struct btree_iter other_extent_iter = {};
 	struct printbuf buf = PRINTBUF;
-	struct bkey_s_c bp_k;
-	int ret = 0;
 
-	struct bch_dev *ca = bch2_dev_tryget_noerror(c, bp->k.p.inode);
-	if (!ca) {
-		prt_printf(&buf, "extent for nonexistent device %llu\n", bp->k.p.inode);
-		bch2_bkey_val_to_text(&buf, c, orig_k);
-		bch_err(c, "%s", buf.buf);
-		ret = -BCH_ERR_fsck_repair_unimplemented;
-		goto err;
-	}
-
-	struct bpos bucket = bp_pos_to_bucket(ca, bp->k.p);
-
-	if (bpos_lt(bucket, s->bucket_start) ||
-	    bpos_gt(bucket, s->bucket_end))
-		goto out;
+	if (bpos_lt(bp->k.p, s->bp_start) ||
+	    bpos_gt(bp->k.p, s->bp_end))
+		return 0;
 
-	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
-	ret = bkey_err(bp_k);
+	struct btree_iter bp_iter;
+	struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
+	int ret = bkey_err(bp_k);
 	if (ret)
 		goto err;
 
@@ -484,7 +471,6 @@ err:
 fsck_err:
 	bch2_trans_iter_exit(trans, &other_extent_iter);
 	bch2_trans_iter_exit(trans, &bp_iter);
-	bch2_dev_put(ca);
 	printbuf_exit(&buf);
 	return ret;
 check_existing_bp:
@@ -514,12 +500,13 @@ check_existing_bp:
 		bch_err(c, "%s", buf.buf);
 
 		if (other_extent.k->size <= orig_k.k->size) {
-			ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bucket.inode);
+			ret = drop_dev_and_update(trans, other_bp.v->btree_id,
+						  other_extent, bp->k.p.inode);
 			if (ret)
 				goto err;
 			goto out;
 		} else {
-			ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bucket.inode);
+			ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode);
 			if (ret)
 				goto err;
 			goto missing;
@@ -529,7 +516,7 @@ check_existing_bp:
 	ret = check_extent_checksum(trans,
 				    other_bp.v->btree_id, other_extent,
 				    bp->v.btree_id, orig_k,
-				    bucket.inode);
+				    bp->k.p.inode);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -538,7 +525,7 @@ check_existing_bp:
 	}
 
 	ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
-				    other_bp.v->btree_id, other_extent, bucket.inode);
+				    other_bp.v->btree_id, other_extent, bp->k.p.inode);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -547,7 +534,7 @@ check_existing_bp:
 	}
 
 	printbuf_reset(&buf);
-	prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n  ", bucket.inode);
+	prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n  ", bp->k.p.inode);
 	bch2_bkey_val_to_text(&buf, c, orig_k);
 	prt_str(&buf, "\n  ");
 	bch2_bkey_val_to_text(&buf, c, other_extent);
@@ -811,7 +798,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
-	struct extents_to_bp_state s = { .bucket_start = POS_MIN };
+	struct extents_to_bp_state s = { .bp_start = POS_MIN };
 	int ret;
 
 	bch2_bkey_buf_init(&s.last_flushed);
@@ -822,35 +809,35 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 		ret = bch2_get_btree_in_memory_pos(trans,
 				BIT_ULL(BTREE_ID_backpointers),
 				BIT_ULL(BTREE_ID_backpointers),
-				BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
+				BBPOS(BTREE_ID_backpointers, s.bp_start), &end);
 		if (ret)
 			break;
 
-		s.bucket_end = end.pos;
+		s.bp_end = end.pos;
 
-		if ( bpos_eq(s.bucket_start, POS_MIN) &&
-		    !bpos_eq(s.bucket_end, SPOS_MAX))
+		if ( bpos_eq(s.bp_start, POS_MIN) &&
+		    !bpos_eq(s.bp_end, SPOS_MAX))
 			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
 				    __func__, btree_nodes_fit_in_ram(c));
 
-		if (!bpos_eq(s.bucket_start, POS_MIN) ||
-		    !bpos_eq(s.bucket_end, SPOS_MAX)) {
+		if (!bpos_eq(s.bp_start, POS_MIN) ||
+		    !bpos_eq(s.bp_end, SPOS_MAX)) {
 			struct printbuf buf = PRINTBUF;
 
 			prt_str(&buf, "check_extents_to_backpointers(): ");
-			bch2_bpos_to_text(&buf, s.bucket_start);
+			bch2_bpos_to_text(&buf, s.bp_start);
 			prt_str(&buf, "-");
-			bch2_bpos_to_text(&buf, s.bucket_end);
+			bch2_bpos_to_text(&buf, s.bp_end);
 
 			bch_verbose(c, "%s", buf.buf);
 			printbuf_exit(&buf);
 		}
 
 		ret = bch2_check_extents_to_backpointers_pass(trans, &s);
-		if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
+		if (ret || bpos_eq(s.bp_end, SPOS_MAX))
 			break;
 
-		s.bucket_start = bpos_successor(s.bucket_end);
+		s.bp_start = bpos_successor(s.bp_end);
 	}
 	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&s.last_flushed, c);
-- 
cgit v1.2.3


From 1ab00b6cddc53d47e6f45002e69034941d40ae03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Nov 2024 18:26:54 -0500
Subject: bcachefs: kill bch_backpointer.bucket_offset usage

bch_backpointer.bucket_offset is going away - it's no longer needed
since we no longer store backpointers in alloc keys, the same
information is in the key position itself.

And we'll be reclaiming the space in bch_backpointer for the bucket
generation number.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 15 +++++++--------
 fs/bcachefs/backpointers.h |  8 ++++++++
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 5963217cd90c..620fa67db7a6 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -54,21 +54,20 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 
 	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
 	if (ca) {
-		struct bpos bucket = bp_pos_to_bucket(ca, k.k->p);
+		u32 bucket_offset;
+		struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
 		rcu_read_unlock();
-		prt_str(out, "bucket=");
-		bch2_bpos_to_text(out, bucket);
-		prt_str(out, " ");
+		prt_printf(out, "bucket=%llu:%llu:%u", bucket.inode, bucket.offset, bucket_offset);
 	} else {
 		rcu_read_unlock();
+		prt_printf(out, "sector=%llu:%llu", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
 	}
 
 	bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
-	prt_printf(out, " offset=%llu:%u len=%u pos=",
-		   (u64) (bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-		   (u32) bp.v->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+	prt_printf(out, " suboffset=%u len=%u pos=",
+		   (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
 		   bp.v->bucket_len);
 	bch2_bpos_to_text(out, bp.v->pos);
 }
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 5f34a25b599a..d8a15f5fa767 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -42,6 +42,14 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos
 	return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
 }
 
+static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
+						      u32 *bucket_offset)
+{
+	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+	return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
+}
+
 static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
 {
 	rcu_read_lock();
-- 
cgit v1.2.3


From c80f33b752688392adf2aa71190636b8e1ff204b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 14 Nov 2024 21:34:43 -0500
Subject: bcachefs: New backpointers helpers

- bch2_backpointer_del()
- bch2_backpointer_maybe_flush()

Kill a bit of open coding and make sure we're properly handling the
btree write buffer.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 58 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 620fa67db7a6..cfd9b9ead473 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -199,6 +199,22 @@ err:
 	return ret;
 }
 
+static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
+{
+	return likely(!bch2_backpointers_no_use_write_buffer)
+	       ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
+	       : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0);
+}
+
+static int bch2_backpointers_maybe_flush(struct btree_trans *trans,
+					 struct bkey_s_c visiting_k,
+					 struct bkey_buf *last_flushed)
+{
+	return likely(!bch2_backpointers_no_use_write_buffer)
+		? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
+		: 0;
+}
+
 static void backpointer_target_not_found(struct btree_trans *trans,
 					 struct bkey_s_c_backpointer bp,
 					 struct bkey_s_c target_k)
@@ -300,9 +316,12 @@ err:
 	return b;
 }
 
-static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
-					struct bkey_s_c k)
+static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
+						   struct bkey_buf *last_flushed)
 {
+	if (k.k->type != KEY_TYPE_backpointer)
+		return 0;
+
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter = { NULL };
 	struct bkey_s_c alloc_k;
@@ -311,10 +330,14 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 
 	struct bpos bucket;
 	if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
+		ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
+		if (ret)
+			goto out;
+
 		if (fsck_err(trans, backpointer_to_missing_device,
 			     "backpointer for missing device:\n%s",
 			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = bch2_btree_delete_at(trans, bp_iter, 0);
+			ret = bch2_backpointer_del(trans, k.k->p);
 		goto out;
 	}
 
@@ -323,13 +346,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 	if (ret)
 		goto out;
 
-	if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4,
-			trans, backpointer_to_missing_alloc,
-			"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
-			alloc_iter.pos.inode, alloc_iter.pos.offset,
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, bp_iter, 0);
-		goto out;
+	if (alloc_k.k->type != KEY_TYPE_alloc_v4) {
+		ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
+		if (ret)
+			goto out;
+
+		if (fsck_err(trans, backpointer_to_missing_alloc,
+			     "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+			     alloc_iter.pos.inode, alloc_iter.pos.offset,
+			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			ret = bch2_backpointer_del(trans, k.k->p);
 	}
 out:
 fsck_err:
@@ -341,11 +367,17 @@ fsck_err:
 /* verify that every backpointer has a corresponding alloc key */
 int bch2_check_btree_backpointers(struct bch_fs *c)
 {
+	struct bkey_buf last_flushed;
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
 	int ret = bch2_trans_run(c,
 		for_each_btree_key_commit(trans, iter,
 			BTREE_ID_backpointers, POS_MIN, 0, k,
 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		  bch2_check_btree_backpointer(trans, &iter, k)));
+		  bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)));
+
+	bch2_bkey_buf_exit(&last_flushed, c);
 	bch_err_fn(c, ret);
 	return ret;
 }
@@ -874,7 +906,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 		return ret;
 
 	if (!k.k) {
-		ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed);
+		ret = bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed);
 		if (ret)
 			goto out;
 
@@ -882,7 +914,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 			     "backpointer for missing %s\n  %s",
 			     bp.v->level ? "btree node" : "extent",
 			     (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
-			ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+			ret = bch2_backpointer_del(trans, bp.k->p);
 			goto out;
 		}
 	}
-- 
cgit v1.2.3


From c601e5d7daea102481bd7512b013018fdbf57034 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 23 Nov 2024 16:27:47 -0500
Subject: bcachefs: Can now block journal activity without closing cur entry

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 44 ++++++++++++++++++++++++++++++++++++++++++--
 fs/bcachefs/journal.h       |  3 ++-
 fs/bcachefs/journal_types.h |  2 ++
 3 files changed, 46 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2cf8f24d50cc..bfbb1ac60c3d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -217,6 +217,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 	if (__bch2_journal_pin_put(j, seq))
 		bch2_journal_reclaim_fast(j);
 	bch2_journal_do_writes(j);
+
+	/*
+	 * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an
+	 * open journal entry
+	 */
+	wake_up(&j->wait);
 }
 
 /*
@@ -251,6 +257,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
 	if (!__journal_entry_is_open(old))
 		return;
 
+	if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)
+		old.cur_entry_offset = j->cur_entry_offset_if_blocked;
+
 	/* Close out old buffer: */
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
 
@@ -868,16 +877,44 @@ int bch2_journal_meta(struct journal *j)
 void bch2_journal_unblock(struct journal *j)
 {
 	spin_lock(&j->lock);
-	j->blocked--;
+	if (!--j->blocked &&
+	    j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL &&
+	    j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) {
+		union journal_res_state old, new;
+
+		old.v = atomic64_read(&j->reservations.counter);
+		do {
+			new.v = old.v;
+			new.cur_entry_offset = j->cur_entry_offset_if_blocked;
+		} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+	}
 	spin_unlock(&j->lock);
 
 	journal_wake(j);
 }
 
+static void __bch2_journal_block(struct journal *j)
+{
+	if (!j->blocked++) {
+		union journal_res_state old, new;
+
+		old.v = atomic64_read(&j->reservations.counter);
+		do {
+			j->cur_entry_offset_if_blocked = old.cur_entry_offset;
+
+			if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL)
+				break;
+
+			new.v = old.v;
+			new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
+		} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+	}
+}
+
 void bch2_journal_block(struct journal *j)
 {
 	spin_lock(&j->lock);
-	j->blocked++;
+	__bch2_journal_block(j);
 	spin_unlock(&j->lock);
 
 	journal_quiesce(j);
@@ -1481,6 +1518,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	case JOURNAL_ENTRY_CLOSED_VAL:
 		prt_printf(out, "closed\n");
 		break;
+	case JOURNAL_ENTRY_BLOCKED_VAL:
+		prt_printf(out, "blocked\n");
+		break;
 	default:
 		prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
 		break;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 2762be6f9814..6d3c839bbbef 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -285,7 +285,8 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
 		spin_lock(&j->lock);
 		bch2_journal_buf_put_final(j, seq);
 		spin_unlock(&j->lock);
-	}
+	} else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL))
+		wake_up(&j->wait);
 }
 
 /*
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 19183fcf7ad7..425d1abb257e 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -112,6 +112,7 @@ union journal_res_state {
  */
 #define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
 
+#define JOURNAL_ENTRY_BLOCKED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 2)
 #define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
 #define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
 
@@ -193,6 +194,7 @@ struct journal {
 	 * insufficient devices:
 	 */
 	enum journal_errors	cur_entry_error;
+	unsigned		cur_entry_offset_if_blocked;
 
 	unsigned		buf_size_want;
 	/*
-- 
cgit v1.2.3


From f4d67f6d5a4c042a28fa1d67f756e3d24ba60888 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 23 Nov 2024 18:21:12 -0500
Subject: bcachefs: trivial btree write buffer refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c | 64 +++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 1639c60dffa0..1bd26221f156 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -19,8 +19,6 @@
 static int bch2_btree_write_buffer_journal_flush(struct journal *,
 				struct journal_entry_pin *, u64);
 
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
-
 static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
 {
 	return (cmp_int(l->hi, r->hi) ?:
@@ -481,13 +479,38 @@ err:
 	return ret;
 }
 
-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+	struct journal_keys_to_wb dst;
+	int ret = 0;
+
+	bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+	for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+		jset_entry_for_each_key(entry, k) {
+			ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+			if (ret)
+				goto out;
+		}
+
+		entry->type = BCH_JSET_ENTRY_btree_keys;
+	}
+
+	spin_lock(&c->journal.lock);
+	buf->need_flush_to_write_buffer = false;
+	spin_unlock(&c->journal.lock);
+out:
+	ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
+	return ret;
+}
+
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
 {
 	struct journal *j = &c->journal;
 	struct journal_buf *buf;
 	int ret = 0;
 
-	while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) {
+	while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq))) {
 		ret = bch2_journal_keys_to_write_buffer(c, buf);
 		mutex_unlock(&j->buf_lock);
 	}
@@ -495,7 +518,7 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
 	return ret;
 }
 
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
 					bool *did_work)
 {
 	struct bch_fs *c = trans->c;
@@ -505,7 +528,7 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
 	do {
 		bch2_trans_unlock(trans);
 
-		fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
+		fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
 
 		*did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
 
@@ -518,8 +541,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
 		mutex_unlock(&wb->flushing.lock);
 	} while (!ret &&
 		 (fetch_from_journal_err ||
-		  (wb->inc.pin.seq && wb->inc.pin.seq <= seq) ||
-		  (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq)));
+		  (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
+		  (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
 
 	return ret;
 }
@@ -771,31 +794,6 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_
 	return ret;
 }
 
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
-{
-	struct journal_keys_to_wb dst;
-	int ret = 0;
-
-	bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
-
-	for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
-		jset_entry_for_each_key(entry, k) {
-			ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
-			if (ret)
-				goto out;
-		}
-
-		entry->type = BCH_JSET_ENTRY_btree_keys;
-	}
-
-	spin_lock(&c->journal.lock);
-	buf->need_flush_to_write_buffer = false;
-	spin_unlock(&c->journal.lock);
-out:
-	ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
-	return ret;
-}
-
 static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
 {
 	if (wb->keys.size >= new_size)
-- 
cgit v1.2.3


From e1cb4f56dc4cfcf1cb1b90479cb35c5a304ff527 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 23 Nov 2024 16:47:10 -0500
Subject: bcachefs: Bias reads more in favor of faster device

Per reports of performance issues on mixed multi device filesystems
where we're issuing too much IO to the spinning rust - tweak this
algorithm.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 98bb680b3860..83aeceb68847 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -89,6 +89,14 @@ static inline bool ptr_better(struct bch_fs *c,
 		u64 l1 = dev_latency(c, p1.ptr.dev);
 		u64 l2 = dev_latency(c, p2.ptr.dev);
 
+		/*
+		 * Square the latencies, to bias more in favor of the faster
+		 * device - we never want to stop issuing reads to the slower
+		 * device altogether, so that we can update our latency numbers:
+		 */
+		l1 *= l1;
+		l2 *= l2;
+
 		/* Pick at random, biased in favor of the faster device: */
 
 		return bch2_rand_range(l1 + l2) > l1;
-- 
cgit v1.2.3


From bb61afebca3b5608842e2ac59bbe4eedf137050f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Nov 2024 20:09:45 -0500
Subject: bcachefs: discard fastpath now uses bch2_discard_one_bucket()

The discard bucket fastpath previously was using its own code for
discarding buckets and clearing them in the need_discard btree, which
didn't have any of the consistency checks of the main discard path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 75 +++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ae9fdb5ad758..1e9f53db4bb8 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1725,7 +1725,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 				   struct bch_dev *ca,
 				   struct btree_iter *need_discard_iter,
 				   struct bpos *discard_pos_done,
-				   struct discard_buckets_state *s)
+				   struct discard_buckets_state *s,
+				   bool fastpath)
 {
 	struct bch_fs *c = trans->c;
 	struct bpos pos = need_discard_iter->pos;
@@ -1782,10 +1783,12 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (discard_in_flight_add(ca, iter.pos.offset, true))
-		goto out;
+	if (!fastpath) {
+		if (discard_in_flight_add(ca, iter.pos.offset, true))
+			goto out;
 
-	discard_locked = true;
+		discard_locked = true;
+	}
 
 	if (!bkey_eq(*discard_pos_done, iter.pos) &&
 	    ca->mi.discard && !c->opts.nochanges) {
@@ -1799,6 +1802,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 				     ca->mi.bucket_size,
 				     GFP_KERNEL);
 		*discard_pos_done = iter.pos;
+		s->discarded++;
 
 		ret = bch2_trans_relock_notrace(trans);
 		if (ret)
@@ -1819,12 +1823,12 @@ commit:
 		goto out;
 
 	count_event(c, bucket_discard);
-	s->discarded++;
 out:
 fsck_err:
 	if (discard_locked)
 		discard_in_flight_remove(ca, iter.pos.offset);
-	s->seen++;
+	if (!ret)
+		s->seen++;
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
 	return ret;
@@ -1848,7 +1852,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 				   BTREE_ID_need_discard,
 				   POS(ca->dev_idx, 0),
 				   POS(ca->dev_idx, U64_MAX), 0, k,
-			bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
+			bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
 
 	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
 			      bch2_err_str(ret));
@@ -1881,27 +1885,31 @@ void bch2_do_discards(struct bch_fs *c)
 		bch2_dev_do_discards(ca);
 }
 
-static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
+static int bch2_do_discards_fast_one(struct btree_trans *trans,
+				     struct bch_dev *ca,
+				     u64 bucket,
+				     struct bpos *discard_pos_done,
+				     struct discard_buckets_state *s)
 {
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
-	ret = PTR_ERR_OR_ZERO(a);
+	struct btree_iter need_discard_iter;
+	struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
+					BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
+	int ret = bkey_err(discard_k);
 	if (ret)
-		goto err;
+		return ret;
 
-	BUG_ON(a->v.dirty_sectors);
-	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
-	alloc_data_type_set(&a->v, a->v.data_type);
+	if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
+			    trans, discarding_bucket_not_in_need_discard_btree,
+			    "attempting to discard bucket %u:%llu not in need_discard btree",
+			    ca->dev_idx, bucket)) {
+		/* log it in the superblock and continue: */
+		goto out;
+	}
 
-	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
+	ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &need_discard_iter);
 	return ret;
 }
 
@@ -1909,6 +1917,10 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
 {
 	struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
 	struct bch_fs *c = ca->fs;
+	struct discard_buckets_state s = {};
+	struct bpos discard_pos_done = POS_MAX;
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = 0;
 
 	while (1) {
 		bool got_bucket = false;
@@ -1929,16 +1941,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
 		if (!got_bucket)
 			break;
 
-		if (ca->mi.discard && !c->opts.nochanges)
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bucket),
-					     ca->mi.bucket_size,
-					     GFP_KERNEL);
-
-		int ret = bch2_trans_commit_do(c, NULL, NULL,
-			BCH_WATERMARK_btree|
-			BCH_TRANS_COMMIT_no_enospc,
-			bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
+		ret = lockrestart_do(trans,
+			bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
 		bch_err_fn(c, ret);
 
 		discard_in_flight_remove(ca, bucket);
@@ -1947,6 +1951,9 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
 			break;
 	}
 
+	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
+
+	bch2_trans_put(trans);
 	percpu_ref_put(&ca->io_ref);
 	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
 }
-- 
cgit v1.2.3


From 0eaac0b44fa93a5bf8bd5d79c557ca5e04d2dc16 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 23 Apr 2024 02:18:18 -0400
Subject: bcachefs: btree_write_buffer_flush_seq() no longer closes journal

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c | 19 ++++++++++++++-----
 fs/bcachefs/journal.c            | 27 ++++++++++++++++++++-------
 fs/bcachefs/journal.h            |  2 +-
 3 files changed, 35 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 1bd26221f156..49ce2d1e5c02 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -495,10 +495,6 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
 
 		entry->type = BCH_JSET_ENTRY_btree_keys;
 	}
-
-	spin_lock(&c->journal.lock);
-	buf->need_flush_to_write_buffer = false;
-	spin_unlock(&c->journal.lock);
 out:
 	ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
 	return ret;
@@ -508,11 +504,24 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
 {
 	struct journal *j = &c->journal;
 	struct journal_buf *buf;
+	bool blocked;
 	int ret = 0;
 
-	while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq))) {
+	while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
 		ret = bch2_journal_keys_to_write_buffer(c, buf);
+
+		if (!blocked && !ret) {
+			spin_lock(&j->lock);
+			buf->need_flush_to_write_buffer = false;
+			spin_unlock(&j->lock);
+		}
+
 		mutex_unlock(&j->buf_lock);
+
+		if (blocked) {
+			bch2_journal_unblock(j);
+			break;
+		}
 	}
 
 	return ret;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bfbb1ac60c3d..699db0d0749a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -908,6 +908,8 @@ static void __bch2_journal_block(struct journal *j)
 			new.v = old.v;
 			new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
 		} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+
+		journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
 	}
 }
 
@@ -920,7 +922,8 @@ void bch2_journal_block(struct journal *j)
 	journal_quiesce(j);
 }
 
-static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j,
+						u64 max_seq, bool *blocked)
 {
 	struct journal_buf *ret = NULL;
 
@@ -937,13 +940,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
 		struct journal_buf *buf = j->buf + idx;
 
 		if (buf->need_flush_to_write_buffer) {
-			if (seq == journal_cur_seq(j))
-				__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-
 			union journal_res_state s;
 			s.v = atomic64_read_acquire(&j->reservations.counter);
 
-			ret = journal_state_count(s, idx)
+			unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s);
+
+			if (open && !*blocked) {
+				__bch2_journal_block(j);
+				*blocked = true;
+			}
+
+			ret = journal_state_count(s, idx) > open
 				? ERR_PTR(-EAGAIN)
 				: buf;
 			break;
@@ -956,11 +963,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
 	return ret;
 }
 
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
+							     u64 max_seq, bool *blocked)
 {
 	struct journal_buf *ret;
+	*blocked = false;
+
+	wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j,
+						max_seq, blocked)) != ERR_PTR(-EAGAIN));
+	if (IS_ERR_OR_NULL(ret) && *blocked)
+		bch2_journal_unblock(j);
 
-	wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
 	return ret;
 }
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 6d3c839bbbef..71a50846967f 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -425,7 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 
 void bch2_journal_unblock(struct journal *);
 void bch2_journal_block(struct journal *);
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
 
 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-- 
cgit v1.2.3


From 375d21b76d9a20ff4bddf81aadadd6e57f38dee8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 23 Nov 2024 22:12:58 -0500
Subject: bcachefs: BCH_ERR_btree_node_read_error_cached

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 6 +++---
 fs/bcachefs/errcode.h     | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a0a406b0c7bc..36dfa6a48aa6 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1131,7 +1131,7 @@ retry:
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-BCH_ERR_btree_node_read_error);
+		return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
 	}
 
 	EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1221,7 +1221,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-BCH_ERR_btree_node_read_error);
+		return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
 	}
 
 	EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1303,7 +1303,7 @@ lock_node:
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_read(&b->c.lock);
-		b = ERR_PTR(-BCH_ERR_btree_node_read_error);
+		b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
 		goto out;
 	}
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 2dda7f962e5b..131b9bef21a0 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -242,6 +242,7 @@
 	x(BCH_ERR_invalid,		invalid_bkey)				\
 	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
 	x(EIO,				btree_node_read_err)			\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_cached)		\
 	x(EIO,				sb_not_downgraded)			\
 	x(EIO,				btree_node_write_all_failed)		\
 	x(EIO,				btree_node_read_error)			\
-- 
cgit v1.2.3


From 525be09e63cc9ff4bee0b0f480551a4dc2e7b276 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Nov 2024 20:15:30 -0500
Subject: bcachefs: Use separate rhltable for
 bch2_inode_or_descendents_is_open()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  1 +
 fs/bcachefs/fs.c       | 39 ++++++++++++++++++++++++++++++---------
 fs/bcachefs/fs.h       |  1 +
 3 files changed, 32 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 11f9ed42a9da..f1d8c821d27a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1020,6 +1020,7 @@ struct bch_fs {
 	struct list_head	vfs_inodes_list;
 	struct mutex		vfs_inodes_lock;
 	struct rhashtable	vfs_inodes_table;
+	struct rhltable		vfs_inodes_by_inum_table;
 
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 50d323fca001..c6e7df7c67fa 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -39,6 +39,7 @@
 #include <linux/posix_acl.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
+#include <linux/siphash.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/xattr.h>
@@ -176,8 +177,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
 static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
 {
 	const subvol_inum *inum = data;
+	siphash_key_t k = { .key[0] = seed };
 
-	return jhash(&inum->inum, sizeof(inum->inum), seed);
+	return siphash_2u64(inum->subvol, inum->inum, &k);
 }
 
 static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
@@ -206,11 +208,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
 	.automatic_shrinking	= true,
 };
 
+static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
+	.head_offset		= offsetof(struct bch_inode_info, by_inum_hash),
+	.key_offset		= offsetof(struct bch_inode_info, ei_inum.inum),
+	.key_len		= sizeof(u64),
+	.automatic_shrinking	= true,
+};
+
 int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
 {
 	struct bch_fs *c = trans->c;
-	struct rhashtable *ht = &c->vfs_inodes_table;
-	subvol_inum inum = (subvol_inum) { .inum = p.offset };
+	struct rhltable *ht = &c->vfs_inodes_by_inum_table;
+	u64 inum = p.offset;
 	DARRAY(u32) subvols;
 	int ret = 0;
 
@@ -235,15 +244,15 @@ restart_from_top:
 	struct rhash_lock_head __rcu *const *bkt;
 	struct rhash_head *he;
 	unsigned int hash;
-	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
 restart:
-	hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
+	hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
 	bkt = rht_bucket(tbl, hash);
 	do {
 		struct bch_inode_info *inode;
 
 		rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
-			if (inode->ei_inum.inum == inum.inum) {
+			if (inode->ei_inum.inum == inum) {
 				ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
 						      GFP_NOWAIT|__GFP_NOWARN);
 				if (ret) {
@@ -264,7 +273,7 @@ restart:
 	/* Ensure we see any new tables. */
 	smp_rmb();
 
-	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
 	if (unlikely(tbl))
 		goto restart;
 	rcu_read_unlock();
@@ -343,7 +352,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod
 	spin_unlock(&inode->v.i_lock);
 
 	if (remove) {
-		int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+		int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
+					&inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
+		BUG_ON(ret);
+
+		ret = rhashtable_remove_fast(&c->vfs_inodes_table,
 					&inode->hash, bch2_vfs_inodes_params);
 		BUG_ON(ret);
 		inode->v.i_hash.pprev = NULL;
@@ -388,6 +401,11 @@ retry:
 		discard_new_inode(&inode->v);
 		return old;
 	} else {
+		int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
+					  &inode->by_inum_hash,
+					  bch2_vfs_inodes_by_inum_params);
+		BUG_ON(ret);
+
 		inode_fake_hash(&inode->v);
 
 		inode_sb_list_add(&inode->v);
@@ -2359,13 +2377,16 @@ static int bch2_init_fs_context(struct fs_context *fc)
 
 void bch2_fs_vfs_exit(struct bch_fs *c)
 {
+	if (c->vfs_inodes_by_inum_table.ht.tbl)
+		rhltable_destroy(&c->vfs_inodes_by_inum_table);
 	if (c->vfs_inodes_table.tbl)
 		rhashtable_destroy(&c->vfs_inodes_table);
 }
 
 int bch2_fs_vfs_init(struct bch_fs *c)
 {
-	return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
+	return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
+		rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
 }
 
 static struct file_system_type bcache_fs_type = {
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 59f9f7ae728d..dd2198541455 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -14,6 +14,7 @@
 struct bch_inode_info {
 	struct inode		v;
 	struct rhash_head	hash;
+	struct rhlist_head	by_inum_hash;
 	subvol_inum		ei_inum;
 
 	struct list_head	ei_vfs_inode_list;
-- 
cgit v1.2.3


From 6534a404d4924820bd1c06fb4412dd4444234f79 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Nov 2024 21:49:08 -0500
Subject: bcachefs: errcode cleanup: journal errors

Instead of throwing standard error codes, we should be throwing
dedicated private error codes, this greatly improves debugability.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h | 2 ++
 fs/bcachefs/journal.c | 4 ++--
 fs/bcachefs/journal.h | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 131b9bef21a0..c989ce4f715f 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -241,6 +241,8 @@
 	x(BCH_ERR_invalid_sb,		invalid_sb_downgrade)			\
 	x(BCH_ERR_invalid,		invalid_bkey)				\
 	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
+	x(EIO,				journal_shutdown)			\
+	x(EIO,				journal_flush_err)			\
 	x(EIO,				btree_node_read_err)			\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_cached)		\
 	x(EIO,				sb_not_downgraded)			\
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 699db0d0749a..bbdd0b17ae69 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -673,7 +673,7 @@ out:
  * @seq:	seq to flush
  * @parent:	closure object to wait with
  * Returns:	1 if @seq has already been flushed, 0 if @seq is being flushed,
- *		-EIO if @seq will never be flushed
+ *		-BCH_ERR_journal_flush_err if @seq will never be flushed
  *
  * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
@@ -696,7 +696,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
 	/* Recheck under lock: */
 	if (j->err_seq && seq >= j->err_seq) {
-		ret = -EIO;
+		ret = -BCH_ERR_journal_flush_err;
 		goto out;
 	}
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 71a50846967f..a6a2e888c59b 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -412,7 +412,7 @@ void bch2_journal_halt(struct journal *);
 static inline int bch2_journal_error(struct journal *j)
 {
 	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
-		? -EIO : 0;
+		? -BCH_ERR_journal_shutdown : 0;
 }
 
 struct bch_dev;
-- 
cgit v1.2.3


From a7ecd5f2ccabb97174f8264b965c9597ea854e8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Nov 2024 22:23:41 -0500
Subject: bcachefs: disk_accounting: bch2_dev_rcu -> bch2_dev_rcu_noerror

Accounting keys that reference invalid devices are corrected by fsck,
they shouldn't cause an emergency shutdown.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 4 ++--
 fs/bcachefs/disk_accounting.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 55a00018dc8b..fa821d278c45 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -805,7 +805,7 @@ int bch2_accounting_read(struct bch_fs *c)
 			break;
 		case BCH_DISK_ACCOUNTING_dev_data_type:
 			rcu_read_lock();
-			struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev);
+			struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
 			if (ca) {
 				struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
 				percpu_u64_set(&d->buckets,	v[0]);
@@ -911,7 +911,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
 				break;
 			case BCH_DISK_ACCOUNTING_dev_data_type: {
 				rcu_read_lock();
-				struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+				struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
 				if (!ca) {
 					rcu_read_unlock();
 					continue;
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 6639535dc91c..8b2b2f83e6a4 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -142,7 +142,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
 			break;
 		case BCH_DISK_ACCOUNTING_dev_data_type:
 			rcu_read_lock();
-			struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+			struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
 			if (ca) {
 				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
 				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
-- 
cgit v1.2.3


From b71d89bd7b1fbe0a4569d072a0069110f60f9ec9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Nov 2024 22:28:41 -0500
Subject: bcachefs: Fix accounting_read when we rewind

If we rewind recovery to run topology repair, that causes
accounting_read to run twice.

This fixes accounting being double counted.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index fa821d278c45..bb5dbbf71d04 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -700,6 +700,21 @@ int bch2_accounting_read(struct bch_fs *c)
 	struct btree_trans *trans = bch2_trans_get(c);
 	struct printbuf buf = PRINTBUF;
 
+	/*
+	 * We might run more than once if we rewind to start topology repair or
+	 * btree node scan - and those might cause us to get different results,
+	 * so we can't just skip if we've already run.
+	 *
+	 * Instead, zero out any accounting we have:
+	 */
+	percpu_down_write(&c->mark_lock);
+	darray_for_each(acc->k, e)
+		percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
+	for_each_member_device(c, ca)
+		percpu_memset(ca->usage, 0, sizeof(*ca->usage));
+	percpu_memset(c->usage, 0, sizeof(*c->usage));
+	percpu_up_write(&c->mark_lock);
+
 	int ret = for_each_btree_key(trans, iter,
 				BTREE_ID_accounting, POS_MIN,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
-- 
cgit v1.2.3


From 427db7ffe9a91bcad00a7bac4edcd74d75007a11 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Nov 2024 22:45:25 -0500
Subject: bcachefs: backpointer_to_missing_ptr is now autofix

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-errors_format.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 9e3425f533bc..d45d0789f1b1 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -141,7 +141,7 @@ enum bch_fsck_flags {
 	x(backpointer_dev_bad,					297,	0)		\
 	x(backpointer_to_missing_device,			126,	0)		\
 	x(backpointer_to_missing_alloc,				127,	0)		\
-	x(backpointer_to_missing_ptr,				128,	0)		\
+	x(backpointer_to_missing_ptr,				128,	FSCK_AUTOFIX)	\
 	x(lru_entry_at_time_0,					129,	FSCK_AUTOFIX)	\
 	x(lru_entry_to_invalid_bucket,				130,	FSCK_AUTOFIX)	\
 	x(lru_entry_bad,					131,	FSCK_AUTOFIX)	\
-- 
cgit v1.2.3


From abf23afa36eb425fb75d47c26fc665dbab2a9ae1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Nov 2024 22:57:01 -0500
Subject: bcachefs: Fix btree node scan when unknown btree IDs are present

btree_root entries for unknown btree IDs are created during recovery,
before reading those btree roots.

But btree_node_scan may find btree nodes with unknown btree IDs when we
haven't seen roots for those btrees.

Reported-by: syzbot+1f202d4da221ec6ebf8e@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 11 ++++++++---
 fs/bcachefs/btree_cache.h |  9 +++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 36dfa6a48aa6..1f06e24e53fc 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1406,9 +1406,14 @@ void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsi
 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
 {
 	bch2_btree_id_to_text(out, b->c.btree_id);
-	prt_printf(out, " level %u/%u\n  ",
-		   b->c.level,
-		   bch2_btree_id_root(c, b->c.btree_id)->level);
+	prt_printf(out, " level %u/", b->c.level);
+	struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
+	if (r)
+		prt_printf(out, "%u", r->level);
+	else
+		prt_printf(out, "(unknown)");
+	prt_printf(out, "\n  ");
+
 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 }
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 6cfacacb6769..dcc34fe4996d 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -128,14 +128,19 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i
 	} else {
 		unsigned idx = id - BTREE_ID_NR;
 
-		EBUG_ON(idx >= c->btree_roots_extra.nr);
+		/* This can happen when we're called from btree_node_scan */
+		if (idx >= c->btree_roots_extra.nr)
+			return NULL;
+
 		return &c->btree_roots_extra.data[idx];
 	}
 }
 
 static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
 {
-	return bch2_btree_id_root(c, b->c.btree_id)->b;
+	struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
+
+	return r ? r->b : NULL;
 }
 
 const char *bch2_btree_id_str(enum btree_id);	/* avoid */
-- 
cgit v1.2.3


From 828552ca74a45877dbf139b34c47d0f600a1e852 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Nov 2024 23:28:21 -0500
Subject: bcachefs: Kill bch2_bucket_alloc_new_fs()

The early-early allocation path, bch2_bucket_alloc_new_fs(), is no
longer needed - and inconsistencies around new_fs_bucket_idx have been a
frequent source of bugs.

Reported-by: syzbot+592425844580a6598410@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 40 ++++++++++++++++------------------------
 fs/bcachefs/alloc_foreground.h |  2 --
 fs/bcachefs/bcachefs.h         |  1 -
 fs/bcachefs/buckets.c          | 25 +++++++++++++++++++++++++
 fs/bcachefs/buckets.h          | 21 +--------------------
 fs/bcachefs/journal.c          | 34 +++++++++++++++-------------------
 fs/bcachefs/journal_reclaim.c  |  3 +++
 fs/bcachefs/recovery.c         |  5 +----
 fs/bcachefs/super.c            | 12 +++++-------
 9 files changed, 66 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 6d665b720f72..4d1ff7f1f302 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -156,6 +156,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	return ob;
 }
 
+static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+{
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
+		return false;
+
+	return bch2_is_superblock_bucket(ca, b);
+}
+
 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
 {
 	BUG_ON(c->open_buckets_partial_nr >=
@@ -175,20 +183,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
 	closure_wake_up(&c->freelist_wait);
 }
 
-/* _only_ for allocating the journal on a new device: */
-long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
-{
-	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
-		u64 b = ca->new_fs_bucket_idx++;
-
-		if (!is_superblock_bucket(ca, b) &&
-		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
-			return b;
-	}
-
-	return -1;
-}
-
 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
 {
 	switch (watermark) {
@@ -214,6 +208,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 {
 	struct open_bucket *ob;
 
+	if (unlikely(is_superblock_bucket(c, ca, bucket)))
+		return NULL;
+
 	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
 		s->skipped_nouse++;
 		return NULL;
@@ -295,9 +292,6 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
 /*
  * This path is for before the freespace btree is initialized:
- *
- * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
- * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
  */
 static noinline struct open_bucket *
 bch2_bucket_alloc_early(struct btree_trans *trans,
@@ -309,7 +303,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
 	struct btree_iter iter, citer;
 	struct bkey_s_c k, ck;
 	struct open_bucket *ob = NULL;
-	u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+	u64 first_bucket = ca->mi.first_bucket;
 	u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
 	u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
 	u64 alloc_cursor = alloc_start;
@@ -332,10 +326,6 @@ again:
 		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
 			break;
 
-		if (ca->new_fs_bucket_idx &&
-		    is_superblock_bucket(ca, k.k->p.offset))
-			continue;
-
 		if (s->btree_bitmap != BTREE_BITMAP_ANY &&
 		    s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
 				bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
@@ -406,8 +396,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
 	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
 	u64 alloc_cursor = alloc_start;
 	int ret;
-
-	BUG_ON(ca->new_fs_bucket_idx);
 again:
 	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
 					 POS(ca->dev_idx, alloc_cursor),
@@ -551,6 +539,10 @@ again:
 		bch2_dev_do_invalidates(ca);
 
 	if (!avail) {
+		if (watermark > BCH_WATERMARK_normal &&
+		    c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+			goto alloc;
+
 		if (cl && !waiting) {
 			closure_wait(&c->freelist_wait, cl);
 			waiting = true;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 1a16fd5bd4f8..4f87745df97e 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
 					  struct bch_devs_mask *);
 void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
 
-long bch2_bucket_alloc_new_fs(struct bch_dev *);
-
 static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
 {
 	return bch2_dev_have_ref(c, ob->dev);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f1d8c821d27a..a85b3bcc6383 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -560,7 +560,6 @@ struct bch_dev {
 	struct bch_dev_usage __percpu	*usage;
 
 	/* Allocator: */
-	u64			new_fs_bucket_idx;
 	u64			alloc_cursor[3];
 
 	unsigned		nr_open_buckets;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1547141ba2a0..afd35c93fcfb 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1161,6 +1161,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
 	return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
 }
 
+bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 b_offset	= bucket_to_sector(ca, b);
+	u64 b_end	= bucket_to_sector(ca, b + 1);
+	unsigned i;
+
+	if (!b)
+		return true;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+		u64 end = offset + (1 << layout->sb_max_size_bits);
+
+		if (!(offset >= b_end || end <= b_offset))
+			return true;
+	}
+
+	for (i = 0; i < ca->journal.nr; i++)
+		if (b == ca->journal.buckets[i])
+			return true;
+
+	return false;
+}
+
 /* Disk reservations: */
 
 #define SECTORS_CACHE	1024
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ccc78bfe2fd4..3bebc4c3044f 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -308,26 +308,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
 				    enum btree_iter_update_trigger_flags);
 int bch2_trans_mark_dev_sbs(struct bch_fs *);
 
-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
-	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-	u64 b_offset	= bucket_to_sector(ca, b);
-	u64 b_end	= bucket_to_sector(ca, b + 1);
-	unsigned i;
-
-	if (!b)
-		return true;
-
-	for (i = 0; i < layout->nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout->sb_offset[i]);
-		u64 end = offset + (1 << layout->sb_max_size_bits);
-
-		if (!(offset >= b_end || end <= b_offset))
-			return true;
-	}
-
-	return false;
-}
+bool bch2_is_superblock_bucket(struct bch_dev *, u64);
 
 static inline const char *bch2_data_type_str(enum bch_data_type type)
 {
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bbdd0b17ae69..95cccda3b22c 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1002,19 +1002,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	}
 
 	for (nr_got = 0; nr_got < nr_want; nr_got++) {
-		if (new_fs) {
-			bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
-			if (bu[nr_got] < 0) {
-				ret = -BCH_ERR_ENOSPC_bucket_alloc;
-				break;
-			}
-		} else {
-			ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal,
-						       BCH_DATA_journal, cl);
-			ret = PTR_ERR_OR_ZERO(ob[nr_got]);
-			if (ret)
-				break;
+		enum bch_watermark watermark = new_fs
+			? BCH_WATERMARK_btree
+			: BCH_WATERMARK_normal;
 
+		ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
+					       BCH_DATA_journal, cl);
+		ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+		if (ret)
+			break;
+
+		if (!new_fs) {
 			ret = bch2_trans_run(c,
 				bch2_trans_mark_metadata_bucket(trans, ca,
 						ob[nr_got]->bucket, BCH_DATA_journal,
@@ -1024,9 +1022,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				bch_err_msg(c, ret, "marking new journal buckets");
 				break;
 			}
-
-			bu[nr_got] = ob[nr_got]->bucket;
 		}
+
+		bu[nr_got] = ob[nr_got]->bucket;
 	}
 
 	if (!nr_got)
@@ -1066,8 +1064,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	if (ret)
 		goto err_unblock;
 
-	if (!new_fs)
-		bch2_write_super(c);
+	bch2_write_super(c);
 
 	/* Commit: */
 	if (c)
@@ -1101,9 +1098,8 @@ err_unblock:
 						bu[i], BCH_DATA_free, 0,
 						BTREE_TRIGGER_transactional));
 err_free:
-	if (!new_fs)
-		for (i = 0; i < nr_got; i++)
-			bch2_open_bucket_put(c, ob[i]);
+	for (i = 0; i < nr_got; i++)
+		bch2_open_bucket_put(c, ob[i]);
 
 	kfree(new_bucket_seq);
 	kfree(new_buckets);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3d8fc2642425..1aabbbe328d9 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 					    struct journal_device *ja,
 					    enum journal_space_from from)
 {
+	if (!ja->nr)
+		return 0;
+
 	unsigned available = (journal_space_from(ja, from) -
 			      ja->cur_idx - 1 + ja->nr) % ja->nr;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7086a7226989..547c78a323f7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1070,7 +1070,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
 	set_bit(BCH_FS_btree_running, &c->flags);
 	set_bit(BCH_FS_may_go_rw, &c->flags);
 
@@ -1111,9 +1110,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	for_each_online_member(c, ca)
-		ca->new_fs_bucket_idx = 0;
-
 	ret = bch2_fs_freespace_init(c);
 	if (ret)
 		goto err;
@@ -1172,6 +1168,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
 	return 0;
 err:
 	bch_err_fn(c, ret);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 08170a3d524f..14157820705d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1750,11 +1750,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (ret)
 		goto err;
 
-	ret = bch2_dev_journal_alloc(ca, true);
-	bch_err_msg(c, ret, "allocating journal");
-	if (ret)
-		goto err;
-
 	down_write(&c->state_lock);
 	mutex_lock(&c->sb_lock);
 
@@ -1805,11 +1800,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (ret)
 		goto err_late;
 
-	ca->new_fs_bucket_idx = 0;
-
 	if (ca->mi.state == BCH_MEMBER_STATE_rw)
 		__bch2_dev_read_write(c, ca);
 
+	ret = bch2_dev_journal_alloc(ca, false);
+	bch_err_msg(c, ret, "allocating journal");
+	if (ret)
+		goto err_late;
+
 	up_write(&c->state_lock);
 	return 0;
 
-- 
cgit v1.2.3


From 14152654805256d760315ec24e414363bfa19a06 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 25 Nov 2024 00:21:27 -0500
Subject: bcachefs: Bad btree roots are now autofix

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-errors_format.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index d45d0789f1b1..89d9dc2c859b 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -68,8 +68,8 @@ enum bch_fsck_flags {
 	x(btree_node_bkey_bad_format,				 55,	0)		\
 	x(btree_node_bad_bkey,					 56,	0)		\
 	x(btree_node_bkey_out_of_order,				 57,	0)		\
-	x(btree_root_bkey_invalid,				 58,	0)		\
-	x(btree_root_read_error,				 59,	0)		\
+	x(btree_root_bkey_invalid,				 58,	FSCK_AUTOFIX)	\
+	x(btree_root_read_error,				 59,	FSCK_AUTOFIX)	\
 	x(btree_root_bad_min_key,				 60,	0)		\
 	x(btree_root_bad_max_key,				 61,	0)		\
 	x(btree_node_read_error,				 62,	0)		\
-- 
cgit v1.2.3


From 75eabea6988e4ef587c8d90425918fdc58a8f1c9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 25 Nov 2024 01:26:56 -0500
Subject: bcachefs: Fix dup/misordered check in btree node read

We were checking for out of order keys, but not duplicate keys.

Reported-by: syzbot+dedbd67513939979f84f@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 89a42ee81e5c..2b5da566fbac 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -857,6 +857,14 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
 	return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent);
 }
 
+static inline int btree_node_read_bkey_cmp(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed(b, l, r)
+		?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
+}
+
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 			 struct bset *i, int write,
 			 bool have_retry, bool *saw_error)
@@ -917,7 +925,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 				    BSET_BIG_ENDIAN(i), write,
 				    &b->format, k);
 
-		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+		if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
 			struct bkey up = bkey_unpack_key(b, prev);
 
 			printbuf_reset(&buf);
-- 
cgit v1.2.3


From dba8243f3b466dd39ef05ff63890d5acab30c852 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 25 Nov 2024 02:05:02 -0500
Subject: bcachefs: Don't try to en/decrypt when encryption not available

If a btree node says it's encrypted, but the superblock never had an
encryptino key - whoops, that needs to be handled.

Reported-by: syzbot+026f1857b12f5eb3f9e9@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c        | 117 +++++++++++++++++++++---------------------
 fs/bcachefs/btree_node_scan.c |   3 ++
 fs/bcachefs/checksum.c        |  10 +++-
 fs/bcachefs/errcode.h         |   1 +
 fs/bcachefs/io_read.c         |  14 ++++-
 5 files changed, 84 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2b5da566fbac..3bb6db9bd4a4 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1045,39 +1045,51 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	while (b->written < (ptr_written ?: btree_sectors(c))) {
 		unsigned sectors;
-		struct nonce nonce;
 		bool first = !b->written;
-		bool csum_bad;
 
-		if (!b->written) {
+		if (first) {
+			bne = NULL;
 			i = &b->data->keys;
+		} else {
+			bne = write_block(b);
+			i = &bne->keys;
 
-			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, i, NULL,
-				     bset_unknown_csum,
-				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
-			nonce = btree_nonce(i, b->written << 9);
+			if (i->seq != b->data->keys.seq)
+				break;
+		}
 
-			struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
-			csum_bad = bch2_crc_cmp(b->data->csum, csum);
-			if (csum_bad)
-				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+		struct nonce nonce = btree_nonce(i, b->written << 9);
+		bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
 
-			btree_err_on(csum_bad,
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, i, NULL,
-				     bset_bad_csum,
-				     "%s",
-				     (printbuf_reset(&buf),
-				      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
-				      buf.buf));
-
-			ret = bset_encrypt(c, i, b->written << 9);
-			if (bch2_fs_fatal_err_on(ret, c,
-					"decrypting btree node: %s", bch2_err_str(ret)))
-				goto fsck_err;
+		btree_err_on(!good_csum_type,
+			     bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
+			     ? -BCH_ERR_btree_node_read_err_must_retry
+			     : -BCH_ERR_btree_node_read_err_want_retry,
+			     c, ca, b, i, NULL,
+			     bset_unknown_csum,
+			     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
+
+		if (first) {
+			if (good_csum_type) {
+				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+				bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
+				if (csum_bad)
+					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+				btree_err_on(csum_bad,
+					     -BCH_ERR_btree_node_read_err_want_retry,
+					     c, ca, b, i, NULL,
+					     bset_bad_csum,
+					     "%s",
+					     (printbuf_reset(&buf),
+					      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
+					      buf.buf));
+
+				ret = bset_encrypt(c, i, b->written << 9);
+				if (bch2_fs_fatal_err_on(ret, c,
+							 "decrypting btree node: %s", bch2_err_str(ret)))
+					goto fsck_err;
+			}
 
 			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
 				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
@@ -1088,37 +1100,26 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 			sectors = vstruct_sectors(b->data, c->block_bits);
 		} else {
-			bne = write_block(b);
-			i = &bne->keys;
-
-			if (i->seq != b->data->keys.seq)
-				break;
-
-			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, i, NULL,
-				     bset_unknown_csum,
-				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
-			nonce = btree_nonce(i, b->written << 9);
-			struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-			csum_bad = bch2_crc_cmp(bne->csum, csum);
-			if (ca && csum_bad)
-				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
-			btree_err_on(csum_bad,
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, i, NULL,
-				     bset_bad_csum,
-				     "%s",
-				     (printbuf_reset(&buf),
-				      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
-				      buf.buf));
-
-			ret = bset_encrypt(c, i, b->written << 9);
-			if (bch2_fs_fatal_err_on(ret, c,
-					"decrypting btree node: %s", bch2_err_str(ret)))
-				goto fsck_err;
+			if (good_csum_type) {
+				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+				bool csum_bad = bch2_crc_cmp(bne->csum, csum);
+				if (ca && csum_bad)
+					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+				btree_err_on(csum_bad,
+					     -BCH_ERR_btree_node_read_err_want_retry,
+					     c, ca, b, i, NULL,
+					     bset_bad_csum,
+					     "%s",
+					     (printbuf_reset(&buf),
+					      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
+					      buf.buf));
+
+				ret = bset_encrypt(c, i, b->written << 9);
+				if (bch2_fs_fatal_err_on(ret, c,
+						"decrypting btree node: %s", bch2_err_str(ret)))
+					goto fsck_err;
+			}
 
 			sectors = vstruct_sectors(bne, c->block_bits);
 		}
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 4b4df31d4b95..327f1a1859b9 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -159,6 +159,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 		return;
 
 	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+		if (!c->chacha20)
+			return;
+
 		struct nonce nonce = btree_nonce(&bn->keys, 0);
 		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
 
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index ce8fc677bef9..23a383577d4c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "checksum.h"
 #include "errcode.h"
+#include "error.h"
 #include "super.h"
 #include "super-io.h"
 
@@ -252,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
 	if (!bch2_csum_type_is_encryption(type))
 		return 0;
 
+	if (bch2_fs_inconsistent_on(!c->chacha20,
+				    c, "attempting to encrypt without encryption key"))
+		return -BCH_ERR_no_encryption_key;
+
 	return do_encrypt(c->chacha20, nonce, data, len);
 }
 
@@ -337,8 +342,9 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 	size_t sgl_len = 0;
 	int ret = 0;
 
-	if (!bch2_csum_type_is_encryption(type))
-		return 0;
+	if (bch2_fs_inconsistent_on(!c->chacha20,
+				    c, "attempting to encrypt without encryption key"))
+		return -BCH_ERR_no_encryption_key;
 
 	darray_init(&sgl);
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index c989ce4f715f..a12050e9c191 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -260,6 +260,7 @@
 	x(EIO,				no_device_to_read_from)			\
 	x(EIO,				missing_indirect_extent)		\
 	x(EIO,				invalidate_stripe_to_dev)		\
+	x(EIO,				no_encryption_key)			\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index eb8d12fd6398..4b6b6d25725b 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -830,7 +830,7 @@ retry_pick:
 	if (!pick_ret)
 		goto hole;
 
-	if (pick_ret < 0) {
+	if (unlikely(pick_ret < 0)) {
 		struct printbuf buf = PRINTBUF;
 		bch2_bkey_val_to_text(&buf, c, k);
 
@@ -843,6 +843,18 @@ retry_pick:
 		goto err;
 	}
 
+	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, k);
+
+		bch_err_inum_offset_ratelimited(c,
+				read_pos.inode, read_pos.offset << 9,
+				"attempting to read encrypted data without encryption key\n  %s",
+				buf.buf);
+		printbuf_exit(&buf);
+		goto err;
+	}
+
 	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
 
 	/*
-- 
cgit v1.2.3


From 686d2ebec683903fe9ca41fd3cef9b4fb924aae4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 25 Nov 2024 17:03:13 -0500
Subject: bcachefs: Change "disk accounting version 0" check to commit only

6.11 had a bug where we'd sometimes create disk accounting keys with
version 0, which causes issues for journal replay - but we don't need to
delete existing accounting keys with version 0.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index bb5dbbf71d04..c5e61265b709 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -134,7 +134,8 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
 	void *end = &acc_k + 1;
 	int ret = 0;
 
-	bkey_fsck_err_on(bversion_zero(k.k->bversion),
+	bkey_fsck_err_on((flags & BCH_VALIDATE_commit) &&
+			 bversion_zero(k.k->bversion),
 			 c, accounting_key_version_0,
 			 "accounting key with version=0");
 
-- 
cgit v1.2.3


From d94159763649831024255a4ec3b4bd1e1bf3f234 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Nov 2024 15:16:57 -0500
Subject: bcachefs: Fix bch2_btree_node_update_key_early()

Fix an assertion pop from the recent btree cache freelist fixes.

Fixes: baefd3f849ed ("bcachefs: btree_cache.freeable list fixes")
Reported-by: Tyler <th020394@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 1f06e24e53fc..1117be901cf0 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -326,7 +326,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans,
 	if (!IS_ERR_OR_NULL(b)) {
 		mutex_lock(&c->btree_cache.lock);
 
-		bch2_btree_node_hash_remove(&c->btree_cache, b);
+		__bch2_btree_node_hash_remove(&c->btree_cache, b);
 
 		bkey_copy(&b->key, new);
 		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-- 
cgit v1.2.3


From db0667a4ed82b67779855674682956685fc71f15 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Nov 2024 21:27:16 -0500
Subject: bcachefs: Go RW earlier, for normal rw mount

Previously, when mounting read-write after a clean shutdown, we wouldn't
go read-write until after all the recovery passes completed.

Now, go RW early in recovery, the same as any other situation we'll need
to go read-write. This fixes a bug where we discover unlinked inodes
after a clean shutdown: repair fails because we're read only.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery_passes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 1240c5c19fea..f6d3a99cb63e 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -46,7 +46,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
 
 	set_bit(BCH_FS_may_go_rw, &c->flags);
 
-	if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
+	if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
 		return bch2_fs_read_write_early(c);
 	return 0;
 }
-- 
cgit v1.2.3


From 90f3683e8f7c9eba516b65c47865fa3a5c08c6fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 26 Nov 2024 22:59:27 -0500
Subject: bcachefs: Fix null ptr deref in btree_path_lock_root()

Historically, we required that all btree node roots point to a valid
(possibly fake) node, but we're improving our ability to continue in the
presence of errors.

Reported-by: syzbot+e22007d6acb9c87c2362@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 89f9665ce70d..80c3b55ce763 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -722,7 +722,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 				       unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
-	struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
+	struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
 	enum six_lock_type lock_type;
 	unsigned i;
 	int ret;
@@ -730,7 +730,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 	EBUG_ON(path->nodes_locked);
 
 	while (1) {
-		b = READ_ONCE(*rootp);
+		struct btree *b = READ_ONCE(r->b);
+		if (unlikely(!b)) {
+			BUG_ON(!r->error);
+			return r->error;
+		}
+
 		path->level = READ_ONCE(b->c.level);
 
 		if (unlikely(path->level < depth_want)) {
@@ -755,7 +760,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 			BUG();
 		}
 
-		if (likely(b == READ_ONCE(*rootp) &&
+		if (likely(b == READ_ONCE(r->b) &&
 			   b->c.level == path->level &&
 			   !race_fault())) {
 			for (i = 0; i < path->level; i++)
-- 
cgit v1.2.3


From c7e78f7b01786d0d06bde8548e88822ff57c4b4f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Nov 2024 01:03:41 -0500
Subject: bcachefs: Ignore empty btree root journal entries

There's no reason to treat them as errors: just ignore them, and go with
a previous btree root if we had one.

Reported-by: syzbot+e22007d6acb9c87c2362@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 547c78a323f7..727e894762f5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -442,7 +442,9 @@ static int journal_replay_entry_early(struct bch_fs *c,
 
 	switch (entry->type) {
 	case BCH_JSET_ENTRY_btree_root: {
-		struct btree_root *r;
+
+		if (unlikely(!entry->u64s))
+			return 0;
 
 		if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
 				c, invalid_btree_id,
@@ -456,15 +458,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
 				return ret;
 		}
 
-		r = bch2_btree_id_root(c, entry->btree_id);
+		struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
 
-		if (entry->u64s) {
-			r->level = entry->level;
-			bkey_copy(&r->key, (struct bkey_i *) entry->start);
-			r->error = 0;
-		} else {
-			r->error = -BCH_ERR_btree_node_read_error;
-		}
+		r->level = entry->level;
+		bkey_copy(&r->key, (struct bkey_i *) entry->start);
+		r->error = 0;
 		r->alive = true;
 		break;
 	}
-- 
cgit v1.2.3


From a6f4794fcd8627638153614193b3b81f37a28175 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Nov 2024 00:29:52 -0500
Subject: bcachefs: struct bkey_validate_context

Add a new parameter to bkey validate functions, and use it to improve
invalid bkey error messages: we can now print the btree and depth it
came from, or if it came from the journal, or is a btree root.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      | 10 ++++-----
 fs/bcachefs/alloc_background.h      | 16 ++++++++------
 fs/bcachefs/backpointers.c          |  2 +-
 fs/bcachefs/backpointers.h          |  3 ++-
 fs/bcachefs/bkey.h                  |  7 ------
 fs/bcachefs/bkey_methods.c          | 29 ++++++++++++------------
 fs/bcachefs/bkey_methods.h          | 15 +++++++------
 fs/bcachefs/bkey_types.h            | 26 ++++++++++++++++++++++
 fs/bcachefs/btree_io.c              | 44 +++++++++++++++++++++++++++++++------
 fs/bcachefs/btree_node_scan.c       |  7 +++++-
 fs/bcachefs/btree_trans_commit.c    |  7 +++++-
 fs/bcachefs/btree_update_interior.c | 11 +++++++---
 fs/bcachefs/data_update.c           |  7 ++++--
 fs/bcachefs/dirent.c                |  4 ++--
 fs/bcachefs/dirent.h                |  4 ++--
 fs/bcachefs/disk_accounting.c       |  4 ++--
 fs/bcachefs/disk_accounting.h       |  3 ++-
 fs/bcachefs/ec.c                    |  4 ++--
 fs/bcachefs/ec.h                    |  5 ++---
 fs/bcachefs/error.c                 | 20 +++++++++++++----
 fs/bcachefs/error.h                 |  4 ++--
 fs/bcachefs/extents.c               | 20 ++++++++---------
 fs/bcachefs/extents.h               |  9 ++++----
 fs/bcachefs/inode.c                 | 16 +++++++-------
 fs/bcachefs/inode.h                 |  9 ++++----
 fs/bcachefs/journal_io.c            | 35 +++++++++++++++++------------
 fs/bcachefs/lru.c                   |  2 +-
 fs/bcachefs/lru.h                   |  2 +-
 fs/bcachefs/quota.c                 |  2 +-
 fs/bcachefs/quota.h                 |  4 ++--
 fs/bcachefs/recovery.c              |  1 +
 fs/bcachefs/reflink.c               |  8 +++----
 fs/bcachefs/reflink.h               | 10 ++++-----
 fs/bcachefs/snapshot.c              |  4 ++--
 fs/bcachefs/snapshot.h              |  7 +++---
 fs/bcachefs/subvolume.c             |  2 +-
 fs/bcachefs/subvolume.h             |  5 ++---
 fs/bcachefs/xattr.c                 |  2 +-
 fs/bcachefs/xattr.h                 |  3 ++-
 39 files changed, 231 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1e9f53db4bb8..8846daaa1162 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -198,7 +198,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 }
 
 int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags)
+			   struct bkey_validate_context from)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 	int ret = 0;
@@ -213,7 +213,7 @@ fsck_err:
 }
 
 int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags)
+			   struct bkey_validate_context from)
 {
 	struct bkey_alloc_unpacked u;
 	int ret = 0;
@@ -226,7 +226,7 @@ fsck_err:
 }
 
 int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags)
+			   struct bkey_validate_context from)
 {
 	struct bkey_alloc_unpacked u;
 	int ret = 0;
@@ -239,7 +239,7 @@ fsck_err:
 }
 
 int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags)
+			   struct bkey_validate_context from)
 {
 	struct bch_alloc_v4 a;
 	int ret = 0;
@@ -509,7 +509,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
 }
 
 int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
-			     enum bch_validate_flags flags)
+			      struct bkey_validate_context from)
 {
 	int ret = 0;
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 57723a37abb8..8cacddd188f4 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,8 +8,6 @@
 #include "debug.h"
 #include "super.h"
 
-enum bch_validate_flags;
-
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
@@ -245,10 +243,14 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
-int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
-int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
-int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -282,7 +284,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 })
 
 int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
-			     enum bch_validate_flags);
+			      struct bkey_validate_context);
 void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index cfd9b9ead473..ff08afd667a0 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -15,7 +15,7 @@
 #include <linux/mm.h>
 
 int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
-			      enum bch_validate_flags flags)
+			      struct bkey_validate_context from)
 {
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 	int ret = 0;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index d8a15f5fa767..95caeabb8978 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -18,7 +18,8 @@ static inline u64 swab40(u64 x)
 		((x & 0xff00000000ULL) >> 32));
 }
 
-int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags);
+int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
+			      struct bkey_validate_context);
 void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_backpointer_swab(struct bkey_s);
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 41df24a53d97..054e2d5e8448 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -9,13 +9,6 @@
 #include "util.h"
 #include "vstructs.h"
 
-enum bch_validate_flags {
-	BCH_VALIDATE_write		= BIT(0),
-	BCH_VALIDATE_commit		= BIT(1),
-	BCH_VALIDATE_journal		= BIT(2),
-	BCH_VALIDATE_silent		= BIT(3),
-};
-
 #if 0
 
 /*
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index e7ac227ba7e8..15c93576b5c2 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -28,7 +28,7 @@ const char * const bch2_bkey_types[] = {
 };
 
 static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	return 0;
 }
@@ -42,7 +42,7 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
-				 enum bch_validate_flags flags)
+				  struct bkey_validate_context from)
 {
 	int ret = 0;
 
@@ -59,7 +59,7 @@ fsck_err:
 })
 
 static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
-				    enum bch_validate_flags flags)
+				    struct bkey_validate_context from)
 {
 	return 0;
 }
@@ -83,7 +83,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
 })
 
 static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
-					 enum bch_validate_flags flags)
+					 struct bkey_validate_context from)
 {
 	return 0;
 }
@@ -124,7 +124,7 @@ const struct bkey_ops bch2_bkey_null_ops = {
 };
 
 int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags)
+			   struct bkey_validate_context from)
 {
 	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
 		return 0;
@@ -140,7 +140,7 @@ int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
 	if (!ops->key_validate)
 		return 0;
 
-	ret = ops->key_validate(c, k, flags);
+	ret = ops->key_validate(c, k, from);
 fsck_err:
 	return ret;
 }
@@ -161,9 +161,10 @@ const char *bch2_btree_node_type_str(enum btree_node_type type)
 }
 
 int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
-			 enum btree_node_type type,
-			 enum bch_validate_flags flags)
+			 struct bkey_validate_context from)
 {
+	enum btree_node_type type = __btree_node_type(from.level, from.btree);
+
 	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
 		return 0;
 
@@ -177,7 +178,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
 		return 0;
 
 	bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
-			 (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) &&
+			 (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
 			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
 			 c, bkey_invalid_type_for_btree,
 			 "invalid key type for btree %s (%s)",
@@ -228,15 +229,15 @@ fsck_err:
 }
 
 int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
-		      enum btree_node_type type,
-		      enum bch_validate_flags flags)
+		       struct bkey_validate_context from)
 {
-	return __bch2_bkey_validate(c, k, type, flags) ?:
-		bch2_bkey_val_validate(c, k, flags);
+	return __bch2_bkey_validate(c, k, from) ?:
+		bch2_bkey_val_validate(c, k, from);
 }
 
 int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
-			    struct bkey_s_c k, enum bch_validate_flags flags)
+			    struct bkey_s_c k,
+			    struct bkey_validate_context from)
 {
 	int ret = 0;
 
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 018fb72e32d3..bf34111cdf00 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -22,7 +22,7 @@ extern const struct bkey_ops bch2_bkey_null_ops;
  */
 struct bkey_ops {
 	int		(*key_validate)(struct bch_fs *c, struct bkey_s_c k,
-					enum bch_validate_flags flags);
+					struct bkey_validate_context from);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(struct bkey_s);
@@ -48,13 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
 		: &bch2_bkey_null_ops;
 }
 
-int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
-int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
-			 enum bch_validate_flags);
-int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
-		       enum bch_validate_flags);
+int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
+			 struct bkey_validate_context);
+int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
+		       struct bkey_validate_context);
 int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
-			    enum bch_validate_flags);
+			    struct bkey_validate_context from);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
index c9ae9e42b385..2af6279b02a9 100644
--- a/fs/bcachefs/bkey_types.h
+++ b/fs/bcachefs/bkey_types.h
@@ -210,4 +210,30 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
 BCH_BKEY_TYPES();
 #undef x
 
+enum bch_validate_flags {
+	BCH_VALIDATE_write		= BIT(0),
+	BCH_VALIDATE_commit		= BIT(1),
+	BCH_VALIDATE_journal		= BIT(2),
+	BCH_VALIDATE_silent		= BIT(3),
+};
+
+#define BKEY_VALIDATE_CONTEXTS()	\
+	x(unknown)			\
+	x(commit)			\
+	x(journal)			\
+	x(btree_root)			\
+	x(btree_node)
+
+struct bkey_validate_context {
+	enum {
+#define x(n)	BKEY_VALIDATE_##n,
+	BKEY_VALIDATE_CONTEXTS()
+#undef x
+	}			from:8;
+	u8			level;
+	enum btree_id		btree;
+	bool			root:1;
+	enum bch_validate_flags	flags:8;
+};
+
 #endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 3bb6db9bd4a4..eedcb2445b99 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -831,13 +831,32 @@ fsck_err:
 	return ret;
 }
 
+static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
+					struct bkey_s_c k,
+					enum bch_validate_flags flags)
+{
+	return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
+		.from	= BKEY_VALIDATE_btree_node,
+		.level	= b->c.level,
+		.btree	= b->c.btree_id,
+		.flags	= flags
+	});
+}
+
 static int bset_key_validate(struct bch_fs *c, struct btree *b,
 			     struct bkey_s_c k,
-			     bool updated_range, int rw)
+			     bool updated_range,
+			     enum bch_validate_flags flags)
 {
-	return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?:
-		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?:
-		(rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0);
+	struct bkey_validate_context from = (struct bkey_validate_context) {
+		.from	= BKEY_VALIDATE_btree_node,
+		.level	= b->c.level,
+		.btree	= b->c.btree_id,
+		.flags	= flags,
+	};
+	return __bch2_bkey_validate(c, k, from) ?:
+		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
+		(flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
 }
 
 static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
@@ -854,7 +873,13 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
 
 	struct bkey tmp;
 	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-	return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent);
+	return !__bch2_bkey_validate(c, u.s_c,
+				     (struct bkey_validate_context) {
+					.from	= BKEY_VALIDATE_btree_node,
+					.level	= b->c.level,
+					.btree	= b->c.btree_id,
+					.flags	= BCH_VALIDATE_silent
+				     });
 }
 
 static inline int btree_node_read_bkey_cmp(const struct btree *b,
@@ -1224,7 +1249,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		struct bkey tmp;
 		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
 
-		ret = bch2_bkey_val_validate(c, u.s_c, READ);
+		ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
 		if (ret == -BCH_ERR_fsck_delete_bkey ||
 		    (bch2_inject_invalid_keys &&
 		     !bversion_cmp(u.k->bversion, MAX_VERSION))) {
@@ -1943,7 +1968,12 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	bool saw_error;
 
 	int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
-				     BKEY_TYPE_btree, WRITE);
+				     (struct bkey_validate_context) {
+					.from	= BKEY_VALIDATE_btree_node,
+					.level	= b->c.level + 1,
+					.btree	= b->c.btree_id,
+					.flags	= BCH_VALIDATE_write,
+				     });
 	if (ret) {
 		bch2_fs_inconsistent(c, "invalid btree node key before write");
 		return ret;
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 327f1a1859b9..eeafb5e7354e 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -538,7 +538,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
 		bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
 		printbuf_exit(&buf);
 
-		BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0));
+		BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
+					  (struct bkey_validate_context) {
+						.from	= BKEY_VALIDATE_btree_node,
+						.level	= level + 1,
+						.btree	= btree,
+					  }));
 
 		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
 		if (ret)
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index cf313477567a..78d72c26083d 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -726,7 +726,12 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
 
 		ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
-					 i->bkey_type, invalid_flags);
+					 (struct bkey_validate_context) {
+						.from	= BKEY_VALIDATE_commit,
+						.level	= i->level,
+						.btree	= i->btree_id,
+						.flags	= invalid_flags,
+					 });
 		if (unlikely(ret)){
 			bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
 						trans->fn, (void *) i->ip_allocated);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index faa2816e02a0..56a70e95ef9a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1360,9 +1360,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
 		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
 
-	if (bch2_bkey_validate(c, bkey_i_to_s_c(insert),
-			      btree_node_type(b), BCH_VALIDATE_write) ?:
-	    bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) {
+	struct bkey_validate_context from = (struct bkey_validate_context) {
+		.from	= BKEY_VALIDATE_btree_node,
+		.level	= b->c.level,
+		.btree	= b->c.btree_id,
+		.flags	= BCH_VALIDATE_commit,
+	};
+	if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
+	    bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
 		bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
 		dump_stack();
 	}
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index e4af2ccdf4c8..31b2aeb0c6e6 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -318,8 +318,11 @@ restart_drop_extra_replicas:
 		 * it's been hard to reproduce, so this should give us some more
 		 * information when it does occur:
 		 */
-		int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id),
-						 BCH_VALIDATE_commit);
+		int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
+						 (struct bkey_validate_context) {
+							.btree	= m->btree_id,
+							.flags	= BCH_VALIDATE_commit,
+						 });
 		if (invalid) {
 			struct printbuf buf = PRINTBUF;
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4c22f78b0484..41813f9ce831 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -101,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 };
 
 int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
-			 enum bch_validate_flags flags)
+			 struct bkey_validate_context from)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	struct qstr d_name = bch2_dirent_get_name(d);
@@ -120,7 +120,7 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
 	 * Check new keys don't exceed the max length
 	 * (older keys may be larger.)
 	 */
-	bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
+	bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
 			 c, dirent_name_too_long,
 			 "dirent name too big (%u > %u)",
 			 d_name.len, BCH_NAME_MAX);
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 53ad99666022..362b3b2f2f2e 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -4,10 +4,10 @@
 
 #include "str_hash.h"
 
-enum bch_validate_flags;
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
+			 struct bkey_validate_context);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index c5e61265b709..71c49a7ee2fe 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -127,14 +127,14 @@ static inline bool is_zero(char *start, char *end)
 #define field_end(p, member)	(((void *) (&p.member)) + sizeof(p.member))
 
 int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
-			     enum bch_validate_flags flags)
+			     struct bkey_validate_context from)
 {
 	struct disk_accounting_pos acc_k;
 	bpos_to_disk_accounting_pos(&acc_k, k.k->p);
 	void *end = &acc_k + 1;
 	int ret = 0;
 
-	bkey_fsck_err_on((flags & BCH_VALIDATE_commit) &&
+	bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
 			 bversion_zero(k.k->bversion),
 			 c, accounting_key_version_0,
 			 "accounting key with version=0");
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 8b2b2f83e6a4..566aa2a8539d 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -83,7 +83,8 @@ int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
 			     s64 *, unsigned, bool);
 int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
 
-int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
+			     struct bkey_validate_context);
 void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
 void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_accounting_swab(struct bkey_s);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index cd377ce6cf64..eaca4c39d703 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -110,7 +110,7 @@ struct ec_bio {
 /* Stripes btree keys: */
 
 int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
-			 enum bch_validate_flags flags)
+			 struct bkey_validate_context from)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	int ret = 0;
@@ -130,7 +130,7 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
 			 "invalid csum granularity (%u >= 64)",
 			 s->csum_granularity_bits);
 
-	ret = bch2_bkey_ptrs_validate(c, k, flags);
+	ret = bch2_bkey_ptrs_validate(c, k, from);
 fsck_err:
 	return ret;
 }
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 43326370b410..583ca6a226da 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -6,9 +6,8 @@
 #include "buckets_types.h"
 #include "extents_types.h"
 
-enum bch_validate_flags;
-
-int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
+			 struct bkey_validate_context);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 2960baa023f6..9a695322b33c 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "btree_cache.h"
 #include "btree_iter.h"
 #include "error.h"
 #include "journal.h"
@@ -443,23 +444,34 @@ err:
 	return ret;
 }
 
+static const char * const bch2_bkey_validate_contexts[] = {
+#define x(n) #n,
+	BKEY_VALIDATE_CONTEXTS()
+#undef x
+	NULL
+};
+
 int __bch2_bkey_fsck_err(struct bch_fs *c,
 			 struct bkey_s_c k,
-			 enum bch_validate_flags validate_flags,
+			 struct bkey_validate_context from,
 			 enum bch_sb_error_id err,
 			 const char *fmt, ...)
 {
-	if (validate_flags & BCH_VALIDATE_silent)
+	if (from.flags & BCH_VALIDATE_silent)
 		return -BCH_ERR_fsck_delete_bkey;
 
 	unsigned fsck_flags = 0;
-	if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)))
+	if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)))
 		fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
 
 	struct printbuf buf = PRINTBUF;
 	va_list args;
 
-	prt_str(&buf, "invalid bkey ");
+	prt_printf(&buf, "invalid bkey in %s btree=",
+		   bch2_bkey_validate_contexts[from.from]);
+	bch2_btree_id_to_text(&buf, from.btree);
+	prt_printf(&buf, " level=%u: ", from.level);
+
 	bch2_bkey_val_to_text(&buf, c, k);
 	prt_str(&buf, "\n  ");
 	va_start(args, fmt);
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 8327a3461535..3b278f28e56b 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -153,7 +153,7 @@ enum bch_validate_flags;
 __printf(5, 6)
 int __bch2_bkey_fsck_err(struct bch_fs *,
 			 struct bkey_s_c,
-			 enum bch_validate_flags,
+			 struct bkey_validate_context from,
 			 enum bch_sb_error_id,
 			 const char *, ...);
 
@@ -163,7 +163,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *,
  */
 #define bkey_fsck_err(c, _err_type, _err_msg, ...)			\
 do {									\
-	int _ret = __bch2_bkey_fsck_err(c, k, flags,			\
+	int _ret = __bch2_bkey_fsck_err(c, k, from,			\
 				BCH_FSCK_ERR_##_err_type,		\
 				_err_msg, ##__VA_ARGS__);		\
 	if (_ret != -BCH_ERR_fsck_fix &&				\
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 83aeceb68847..aa3b88291814 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -178,7 +178,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 /* KEY_TYPE_btree_ptr: */
 
 int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
-			    enum bch_validate_flags flags)
+			    struct bkey_validate_context from)
 {
 	int ret = 0;
 
@@ -186,7 +186,7 @@ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
 			 c, btree_ptr_val_too_big,
 			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
 
-	ret = bch2_bkey_ptrs_validate(c, k, flags);
+	ret = bch2_bkey_ptrs_validate(c, k, from);
 fsck_err:
 	return ret;
 }
@@ -198,7 +198,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-			       enum bch_validate_flags flags)
+			       struct bkey_validate_context from)
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 	int ret = 0;
@@ -212,13 +212,13 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
 			 c, btree_ptr_v2_min_key_bad,
 			 "min_key > key");
 
-	if ((flags & BCH_VALIDATE_write) &&
+	if ((from.flags & BCH_VALIDATE_write) &&
 	    c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
 		bkey_fsck_err_on(!bp.v->sectors_written,
 				 c, btree_ptr_v2_written_0,
 				 "sectors_written == 0");
 
-	ret = bch2_bkey_ptrs_validate(c, k, flags);
+	ret = bch2_bkey_ptrs_validate(c, k, from);
 fsck_err:
 	return ret;
 }
@@ -405,7 +405,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
-			      enum bch_validate_flags flags)
+			      struct bkey_validate_context from)
 {
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 	int ret = 0;
@@ -1231,7 +1231,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 
 static int extent_ptr_validate(struct bch_fs *c,
 			       struct bkey_s_c k,
-			       enum bch_validate_flags flags,
+			       struct bkey_validate_context from,
 			       const struct bch_extent_ptr *ptr,
 			       unsigned size_ondisk,
 			       bool metadata)
@@ -1274,7 +1274,7 @@ fsck_err:
 }
 
 int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
-			    enum bch_validate_flags flags)
+			    struct bkey_validate_context from)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
@@ -1301,7 +1301,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 
 		switch (extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
-			ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false);
+			ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
 			if (ret)
 				return ret;
 
@@ -1348,7 +1348,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 
 			bkey_fsck_err_on(crc_is_encoded(crc) &&
 					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
-					 (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
+					 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
 					 c, ptr_crc_uncompressed_size_too_big,
 					 "too large encoded extent");
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ba33788fee36..620b284aa34f 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -8,7 +8,6 @@
 
 struct bch_fs;
 struct btree_trans;
-enum bch_validate_flags;
 
 /* extent entries: */
 
@@ -410,12 +409,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 /* KEY_TYPE_btree_ptr: */
 
 int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
-			    enum bch_validate_flags);
+			    struct bkey_validate_context);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
 int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
-			       enum bch_validate_flags);
+			       struct bkey_validate_context);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 			      int, struct bkey_s);
@@ -452,7 +451,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 /* KEY_TYPE_reservation: */
 
 int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
-			      enum bch_validate_flags);
+			      struct bkey_validate_context);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
@@ -696,7 +695,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
-			    enum bch_validate_flags);
+			    struct bkey_validate_context);
 
 static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
 				      struct bch_extent_ptr ptr2)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 5c603ab66be0..8818e41883f2 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -429,7 +429,7 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
 }
 
 static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
-				 enum bch_validate_flags flags)
+				 struct bkey_validate_context from)
 {
 	struct bch_inode_unpacked unpacked;
 	int ret = 0;
@@ -469,7 +469,7 @@ fsck_err:
 }
 
 int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
-			enum bch_validate_flags flags)
+			struct bkey_validate_context from)
 {
 	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 	int ret = 0;
@@ -479,13 +479,13 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
 			 "invalid str hash type (%llu >= %u)",
 			 INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-	ret = __bch2_inode_validate(c, k, flags);
+	ret = __bch2_inode_validate(c, k, from);
 fsck_err:
 	return ret;
 }
 
 int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags)
+			   struct bkey_validate_context from)
 {
 	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 	int ret = 0;
@@ -495,13 +495,13 @@ int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
 			 "invalid str hash type (%llu >= %u)",
 			 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-	ret = __bch2_inode_validate(c, k, flags);
+	ret = __bch2_inode_validate(c, k, from);
 fsck_err:
 	return ret;
 }
 
 int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags)
+			   struct bkey_validate_context from)
 {
 	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 	int ret = 0;
@@ -519,7 +519,7 @@ int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
 			 "invalid str hash type (%llu >= %u)",
 			 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-	ret = __bch2_inode_validate(c, k, flags);
+	ret = __bch2_inode_validate(c, k, from);
 fsck_err:
 	return ret;
 }
@@ -780,7 +780,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
 }
 
 int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
-				   enum bch_validate_flags flags)
+				   struct bkey_validate_context from)
 {
 	int ret = 0;
 
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index f52336cb298f..927c875976da 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -7,15 +7,14 @@
 #include "opts.h"
 #include "snapshot.h"
 
-enum bch_validate_flags;
 extern const char * const bch2_inode_opts[];
 
 int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
-		       enum bch_validate_flags);
+			struct bkey_validate_context);
 int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags);
+			   struct bkey_validate_context);
 int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags);
+			   struct bkey_validate_context);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
@@ -60,7 +59,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
 }
 
 int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
-				  enum bch_validate_flags);
+				   struct bkey_validate_context);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 768a3b950997..1627f3e16517 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -327,11 +327,11 @@ static void journal_entry_err_msg(struct printbuf *out,
 static int journal_validate_key(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
-				unsigned level, enum btree_id btree_id,
 				struct bkey_i *k,
-				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from,
+				unsigned version, int big_endian)
 {
+	enum bch_validate_flags flags = from.flags;
 	int write = flags & BCH_VALIDATE_write;
 	void *next = vstruct_next(entry);
 	int ret = 0;
@@ -366,11 +366,10 @@ static int journal_validate_key(struct bch_fs *c,
 	}
 
 	if (!write)
-		bch2_bkey_compat(level, btree_id, version, big_endian,
+		bch2_bkey_compat(from.level, from.btree, version, big_endian,
 				 write, NULL, bkey_to_packed(k));
 
-	ret = bch2_bkey_validate(c, bkey_i_to_s_c(k),
-				 __btree_node_type(level, btree_id), write);
+	ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
 	if (ret == -BCH_ERR_fsck_delete_bkey) {
 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -381,7 +380,7 @@ static int journal_validate_key(struct bch_fs *c,
 		goto fsck_err;
 
 	if (write)
-		bch2_bkey_compat(level, btree_id, version, big_endian,
+		bch2_bkey_compat(from.level, from.btree, version, big_endian,
 				 write, NULL, bkey_to_packed(k));
 fsck_err:
 	return ret;
@@ -394,13 +393,15 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
 				enum bch_validate_flags flags)
 {
 	struct bkey_i *k = entry->start;
+	struct bkey_validate_context from = {
+		.from	= BKEY_VALIDATE_journal,
+		.level	= entry->level,
+		.btree	= entry->btree_id,
+		.flags	= flags|BCH_VALIDATE_journal,
+	};
 
 	while (k != vstruct_last(entry)) {
-		int ret = journal_validate_key(c, jset, entry,
-					       entry->level,
-					       entry->btree_id,
-					       k, version, big_endian,
-					       flags|BCH_VALIDATE_journal);
+		int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
 		if (ret == FSCK_DELETED_KEY)
 			continue;
 		else if (ret)
@@ -455,8 +456,14 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
 		return 0;
 	}
 
-	ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
-				   version, big_endian, flags);
+	struct bkey_validate_context from = {
+		.from	= BKEY_VALIDATE_journal,
+		.level	= entry->level + 1,
+		.btree	= entry->btree_id,
+		.root	= true,
+		.flags	= flags,
+	};
+	ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
 	if (ret == FSCK_DELETED_KEY)
 		ret = 0;
 fsck_err:
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index c18242748ca3..ce794d55818f 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -12,7 +12,7 @@
 
 /* KEY_TYPE_lru is obsolete: */
 int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
-		     enum bch_validate_flags flags)
+		      struct bkey_validate_context from)
 {
 	int ret = 0;
 
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index e6a7d8241bb8..f31a6cf1514c 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -33,7 +33,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
 	return BCH_LRU_read;
 }
 
-int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 74f45a8162ad..8b857fc33244 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -60,7 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 };
 
 int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
-			enum bch_validate_flags flags)
+			struct bkey_validate_context from)
 {
 	int ret = 0;
 
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index a62abcc5332a..1551800ff44c 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -5,10 +5,10 @@
 #include "inode.h"
 #include "quota_types.h"
 
-enum bch_validate_flags;
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_quota_validate(struct bch_fs *, struct bkey_s_c,
+			struct bkey_validate_context);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_quota ((struct bkey_ops) {	\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 727e894762f5..e361057ffad4 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -569,6 +569,7 @@ static int read_btree_roots(struct bch_fs *c)
 				r->error = 0;
 
 			ret = bch2_btree_lost_data(c, i);
+			BUG_ON(ret);
 		}
 	}
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 38db5a011702..e1911b9beb61 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -41,7 +41,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 /* reflink pointers */
 
 int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
-			    enum bch_validate_flags flags)
+			    struct bkey_validate_context from)
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	int ret = 0;
@@ -89,7 +89,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 /* indirect extents */
 
 int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
-			    enum bch_validate_flags flags)
+			    struct bkey_validate_context from)
 {
 	int ret = 0;
 
@@ -98,7 +98,7 @@ int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
 			 "indirect extent above maximum position 0:%llu",
 			 REFLINK_P_IDX_MAX);
 
-	ret = bch2_bkey_ptrs_validate(c, k, flags);
+	ret = bch2_bkey_ptrs_validate(c, k, from);
 fsck_err:
 	return ret;
 }
@@ -128,7 +128,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 /* indirect inline data */
 
 int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
-				      enum bch_validate_flags flags)
+				       struct bkey_validate_context from)
 {
 	return 0;
 }
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index b61a4bdd8e82..f119316adc81 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,9 +2,8 @@
 #ifndef _BCACHEFS_REFLINK_H
 #define _BCACHEFS_REFLINK_H
 
-enum bch_validate_flags;
-
-int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c,
+			    struct bkey_validate_context);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
@@ -19,7 +18,8 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
 	.min_val_size	= 16,					\
 })
 
-int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c,
+			    struct bkey_validate_context);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 			   struct bkey_s_c, struct bkey_s,
@@ -34,7 +34,7 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 })
 
 int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
-				      enum bch_validate_flags);
+				       struct bkey_validate_context);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
 				struct bch_fs *, struct bkey_s_c);
 int bch2_trigger_indirect_inline_data(struct btree_trans *,
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 6a52090485dc..f368270d6d9b 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -32,7 +32,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
-			       enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	int ret = 0;
 
@@ -225,7 +225,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags)
+			   struct bkey_validate_context from)
 {
 	struct bkey_s_c_snapshot s;
 	u32 i, id;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 29c94716293e..ae23d45fad66 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -2,11 +2,9 @@
 #ifndef _BCACHEFS_SNAPSHOT_H
 #define _BCACHEFS_SNAPSHOT_H
 
-enum bch_validate_flags;
-
 void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
-			       enum bch_validate_flags);
+				struct bkey_validate_context);
 
 #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
 	.key_validate	= bch2_snapshot_tree_validate,		\
@@ -19,7 +17,8 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
 int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
 int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s,
 		       enum btree_iter_update_trigger_flags);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index cb45ef769c54..5e5ae405cb28 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -207,7 +207,7 @@ int bch2_check_subvol_children(struct bch_fs *c)
 /* Subvolumes: */
 
 int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags)
+			    struct bkey_validate_context from)
 {
 	struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
 	int ret = 0;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 07b23dc08614..d53d292c22d7 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -5,12 +5,11 @@
 #include "darray.h"
 #include "subvolume_types.h"
 
-enum bch_validate_flags;
-
 int bch2_check_subvols(struct bch_fs *);
 int bch2_check_subvol_children(struct bch_fs *);
 
-int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c,
+			    struct bkey_validate_context);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
 			   struct bkey_s_c, struct bkey_s,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 820c1791545a..aed7c6984173 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 };
 
 int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
-		       enum bch_validate_flags flags)
+			struct bkey_validate_context from)
 {
 	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 	unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 2c96de051f3e..132fbbd15a66 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,7 +6,8 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c,
+			struct bkey_validate_context);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
-- 
cgit v1.2.3


From 124e108185dccf7cc1bb159501ea7fea59350218 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 24 Nov 2024 21:28:07 -0500
Subject: bcachefs: Make topology errors autofix

These repair paths are well tested, we can repair them without explicit
user intervention

This also tweaks bch2_topology_error() so that we run topology repair if
we're in recovery, not just fsck.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c         |  2 +-
 fs/bcachefs/recovery.c         | 31 +++++++++++++++++++++++++------
 fs/bcachefs/sb-errors_format.h | 12 ++++++------
 3 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2e8cfc4d3265..19db4d8aca88 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -348,7 +348,7 @@ again:
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
 
 		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
-				trans, btree_node_unreadable,
+				trans, btree_node_read_error,
 				"Topology repair: unreadable btree node at\n"
 				"  %s",
 				buf.buf)) {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e361057ffad4..64bb330eac86 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -40,19 +40,42 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
 	int ret = 0;
 
 	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 
 	if (!(c->sb.btrees_lost_data & b)) {
 		struct printbuf buf = PRINTBUF;
 		bch2_btree_id_to_text(&buf, btree);
 		bch_err(c, "flagging btree %s lost data", buf.buf);
 		printbuf_exit(&buf);
-		bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
+		ext->btrees_lost_data |= cpu_to_le64(b);
 	}
 
+	/* Once we have runtime self healing for topology errors we won't need this: */
+	ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
+
+	/* Btree node accounting will be off: */
+	__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+	ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	/*
+	 * These are much more minor, and don't need to be corrected right away,
+	 * but in debug mode we want the next fsck run to be clean:
+	 */
+	ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
+	ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
+#endif
+
 	switch (btree) {
 	case BTREE_ID_alloc:
-		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
 		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
 		goto out;
 	case BTREE_ID_backpointers:
 		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
@@ -75,7 +98,6 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
 		goto out;
 	default:
 		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
-		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
 		goto out;
 	}
 out:
@@ -748,9 +770,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
-
 	if (c->opts.fsck)
 		set_bit(BCH_FS_fsck_running, &c->flags);
 	if (c->sb.clean)
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 89d9dc2c859b..917ef6aa4a23 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -72,12 +72,12 @@ enum bch_fsck_flags {
 	x(btree_root_read_error,				 59,	FSCK_AUTOFIX)	\
 	x(btree_root_bad_min_key,				 60,	0)		\
 	x(btree_root_bad_max_key,				 61,	0)		\
-	x(btree_node_read_error,				 62,	0)		\
-	x(btree_node_topology_bad_min_key,			 63,	0)		\
-	x(btree_node_topology_bad_max_key,			 64,	0)		\
-	x(btree_node_topology_overwritten_by_prev_node,		 65,	0)		\
-	x(btree_node_topology_overwritten_by_next_node,		 66,	0)		\
-	x(btree_node_topology_interior_node_empty,		 67,	0)		\
+	x(btree_node_read_error,				 62,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_bad_min_key,			 63,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_bad_max_key,			 64,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_overwritten_by_prev_node,		 65,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_overwritten_by_next_node,		 66,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_interior_node_empty,		 67,	FSCK_AUTOFIX)	\
 	x(fs_usage_hidden_wrong,				 68,	FSCK_AUTOFIX)	\
 	x(fs_usage_btree_wrong,					 69,	FSCK_AUTOFIX)	\
 	x(fs_usage_data_wrong,					 70,	FSCK_AUTOFIX)	\
-- 
cgit v1.2.3


From 9963a14da1091c114764bec8723cf81677ca691e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Nov 2024 03:00:54 -0500
Subject: bcachefs: BCH_FS_recovery_running

If we're autofixing topology errors, we shouldn't shutdown if we're
still in recovery.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h | 1 +
 fs/bcachefs/error.c    | 2 +-
 fs/bcachefs/recovery.c | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a85b3bcc6383..d88129503bc5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -614,6 +614,7 @@ struct bch_dev {
 	x(going_ro)			\
 	x(write_disable_complete)	\
 	x(clean_shutdown)		\
+	x(recovery_running)		\
 	x(fsck_running)			\
 	x(initial_gc_unfixed)		\
 	x(need_delete_dead_snapshots)	\
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 9a695322b33c..5b67361b0cf1 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -34,7 +34,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 int bch2_topology_error(struct bch_fs *c)
 {
 	set_bit(BCH_FS_topology_error, &c->flags);
-	if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+	if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
 		bch2_inconsistent_error(c);
 		return -BCH_ERR_btree_need_topology_repair;
 	} else {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 64bb330eac86..c50dede64785 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -774,6 +774,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		set_bit(BCH_FS_fsck_running, &c->flags);
 	if (c->sb.clean)
 		set_bit(BCH_FS_clean_recovery, &c->flags);
+	set_bit(BCH_FS_recovery_running, &c->flags);
 
 	ret = bch2_blacklist_table_initialize(c);
 	if (ret) {
@@ -925,6 +926,7 @@ use_clean:
 	 */
 	set_bit(BCH_FS_may_go_rw, &c->flags);
 	clear_bit(BCH_FS_fsck_running, &c->flags);
+	clear_bit(BCH_FS_recovery_running, &c->flags);
 
 	/* in case we don't run journal replay, i.e. norecovery mode */
 	set_bit(BCH_FS_accounting_replay_done, &c->flags);
-- 
cgit v1.2.3


From b3d82c2f27611c897c3877a51de8f8df755165af Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Nov 2024 21:58:43 -0500
Subject: bcachefs: Guard against journal seq overflow

Wraparound is impractical to handle since in various places we use 0 as
a sentinal value - but 64 bits (or 56, because the btree write buffer
steals a few bits) is enough for all practical purposes.

Reported-by: syzbot+73ed43fbe826227bd4e0@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 9 +++++++++
 fs/bcachefs/journal_types.h | 3 +++
 2 files changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 95cccda3b22c..dc66521964b7 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -382,6 +382,10 @@ static int journal_entry_open(struct journal *j)
 	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
 		return JOURNAL_ERR_max_in_flight;
 
+	if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX,
+				 c, "cannot start: journal seq overflow"))
+		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
 	BUG_ON(!j->cur_entry_sectors);
 
 	buf->expires		=
@@ -1270,6 +1274,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	bool had_entries = false;
 	u64 last_seq = cur_seq, nr, seq;
 
+	if (cur_seq >= JOURNAL_SEQ_MAX) {
+		bch_err(c, "cannot start: journal seq overflow");
+		return -EINVAL;
+	}
+
 	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
 		i = *_i;
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 425d1abb257e..e9bd716fbb71 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -9,6 +9,9 @@
 #include "super_types.h"
 #include "fifo.h"
 
+/* btree write buffer steals 8 bits for its own purposes: */
+#define JOURNAL_SEQ_MAX		((1ULL << 56) - 1)
+
 #define JOURNAL_BUF_BITS	2
 #define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
 #define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
-- 
cgit v1.2.3


From f9e0a9be70c8f790fe721ee2796995b06a314b7f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Nov 2024 22:09:29 -0500
Subject: bcachefs: Issue a transaction restart after commit in repair

transaction commits invalidate pointers to btree values, and they also
downgrade intent locks.

This breaks the interior btree update path, which takes intent locks and
then calls into the allocator.

This isn't an ideal solution: we can't unconditionally issue a restart
after a transaction commit, because that would break other codepaths.

Reported-by: syzbot+78d82470c16a49702682@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 +-
 fs/bcachefs/errcode.h          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 8846daaa1162..79af226ca609 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1384,7 +1384,7 @@ delete:
 	ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 			BCH_TRANS_COMMIT_no_enospc) ?:
-		1;
+		-BCH_ERR_transaction_restart_commit;
 	goto out;
 }
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index a12050e9c191..a0cfc0f286f4 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -148,6 +148,7 @@
 	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
 	x(BCH_ERR_transaction_restart,	transaction_restart_write_buffer_flush)	\
 	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_commit)		\
 	x(0,				no_btree_node)				\
 	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
 	x(BCH_ERR_no_btree_node,	no_btree_node_upgrade)			\
-- 
cgit v1.2.3


From f11ca2ab18e369a662c2f60d53b8bea46c54e312 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 27 Nov 2024 22:29:54 -0500
Subject: bcachefs: Guard against backpointers to unknown btrees

Reported-by: syzbot+997f0573004dcb964555@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c     | 7 +++++--
 fs/bcachefs/sb-errors_format.h | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index ff08afd667a0..702bf62d7fa7 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -249,9 +249,12 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 					 struct btree_iter *iter,
 					 unsigned iter_flags)
 {
-	if (likely(!bp.v->level)) {
-		struct bch_fs *c = trans->c;
+	struct bch_fs *c = trans->c;
 
+	if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
+		return bkey_s_c_null;
+
+	if (likely(!bp.v->level)) {
 		bch2_trans_node_iter_init(trans, iter,
 					  bp.v->btree_id,
 					  bp.v->pos,
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 917ef6aa4a23..e73d1c60198e 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -67,7 +67,7 @@ enum bch_fsck_flags {
 	x(btree_node_bkey_past_bset_end,			 54,	0)		\
 	x(btree_node_bkey_bad_format,				 55,	0)		\
 	x(btree_node_bad_bkey,					 56,	0)		\
-	x(btree_node_bkey_out_of_order,				 57,	0)		\
+	x(btree_node_bkey_out_of_order,				 57,	FSCK_AUTOFIX)	\
 	x(btree_root_bkey_invalid,				 58,	FSCK_AUTOFIX)	\
 	x(btree_root_read_error,				 59,	FSCK_AUTOFIX)	\
 	x(btree_root_bad_min_key,				 60,	0)		\
-- 
cgit v1.2.3


From 0184dfa3b82111f147455bdbd82801843d306eb1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 15:10:24 -0500
Subject: bcachefs: Fix journal_iter list corruption

Fix exiting an iterator that wasn't initialized.

Reported-by: syzbot+2f7c2225ed8a5cb24af1@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 56a70e95ef9a..5eabd532e388 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -58,6 +58,10 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
 			b->data->min_key));
 
+	bch2_bkey_buf_init(&prev);
+	bkey_init(&prev.k->k);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+
 	if (b == btree_node_root(c, b)) {
 		if (!bpos_eq(b->data->min_key, POS_MIN)) {
 			printbuf_reset(&buf);
@@ -77,11 +81,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 	}
 
 	if (!b->c.level)
-		return 0;
-
-	bch2_bkey_buf_init(&prev);
-	bkey_init(&prev.k->k);
-	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+		goto out;
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		if (k.k->type != KEY_TYPE_btree_ptr_v2)
-- 
cgit v1.2.3


From 914381013bc759370b5b28b679bfbe5df6009f31 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 16:09:04 -0500
Subject: bcachefs: add missing printbuf_reset()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 19db4d8aca88..e59924cfe2bc 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -521,6 +521,7 @@ int bch2_check_topology(struct bch_fs *c)
 		struct btree_root *r = bch2_btree_id_root(c, i);
 		bool reconstructed_root = false;
 
+		printbuf_reset(&buf);
 		bch2_btree_id_to_text(&buf, i);
 
 		if (r->error) {
-- 
cgit v1.2.3


From cfba90aba9fb858488ad035ce041f2fbda5e20f9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 16:09:15 -0500
Subject: bcachefs: mark more errors AUTOFIX

mark errors as autofix where syzbot has hit the repair paths

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-errors_format.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index e73d1c60198e..382fcafa815a 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -124,9 +124,9 @@ enum bch_fsck_flags {
 	x(alloc_key_stripe_redundancy_wrong,			111,	FSCK_AUTOFIX)	\
 	x(bucket_sector_count_overflow,				112,	0)		\
 	x(bucket_metadata_type_mismatch,			113,	0)		\
-	x(need_discard_key_wrong,				114,	0)		\
-	x(freespace_key_wrong,					115,	0)		\
-	x(freespace_hole_missing,				116,	0)		\
+	x(need_discard_key_wrong,				114,	FSCK_AUTOFIX)	\
+	x(freespace_key_wrong,					115,	FSCK_AUTOFIX)	\
+	x(freespace_hole_missing,				116,	FSCK_AUTOFIX)	\
 	x(bucket_gens_val_size_bad,				117,	0)		\
 	x(bucket_gens_key_wrong,				118,	FSCK_AUTOFIX)	\
 	x(bucket_gens_hole_wrong,				119,	FSCK_AUTOFIX)	\
@@ -288,7 +288,7 @@ enum bch_fsck_flags {
 	x(btree_root_unreadable_and_scan_found_nothing,		263,	0)		\
 	x(snapshot_node_missing,				264,	0)		\
 	x(dup_backpointer_to_bad_csum_extent,			265,	0)		\
-	x(btree_bitmap_not_marked,				266,	0)		\
+	x(btree_bitmap_not_marked,				266,	FSCK_AUTOFIX)	\
 	x(sb_clean_entry_overrun,				267,	0)		\
 	x(btree_ptr_v2_written_0,				268,	0)		\
 	x(subvol_snapshot_bad,					269,	0)		\
-- 
cgit v1.2.3


From 052210c3fa1f8a964942cd71cb479b339537c69b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 16:14:06 -0500
Subject: bcachefs: Don't error out when logging fsck error

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  8 +++++---
 fs/bcachefs/error.c            | 29 +++++++++++++++++------------
 fs/bcachefs/error.h            | 14 +++++++++-----
 3 files changed, 31 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 79af226ca609..6de0387ede99 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -676,6 +676,10 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans,
 				  set ? "" : "un",
 				  bch2_btree_id_str(btree),
 				  buf.buf);
+	if (ret == -BCH_ERR_fsck_ignore ||
+	    ret == -BCH_ERR_fsck_errors_not_fixed)
+		ret = 0;
+
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -1901,10 +1905,8 @@ static int bch2_do_discards_fast_one(struct btree_trans *trans,
 	if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
 			    trans, discarding_bucket_not_in_need_discard_btree,
 			    "attempting to discard bucket %u:%llu not in need_discard btree",
-			    ca->dev_idx, bucket)) {
-		/* log it in the superblock and continue: */
+			    ca->dev_idx, bucket))
 		goto out;
-	}
 
 	ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
 out:
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 5b67361b0cf1..23b9ecbcf3cf 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -227,7 +227,7 @@ int __bch2_fsck_err(struct bch_fs *c,
 {
 	struct fsck_err_state *s = NULL;
 	va_list args;
-	bool print = true, suppressing = false, inconsistent = false;
+	bool print = true, suppressing = false, inconsistent = false, exiting = false;
 	struct printbuf buf = PRINTBUF, *out = &buf;
 	int ret = -BCH_ERR_fsck_ignore;
 	const char *action_orig = "fix?", *action = action_orig;
@@ -320,13 +320,19 @@ int __bch2_fsck_err(struct bch_fs *c,
 		prt_printf(out, bch2_log_msg(c, ""));
 #endif
 
-	if ((flags & FSCK_CAN_FIX) &&
-	    (flags & FSCK_AUTOFIX) &&
+	if ((flags & FSCK_AUTOFIX) &&
 	    (c->opts.errors == BCH_ON_ERROR_continue ||
 	     c->opts.errors == BCH_ON_ERROR_fix_safe)) {
 		prt_str(out, ", ");
-		prt_actioning(out, action);
-		ret = -BCH_ERR_fsck_fix;
+		if (flags & FSCK_CAN_FIX) {
+			prt_actioning(out, action);
+			ret = -BCH_ERR_fsck_fix;
+		} else {
+			prt_str(out, ", continuing");
+			ret = -BCH_ERR_fsck_ignore;
+		}
+
+		goto print;
 	} else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
 		if (c->opts.errors != BCH_ON_ERROR_continue ||
 		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
@@ -396,14 +402,13 @@ int __bch2_fsck_err(struct bch_fs *c,
 	     !(flags & FSCK_CAN_IGNORE)))
 		ret = -BCH_ERR_fsck_errors_not_fixed;
 
-	bool exiting =
-		test_bit(BCH_FS_fsck_running, &c->flags) &&
-		(ret != -BCH_ERR_fsck_fix &&
-		 ret != -BCH_ERR_fsck_ignore);
-
-	if (exiting)
+	if (test_bit(BCH_FS_fsck_running, &c->flags) &&
+	    (ret != -BCH_ERR_fsck_fix &&
+	     ret != -BCH_ERR_fsck_ignore)) {
+		exiting = true;
 		print = true;
-
+	}
+print:
 	if (print) {
 		if (bch2_fs_stdio_redirect(c))
 			bch2_print(c, "%s\n", out->buf);
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 3b278f28e56b..12ca5287e20a 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -45,12 +45,11 @@ int bch2_topology_error(struct bch_fs *);
 	bch2_inconsistent_error(c);					\
 })
 
-#define bch2_fs_inconsistent_on(cond, c, ...)				\
+#define bch2_fs_inconsistent_on(cond, ...)				\
 ({									\
 	bool _ret = unlikely(!!(cond));					\
-									\
 	if (_ret)							\
-		bch2_fs_inconsistent(c, __VA_ARGS__);			\
+		bch2_fs_inconsistent(__VA_ARGS__);			\
 	_ret;								\
 })
 
@@ -146,8 +145,13 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 #define log_fsck_err(c, _err_type, ...)					\
 	__fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
-#define log_fsck_err_on(cond, c, _err_type, ...)				\
-	__fsck_err_on(cond, c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+#define log_fsck_err_on(cond, ...)					\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+	if (_ret)							\
+		log_fsck_err(__VA_ARGS__);				\
+	_ret;								\
+})
 
 enum bch_validate_flags;
 __printf(5, 6)
-- 
cgit v1.2.3


From 8b105909182fed845bdf1eb4313896c6a5d04a10 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 16:25:41 -0500
Subject: bcachefs: do_fsck_ask_yn()

__bch2_fsck_err() is huge, and badly needs more refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 59 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 23b9ecbcf3cf..0517782ca57a 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -219,6 +219,30 @@ static const u8 fsck_flags_extra[] = {
 #undef x
 };
 
+static int do_fsck_ask_yn(struct bch_fs *c,
+			  struct btree_trans *trans,
+			  struct printbuf *question,
+			  const char *action)
+{
+	prt_str(question, ", ");
+	prt_str(question, action);
+
+	if (bch2_fs_stdio_redirect(c))
+		bch2_print(c, "%s", question->buf);
+	else
+		bch2_print_string_as_lines(KERN_ERR, question->buf);
+
+	int ask = bch2_fsck_ask_yn(c, trans);
+
+	if (trans) {
+		int ret = bch2_trans_relock(trans);
+		if (ret)
+			return ret;
+	}
+
+	return ask;
+}
+
 int __bch2_fsck_err(struct bch_fs *c,
 		  struct btree_trans *trans,
 		  enum bch_fsck_flags flags,
@@ -291,16 +315,14 @@ int __bch2_fsck_err(struct bch_fs *c,
 		 */
 		if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
 			ret = s->ret;
-			mutex_unlock(&c->fsck_error_msgs_lock);
-			goto err;
+			goto err_unlock;
 		}
 
 		kfree(s->last_msg);
 		s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
 		if (!s->last_msg) {
-			mutex_unlock(&c->fsck_error_msgs_lock);
 			ret = -ENOMEM;
-			goto err;
+			goto err_unlock;
 		}
 
 		if (c->opts.ratelimit_errors &&
@@ -356,31 +378,18 @@ int __bch2_fsck_err(struct bch_fs *c,
 			: c->opts.fix_errors;
 
 		if (fix == FSCK_FIX_ask) {
-			prt_str(out, ", ");
-			prt_str(out, action);
-
-			if (bch2_fs_stdio_redirect(c))
-				bch2_print(c, "%s", out->buf);
-			else
-				bch2_print_string_as_lines(KERN_ERR, out->buf);
 			print = false;
 
-			int ask = bch2_fsck_ask_yn(c, trans);
-
-			if (trans) {
-				ret = bch2_trans_relock(trans);
-				if (ret) {
-					mutex_unlock(&c->fsck_error_msgs_lock);
-					goto err;
-				}
-			}
+			ret = do_fsck_ask_yn(c, trans, out, action);
+			if (ret < 0)
+				goto err_unlock;
 
-			if (ask >= YN_ALLNO && s)
-				s->fix = ask == YN_ALLNO
+			if (ret >= YN_ALLNO && s)
+				s->fix = ret == YN_ALLNO
 					? FSCK_FIX_no
 					: FSCK_FIX_yes;
 
-			ret = ask & 1
+			ret = ret & 1
 				? -BCH_ERR_fsck_fix
 				: -BCH_ERR_fsck_ignore;
 		} else if (fix == FSCK_FIX_yes ||
@@ -424,8 +433,6 @@ print:
 	if (s)
 		s->ret = ret;
 
-	mutex_unlock(&c->fsck_error_msgs_lock);
-
 	if (inconsistent)
 		bch2_inconsistent_error(c);
 
@@ -442,6 +449,8 @@ print:
 			set_bit(BCH_FS_error, &c->flags);
 		}
 	}
+err_unlock:
+	mutex_unlock(&c->fsck_error_msgs_lock);
 err:
 	if (action != action_orig)
 		kfree(action);
-- 
cgit v1.2.3


From 0eafe758ac440fd285eb8de854c4b673886e9eea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 16:59:40 -0500
Subject: bcachefs: Check for bucket journal seq in the future

This fixes an assertion pop in bch2_journal_noflush_seq() - log the
error to the superblock and continue instead.

Reported-by: syzbot+85700120f75fc10d4e18@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 63 +++++++++++++++++++++++-------------------
 fs/bcachefs/sb-errors_format.h |  3 +-
 2 files changed, 37 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 6de0387ede99..e8c246e5803c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -926,37 +926,43 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 	}
 
 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
-		u64 journal_seq = trans->journal_res.seq;
-		u64 bucket_journal_seq = new_a->journal_seq;
+		u64 transaction_seq = trans->journal_res.seq;
 
-		if ((flags & BTREE_TRIGGER_insert) &&
-		    data_type_is_empty(old_a->data_type) !=
-		    data_type_is_empty(new_a->data_type) &&
-		    new.k->type == KEY_TYPE_alloc_v4) {
-			struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
+		if (log_fsck_err_on(transaction_seq && new_a->journal_seq > transaction_seq,
+				    trans, alloc_key_journal_seq_in_future,
+				    "bucket journal seq in future (currently at %llu)\n%s",
+				    journal_cur_seq(&c->journal),
+				    (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
+			new_a->journal_seq = transaction_seq;
 
-			/*
-			 * If the btree updates referring to a bucket weren't flushed
-			 * before the bucket became empty again, then the we don't have
-			 * to wait on a journal flush before we can reuse the bucket:
-			 */
-			v->journal_seq = bucket_journal_seq =
-				data_type_is_empty(new_a->data_type) &&
-				(journal_seq == v->journal_seq ||
-				 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
-				? 0 : journal_seq;
-		}
+		int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
+				     (int) data_type_is_empty(old_a->data_type);
 
-		if (!data_type_is_empty(old_a->data_type) &&
-		    data_type_is_empty(new_a->data_type) &&
-		    bucket_journal_seq) {
-			ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-					c->journal.flushed_seq_ondisk,
-					new.k->p.inode, new.k->p.offset,
-					bucket_journal_seq);
-			if (bch2_fs_fatal_err_on(ret, c,
-					"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
-				goto err;
+		/* Record journal sequence number of empty -> nonempty transition: */
+		if (is_empty_delta < 0)
+			new_a->journal_seq = max(new_a->journal_seq, transaction_seq);
+
+		/*
+		 * Bucket becomes empty: mark it as waiting for a journal flush,
+		 * unless updates since empty -> nonempty transition were never
+		 * flushed - we may need to ask the journal not to flush
+		 * intermediate sequence numbers:
+		 */
+		if (is_empty_delta > 0) {
+			if (new_a->journal_seq == transaction_seq ||
+			    bch2_journal_noflush_seq(&c->journal, new_a->journal_seq))
+				new_a->journal_seq = 0;
+			else {
+				new_a->journal_seq = transaction_seq;
+
+				ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+						c->journal.flushed_seq_ondisk,
+						new.k->p.inode, new.k->p.offset,
+						transaction_seq);
+				if (bch2_fs_fatal_err_on(ret, c,
+						"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
+					goto err;
+			}
 		}
 
 		if (new_a->gen != old_a->gen) {
@@ -1004,6 +1010,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 		rcu_read_unlock();
 	}
 err:
+fsck_err:
 	printbuf_exit(&buf);
 	bch2_dev_put(ca);
 	return ret;
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 382fcafa815a..8e3a6c5da10d 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -122,6 +122,7 @@ enum bch_fsck_flags {
 	x(alloc_key_cached_sectors_wrong,			109,	FSCK_AUTOFIX)	\
 	x(alloc_key_stripe_wrong,				110,	FSCK_AUTOFIX)	\
 	x(alloc_key_stripe_redundancy_wrong,			111,	FSCK_AUTOFIX)	\
+	x(alloc_key_journal_seq_in_future,			298,	FSCK_AUTOFIX)	\
 	x(bucket_sector_count_overflow,				112,	0)		\
 	x(bucket_metadata_type_mismatch,			113,	0)		\
 	x(need_discard_key_wrong,				114,	FSCK_AUTOFIX)	\
@@ -308,7 +309,7 @@ enum bch_fsck_flags {
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
 	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
 	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
-	x(MAX,							298,	0)
+	x(MAX,							299,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From 4746ee182a76bbe4dd847180b0c8b575756b0d0d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 17:48:20 -0500
Subject: bcachefs: Check for inode journal seq in the future

More check and repair code: this fixes a warning in
bch2_journal_flush_seq_async()

Reported-by: syzbot+d119b445ec739e7f3068@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c            | 35 ++++++++++++++++++++++++++++++++---
 fs/bcachefs/fsck.c             | 13 ++++++++++++-
 fs/bcachefs/sb-errors_format.h |  3 ++-
 3 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c6fdfec51082..33d0e7080bf6 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -167,6 +167,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 
 /* fsync: */
 
+static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum,
+					    u64 *seq)
+{
+	struct printbuf buf = PRINTBUF;
+	struct bch_inode_unpacked u;
+	struct btree_iter iter;
+	int ret = bch2_inode_peek(trans, &iter, &u, inum, 0);
+	if (ret)
+		return ret;
+
+	u64 cur_seq = journal_cur_seq(&trans->c->journal);
+	*seq = min(cur_seq, u.bi_journal_seq);
+
+	if (fsck_err_on(u.bi_journal_seq > cur_seq,
+			trans, inode_journal_seq_in_future,
+			"inode journal seq in future (currently at %llu)\n%s",
+			cur_seq,
+			(bch2_inode_unpacked_to_text(&buf, &u),
+			buf.buf))) {
+		u.bi_journal_seq = cur_seq;
+		ret = bch2_inode_write(trans, &iter, &u);
+	}
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
 /*
  * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
  * insert trigger: look up the btree inode instead
@@ -180,9 +208,10 @@ static int bch2_flush_inode(struct bch_fs *c,
 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
 		return -EROFS;
 
-	struct bch_inode_unpacked u;
-	int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
-		  bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?:
+	u64 seq;
+	int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+			bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
+		  bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
 		  bch2_inode_flush_nocow_writes(c, inode);
 	bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
 	return ret;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e10abd2e6c69..f2174528ee5f 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1392,7 +1392,7 @@ static int check_inode(struct btree_trans *trans,
 
 			if (fsck_err_on(!ret,
 					trans, inode_unlinked_and_not_open,
-				      "inode %llu%u unlinked and not open",
+				      "inode %llu:%u unlinked and not open",
 				      u.bi_inum, u.bi_snapshot)) {
 				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
 				bch_err_msg(c, ret, "in fsck deleting inode");
@@ -1441,6 +1441,17 @@ static int check_inode(struct btree_trans *trans,
 			do_update = true;
 		}
 	}
+
+	if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
+			trans, inode_journal_seq_in_future,
+			"inode journal seq in future (currently at %llu)\n%s",
+			journal_cur_seq(&c->journal),
+			(printbuf_reset(&buf),
+			 bch2_inode_unpacked_to_text(&buf, &u),
+			buf.buf))) {
+		u.bi_journal_seq = journal_cur_seq(&c->journal);
+		do_update = true;
+	}
 do_update:
 	if (do_update) {
 		ret = __bch2_fsck_write_inode(trans, &u);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 8e3a6c5da10d..342eda8ab69f 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -233,6 +233,7 @@ enum bch_fsck_flags {
 	x(inode_wrong_nlink,					209,	FSCK_AUTOFIX)	\
 	x(inode_has_child_snapshots_wrong,			287,	0)		\
 	x(inode_unreachable,					210,	FSCK_AUTOFIX)	\
+	x(inode_journal_seq_in_future,				299,	FSCK_AUTOFIX)	\
 	x(deleted_inode_but_clean,				211,	FSCK_AUTOFIX)	\
 	x(deleted_inode_missing,				212,	FSCK_AUTOFIX)	\
 	x(deleted_inode_is_dir,					213,	FSCK_AUTOFIX)	\
@@ -309,7 +310,7 @@ enum bch_fsck_flags {
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
 	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
 	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
-	x(MAX,							299,	0)
+	x(MAX,							300,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From f7f196170dcd7c629126ee9d37be5dbdb6d4f941 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 17:57:55 -0500
Subject: bcachefs: cryptographic MACs on superblock are not (yet?) supported

We should add support for cryptographic macs on the superblock - and it
won't be hard, but it'll need an incompatible feature bit (and we have a
new incompatible feature versioning scheme coming).

For now, just add a guard to avoid a dull ptr deref in gen_poly_key().

Reported-by: syzbot+dd3d9835055dacb66f35@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 4c29f8215d54..6a086c1c4b14 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -677,7 +677,8 @@ reread:
 	}
 
 	enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
-	if (csum_type >= BCH_CSUM_NR) {
+	if (csum_type >= BCH_CSUM_NR ||
+	    bch2_csum_type_is_encryption(csum_type)) {
 		prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
 		return -BCH_ERR_invalid_sb_csum_type;
 	}
-- 
cgit v1.2.3


From ff1dd05f82338cc0be15285035b55b517b4c64a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 18:05:06 -0500
Subject: bcachefs: bch2_trans_relock() is trylock for lockdep

fix some spurious lockdep splats

Reported-by: syzbot+e088be3c2d5c05aaac35@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    | 8 ++++----
 fs/bcachefs/btree_locking.c | 2 +-
 fs/bcachefs/btree_locking.h | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 80c3b55ce763..9c54891c737a 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1007,7 +1007,7 @@ retry_all:
 
 	bch2_trans_unlock(trans);
 	cond_resched();
-	trans_set_locked(trans);
+	trans_set_locked(trans, false);
 
 	if (unlikely(trans->memory_allocation_failure)) {
 		struct closure cl;
@@ -3248,7 +3248,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 
 	trans->last_begin_ip = _RET_IP_;
 
-	trans_set_locked(trans);
+	trans_set_locked(trans, false);
 
 	if (trans->restarted) {
 		bch2_btree_path_traverse_all(trans);
@@ -3354,7 +3354,7 @@ got_trans:
 	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
 	trans->srcu_lock_time	= jiffies;
 	trans->srcu_held	= true;
-	trans_set_locked(trans);
+	trans_set_locked(trans, false);
 
 	closure_init_stack_release(&trans->ref);
 	return trans;
@@ -3622,7 +3622,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 #ifdef CONFIG_LOCKDEP
 	fs_reclaim_acquire(GFP_KERNEL);
 	struct btree_trans *trans = bch2_trans_get(c);
-	trans_set_locked(trans);
+	trans_set_locked(trans, false);
 	bch2_trans_put(trans);
 	fs_reclaim_release(GFP_KERNEL);
 #endif
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index efe2a007b482..d343df9f0ad2 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -782,7 +782,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
 			return bch2_trans_relock_fail(trans, path, &f, trace);
 	}
 
-	trans_set_locked(trans);
+	trans_set_locked(trans, true);
 out:
 	bch2_trans_verify_locks(trans);
 	return 0;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index ca4aeefd631e..7474ab6ce019 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -188,10 +188,10 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
 
 /* lock: */
 
-static inline void trans_set_locked(struct btree_trans *trans)
+static inline void trans_set_locked(struct btree_trans *trans, bool try)
 {
 	if (!trans->locked) {
-		lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_);
+		lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
 		trans->locked = true;
 		trans->last_unlock_ip = 0;
 
-- 
cgit v1.2.3


From 9bdb3b73e73203546107eb11a4d8bb3ad3f48851 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 19:02:18 -0500
Subject: bcachefs: Check for extent crc uncompressed/compressed size mismatch

When not compressed, these must be equal - this fixes an assertion pop
in bch2_rechecksum_bio().

Reported-by: syzbot+50d3544c9b8db9c99fd2@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c          | 22 +++++++++++++---------
 fs/bcachefs/sb-errors_format.h |  5 +++--
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index aa3b88291814..2fc9ace5533c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1323,9 +1323,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 		case BCH_EXTENT_ENTRY_crc128:
 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
-					 c, ptr_crc_uncompressed_size_too_small,
-					 "checksum offset + key size > uncompressed size");
 			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
 					 c, ptr_crc_csum_type_unknown,
 					 "invalid checksum type");
@@ -1333,6 +1330,19 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 					 c, ptr_crc_compression_type_unknown,
 					 "invalid compression type");
 
+			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
+					 c, ptr_crc_uncompressed_size_too_small,
+					 "checksum offset + key size > uncompressed size");
+			bkey_fsck_err_on(crc_is_encoded(crc) &&
+					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+					 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
+					 c, ptr_crc_uncompressed_size_too_big,
+					 "too large encoded extent");
+			bkey_fsck_err_on(!crc_is_compressed(crc) &&
+					 crc.compressed_size != crc.uncompressed_size,
+					 c, ptr_crc_uncompressed_size_mismatch,
+					 "not compressed but compressed != uncompressed size");
+
 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
 				if (nonce == UINT_MAX)
 					nonce = crc.offset + crc.nonce;
@@ -1346,12 +1356,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 					 "redundant crc entry");
 			crc_since_last_ptr = true;
 
-			bkey_fsck_err_on(crc_is_encoded(crc) &&
-					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
-					 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
-					 c, ptr_crc_uncompressed_size_too_big,
-					 "too large encoded extent");
-
 			size_ondisk = crc.compressed_size;
 			break;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 342eda8ab69f..3bbda181f314 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -172,10 +172,11 @@ enum bch_fsck_flags {
 	x(ptr_bucket_data_type_mismatch,			155,	0)		\
 	x(ptr_cached_and_erasure_coded,				156,	0)		\
 	x(ptr_crc_uncompressed_size_too_small,			157,	0)		\
+	x(ptr_crc_uncompressed_size_too_big,			161,	0)		\
+	x(ptr_crc_uncompressed_size_mismatch,			300,	0)		\
 	x(ptr_crc_csum_type_unknown,				158,	0)		\
 	x(ptr_crc_compression_type_unknown,			159,	0)		\
 	x(ptr_crc_redundant,					160,	0)		\
-	x(ptr_crc_uncompressed_size_too_big,			161,	0)		\
 	x(ptr_crc_nonce_mismatch,				162,	0)		\
 	x(ptr_stripe_redundant,					163,	0)		\
 	x(reservation_key_nr_replicas_invalid,			164,	0)		\
@@ -310,7 +311,7 @@ enum bch_fsck_flags {
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
 	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
 	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
-	x(MAX,							300,	0)
+	x(MAX,							301,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From 2cd85fea49d850629404d4668e104466114598e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 28 Nov 2024 19:30:23 -0500
Subject: bcachefs: Don't recurse in check_discard_freespace_key

When calling check_discard_freeespace_key from the allocator, we can't
repair without recursing - run it asynchronously instead.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 72 +++++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/alloc_background.h |  2 +-
 fs/bcachefs/alloc_foreground.c |  2 +-
 fs/bcachefs/bcachefs.h         |  1 +
 4 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e8c246e5803c..b2d570453351 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1338,7 +1338,40 @@ fsck_err:
 	return ret;
 }
 
-int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen)
+struct check_discard_freespace_key_async {
+	struct work_struct	work;
+	struct bch_fs		*c;
+	struct bbpos		pos;
+};
+
+static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	u8 gen;
+	ret = k.k->type != KEY_TYPE_set
+		? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
+		: 0;
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void check_discard_freespace_key_work(struct work_struct *work)
+{
+	struct check_discard_freespace_key_async *w =
+		container_of(work, struct check_discard_freespace_key_async, work);
+
+	bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
+	bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
+	kfree(w);
+}
+
+int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
+				     bool async_repair)
 {
 	struct bch_fs *c = trans->c;
 	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
@@ -1351,7 +1384,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
 	u64 genbits = iter->pos.offset & (~0ULL << 56);
 
 	struct btree_iter alloc_iter;
-	struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, BTREE_ITER_cached);
+	struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
+						     BTREE_ID_alloc, bucket, BTREE_ITER_cached);
 	int ret = bkey_err(alloc_k);
 	if (ret)
 		return ret;
@@ -1392,17 +1426,39 @@ fsck_err:
 	printbuf_exit(&buf);
 	return ret;
 delete:
-	ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-			BCH_TRANS_COMMIT_no_enospc) ?:
-		-BCH_ERR_transaction_restart_commit;
-	goto out;
+	if (!async_repair) {
+		ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
+			bch2_trans_commit(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc) ?:
+			-BCH_ERR_transaction_restart_commit;
+		goto out;
+	} else {
+		/*
+		 * We can't repair here when called from the allocator path: the
+		 * commit will recurse back into the allocator
+		 */
+		struct check_discard_freespace_key_async *w =
+			kzalloc(sizeof(*w), GFP_KERNEL);
+		if (!w)
+			goto out;
+
+		if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
+			kfree(w);
+			goto out;
+		}
+
+		INIT_WORK(&w->work, check_discard_freespace_key_work);
+		w->c = c;
+		w->pos = BBPOS(iter->btree_id, iter->pos);
+		queue_work(c->write_ref_wq, &w->work);
+		goto out;
+	}
 }
 
 static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
 {
 	u8 gen;
-	int ret = bch2_check_discard_freespace_key(trans, iter, &gen);
+	int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
 	return ret < 0 ? ret : 0;
 }
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8cacddd188f4..de25ba4ee94b 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -310,7 +310,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s,
 		       enum btree_iter_update_trigger_flags);
 
-int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *);
+int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
 int bch2_check_alloc_info(struct bch_fs *);
 int bch2_check_alloc_to_lru_refs(struct bch_fs *);
 void bch2_dev_do_discards(struct bch_dev *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 4d1ff7f1f302..c40a76df76b8 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -281,7 +281,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 	u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
 	u8 gen;
 
-	int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen);
+	int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
 	if (ret < 0)
 		return ERR_PTR(ret);
 	if (ret)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d88129503bc5..c16937e54734 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -680,6 +680,7 @@ struct btree_trans_buf {
 	x(dio_write)							\
 	x(discard)							\
 	x(discard_fast)							\
+	x(check_discard_freespace_key)					\
 	x(invalidate)							\
 	x(delete_dead_snapshots)					\
 	x(gc_gens)							\
-- 
cgit v1.2.3


From 2f8d5edf556a7499a188c8b38a650bed5aebcdbf Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Fri, 29 Nov 2024 14:38:27 +0800
Subject: bcachefs: Add missing parameter description to
 bch2_bucket_alloc_trans()

The function bch2_bucket_alloc_trans() lacked a description for the
nowait parameter in its documentation comment block. This patch adds the
missing description to ensure all parameters are properly documented.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=12179
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index c40a76df76b8..095bfe7c53bd 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -505,6 +505,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
  * @watermark:	how important is this allocation?
  * @data_type:	BCH_DATA_journal, btree, user...
  * @cl:		if not NULL, closure to be used to wait if buckets not available
+ * @nowait:	if true, do not wait for buckets to become available
  * @usage:	for secondarily also returning the current device usage
  *
  * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
-- 
cgit v1.2.3


From c9b9afe78c436bb6d64ac0d3fa255c9cc232f2a9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Nov 2024 21:12:47 -0500
Subject: bcachefs: Fix fsck.c build in userspace

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f2174528ee5f..cc15ff135cd6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -3206,6 +3206,8 @@ int bch2_fix_reflink_p(struct bch_fs *c)
 	return ret;
 }
 
+#ifndef NO_BCACHEFS_CHARDEV
+
 struct fsck_thread {
 	struct thread_with_stdio thr;
 	struct bch_fs		*c;
@@ -3421,3 +3423,5 @@ err:
 	}
 	return ret;
 }
+
+#endif /* NO_BCACHEFS_CHARDEV */
-- 
cgit v1.2.3


From f7727a6767277a9d939f9ab3720eb327973a262e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 28 Sep 2024 15:40:49 -0400
Subject: bcachefs: bch2_inum_to_path()

Add a function for walking backpointers to find a path from a given
inode number, and convert various error messages to use it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h        |   1 +
 fs/bcachefs/error.c          |  34 ++++++++
 fs/bcachefs/error.h          |   6 ++
 fs/bcachefs/fs-common.c      |  81 +++++++++++++++++++
 fs/bcachefs/fs-common.h      |   2 +
 fs/bcachefs/fs-io-buffered.c |  10 ++-
 fs/bcachefs/fsck.c           |  12 ++-
 fs/bcachefs/io_misc.c        |  12 +--
 fs/bcachefs/io_read.c        | 185 +++++++++++++++++++++++++++++++------------
 9 files changed, 279 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index a0cfc0f286f4..47387f7d6202 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -116,6 +116,7 @@
 	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
 	x(ENOENT,			ENOENT_dev_not_found)			\
 	x(ENOENT,			ENOENT_dev_idx_not_found)		\
+	x(ENOENT,			ENOENT_inode_no_backpointer)		\
 	x(ENOTEMPTY,			ENOTEMPTY_dir_not_empty)		\
 	x(ENOTEMPTY,			ENOTEMPTY_subvol_not_empty)		\
 	x(EEXIST,			EEXIST_str_hash_set)			\
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 0517782ca57a..abaa9570cd62 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -3,6 +3,7 @@
 #include "btree_cache.h"
 #include "btree_iter.h"
 #include "error.h"
+#include "fs-common.h"
 #include "journal.h"
 #include "recovery_passes.h"
 #include "super.h"
@@ -515,3 +516,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
 
 	mutex_unlock(&c->fsck_error_msgs_lock);
 }
+
+int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
+{
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+
+	/* XXX: we don't yet attempt to print paths when we don't know the subvol */
+	if (inum.subvol)
+		ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
+	if (!inum.subvol || ret)
+		prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
+
+	return trans_was_restarted(trans, restart_count);
+}
+
+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+				    subvol_inum inum, u64 offset)
+{
+	int ret = bch2_inum_err_msg_trans(trans, out, inum);
+	prt_printf(out, " offset %llu: ", offset);
+	return ret;
+}
+
+void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
+{
+	bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
+}
+
+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+			      subvol_inum inum, u64 offset)
+{
+	bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 12ca5287e20a..7acf2a27ca28 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -238,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
 	_ret;								\
 })
 
+int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
+int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
+
+void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
+void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
+
 #endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 7e10a9ddcfd9..dcaa47f68f31 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -548,3 +548,84 @@ err:
 	bch2_trans_iter_exit(trans, &src_dir_iter);
 	return ret;
 }
+
+static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
+{
+	bch2_printbuf_make_room(out, n);
+
+	unsigned can_print = min(n, printbuf_remaining(out));
+
+	b += n;
+
+	for (unsigned i = 0; i < can_print; i++)
+		out->buf[out->pos++] = *((char *) --b);
+
+	printbuf_nul_terminate(out);
+}
+
+static inline void reverse_bytes(void *b, size_t n)
+{
+	char *e = b + n, *s = b;
+
+	while (s < e) {
+		--e;
+		swap(*s, *e);
+		s++;
+	}
+}
+
+/* XXX: we don't yet attempt to print paths when we don't know the subvol */
+int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
+{
+	unsigned orig_pos = path->pos;
+	int ret = 0;
+
+	while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
+		 inum.inum   == BCACHEFS_ROOT_INO)) {
+		struct bch_inode_unpacked inode;
+		ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
+		if (ret)
+			goto err;
+
+		if (!inode.bi_dir && !inode.bi_dir_offset) {
+			ret = -BCH_ERR_ENOENT_inode_no_backpointer;
+			goto err;
+		}
+
+		u32 snapshot;
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto err;
+
+		struct btree_iter d_iter;
+		struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
+				BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
+				0, dirent);
+		ret = bkey_err(d.s_c);
+		if (ret)
+			goto err;
+
+		struct qstr dirent_name = bch2_dirent_get_name(d);
+		prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
+
+		prt_char(path, '/');
+
+		if (d.v->d_type == DT_SUBVOL)
+			inum.subvol = le32_to_cpu(d.v->d_parent_subvol);
+		inum.inum = d.k->p.inode;
+
+		bch2_trans_iter_exit(trans, &d_iter);
+	}
+
+	if (orig_pos == path->pos)
+		prt_char(path, '/');
+
+	ret = path->allocation_failure ? -ENOMEM : 0;
+	if (ret)
+		goto err;
+
+	reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
+	return 0;
+err:
+	return ret;
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index c934e807b380..2b59210bb5e8 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -42,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *,
 bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
 			  struct bch_inode_unpacked *);
 
+int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
+
 #endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index d55e215e8aa6..ff8b8df50bf3 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -231,10 +231,12 @@ err:
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret) {
-		bch_err_inum_offset_ratelimited(c,
-				iter.pos.inode,
-				iter.pos.offset << 9,
-				"read error %i from btree lookup", ret);
+		struct printbuf buf = PRINTBUF;
+		bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
+		prt_printf(&buf, "read error %i from btree lookup", ret);
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+
 		rbio->bio.bi_status = BLK_STS_IOERR;
 		bio_endio(&rbio->bio);
 	}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index cc15ff135cd6..1a5a07112779 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -212,6 +212,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 {
 	struct bch_fs *c = trans->c;
 	struct qstr lostfound_str = QSTR("lost+found");
+	struct btree_iter lostfound_iter = { NULL };
 	u64 inum = 0;
 	unsigned d_type = 0;
 	int ret;
@@ -290,11 +291,16 @@ create_lostfound:
 	 * XXX: we could have a nicer log message here  if we had a nice way to
 	 * walk backpointers to print a path
 	 */
-	bch_notice(c, "creating lost+found in subvol %llu snapshot %u",
-		   root_inum.subvol, le32_to_cpu(st.root_snapshot));
+	struct printbuf path = PRINTBUF;
+	ret = bch2_inum_to_path(trans, root_inum, &path);
+	if (ret)
+		goto err;
+
+	bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
+		   path.buf, root_inum.subvol, snapshot);
+	printbuf_exit(&path);
 
 	u64 now = bch2_current_time(c);
-	struct btree_iter lostfound_iter = { NULL };
 	u64 cpu = raw_smp_processor_id();
 
 	bch2_inode_init_early(c, lostfound);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 524e31e7411b..5353979117b0 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -113,11 +113,13 @@ int bch2_extent_fallocate(struct btree_trans *trans,
 err:
 	if (!ret && sectors_allocated)
 		bch2_increment_clock(c, sectors_allocated, WRITE);
-	if (should_print_err(ret))
-		bch_err_inum_offset_ratelimited(c,
-			inum.inum,
-			iter->pos.offset << 9,
-			"%s(): error: %s", __func__, bch2_err_str(ret));
+	if (should_print_err(ret)) {
+		struct printbuf buf = PRINTBUF;
+		bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
+		prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+	}
 err_noprint:
 	bch2_open_buckets_put(c, &open_buckets);
 	bch2_disk_reservation_put(c, &disk_res);
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 4b6b6d25725b..34a3569d085a 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -322,6 +322,20 @@ nopromote:
 
 /* Read */
 
+static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+				   struct bch_read_bio *rbio, struct bpos read_pos)
+{
+	return bch2_inum_offset_err_msg_trans(trans, out,
+		(subvol_inum) { rbio->subvol, read_pos.inode },
+		read_pos.offset << 9);
+}
+
+static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
+			      struct bch_read_bio *rbio, struct bpos read_pos)
+{
+	bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
+}
+
 #define READ_RETRY_AVOID	1
 #define READ_RETRY		2
 #define READ_ERR		3
@@ -500,6 +514,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
 	}
 }
 
+static void bch2_read_io_err(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bio *bio = &rbio->bio;
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	struct printbuf buf = PRINTBUF;
+
+	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+	prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
+
+	if (ca) {
+		bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+		bch_err_ratelimited(ca, "%s", buf.buf);
+	} else {
+		bch_err_ratelimited(c, "%s", buf.buf);
+	}
+
+	printbuf_exit(&buf);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+}
+
 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
 				   struct bch_read_bio *rbio)
 {
@@ -563,6 +600,73 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 			     __bch2_rbio_narrow_crcs(trans, rbio));
 }
 
+static void bch2_read_csum_err(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bio *src		= &rbio->bio;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	struct printbuf buf = PRINTBUF;
+
+	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+	prt_str(&buf, "data ");
+	bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
+
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	if (ca) {
+		bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+		bch_err_ratelimited(ca, "%s", buf.buf);
+	} else {
+		bch_err_ratelimited(c, "%s", buf.buf);
+	}
+
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	printbuf_exit(&buf);
+}
+
+static void bch2_read_decompress_err(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct printbuf buf = PRINTBUF;
+
+	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+	prt_str(&buf, "decompression error");
+
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	if (ca)
+		bch_err_ratelimited(ca, "%s", buf.buf);
+	else
+		bch_err_ratelimited(c, "%s", buf.buf);
+
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	printbuf_exit(&buf);
+}
+
+static void bch2_read_decrypt_err(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct printbuf buf = PRINTBUF;
+
+	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+	prt_str(&buf, "decrypt error");
+
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	if (ca)
+		bch_err_ratelimited(ca, "%s", buf.buf);
+	else
+		bch_err_ratelimited(c, "%s", buf.buf);
+
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	printbuf_exit(&buf);
+}
+
 /* Inner part that may run in process context */
 static void __bch2_read_endio(struct work_struct *work)
 {
@@ -669,33 +773,13 @@ csum_err:
 		goto out;
 	}
 
-	struct printbuf buf = PRINTBUF;
-	buf.atomic++;
-	prt_str(&buf, "data ");
-	bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
-
-	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-	if (ca) {
-		bch_err_inum_offset_ratelimited(ca,
-			rbio->read_pos.inode,
-			rbio->read_pos.offset << 9,
-			"data %s", buf.buf);
-		bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-	}
-	printbuf_exit(&buf);
-	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
 	goto out;
 decompression_err:
-	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-					rbio->read_pos.offset << 9,
-					"decompression error");
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
 	goto out;
 decrypt_err:
-	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-					rbio->read_pos.offset << 9,
-					"decrypt error");
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
 	goto out;
 }
 
@@ -716,16 +800,8 @@ static void bch2_read_endio(struct bio *bio)
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;
 
-	if (bio->bi_status) {
-		if (ca) {
-			bch_err_inum_offset_ratelimited(ca,
-				rbio->read_pos.inode,
-				rbio->read_pos.offset,
-				"data read error: %s",
-				bch2_blk_status_to_str(bio->bi_status));
-			bch2_io_error(ca, BCH_MEMBER_ERROR_read);
-		}
-		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+	if (unlikely(bio->bi_status)) {
+		bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
 		return;
 	}
 
@@ -832,25 +908,22 @@ retry_pick:
 
 	if (unlikely(pick_ret < 0)) {
 		struct printbuf buf = PRINTBUF;
+		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+		prt_printf(&buf, "no device to read from: %s\n  ", bch2_err_str(pick_ret));
 		bch2_bkey_val_to_text(&buf, c, k);
 
-		bch_err_inum_offset_ratelimited(c,
-				read_pos.inode, read_pos.offset << 9,
-				"no device to read from: %s\n  %s",
-				bch2_err_str(pick_ret),
-				buf.buf);
+		bch_err_ratelimited(c, "%s", buf.buf);
 		printbuf_exit(&buf);
 		goto err;
 	}
 
 	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
 		struct printbuf buf = PRINTBUF;
+		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+		prt_printf(&buf, "attempting to read encrypted data without encryption key\n  ");
 		bch2_bkey_val_to_text(&buf, c, k);
 
-		bch_err_inum_offset_ratelimited(c,
-				read_pos.inode, read_pos.offset << 9,
-				"attempting to read encrypted data without encryption key\n  %s",
-				buf.buf);
+		bch_err_ratelimited(c, "%s", buf.buf);
 		printbuf_exit(&buf);
 		goto err;
 	}
@@ -1036,11 +1109,15 @@ get_bio:
 	}
 
 	if (!rbio->pick.idx) {
-		if (!rbio->have_ioref) {
-			bch_err_inum_offset_ratelimited(c,
-					read_pos.inode,
-					read_pos.offset << 9,
-					"no device to read from");
+		if (unlikely(!rbio->have_ioref)) {
+			struct printbuf buf = PRINTBUF;
+			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
+			prt_printf(&buf, "no device to read from:\n  ");
+			bch2_bkey_val_to_text(&buf, c, k);
+
+			bch_err_ratelimited(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+
 			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 			goto out;
 		}
@@ -1202,16 +1279,20 @@ err:
 	}
 
 	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&sk, c);
 
 	if (ret) {
-		bch_err_inum_offset_ratelimited(c, inum.inum,
-						bvec_iter.bi_sector << 9,
-						"read error %i from btree lookup", ret);
+		struct printbuf buf = PRINTBUF;
+		bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
+		prt_printf(&buf, "read error %i from btree lookup", ret);
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+
 		rbio->bio.bi_status = BLK_STS_IOERR;
 		bch2_rbio_done(rbio);
 	}
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
 }
 
 void bch2_fs_io_read_exit(struct bch_fs *c)
-- 
cgit v1.2.3


From 097cc9d0d60a3671fd1adcda9b7c0324908e3fd7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 13 Nov 2024 23:08:57 -0500
Subject: bcachefs: Convert write path errors to inum_to_path()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_write.c | 91 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f97ebb30f6c0..bae045e76055 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -396,6 +396,21 @@ static int bch2_write_index_default(struct bch_write_op *op)
 
 /* Writes */
 
+static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
+				  u64 offset)
+{
+	bch2_inum_offset_err_msg(op->c, out,
+				 (subvol_inum) { op->subvol, op->pos.inode, },
+				 offset << 9);
+	prt_printf(out, "write error%s: ",
+		   op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
+}
+
+static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
+{
+	__bch2_write_op_error(out, op, op->pos.offset);
+}
+
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			       enum bch_data_type type,
 			       const struct bkey_i *k,
@@ -532,14 +547,14 @@ static void __bch2_write_index(struct bch_write_op *op)
 
 		op->written += sectors_start - keylist_sectors(keys);
 
-		if (ret && !bch2_err_matches(ret, EROFS)) {
+		if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
 			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
-			bch_err_inum_offset_ratelimited(c,
-				insert->k.p.inode, insert->k.p.offset << 9,
-				"%s write error while doing btree update: %s",
-				op->flags & BCH_WRITE_MOVE ? "move" : "user",
-				bch2_err_str(ret));
+			struct printbuf buf = PRINTBUF;
+			__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
+			prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+			bch_err_ratelimited(c, "%s", buf.buf);
+			printbuf_exit(&buf);
 		}
 
 		if (ret)
@@ -1081,11 +1096,14 @@ do_write:
 	*_dst = dst;
 	return more;
 csum_err:
-	bch_err_inum_offset_ratelimited(c,
-		op->pos.inode,
-		op->pos.offset << 9,
-		"%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)",
-		op->flags & BCH_WRITE_MOVE ? "move" : "user");
+	{
+		struct printbuf buf = PRINTBUF;
+		bch2_write_op_error(&buf, op);
+		prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
 	ret = -EIO;
 err:
 	if (to_wbio(dst)->bounce)
@@ -1176,11 +1194,11 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 		if (ret && !bch2_err_matches(ret, EROFS)) {
 			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
-			bch_err_inum_offset_ratelimited(c,
-				insert->k.p.inode, insert->k.p.offset << 9,
-				"%s write error while doing btree update: %s",
-				op->flags & BCH_WRITE_MOVE ? "move" : "user",
-				bch2_err_str(ret));
+			struct printbuf buf = PRINTBUF;
+			__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
+			prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+			bch_err_ratelimited(c, "%s", buf.buf);
+			printbuf_exit(&buf);
 		}
 
 		if (ret) {
@@ -1340,17 +1358,19 @@ err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
+	bch2_trans_put(trans);
+	darray_exit(&buckets);
+
 	if (ret) {
-		bch_err_inum_offset_ratelimited(c,
-			op->pos.inode, op->pos.offset << 9,
-			"%s: btree lookup error %s", __func__, bch2_err_str(ret));
+		struct printbuf buf = PRINTBUF;
+		bch2_write_op_error(&buf, op);
+		prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
 		op->error = ret;
 		op->flags |= BCH_WRITE_SUBMITTED;
 	}
 
-	bch2_trans_put(trans);
-	darray_exit(&buckets);
-
 	/* fallback to cow write path? */
 	if (!(op->flags & BCH_WRITE_SUBMITTED)) {
 		closure_sync(&op->cl);
@@ -1463,14 +1483,14 @@ err:
 		if (ret <= 0) {
 			op->flags |= BCH_WRITE_SUBMITTED;
 
-			if (ret < 0) {
-				if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
-					bch_err_inum_offset_ratelimited(c,
-						op->pos.inode,
-						op->pos.offset << 9,
-						"%s(): %s error: %s", __func__,
-						op->flags & BCH_WRITE_MOVE ? "move" : "user",
-						bch2_err_str(ret));
+			if (unlikely(ret < 0)) {
+				if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
+					struct printbuf buf = PRINTBUF;
+					bch2_write_op_error(&buf, op);
+					prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
+					bch_err_ratelimited(c, "%s", buf.buf);
+					printbuf_exit(&buf);
+				}
 				op->error = ret;
 				break;
 			}
@@ -1596,12 +1616,11 @@ CLOSURE_CALLBACK(bch2_write)
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
 	wbio_init(bio)->put_bio = false;
 
-	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
-		bch_err_inum_offset_ratelimited(c,
-			op->pos.inode,
-			op->pos.offset << 9,
-			"%s write error: misaligned write",
-			op->flags & BCH_WRITE_MOVE ? "move" : "user");
+	if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
+		struct printbuf buf = PRINTBUF;
+		bch2_write_op_error(&buf, op);
+		prt_printf(&buf, "misaligned write");
+		printbuf_exit(&buf);
 		op->error = -EIO;
 		goto err;
 	}
-- 
cgit v1.2.3


From 7807b5b07de1d009275e00b7fa51db31071d57a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Nov 2024 19:13:54 -0500
Subject: bcachefs: list_pop_entry()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c       |  6 ++----
 fs/bcachefs/io_write.c |  4 +---
 fs/bcachefs/util.h     | 13 +++++++++++++
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index eaca4c39d703..df9c0e453391 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -2456,11 +2456,9 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 
 	while (1) {
 		mutex_lock(&c->ec_stripe_head_lock);
-		h = list_first_entry_or_null(&c->ec_stripe_head_list,
-					     struct ec_stripe_head, list);
-		if (h)
-			list_del(&h->list);
+		h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
 		mutex_unlock(&c->ec_stripe_head_lock);
+
 		if (!h)
 			break;
 
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index bae045e76055..3e71860f66b9 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -637,9 +637,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
 
 	while (1) {
 		spin_lock_irq(&wp->writes_lock);
-		op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
-		if (op)
-			list_del(&op->wp_list);
+		op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
 		wp_update_state(wp, op != NULL);
 		spin_unlock_irq(&wp->writes_lock);
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index fb02c1c36004..5e4820c8fa44 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -317,6 +317,19 @@ do {									\
 	_ptr ? container_of(_ptr, type, member) : NULL;			\
 })
 
+static inline struct list_head *list_pop(struct list_head *head)
+{
+	if (list_empty(head))
+		return NULL;
+
+	struct list_head *ret = head->next;
+	list_del_init(ret);
+	return ret;
+}
+
+#define list_pop_entry(head, type, member)		\
+	container_of_or_null(list_pop(head), type, member)
+
 /* Does linear interpolation between powers of two */
 static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 {
-- 
cgit v1.2.3


From 1302eeb7c5db1b9ac9db9d29c39e6a46bda718a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Nov 2024 18:20:42 -0500
Subject: bcachefs: bkey_fsck_err now respects errors_silent

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/error.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index abaa9570cd62..9e34374960f3 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -476,11 +476,16 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
 		return -BCH_ERR_fsck_delete_bkey;
 
 	unsigned fsck_flags = 0;
-	if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)))
+	if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
+		if (test_bit(err, c->sb.errors_silent))
+			return -BCH_ERR_fsck_delete_bkey;
+
 		fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
+	}
+	if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+		fsck_flags |= fsck_flags_extra[err];
 
 	struct printbuf buf = PRINTBUF;
-	va_list args;
 
 	prt_printf(&buf, "invalid bkey in %s btree=",
 		   bch2_bkey_validate_contexts[from.from]);
@@ -489,9 +494,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
 
 	bch2_bkey_val_to_text(&buf, c, k);
 	prt_str(&buf, "\n  ");
+
+	va_list args;
 	va_start(args, fmt);
 	prt_vprintf(&buf, fmt, args);
 	va_end(args);
+
 	prt_str(&buf, ": delete?");
 
 	int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
-- 
cgit v1.2.3


From b29769c72d0b6f842ae7a1e10e9cfb9a8fcc87fa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Nov 2024 18:17:00 -0500
Subject: bcachefs: If we did repair on a btree node, make sure we rewrite it

Ensure that "invalid bkey" repair gets persisted, so that it doesn't
repeatedly spam the logs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index eedcb2445b99..9df9fc1c5e2b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -997,6 +997,7 @@ drop_this_key:
 got_good_key:
 		le16_add_cpu(&i->u64s, -next_good_key);
 		memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
+		set_btree_node_need_rewrite(b);
 	}
 fsck_err:
 	printbuf_exit(&buf);
@@ -1259,6 +1260,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			memmove_u64s_down(k, bkey_p_next(k),
 					  (u64 *) vstruct_end(i) - (u64 *) k);
 			set_btree_bset_end(b, b->set);
+			set_btree_node_need_rewrite(b);
 			continue;
 		}
 		if (ret)
@@ -1372,15 +1374,18 @@ start:
 			       rb->start_time);
 	bio_put(&rb->bio);
 
-	if (saw_error &&
+	if ((saw_error ||
+	     btree_node_need_rewrite(b)) &&
 	    !btree_node_read_error(b) &&
 	    c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
-		printbuf_reset(&buf);
-		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-		prt_str(&buf, " ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n  %s",
-				    __func__, buf.buf);
+		if (saw_error) {
+			printbuf_reset(&buf);
+			bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+			prt_str(&buf, " ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+			bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n  %s",
+					    __func__, buf.buf);
+		}
 
 		bch2_btree_node_rewrite_async(c, b);
 	}
-- 
cgit v1.2.3


From c1f618f4f7cc7b8360e7362d3d18f3e244ded364 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Nov 2024 18:53:26 -0500
Subject: bcachefs: bch2_async_btree_node_rewrites_flush()

Add a method to flush btree node rewrites at the end of recovery, to
ensure that corrected errors are persisted.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |   7 +-
 fs/bcachefs/btree_update_interior.c | 153 +++++++++++++++++++++---------------
 fs/bcachefs/btree_update_interior.h |   1 +
 fs/bcachefs/recovery.c              |   2 +
 4 files changed, 97 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c16937e54734..b12c9c78beec 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -829,9 +829,10 @@ struct bch_fs {
 	struct work_struct	btree_interior_update_work;
 
 	struct workqueue_struct	*btree_node_rewrite_worker;
-
-	struct list_head	pending_node_rewrites;
-	struct mutex		pending_node_rewrites_lock;
+	struct list_head	btree_node_rewrites;
+	struct list_head	btree_node_rewrites_pending;
+	spinlock_t		btree_node_rewrites_lock;
+	struct closure_waitlist	btree_node_rewrites_wait;
 
 	/* btree_io.c: */
 	spinlock_t		btree_write_error_lock;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 5eabd532e388..f2a1d5d3d8d5 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2206,42 +2206,50 @@ struct async_btree_rewrite {
 	struct list_head	list;
 	enum btree_id		btree_id;
 	unsigned		level;
-	struct bpos		pos;
-	__le64			seq;
+	struct bkey_buf		key;
 };
 
 static int async_btree_node_rewrite_trans(struct btree_trans *trans,
 					  struct async_btree_rewrite *a)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	struct btree *b;
-	int ret;
-
-	bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+	bch2_trans_node_iter_init(trans, &iter,
+				  a->btree_id, a->key.k->k.p,
 				  BTREE_MAX_DEPTH, a->level, 0);
-	b = bch2_btree_iter_peek_node(&iter);
-	ret = PTR_ERR_OR_ZERO(b);
+	struct btree *b = bch2_btree_iter_peek_node(&iter);
+	int ret = PTR_ERR_OR_ZERO(b);
 	if (ret)
 		goto out;
 
-	if (!b || b->data->keys.seq != a->seq) {
+	bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
+	ret = found
+		? bch2_btree_node_rewrite(trans, &iter, b, 0)
+		: -ENOENT;
+
+#if 0
+	/* Tracepoint... */
+	if (!ret || ret == -ENOENT) {
+		struct bch_fs *c = trans->c;
 		struct printbuf buf = PRINTBUF;
 
-		if (b)
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		else
-			prt_str(&buf, "(null");
-		bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
-			 __func__, a->seq, buf.buf);
+		if (!ret) {
+			prt_printf(&buf, "rewrite node:\n  ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
+		} else {
+			prt_printf(&buf, "node to rewrite not found:\n  want: ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
+			prt_printf(&buf, "\n  got:  ");
+			if (b)
+				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+			else
+				prt_str(&buf, "(null)");
+		}
+		bch_info(c, "%s", buf.buf);
 		printbuf_exit(&buf);
-		goto out;
 	}
-
-	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+#endif
 out:
 	bch2_trans_iter_exit(trans, &iter);
-
 	return ret;
 }
 
@@ -2252,81 +2260,96 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
 	struct bch_fs *c = a->c;
 
 	int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
-	bch_err_fn_ratelimited(c, ret);
+	if (ret != -ENOENT)
+		bch_err_fn_ratelimited(c, ret);
+
+	spin_lock(&c->btree_node_rewrites_lock);
+	list_del(&a->list);
+	spin_unlock(&c->btree_node_rewrites_lock);
+
+	closure_wake_up(&c->btree_node_rewrites_wait);
+
+	bch2_bkey_buf_exit(&a->key, c);
 	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 	kfree(a);
 }
 
 void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
-	struct async_btree_rewrite *a;
-	int ret;
-
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (!a) {
-		bch_err(c, "%s: error allocating memory", __func__);
+	struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (!a)
 		return;
-	}
 
 	a->c		= c;
 	a->btree_id	= b->c.btree_id;
 	a->level	= b->c.level;
-	a->pos		= b->key.k.p;
-	a->seq		= b->data->keys.seq;
 	INIT_WORK(&a->work, async_btree_node_rewrite_work);
 
-	if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
-		mutex_lock(&c->pending_node_rewrites_lock);
-		list_add(&a->list, &c->pending_node_rewrites);
-		mutex_unlock(&c->pending_node_rewrites_lock);
-		return;
-	}
+	bch2_bkey_buf_init(&a->key);
+	bch2_bkey_buf_copy(&a->key, c, &b->key);
 
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
-		if (test_bit(BCH_FS_started, &c->flags)) {
-			bch_err(c, "%s: error getting c->writes ref", __func__);
-			kfree(a);
-			return;
-		}
+	bool now = false, pending = false;
 
-		ret = bch2_fs_read_write_early(c);
-		bch_err_msg(c, ret, "going read-write");
-		if (ret) {
-			kfree(a);
-			return;
-		}
+	spin_lock(&c->btree_node_rewrites_lock);
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+		list_add(&a->list, &c->btree_node_rewrites);
+		now = true;
+	} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
+		list_add(&a->list, &c->btree_node_rewrites_pending);
+		pending = true;
+	}
+	spin_unlock(&c->btree_node_rewrites_lock);
 
-		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+	if (now) {
+		queue_work(c->btree_node_rewrite_worker, &a->work);
+	} else if (pending) {
+		/* bch2_do_pending_node_rewrites will execute */
+	} else {
+		bch2_bkey_buf_exit(&a->key, c);
+		kfree(a);
 	}
+}
 
-	queue_work(c->btree_node_rewrite_worker, &a->work);
+void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
+{
+	closure_wait_event(&c->btree_node_rewrites_wait,
+			   list_empty(&c->btree_node_rewrites));
 }
 
 void bch2_do_pending_node_rewrites(struct bch_fs *c)
 {
-	struct async_btree_rewrite *a, *n;
-
-	mutex_lock(&c->pending_node_rewrites_lock);
-	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
-		list_del(&a->list);
+	while (1) {
+		spin_lock(&c->btree_node_rewrites_lock);
+		struct async_btree_rewrite *a =
+			list_pop_entry(&c->btree_node_rewrites_pending,
+				       struct async_btree_rewrite, list);
+		if (a)
+			list_add(&a->list, &c->btree_node_rewrites);
+		spin_unlock(&c->btree_node_rewrites_lock);
+
+		if (!a)
+			break;
 
 		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
 		queue_work(c->btree_node_rewrite_worker, &a->work);
 	}
-	mutex_unlock(&c->pending_node_rewrites_lock);
 }
 
 void bch2_free_pending_node_rewrites(struct bch_fs *c)
 {
-	struct async_btree_rewrite *a, *n;
+	while (1) {
+		spin_lock(&c->btree_node_rewrites_lock);
+		struct async_btree_rewrite *a =
+			list_pop_entry(&c->btree_node_rewrites_pending,
+				       struct async_btree_rewrite, list);
+		spin_unlock(&c->btree_node_rewrites_lock);
 
-	mutex_lock(&c->pending_node_rewrites_lock);
-	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
-		list_del(&a->list);
+		if (!a)
+			break;
 
+		bch2_bkey_buf_exit(&a->key, c);
 		kfree(a);
 	}
-	mutex_unlock(&c->pending_node_rewrites_lock);
 }
 
 static int __bch2_btree_node_update_key(struct btree_trans *trans,
@@ -2683,6 +2706,9 @@ void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
 
 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
 {
+	WARN_ON(!list_empty(&c->btree_node_rewrites));
+	WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
+
 	if (c->btree_node_rewrite_worker)
 		destroy_workqueue(c->btree_node_rewrite_worker);
 	if (c->btree_interior_update_worker)
@@ -2698,8 +2724,9 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
 	mutex_init(&c->btree_interior_update_lock);
 	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
 
-	INIT_LIST_HEAD(&c->pending_node_rewrites);
-	mutex_init(&c->pending_node_rewrites_lock);
+	INIT_LIST_HEAD(&c->btree_node_rewrites);
+	INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
+	spin_lock_init(&c->btree_node_rewrites_lock);
 }
 
 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 1c6cf3e2e6a9..7930ffea3075 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -334,6 +334,7 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
 					struct jset_entry *, unsigned long);
 
+void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
 void bch2_do_pending_node_rewrites(struct bch_fs *);
 void bch2_free_pending_node_rewrites(struct bch_fs *);
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c50dede64785..a342744fd275 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -931,6 +931,8 @@ use_clean:
 	/* in case we don't run journal replay, i.e. norecovery mode */
 	set_bit(BCH_FS_accounting_replay_done, &c->flags);
 
+	bch2_async_btree_node_rewrites_flush(c);
+
 	/* fsync if we fixed errors */
 	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
 		bch2_journal_flush_all_pins(&c->journal);
-- 
cgit v1.2.3


From 511ddcdb2d5e0bdb73c7968e4215268f4572a984 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 30 Nov 2024 23:27:45 -0500
Subject: bcachefs: fix bch2_journal_key_insert_take() seq

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_journal_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index de3db161d6ab..6d25e3f85ce8 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -259,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		 * Ensure these keys are done last by journal replay, to unblock
 		 * journal reclaim:
 		 */
-		.journal_seq	= U32_MAX,
+		.journal_seq	= U64_MAX,
 	};
 	struct journal_keys *keys = &c->journal_keys;
 	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-- 
cgit v1.2.3


From 5cdaec193a85e32235e7dccb95c085acc50b8dbd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 1 Dec 2024 16:39:54 -0500
Subject: bcachefs: Improve "unable to allocate journal write" message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1627f3e16517..bb69d80886b5 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -2036,8 +2036,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
 		struct printbuf buf = PRINTBUF;
 		buf.atomic++;
 
-		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"),
+		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
 					  le64_to_cpu(w->data->seq),
+					  vstruct_sectors(w->data, c->block_bits),
 					  bch2_err_str(ret));
 		__bch2_journal_debug_to_text(&buf, j);
 		spin_unlock(&j->lock);
-- 
cgit v1.2.3


From 9c22dd02ae8b80bf662ab409091731cfb9a09348 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 2 Dec 2024 23:36:38 -0500
Subject: bcachefs: Fix allocating too big journal entry

The "journal space available" calculations didn't take into account
mismatched bucket sizes; we need to take the minimum space available out
of our devices.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 1aabbbe328d9..b7936ad3ae7f 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -140,6 +140,7 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned pos, nr_devs = 0;
 	struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+	unsigned min_bucket_size = U32_MAX;
 
 	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
 
@@ -148,6 +149,8 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 		if (!ca->journal.nr)
 			continue;
 
+		min_bucket_size = min(min_bucket_size, ca->mi.bucket_size);
+
 		space = journal_dev_space_available(j, ca, from);
 		if (!space.next_entry)
 			continue;
@@ -167,7 +170,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 	 * We sorted largest to smallest, and we want the smallest out of the
 	 * @nr_devs_want largest devices:
 	 */
-	return dev_space[nr_devs_want - 1];
+	space = dev_space[nr_devs_want - 1];
+	space.next_entry = min(space.next_entry, min_bucket_size);
+	return space;
 }
 
 void bch2_journal_space_available(struct journal *j)
-- 
cgit v1.2.3


From d36b3e74b65f4ec68a38bdb717d94b32a81a355f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 3 Dec 2024 17:40:10 +0100
Subject: bcachefs: BCACHEFS_PATH_TRACEPOINTS should depend on TRACING

When tracing is disabled, there is no point in asking the user about
enabling extra btree_path tracepoints in bcachefs.

Fixes: 32ed4a620c5405be ("bcachefs: Btree path tracepoints")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index ab6c95b895b3..464b927e4fff 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -90,7 +90,7 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
 
 config BCACHEFS_PATH_TRACEPOINTS
 	bool "Extra btree_path tracepoints"
-	depends on BCACHEFS_FS
+	depends on BCACHEFS_FS && TRACING
 	help
 	Enable extra tracepoints for debugging btree_path operations; we don't
 	normally want these enabled because they happen at very high rates.
-- 
cgit v1.2.3


From ad0b2544ec827e03b75143bed83338bda7f6fe21 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 3 Dec 2024 21:22:26 -0500
Subject: bcachefs: rcu_pending now works in userspace

Introduce a typedef to handle the difference between unsigned
long/struct urcu_gp_poll_state.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rcu_pending.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
index 67522aa344a7..bef2aa1b8bcd 100644
--- a/fs/bcachefs/rcu_pending.c
+++ b/fs/bcachefs/rcu_pending.c
@@ -25,21 +25,37 @@ enum rcu_pending_special {
 #define RCU_PENDING_KVFREE_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
 #define RCU_PENDING_CALL_RCU_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
 
-static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
+#ifdef __KERNEL__
+typedef unsigned long			rcu_gp_poll_state_t;
+
+static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
+{
+	return l == r;
+}
+#else
+typedef struct urcu_gp_poll_state	rcu_gp_poll_state_t;
+
+static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
+{
+	return l.grace_period_id == r.grace_period_id;
+}
+#endif
+
+static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
 {
 	return ssp
 		? get_state_synchronize_srcu(ssp)
 		: get_state_synchronize_rcu();
 }
 
-static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
 {
 	return ssp
 		? start_poll_synchronize_srcu(ssp)
 		: start_poll_synchronize_rcu();
 }
 
-static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
 {
 	return ssp
 		? poll_state_synchronize_srcu(ssp, cookie)
@@ -71,13 +87,13 @@ struct rcu_pending_seq {
 	GENRADIX(struct rcu_head *)	objs;
 	size_t				nr;
 	struct rcu_head			**cursor;
-	unsigned long			seq;
+	rcu_gp_poll_state_t		seq;
 };
 
 struct rcu_pending_list {
 	struct rcu_head			*head;
 	struct rcu_head			*tail;
-	unsigned long			seq;
+	rcu_gp_poll_state_t		seq;
 };
 
 struct rcu_pending_pcpu {
@@ -316,10 +332,10 @@ static void rcu_pending_rcu_cb(struct rcu_head *rcu)
 }
 
 static __always_inline struct rcu_pending_seq *
-get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
+get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
 {
 	darray_for_each_reverse(p->objs, objs)
-		if (objs->seq == seq)
+		if (rcu_gp_poll_cookie_eq(objs->seq, seq))
 			return objs;
 
 	if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
@@ -329,7 +345,7 @@ get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
 }
 
 static noinline bool
-rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
 			 struct rcu_head *head, void *ptr,
 			 unsigned long *flags)
 {
@@ -364,7 +380,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
 again:
 	for (struct rcu_pending_list *i = p->lists;
 	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
-		if (i->seq == seq) {
+		if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
 			rcu_pending_list_add(i, head);
 			return false;
 		}
@@ -408,7 +424,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
 	struct rcu_pending_pcpu *p;
 	struct rcu_pending_seq *objs;
 	struct genradix_node *new_node = NULL;
-	unsigned long seq, flags;
+	unsigned long flags;
 	bool start_gp = false;
 
 	BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
@@ -416,7 +432,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
 	local_irq_save(flags);
 	p = this_cpu_ptr(pending->p);
 	spin_lock(&p->lock);
-	seq = __get_state_synchronize_rcu(pending->srcu);
+	rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
 restart:
 	if (may_sleep &&
 	    unlikely(process_finished_items(pending, p, flags)))
@@ -478,9 +494,7 @@ start_gp:
 		 */
 		if (!p->cb_armed) {
 			p->cb_armed = true;
-			spin_unlock_irqrestore(&p->lock, flags);
 			__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
-			goto free_node;
 		} else {
 			__start_poll_synchronize_rcu(pending->srcu);
 		}
-- 
cgit v1.2.3


From f78760dede23affb50a6fe62b1230849e1a5d15f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 1 Dec 2024 21:35:11 -0500
Subject: bcachefs: logged ops only use inum 0 of logged ops btree

we wish to use the logged ops btree for other items that aren't strictly
logged ops: cursors for inode allocation

There's no reason to create another cached btree for inode allocator
cursors - so reserve different parts of the keyspace for different
purposes.

Older versions will ignore or delete the cursors.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/logged_ops.c        | 10 +++++-----
 fs/bcachefs/logged_ops_format.h |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 60e00702d1a4..1ac51af16299 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -63,8 +63,9 @@ fsck_err:
 int bch2_resume_logged_ops(struct bch_fs *c)
 {
 	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter,
-				   BTREE_ID_logged_ops, POS_MIN,
+		for_each_btree_key_max(trans, iter,
+				   BTREE_ID_logged_ops,
+				   POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX),
 				   BTREE_ITER_prefetch, k,
 			resume_logged_op(trans, &iter, k)));
 	bch_err_fn(c, ret);
@@ -74,9 +75,8 @@ int bch2_resume_logged_ops(struct bch_fs *c)
 static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
 {
 	struct btree_iter iter;
-	int ret;
-
-	ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+	int ret = bch2_bkey_get_empty_slot(trans, &iter,
+				 BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX));
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
index 6a4bf7129dba..0b370a963ac6 100644
--- a/fs/bcachefs/logged_ops_format.h
+++ b/fs/bcachefs/logged_ops_format.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
 #define _BCACHEFS_LOGGED_OPS_FORMAT_H
 
+#define LOGGED_OPS_INUM		0
+
 struct bch_logged_op_truncate {
 	struct bch_val		v;
 	__le32			subvol;
-- 
cgit v1.2.3


From 8dabb19ff4b802131ebfc1024de132b601c3c23d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 3 Dec 2024 22:03:18 -0500
Subject: bcachefs: Simplify disk accounting validate late

The validate late path was iterating over accounting entries in
eytzinger order, which is unnecessarily tricky when we may have to
remove entries.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/darray.h          |  2 +-
 fs/bcachefs/disk_accounting.c | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 8f4c3f0665c4..c6151495985f 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
 	for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
 
 #define darray_for_each_reverse(_d, _i)					\
-	for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+	for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
 
 #define darray_init(_d)							\
 do {									\
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 71c49a7ee2fe..a915d9dc8de4 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -765,15 +765,16 @@ int bch2_accounting_read(struct bch_fs *c)
 	keys->gap = keys->nr = dst - keys->data;
 
 	percpu_down_write(&c->mark_lock);
-	unsigned i = 0;
-	while (i < acc->k.nr) {
-		unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
 
+	darray_for_each_reverse(acc->k, i) {
 		struct disk_accounting_pos acc_k;
-		bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
+		bpos_to_disk_accounting_pos(&acc_k, i->pos);
 
 		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
+		memset(v, 0, sizeof(v));
+
+		for (unsigned j = 0; j < i->nr_counters; j++)
+			v[j] = percpu_u64_get(i->v[0] + j);
 
 		/*
 		 * If the entry counters are zeroed, it should be treated as
@@ -782,26 +783,25 @@ int bch2_accounting_read(struct bch_fs *c)
 		 * Remove it, so that if it's re-added it gets re-marked in the
 		 * superblock:
 		 */
-		ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
+		ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
 			? -BCH_ERR_remove_disk_accounting_entry
-			: bch2_disk_accounting_validate_late(trans, acc_k,
-							v, acc->k.data[idx].nr_counters);
+			: bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters);
 
 		if (ret == -BCH_ERR_remove_disk_accounting_entry) {
-			free_percpu(acc->k.data[idx].v[0]);
-			free_percpu(acc->k.data[idx].v[1]);
-			darray_remove_item(&acc->k, &acc->k.data[idx]);
-			eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-					accounting_pos_cmp, NULL);
+			free_percpu(i->v[0]);
+			free_percpu(i->v[1]);
+			darray_remove_item(&acc->k, i);
 			ret = 0;
 			continue;
 		}
 
 		if (ret)
 			goto fsck_err;
-		i++;
 	}
 
+	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+			accounting_pos_cmp, NULL);
+
 	preempt_disable();
 	struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
 
-- 
cgit v1.2.3


From e3474394eb1a0e4ebf4a5e0e2531671fa96add16 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 01:19:28 -0500
Subject: bcachefs: Advance to next bp on
 BCH_ERR_backpointer_to_overwritten_btree_node

Don't spin.

Fixes: de95cc201a97 ("bcachefs: Kill bch2_get_next_backpointer()")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 460175464762..6f21e36d89f7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -785,7 +785,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 			b = bch2_backpointer_get_node(trans, bp, &iter);
 			ret = PTR_ERR_OR_ZERO(b);
 			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-				continue;
+				goto next;
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
 			if (ret)
-- 
cgit v1.2.3


From 400af9a398186851103e27d848ef42be8870072b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 17:44:25 -0500
Subject: bcachefs: trace_accounting_mem_insert

Add a tracepoint for inserting new accounting entries: we're seeing odd
spinning behaviour in accounting read.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c |  8 ++++++++
 fs/bcachefs/trace.h           | 24 ++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index a915d9dc8de4..a0061bcf9159 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -324,6 +324,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
 
 	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
 			accounting_pos_cmp, NULL);
+
+	if (trace_accounting_mem_insert_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_accounting_to_text(&buf, c, a.s_c);
+		trace_accounting_mem_insert(c, buf.buf);
+		printbuf_exit(&buf);
+	}
 	return 0;
 err:
 	free_percpu(n.v[1]);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 2d5932d2881e..7baf66beee22 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio,
 		  (unsigned long long)__entry->sector, __entry->nr_sector)
 );
 
+/* disk_accounting.c */
+
+TRACE_EVENT(accounting_mem_insert,
+	TP_PROTO(struct bch_fs *c, const char *acc),
+	TP_ARGS(c, acc),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(unsigned,	new_nr			)
+		__string(acc,		acc			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->new_nr		= c->accounting.k.nr;
+		__assign_str(acc);
+	),
+
+	TP_printk("%d,%d entries %u added %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->new_nr,
+		  __get_str(acc))
+);
+
 /* fs.c: */
 TRACE_EVENT(bch2_sync_fs,
 	TP_PROTO(struct super_block *sb, int wait),
-- 
cgit v1.2.3


From 3f1cf04ff9877bf043795d05bb6704d0a85bcd80 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 17:48:06 -0500
Subject: bcachefs: Silence "unable to allocate journal write" if we're already
 RO

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bb69d80886b5..e7a43400a587 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -2032,7 +2032,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
 		bch2_journal_do_discards(j);
 	}
 
-	if (ret) {
+	if (ret && !bch2_journal_error(j)) {
 		struct printbuf buf = PRINTBUF;
 		buf.atomic++;
 
@@ -2044,8 +2044,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
 		spin_unlock(&j->lock);
 		bch2_print_string_as_lines(KERN_ERR, buf.buf);
 		printbuf_exit(&buf);
-		goto err;
 	}
+	if (ret)
+		goto err;
 
 	/*
 	 * write is allocated, no longer need to account for it in
-- 
cgit v1.2.3


From 6728f8f829cf68ae25cc664d3b1ba7034bc81fd4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 17:53:38 -0500
Subject: bcachefs: BCH_ERR_insufficient_journal_devices

kill another standard error code use

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h    | 1 +
 fs/bcachefs/journal_io.c | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 47387f7d6202..5e4dd85ac669 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -263,6 +263,7 @@
 	x(EIO,				missing_indirect_extent)		\
 	x(EIO,				invalidate_stripe_to_dev)		\
 	x(EIO,				no_encryption_key)			\
+	x(EIO,				insufficient_journal_devices)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e7a43400a587..e5fce5e497f2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1503,8 +1503,7 @@ retry:
 
 	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
 
-	__journal_write_alloc(j, w, &devs_sorted,
-			      sectors, &replicas, replicas_want);
+	__journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
 
 	if (replicas >= replicas_want)
 		goto done;
@@ -1544,7 +1543,7 @@ done:
 
 	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
 
-	return replicas >= replicas_need ? 0 : -EROFS;
+	return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
 }
 
 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
-- 
cgit v1.2.3


From 49833ce27ed2eed91915a4c25690d82aae5b6a0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 18:16:25 -0500
Subject: bcachefs: Fix failure to allocate journal write on discard retry

When allocating a journal write fails, then retries after doing
discards, we were failing to count already allocated replicas.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e5fce5e497f2..d7dfea5f0181 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1498,6 +1498,15 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 				       READ_ONCE(c->opts.metadata_replicas_required));
 
 	rcu_read_lock();
+
+	/* We might run more than once if we have to stop and do discards: */
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
+	bkey_for_each_ptr(ptrs, p) {
+		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
+		if (ca)
+			replicas += ca->mi.durability;
+	}
+
 retry:
 	devs = target_rw_devs(c, BCH_DATA_journal, target);
 
-- 
cgit v1.2.3


From 47d6ee766f8033563aff333f326378cd4b36a170 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 19:21:22 -0500
Subject: bcachefs: dev_alloc_list.devs -> dev_alloc_list.data

This lets us use darray macros on dev_alloc_list (and it will become a
darray eventually, when we increase the maximum number of devices).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 60 ++++++++++++++++++------------------------
 fs/bcachefs/alloc_foreground.h |  2 +-
 fs/bcachefs/journal_io.c       | 21 ++++++---------
 3 files changed, 34 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 095bfe7c53bd..49c9275465f9 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -626,9 +626,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
 	unsigned i;
 
 	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
-		ret.devs[ret.nr++] = i;
+		ret.data[ret.nr++] = i;
 
-	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
+	bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
 	return ret;
 }
 
@@ -700,18 +700,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		      struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
-	struct dev_alloc_list devs_sorted =
-		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 	int ret = -BCH_ERR_insufficient_devices;
 
 	BUG_ON(*nr_effective >= nr_replicas);
 
-	for (unsigned i = 0; i < devs_sorted.nr; i++) {
-		struct bch_dev_usage usage;
-		struct open_bucket *ob;
-
-		unsigned dev = devs_sorted.devs[i];
-		struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
+	struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+	darray_for_each(devs_sorted, i) {
+		struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i);
 		if (!ca)
 			continue;
 
@@ -720,8 +715,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 			continue;
 		}
 
-		ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
-					     cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
+		struct bch_dev_usage usage;
+		struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
+						     cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
 		if (!IS_ERR(ob))
 			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
 		bch2_dev_put(ca);
@@ -765,10 +761,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 			 struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
-	struct dev_alloc_list devs_sorted;
-	struct ec_stripe_head *h;
-	struct open_bucket *ob;
-	unsigned i, ec_idx;
 	int ret = 0;
 
 	if (nr_replicas < 2)
@@ -777,34 +769,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 	if (ec_open_bucket(c, ptrs))
 		return 0;
 
-	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+	struct ec_stripe_head *h =
+		bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 	if (!h)
 		return 0;
 
-	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
-
-	for (i = 0; i < devs_sorted.nr; i++)
-		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+	struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+	darray_for_each(devs_sorted, i)
+		for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
 			if (!h->s->blocks[ec_idx])
 				continue;
 
-			ob = c->open_buckets + h->s->blocks[ec_idx];
-			if (ob->dev == devs_sorted.devs[i] &&
-			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
-				goto got_bucket;
+			struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
+			if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
+				ob->ec_idx	= ec_idx;
+				ob->ec		= h->s;
+				ec_stripe_new_get(h->s, STRIPE_REF_io);
+
+				ret = add_new_bucket(c, ptrs, devs_may_alloc,
+						     nr_replicas, nr_effective,
+						     have_cache, ob);
+				goto out;
+			}
 		}
-	goto out_put_head;
-got_bucket:
-	ob->ec_idx	= ec_idx;
-	ob->ec		= h->s;
-	ec_stripe_new_get(h->s, STRIPE_REF_io);
-
-	ret = add_new_bucket(c, ptrs, devs_may_alloc,
-			     nr_replicas, nr_effective,
-			     have_cache, ob);
-out_put_head:
+out:
 	bch2_ec_stripe_head_put(c, h);
 	return ret;
 }
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 4f87745df97e..f25481a0d1a0 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *);
 
 struct dev_alloc_list {
 	unsigned	nr;
-	u8		devs[BCH_SB_MEMBERS_MAX];
+	u8		data[BCH_SB_MEMBERS_MAX];
 };
 
 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index d7dfea5f0181..9a1647297d11 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1422,25 +1422,22 @@ fsck_err:
 
 static void __journal_write_alloc(struct journal *j,
 				  struct journal_buf *w,
-				  struct dev_alloc_list *devs_sorted,
+				  struct dev_alloc_list *devs,
 				  unsigned sectors,
 				  unsigned *replicas,
 				  unsigned replicas_want)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_device *ja;
-	struct bch_dev *ca;
-	unsigned i;
 
 	if (*replicas >= replicas_want)
 		return;
 
-	for (i = 0; i < devs_sorted->nr; i++) {
-		ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+	darray_for_each(*devs, i) {
+		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
 		if (!ca)
 			continue;
 
-		ja = &ca->journal;
+		struct journal_device *ja = &ca->journal;
 
 		/*
 		 * Check that we can use this device, and aren't already using
@@ -1486,13 +1483,11 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_devs_mask devs;
-	struct journal_device *ja;
-	struct bch_dev *ca;
 	struct dev_alloc_list devs_sorted;
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 	unsigned target = c->opts.metadata_target ?:
 		c->opts.foreground_target;
-	unsigned i, replicas = 0, replicas_want =
+	unsigned replicas = 0, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);
 	unsigned replicas_need = min_t(unsigned, replicas_want,
 				       READ_ONCE(c->opts.metadata_replicas_required));
@@ -1517,12 +1512,12 @@ retry:
 	if (replicas >= replicas_want)
 		goto done;
 
-	for (i = 0; i < devs_sorted.nr; i++) {
-		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+	darray_for_each(devs_sorted, i) {
+		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
 		if (!ca)
 			continue;
 
-		ja = &ca->journal;
+		struct journal_device *ja = &ca->journal;
 
 		if (sectors > ja->sectors_free &&
 		    sectors <= ca->mi.bucket_size &&
-- 
cgit v1.2.3


From ff7e7c5367250454ed10a6113695d2e01ccc0cfc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 18:14:14 -0500
Subject: bcachefs: Journal write path refactoring, debug improvements

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c    |  6 +++++
 fs/bcachefs/journal_io.c | 70 +++++++++++++++++++++++++++---------------------
 2 files changed, 45 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index dc66521964b7..04a9ccf76d75 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1564,6 +1564,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	printbuf_indent_sub(out, 2);
 
 	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
+		if (!ca->mi.durability)
+			continue;
+
 		struct journal_device *ja = &ca->journal;
 
 		if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
@@ -1573,6 +1576,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 			continue;
 
 		prt_printf(out, "dev %u:\n",			ca->dev_idx);
+		prt_printf(out, "durability %u:\n",		ca->mi.durability);
 		printbuf_indent_add(out, 2);
 		prt_printf(out, "nr\t%u\n",			ja->nr);
 		prt_printf(out, "bucket size\t%u\n",		ca->mi.bucket_size);
@@ -1584,6 +1588,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 		printbuf_indent_sub(out, 2);
 	}
 
+	prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
+
 	rcu_read_unlock();
 
 	--out->atomic;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9a1647297d11..2f4daa8bd498 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1420,6 +1420,35 @@ fsck_err:
 
 /* journal write: */
 
+static void journal_advance_devs_to_next_bucket(struct journal *j,
+						struct dev_alloc_list *devs,
+						unsigned sectors, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	darray_for_each(*devs, i) {
+		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
+		if (!ca)
+			continue;
+
+		struct journal_device *ja = &ca->journal;
+
+		if (sectors > ja->sectors_free &&
+		    sectors <= ca->mi.bucket_size &&
+		    bch2_journal_dev_buckets_available(j, ja,
+					journal_space_discarded)) {
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+			ja->sectors_free = ca->mi.bucket_size;
+
+			/*
+			 * ja->bucket_seq[ja->cur_idx] must always have
+			 * something sensible:
+			 */
+			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
+		}
+	}
+}
+
 static void __journal_write_alloc(struct journal *j,
 				  struct journal_buf *w,
 				  struct dev_alloc_list *devs,
@@ -1429,9 +1458,6 @@ static void __journal_write_alloc(struct journal *j,
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-	if (*replicas >= replicas_want)
-		return;
-
 	darray_for_each(*devs, i) {
 		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
 		if (!ca)
@@ -1491,6 +1517,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 		READ_ONCE(c->opts.metadata_replicas);
 	unsigned replicas_need = min_t(unsigned, replicas_want,
 				       READ_ONCE(c->opts.metadata_replicas_required));
+	bool advance_done = false;
 
 	rcu_read_lock();
 
@@ -1502,45 +1529,26 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 			replicas += ca->mi.durability;
 	}
 
-retry:
+retry_target:
 	devs = target_rw_devs(c, BCH_DATA_journal, target);
-
 	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
-
+retry_alloc:
 	__journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
 
-	if (replicas >= replicas_want)
+	if (likely(replicas >= replicas_want))
 		goto done;
 
-	darray_for_each(devs_sorted, i) {
-		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
-		if (!ca)
-			continue;
-
-		struct journal_device *ja = &ca->journal;
-
-		if (sectors > ja->sectors_free &&
-		    sectors <= ca->mi.bucket_size &&
-		    bch2_journal_dev_buckets_available(j, ja,
-					journal_space_discarded)) {
-			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-			ja->sectors_free = ca->mi.bucket_size;
-
-			/*
-			 * ja->bucket_seq[ja->cur_idx] must always have
-			 * something sensible:
-			 */
-			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-		}
+	if (!advance_done) {
+		journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
+		advance_done = true;
+		goto retry_alloc;
 	}
 
-	__journal_write_alloc(j, w, &devs_sorted,
-			      sectors, &replicas, replicas_want);
-
 	if (replicas < replicas_want && target) {
 		/* Retry from all devices: */
 		target = 0;
-		goto retry;
+		advance_done = false;
+		goto retry_target;
 	}
 done:
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 90c6daa6ac90a7f83efa566350fbe2404f848ef0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 19:41:38 -0500
Subject: bcachefs: Call bch2_btree_lost_data() on btree read error

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e59924cfe2bc..24f2f3bdf704 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -29,6 +29,7 @@
 #include "move.h"
 #include "recovery_passes.h"
 #include "reflink.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
@@ -359,11 +360,9 @@ again:
 			if (ret)
 				break;
 
-			if (!btree_id_is_alloc(b->c.btree_id)) {
-				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
-				if (ret)
-					break;
-			}
+			ret = bch2_btree_lost_data(c, b->c.btree_id);
+			if (ret)
+				break;
 			continue;
 		}
 
@@ -525,7 +524,7 @@ int bch2_check_topology(struct bch_fs *c)
 		bch2_btree_id_to_text(&buf, i);
 
 		if (r->error) {
-			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+			ret = bch2_btree_lost_data(c, i);
 			if (ret)
 				break;
 reconstruct_root:
@@ -741,7 +740,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
 			       (printbuf_reset(&buf),
 				bch2_btree_id_to_text(&buf, btree),
 				buf.buf)))
-			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+			ret = bch2_btree_lost_data(c, btree);
 	}
 fsck_err:
 	printbuf_exit(&buf);
-- 
cgit v1.2.3


From c67fab0774cee93b6aac9adc3601bcf0a4ea6ab4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 19:46:35 -0500
Subject: bcachefs: Make sure __bch2_run_explicit_recovery_pass() signals to
 rewind

We should always signal to rewind if the requested pass hasn't been run,
even if called multiple times.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  1 +
 fs/bcachefs/recovery_passes.c | 52 +++++++++++++++++++++----------------------
 2 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b12c9c78beec..e6cd93e1ed0f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1044,6 +1044,7 @@ struct bch_fs {
 	 * for signaling to the toplevel code which pass we want to run now.
 	 */
 	enum bch_recovery_pass	curr_recovery_pass;
+	enum bch_recovery_pass	next_recovery_pass;
 	/* bitmask of recovery passes that we actually ran */
 	u64			recovery_passes_complete;
 	/* never rewinds version of curr_recovery_pass */
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index f6d3a99cb63e..0b3c951c32da 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -103,27 +103,31 @@ u64 bch2_recovery_passes_from_stable(u64 v)
 static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
 					     enum bch_recovery_pass pass)
 {
-	if (c->opts.recovery_passes & BIT_ULL(pass))
-		return 0;
-
 	if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
 		return -BCH_ERR_not_in_recovery;
 
+	if (c->recovery_passes_complete & BIT_ULL(pass))
+		return 0;
+
+	bool print = !(c->opts.recovery_passes & BIT_ULL(pass));
+
 	if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
 	    c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
-		bch_info(c, "need recovery pass %s (%u), but already rw",
-			 bch2_recovery_passes[pass], pass);
+		if (print)
+			bch_info(c, "need recovery pass %s (%u), but already rw",
+				 bch2_recovery_passes[pass], pass);
 		return -BCH_ERR_cannot_rewind_recovery;
 	}
 
-	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
-		 bch2_recovery_passes[pass], pass,
-		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+	if (print)
+		bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+			 bch2_recovery_passes[pass], pass,
+			 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
 
 	c->opts.recovery_passes |= BIT_ULL(pass);
 
-	if (c->curr_recovery_pass >= pass) {
-		c->curr_recovery_pass = pass;
+	if (c->curr_recovery_pass > pass) {
+		c->next_recovery_pass = pass;
 		c->recovery_passes_complete &= (1ULL << pass) >> 1;
 		return -BCH_ERR_restart_recovery;
 	} else {
@@ -264,7 +268,9 @@ int bch2_run_recovery_passes(struct bch_fs *c)
 	 */
 	c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
 
-	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
+		c->next_recovery_pass = c->curr_recovery_pass + 1;
+
 		spin_lock_irq(&c->recovery_pass_lock);
 		unsigned pass = c->curr_recovery_pass;
 
@@ -285,31 +291,25 @@ int bch2_run_recovery_passes(struct bch_fs *c)
 		ret =   bch2_run_recovery_pass(c, pass) ?:
 			bch2_journal_flush(&c->journal);
 
+		if (!ret && !test_bit(BCH_FS_error, &c->flags))
+			bch2_clear_recovery_pass_required(c, pass);
+
 		spin_lock_irq(&c->recovery_pass_lock);
-		if (c->curr_recovery_pass < pass) {
+		if (c->next_recovery_pass < c->curr_recovery_pass) {
 			/*
 			 * bch2_run_explicit_recovery_pass() was called: we
 			 * can't always catch -BCH_ERR_restart_recovery because
 			 * it may have been called from another thread (btree
 			 * node read completion)
 			 */
-			spin_unlock_irq(&c->recovery_pass_lock);
-			continue;
-		} else if (c->curr_recovery_pass == pass) {
-			c->curr_recovery_pass++;
+			ret = 0;
+			c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
 		} else {
-			BUG();
+			c->recovery_passes_complete |= BIT_ULL(pass);
+			c->recovery_pass_done = max(c->recovery_pass_done, pass);
 		}
+		c->curr_recovery_pass = c->next_recovery_pass;
 		spin_unlock_irq(&c->recovery_pass_lock);
-
-		if (ret)
-			break;
-
-		c->recovery_passes_complete |= BIT_ULL(pass);
-		c->recovery_pass_done = max(c->recovery_pass_done, pass);
-
-		if (!test_bit(BCH_FS_error, &c->flags))
-			bch2_clear_recovery_pass_required(c, pass);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 23f88c1d165563c9432314f92a9e7b8b6e17c7a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 20:43:01 -0500
Subject: bcachefs: Don't call bch2_btree_interior_update_will_free_node()
 until after update succeeds

Originally, btree splits always succeeded once we got to the point of
recursing to the btree_insert_node() call.

But that changed when we switched to not taking intent locks all the way
up to the root, and that introduced a bug, because
bch2_btree_interior_update_will_free_node() cancels paending writes and
reparents a node that's going to be made visible on disk by another
btree update to the current btree update.

This was discovered in recent backpointers work, because
bch2_btree_interior_update_will_free_node() also clears the
will_make_reachable flag, causing backpointer target lookup to
spuriously thing it had found a dangling backpointer (when the
backpointer just hadn't been created yet by
btree_update_nodes_written()).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f2a1d5d3d8d5..7d9dab95bdcf 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1607,8 +1607,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	bch2_btree_interior_update_will_free_node(as, b);
-
 	if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
 		struct btree *n[2];
 
@@ -1707,6 +1705,8 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 	if (ret)
 		goto err;
 
+	bch2_btree_interior_update_will_free_node(as, b);
+
 	if (n3) {
 		bch2_btree_update_get_open_buckets(as, n3);
 		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
@@ -2063,9 +2063,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	trace_and_count(c, btree_node_merge, trans, b);
 
-	bch2_btree_interior_update_will_free_node(as, b);
-	bch2_btree_interior_update_will_free_node(as, m);
-
 	n = bch2_btree_node_alloc(as, trans, b->c.level);
 
 	SET_BTREE_NODE_SEQ(n->data,
@@ -2101,6 +2098,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	if (ret)
 		goto err_free_update;
 
+	bch2_btree_interior_update_will_free_node(as, b);
+	bch2_btree_interior_update_will_free_node(as, m);
+
 	bch2_trans_verify_paths(trans);
 
 	bch2_btree_update_get_open_buckets(as, n);
@@ -2155,8 +2155,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	bch2_btree_interior_update_will_free_node(as, b);
-
 	n = bch2_btree_node_alloc_replacement(as, trans, b);
 
 	bch2_btree_build_aux_trees(n);
@@ -2180,6 +2178,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
+	bch2_btree_interior_update_will_free_node(as, b);
+
 	bch2_btree_update_get_open_buckets(as, n);
 	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
-- 
cgit v1.2.3


From ce70157112482c775430568d9cc62e0abeb386ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 23:40:26 -0500
Subject: bcachefs: kill flags param to bch2_subvolume_get()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c    |  2 +-
 fs/bcachefs/fs-common.c |  4 +---
 fs/bcachefs/fs.c        |  7 +++----
 fs/bcachefs/fsck.c      |  7 +++----
 fs/bcachefs/snapshot.c  |  5 ++---
 fs/bcachefs/subvolume.c | 14 ++++++--------
 fs/bcachefs/subvolume.h |  2 +-
 7 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 41813f9ce831..600eee936f13 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -266,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
 	} else {
 		target->subvol	= le32_to_cpu(d.v->d_child_subvol);
 
-		ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s);
+		ret = bch2_subvolume_get(trans, target->subvol, true, &s);
 
 		target->inum	= le64_to_cpu(s.inode);
 	}
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index dcaa47f68f31..f8d27244e1d6 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans,
 		if (!snapshot_src.inum) {
 			/* Inode wasn't specified, just snapshot: */
 			struct bch_subvolume s;
-
-			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
-						 BTREE_ITER_cached, &s);
+			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s);
 			if (ret)
 				goto err;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index c6e7df7c67fa..3f83f131d0e8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -499,7 +499,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	struct bch_inode_unpacked inode_u;
 	struct bch_subvolume subvol;
 	int ret = lockrestart_do(trans,
-		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
 		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
 	bch2_trans_put(trans);
@@ -569,8 +569,7 @@ retry:
 	inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
 	inum.inum = inode_u.bi_inum;
 
-	ret   = bch2_subvolume_get(trans, inum.subvol, true,
-				   BTREE_ITER_with_updates, &subvol) ?:
+	ret   = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
 		bch2_trans_commit(trans, NULL, &journal_seq, 0);
 	if (unlikely(ret)) {
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
@@ -651,7 +650,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
 
 	struct bch_subvolume subvol;
 	struct bch_inode_unpacked inode_u;
-	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+	ret =   bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
 		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
 		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1a5a07112779..1e00b2694db7 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -109,7 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
 			 u32 *snapshot, u64 *inum)
 {
 	struct bch_subvolume s;
-	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+	int ret = bch2_subvolume_get(trans, subvol, false, &s);
 
 	*snapshot = le32_to_cpu(s.snapshot);
 	*inum = le64_to_cpu(s.inode);
@@ -226,8 +226,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
 
 	struct bch_subvolume subvol;
-	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
-				 false, 0, &subvol);
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), false, &subvol);
 	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
 		    le32_to_cpu(st.master_subvol), snapshot);
 	if (ret)
@@ -1421,7 +1420,7 @@ static int check_inode(struct btree_trans *trans,
 	if (u.bi_subvol) {
 		struct bch_subvolume s;
 
-		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+		ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
 		if (ret && !bch2_err_matches(ret, ENOENT))
 			goto err;
 
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index f368270d6d9b..99f045518312 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -570,8 +570,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
 		goto err;
 	}
 
-	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
-				 false, 0, &subvol);
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		goto err;
 
@@ -811,7 +810,7 @@ static int check_snapshot(struct btree_trans *trans,
 
 	if (should_have_subvol) {
 		id = le32_to_cpu(s.subvol);
-		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+		ret = bch2_subvolume_get(trans, id, false, &subvol);
 		if (bch2_err_matches(ret, ENOENT))
 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 5e5ae405cb28..0e756e35c3d9 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -286,11 +286,11 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
 static __always_inline int
 bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
 			   bool inconsistent_if_not_found,
-			   int iter_flags,
 			   struct bch_subvolume *s)
 {
 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
-					  iter_flags, subvolume, s);
+					  BTREE_ITER_cached|
+					  BTREE_ITER_with_updates, subvolume, s);
 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
 				inconsistent_if_not_found,
 				trans->c, "missing subvolume %u", subvol);
@@ -299,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
 
 int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
 		       bool inconsistent_if_not_found,
-		       int iter_flags,
 		       struct bch_subvolume *s)
 {
-	return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+	return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s);
 }
 
 int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
 {
 	struct bch_subvolume s;
-	int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s);
+	int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s);
 	if (ret)
 		return ret;
 
@@ -328,7 +327,7 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
 	struct bch_snapshot snap;
 
 	return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
-		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol);
 }
 
 int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
@@ -396,8 +395,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
 	struct bch_subvolume s;
 
 	return lockrestart_do(trans,
-			bch2_subvolume_get(trans, subvolid_to_delete, true,
-				   BTREE_ITER_cached, &s)) ?:
+			bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?:
 		for_each_btree_key_commit(trans, iter,
 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index d53d292c22d7..910f6196700e 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -24,7 +24,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
 
 int bch2_subvol_has_children(struct btree_trans *, u32);
 int bch2_subvolume_get(struct btree_trans *, unsigned,
-		       bool, int, struct bch_subvolume *);
+		       bool, struct bch_subvolume *);
 int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
 				  u32 *, bool);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
-- 
cgit v1.2.3


From d4c9fc000bd1bc03cfa0b7650e59060f623cf46d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 4 Dec 2024 23:36:33 -0500
Subject: bcachefs: factor out str_hash.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile   |   1 +
 fs/bcachefs/fsck.c     | 214 ++-----------------------------------------------
 fs/bcachefs/fsck.h     |   8 ++
 fs/bcachefs/str_hash.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/str_hash.h |   7 ++
 5 files changed, 232 insertions(+), 207 deletions(-)
 create mode 100644 fs/bcachefs/str_hash.c

(limited to 'fs')

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 56d20e219f59..d2689388d5e8 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -82,6 +82,7 @@ bcachefs-y		:=	\
 	siphash.o		\
 	six.o			\
 	snapshot.o		\
+	str_hash.o		\
 	subvolume.o		\
 	super.o			\
 	super-io.o		\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1e00b2694db7..22a33b9ba30d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -941,69 +941,16 @@ static int get_visible_inodes(struct btree_trans *trans,
 	return ret;
 }
 
-static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
-{
-	if (d.v->d_type == DT_SUBVOL) {
-		u32 snap;
-		u64 inum;
-		int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			return ret;
-		return !ret;
-	} else {
-		struct btree_iter iter;
-		struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-				SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
-		int ret = bkey_err(k);
-		if (ret)
-			return ret;
-
-		ret = bkey_is_inode(k.k);
-		bch2_trans_iter_exit(trans, &iter);
-		return ret;
-	}
-}
-
 /*
  * Prefer to delete the first one, since that will be the one at the wrong
  * offset:
  * return value: 0 -> delete k1, 1 -> delete k2
  */
-static int hash_pick_winner(struct btree_trans *trans,
-			    const struct bch_hash_desc desc,
-			    struct bch_hash_info *hash_info,
-			    struct bkey_s_c k1,
-			    struct bkey_s_c k2)
-{
-	if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
-	    !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
-		return 0;
-
-	switch (desc.btree_id) {
-	case BTREE_ID_dirents: {
-		int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1));
-		if (ret < 0)
-			return ret;
-		if (!ret)
-			return 0;
-
-		ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2));
-		if (ret < 0)
-			return ret;
-		if (!ret)
-			return 1;
-		return 2;
-	}
-	default:
-		return 0;
-	}
-}
-
-static int fsck_update_backpointers(struct btree_trans *trans,
-				    struct snapshots_seen *s,
-				    const struct bch_hash_desc desc,
-				    struct bch_hash_info *hash_info,
-				    struct bkey_i *new)
+int bch2_fsck_update_backpointers(struct btree_trans *trans,
+				  struct snapshots_seen *s,
+				  const struct bch_hash_desc desc,
+				  struct bch_hash_info *hash_info,
+				  struct bkey_i *new)
 {
 	if (new->k.type != KEY_TYPE_dirent)
 		return 0;
@@ -1031,153 +978,6 @@ err:
 	return ret;
 }
 
-static int fsck_rename_dirent(struct btree_trans *trans,
-			      struct snapshots_seen *s,
-			      const struct bch_hash_desc desc,
-			      struct bch_hash_info *hash_info,
-			      struct bkey_s_c_dirent old)
-{
-	struct qstr old_name = bch2_dirent_get_name(old);
-	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
-	int ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		return ret;
-
-	bkey_dirent_init(&new->k_i);
-	dirent_copy_target(new, old);
-	new->k.p = old.k->p;
-
-	for (unsigned i = 0; i < 1000; i++) {
-		unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
-				       old_name.len, old_name.name, i);
-		unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
-
-		if (u64s > U8_MAX)
-			return -EINVAL;
-
-		new->k.u64s = u64s;
-
-		ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-						(subvol_inum) { 0, old.k->p.inode },
-						old.k->p.snapshot, &new->k_i,
-						BTREE_UPDATE_internal_snapshot_node);
-		if (!bch2_err_matches(ret, EEXIST))
-			break;
-	}
-
-	if (ret)
-		return ret;
-
-	return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
-}
-
-static int hash_check_key(struct btree_trans *trans,
-			  struct snapshots_seen *s,
-			  const struct bch_hash_desc desc,
-			  struct bch_hash_info *hash_info,
-			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter = { NULL };
-	struct printbuf buf = PRINTBUF;
-	struct bkey_s_c k;
-	u64 hash;
-	int ret = 0;
-
-	if (hash_k.k->type != desc.key_type)
-		return 0;
-
-	hash = desc.hash_bkey(hash_info, hash_k);
-
-	if (likely(hash == hash_k.k->p.offset))
-		return 0;
-
-	if (hash_k.k->p.offset < hash)
-		goto bad_hash;
-
-	for_each_btree_key_norestart(trans, iter, desc.btree_id,
-				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
-				     BTREE_ITER_slots, k, ret) {
-		if (bkey_eq(k.k->p, hash_k.k->p))
-			break;
-
-		if (k.k->type == desc.key_type &&
-		    !desc.cmp_bkey(k, hash_k))
-			goto duplicate_entries;
-
-		if (bkey_deleted(k.k)) {
-			bch2_trans_iter_exit(trans, &iter);
-			goto bad_hash;
-		}
-	}
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ret;
-bad_hash:
-	if (fsck_err(trans, hash_table_key_wrong_offset,
-		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n  %s",
-		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
-		     (printbuf_reset(&buf),
-		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
-		if (IS_ERR(new))
-			return PTR_ERR(new);
-
-		k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
-				       (subvol_inum) { 0, hash_k.k->p.inode },
-				       hash_k.k->p.snapshot, new,
-				       STR_HASH_must_create|
-				       BTREE_ITER_with_updates|
-				       BTREE_UPDATE_internal_snapshot_node);
-		ret = bkey_err(k);
-		if (ret)
-			goto out;
-		if (k.k)
-			goto duplicate_entries;
-
-		ret =   bch2_hash_delete_at(trans, desc, hash_info, k_iter,
-					    BTREE_UPDATE_internal_snapshot_node) ?:
-			fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
-			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-			-BCH_ERR_transaction_restart_nested;
-		goto out;
-	}
-fsck_err:
-	goto out;
-duplicate_entries:
-	ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
-	if (ret < 0)
-		goto out;
-
-	if (!fsck_err(trans, hash_table_key_duplicate,
-		      "duplicate hash table keys%s:\n%s",
-		      ret != 2 ? "" : ", both point to valid inodes",
-		      (printbuf_reset(&buf),
-		       bch2_bkey_val_to_text(&buf, c, hash_k),
-		       prt_newline(&buf),
-		       bch2_bkey_val_to_text(&buf, c, k),
-		       buf.buf)))
-		goto out;
-
-	switch (ret) {
-	case 0:
-		ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
-		break;
-	case 1:
-		ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
-		break;
-	case 2:
-		ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
-			bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
-		goto out;
-	}
-
-	ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-		-BCH_ERR_transaction_restart_nested;
-	goto out;
-}
-
 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
 					       struct btree_iter *iter,
 					       struct bch_inode_unpacked *inode,
@@ -2496,7 +2296,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &i->inode);
 	dir->first_this_inode = false;
 
-	ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
+	ret = bch2_str_hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -2610,7 +2410,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &i->inode);
 	inode->first_this_inode = false;
 
-	ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
+	ret = bch2_str_hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
 	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 4481b40a881d..574948278cd4 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -2,6 +2,14 @@
 #ifndef _BCACHEFS_FSCK_H
 #define _BCACHEFS_FSCK_H
 
+#include "str_hash.h"
+
+int bch2_fsck_update_backpointers(struct btree_trans *,
+				  struct snapshots_seen *,
+				  const struct bch_hash_desc,
+				  struct bch_hash_info *,
+				  struct bkey_i *);
+
 int bch2_check_inodes(struct bch_fs *);
 int bch2_check_extents(struct bch_fs *);
 int bch2_check_indirect_extents(struct bch_fs *);
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
new file mode 100644
index 000000000000..c3276a7e7324
--- /dev/null
+++ b/fs/bcachefs/str_hash.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fsck.h"
+#include "str_hash.h"
+#include "subvolume.h"
+
+static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
+{
+	if (d.v->d_type == DT_SUBVOL) {
+		struct bch_subvolume subvol;
+		int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol),
+					     false, &subvol);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			return ret;
+		return !ret;
+	} else {
+		struct btree_iter iter;
+		struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+				SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
+		int ret = bkey_err(k);
+		if (ret)
+			return ret;
+
+		ret = bkey_is_inode(k.k);
+		bch2_trans_iter_exit(trans, &iter);
+		return ret;
+	}
+}
+
+static int fsck_rename_dirent(struct btree_trans *trans,
+			      struct snapshots_seen *s,
+			      const struct bch_hash_desc desc,
+			      struct bch_hash_info *hash_info,
+			      struct bkey_s_c_dirent old)
+{
+	struct qstr old_name = bch2_dirent_get_name(old);
+	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+	int ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
+
+	bkey_dirent_init(&new->k_i);
+	dirent_copy_target(new, old);
+	new->k.p = old.k->p;
+
+	for (unsigned i = 0; i < 1000; i++) {
+		unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
+				       old_name.len, old_name.name, i);
+		unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
+
+		if (u64s > U8_MAX)
+			return -EINVAL;
+
+		new->k.u64s = u64s;
+
+		ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+						(subvol_inum) { 0, old.k->p.inode },
+						old.k->p.snapshot, &new->k_i,
+						BTREE_UPDATE_internal_snapshot_node);
+		if (!bch2_err_matches(ret, EEXIST))
+			break;
+	}
+
+	if (ret)
+		return ret;
+
+	return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+}
+
+static int hash_pick_winner(struct btree_trans *trans,
+			    const struct bch_hash_desc desc,
+			    struct bch_hash_info *hash_info,
+			    struct bkey_s_c k1,
+			    struct bkey_s_c k2)
+{
+	if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
+	    !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
+		return 0;
+
+	switch (desc.btree_id) {
+	case BTREE_ID_dirents: {
+		int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1));
+		if (ret < 0)
+			return ret;
+		if (!ret)
+			return 0;
+
+		ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2));
+		if (ret < 0)
+			return ret;
+		if (!ret)
+			return 1;
+		return 2;
+	}
+	default:
+		return 0;
+	}
+}
+
+int bch2_str_hash_check_key(struct btree_trans *trans,
+			    struct snapshots_seen *s,
+			    const struct bch_hash_desc desc,
+			    struct bch_hash_info *hash_info,
+			    struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c k;
+	u64 hash;
+	int ret = 0;
+
+	if (hash_k.k->type != desc.key_type)
+		return 0;
+
+	hash = desc.hash_bkey(hash_info, hash_k);
+
+	if (likely(hash == hash_k.k->p.offset))
+		return 0;
+
+	if (hash_k.k->p.offset < hash)
+		goto bad_hash;
+
+	for_each_btree_key_norestart(trans, iter, desc.btree_id,
+				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+				     BTREE_ITER_slots, k, ret) {
+		if (bkey_eq(k.k->p, hash_k.k->p))
+			break;
+
+		if (k.k->type == desc.key_type &&
+		    !desc.cmp_bkey(k, hash_k))
+			goto duplicate_entries;
+
+		if (bkey_deleted(k.k)) {
+			bch2_trans_iter_exit(trans, &iter);
+			goto bad_hash;
+		}
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+bad_hash:
+	if (fsck_err(trans, hash_table_key_wrong_offset,
+		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n  %s",
+		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
+		     (printbuf_reset(&buf),
+		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
+		if (IS_ERR(new))
+			return PTR_ERR(new);
+
+		k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
+				       (subvol_inum) { 0, hash_k.k->p.inode },
+				       hash_k.k->p.snapshot, new,
+				       STR_HASH_must_create|
+				       BTREE_ITER_with_updates|
+				       BTREE_UPDATE_internal_snapshot_node);
+		ret = bkey_err(k);
+		if (ret)
+			goto out;
+		if (k.k)
+			goto duplicate_entries;
+
+		ret =   bch2_hash_delete_at(trans, desc, hash_info, k_iter,
+					    BTREE_UPDATE_internal_snapshot_node) ?:
+			bch2_fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
+			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+			-BCH_ERR_transaction_restart_nested;
+		goto out;
+	}
+fsck_err:
+	goto out;
+duplicate_entries:
+	ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
+	if (ret < 0)
+		goto out;
+
+	if (!fsck_err(trans, hash_table_key_duplicate,
+		      "duplicate hash table keys%s:\n%s",
+		      ret != 2 ? "" : ", both point to valid inodes",
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, hash_k),
+		       prt_newline(&buf),
+		       bch2_bkey_val_to_text(&buf, c, k),
+		       buf.buf)))
+		goto out;
+
+	switch (ret) {
+	case 0:
+		ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+		break;
+	case 1:
+		ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
+		break;
+	case 2:
+		ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+			bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+		goto out;
+	}
+
+	ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+		-BCH_ERR_transaction_restart_nested;
+	goto out;
+}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 00c785055d22..0c20f3af03f8 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -393,4 +393,11 @@ int bch2_hash_delete(struct btree_trans *trans,
 	return ret;
 }
 
+struct snapshots_seen;
+int bch2_str_hash_check_key(struct btree_trans *,
+			    struct snapshots_seen *,
+			    const struct bch_hash_desc,
+			    struct bch_hash_info *,
+			    struct btree_iter *, struct bkey_s_c);
+
 #endif /* _BCACHEFS_STR_HASH_H */
-- 
cgit v1.2.3


From 58117dbdd6ef9ae2b61aedd15a005e0ef19957d2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 5 Dec 2024 12:35:17 -0500
Subject: bcachefs: Journal space calculations should skip durability=0 devices

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index b7936ad3ae7f..3c8242606da7 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -146,7 +146,8 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 
 	rcu_read_lock();
 	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
-		if (!ca->journal.nr)
+		if (!ca->journal.nr ||
+		    !ca->mi.durability)
 			continue;
 
 		min_bucket_size = min(min_bucket_size, ca->mi.bucket_size);
-- 
cgit v1.2.3


From 821ddebbc2c43e43210b1bb9f2dab1f90944d8a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Dec 2024 20:11:16 -0500
Subject: bcachefs: fix bch2_btree_node_header_to_text() format string

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 9df9fc1c5e2b..d99f8a78d286 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -26,7 +26,7 @@
 static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
 {
 	bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
-	prt_printf(out, " seq %llux\n", bn->keys.seq);
+	prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
 	prt_str(out, "min: ");
 	bch2_bpos_to_text(out, bn->min_key);
 	prt_newline(out);
-- 
cgit v1.2.3


From f65645d80451e2bc675d539dde8ce951b6a0640e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Dec 2024 19:49:46 -0500
Subject: bcachefs: Mark more errors autofix

tested repairing from a bug uncovered by the merge_torture_flakey test

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-errors_format.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 3bbda181f314..0bc4cec2926c 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -58,7 +58,7 @@ enum bch_fsck_flags {
 	x(bset_empty,						 45,	0)		\
 	x(bset_bad_seq,						 46,	0)		\
 	x(bset_blacklisted_journal_seq,				 47,	0)		\
-	x(first_bset_blacklisted_journal_seq,			 48,	0)		\
+	x(first_bset_blacklisted_journal_seq,			 48,	FSCK_AUTOFIX)	\
 	x(btree_node_bad_btree,					 49,	0)		\
 	x(btree_node_bad_level,					 50,	0)		\
 	x(btree_node_bad_min_key,				 51,	0)		\
@@ -168,7 +168,7 @@ enum bch_fsck_flags {
 	x(ptr_to_incorrect_stripe,				151,	0)		\
 	x(ptr_gen_newer_than_bucket_gen,			152,	0)		\
 	x(ptr_too_stale,					153,	0)		\
-	x(stale_dirty_ptr,					154,	0)		\
+	x(stale_dirty_ptr,					154,	FSCK_AUTOFIX)	\
 	x(ptr_bucket_data_type_mismatch,			155,	0)		\
 	x(ptr_cached_and_erasure_coded,				156,	0)		\
 	x(ptr_crc_uncompressed_size_too_small,			157,	0)		\
-- 
cgit v1.2.3


From fbd152bf9469873b5d2a9aba2373e624eaa22a9f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Dec 2024 22:37:42 -0500
Subject: bcachefs: Minor bucket alloc optimization

Check open buckets and buckets waiting for journal commit before doing
other expensive lookups.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 55 +++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 49c9275465f9..57d5f14c93d0 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -200,14 +200,35 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
 	}
 }
 
+static inline bool may_alloc_bucket(struct bch_fs *c,
+				    struct bpos bucket,
+				    struct bucket_alloc_state *s)
+{
+	if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
+		s->skipped_open++;
+		return false;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) {
+		s->skipped_need_journal_commit++;
+		return false;
+	}
+
+	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
+		s->skipped_nocow++;
+		return false;
+	}
+
+	return true;
+}
+
 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 					      u64 bucket, u8 gen,
 					      enum bch_watermark watermark,
 					      struct bucket_alloc_state *s,
 					      struct closure *cl)
 {
-	struct open_bucket *ob;
-
 	if (unlikely(is_superblock_bucket(c, ca, bucket)))
 		return NULL;
 
@@ -216,22 +237,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 		return NULL;
 	}
 
-	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
-		s->skipped_open++;
-		return NULL;
-	}
-
-	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
-		s->skipped_need_journal_commit++;
-		return NULL;
-	}
-
-	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
-		s->skipped_nocow++;
-		return NULL;
-	}
-
 	spin_lock(&c->freelist_lock);
 
 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
@@ -250,10 +255,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 		return NULL;
 	}
 
-	ob = bch2_open_bucket_alloc(c);
+	struct open_bucket *ob = bch2_open_bucket_alloc(c);
 
 	spin_lock(&ob->lock);
-
 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->dev		= ca->dev_idx;
@@ -279,8 +283,11 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 {
 	struct bch_fs *c = trans->c;
 	u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
-	u8 gen;
 
+	if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s))
+		return NULL;
+
+	u8 gen;
 	int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
 	if (ret < 0)
 		return ERR_PTR(ret);
@@ -300,6 +307,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
 			struct bucket_alloc_state *s,
 			struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter, citer;
 	struct bkey_s_c k, ck;
 	struct open_bucket *ob = NULL;
@@ -359,7 +367,10 @@ again:
 
 		s->buckets_seen++;
 
-		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, a->gen, watermark, s, cl);
+		ob = may_alloc_bucket(c, k.k->p, s)
+			? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen,
+					     watermark, s, cl)
+			: NULL;
 next:
 		bch2_set_btree_iter_dontneed(&citer);
 		bch2_trans_iter_exit(trans, &citer);
-- 
cgit v1.2.3


From bbe36bd0993df2167b883d4af0b849a309350c38 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Dec 2024 19:23:22 -0500
Subject: bcachefs: Use a heap for handling overwrites in btree node scan

Fix an O(n^2) issue when we find many overlapping (overwritten) btree
nodes - especially when one node overwrites many smaller nodes.

This was discovered to be an issue with the bcachefs
merge_torture_flakey test - if we had a large btree that was then
emptied, the number of difficult overwrites can be unbounded.

Cc: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_node_scan.c       | 133 +++++++++++++++++++++++-------------
 fs/bcachefs/btree_node_scan_types.h |   1 -
 2 files changed, 86 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index eeafb5e7354e..a7f06deee13c 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -12,6 +12,7 @@
 #include "recovery_passes.h"
 
 #include <linux/kthread.h>
+#include <linux/min_heap.h>
 #include <linux/sort.h>
 
 struct find_btree_nodes_worker {
@@ -31,8 +32,6 @@ static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, con
 
 	if (n->range_updated)
 		prt_str(out, " range updated");
-	if (n->overwritten)
-		prt_str(out, " overwritten");
 
 	for (unsigned i = 0; i < n->nr_ptrs; i++) {
 		prt_char(out, ' ');
@@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r)
 	       -found_btree_node_cmp_time(l, r);
 }
 
+static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
+{
+	return found_btree_node_cmp_pos(l, r) < 0;
+}
+
+static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
+{
+	struct found_btree_node *l = _l;
+	struct found_btree_node *r = _r;
+
+	swap(*l, *r);
+}
+
+static const struct min_heap_callbacks found_btree_node_heap_cbs = {
+	.less = found_btree_node_cmp_pos_less,
+	.swp = found_btree_node_swap,
+};
+
 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 				struct bio *bio, struct btree_node *bn, u64 offset)
 {
@@ -295,55 +312,48 @@ err:
 	return f->ret ?: ret;
 }
 
-static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+static bool nodes_overlap(const struct found_btree_node *l,
+			  const struct found_btree_node *r)
 {
-	while (n + 1 < end &&
-	       found_btree_node_cmp_pos(n, n + 1) > 0) {
-		swap(n[0], n[1]);
-		n++;
-	}
+	return (l->btree_id	== r->btree_id &&
+		l->level	== r->level &&
+		bpos_gt(l->max_key, r->min_key));
 }
 
 static int handle_overwrites(struct bch_fs *c,
-			     struct found_btree_node *start,
-			     struct found_btree_node *end)
+			     struct found_btree_node *l,
+			     found_btree_nodes *nodes_heap)
 {
-	struct found_btree_node *n;
-again:
-	for (n = start + 1;
-	     n < end &&
-	     n->btree_id	== start->btree_id &&
-	     n->level		== start->level &&
-	     bpos_lt(n->min_key, start->max_key);
-	     n++)  {
-		int cmp = found_btree_node_cmp_time(start, n);
+	struct found_btree_node *r;
+
+	while ((r = min_heap_peek(nodes_heap)) &&
+	       nodes_overlap(l, r)) {
+		int cmp = found_btree_node_cmp_time(l, r);
 
 		if (cmp > 0) {
-			if (bpos_cmp(start->max_key, n->max_key) >= 0)
-				n->overwritten = true;
+			if (bpos_cmp(l->max_key, r->max_key) >= 0)
+				min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
 			else {
-				n->range_updated = true;
-				n->min_key = bpos_successor(start->max_key);
-				n->range_updated = true;
-				bubble_up(n, end);
-				goto again;
+				r->range_updated = true;
+				r->min_key = bpos_successor(l->max_key);
+				r->range_updated = true;
+				min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
 			}
 		} else if (cmp < 0) {
-			BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+			BUG_ON(bpos_eq(l->min_key, r->min_key));
 
-			start->max_key = bpos_predecessor(n->min_key);
-			start->range_updated = true;
-		} else if (n->level) {
-			n->overwritten = true;
+			l->max_key = bpos_predecessor(r->min_key);
+			l->range_updated = true;
+		} else if (r->level) {
+			min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
 		} else {
-			if (bpos_cmp(start->max_key, n->max_key) >= 0)
-				n->overwritten = true;
+			if (bpos_cmp(l->max_key, r->max_key) >= 0)
+				min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
 			else {
-				n->range_updated = true;
-				n->min_key = bpos_successor(start->max_key);
-				n->range_updated = true;
-				bubble_up(n, end);
-				goto again;
+				r->range_updated = true;
+				r->min_key = bpos_successor(l->max_key);
+				r->range_updated = true;
+				min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
 			}
 		}
 	}
@@ -355,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
 {
 	struct find_btree_nodes *f = &c->found_btree_nodes;
 	struct printbuf buf = PRINTBUF;
+	found_btree_nodes nodes_heap = {};
 	size_t dst;
 	int ret = 0;
 
@@ -409,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
 		bch2_print_string_as_lines(KERN_INFO, buf.buf);
 	}
 
-	dst = 0;
-	darray_for_each(f->nodes, i) {
-		if (i->overwritten)
-			continue;
+	swap(nodes_heap, f->nodes);
+
+	{
+		/* darray must have same layout as a heap */
+		min_heap_char real_heap;
+		BUILD_BUG_ON(sizeof(nodes_heap.nr)	!= sizeof(real_heap.nr));
+		BUILD_BUG_ON(sizeof(nodes_heap.size)	!= sizeof(real_heap.size));
+		BUILD_BUG_ON(offsetof(found_btree_nodes, nr)	!= offsetof(min_heap_char, nr));
+		BUILD_BUG_ON(offsetof(found_btree_nodes, size)	!= offsetof(min_heap_char, size));
+	}
 
-		ret = handle_overwrites(c, i, &darray_top(f->nodes));
+	min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
+
+	if (nodes_heap.nr) {
+		ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
 		if (ret)
 			goto err;
 
-		BUG_ON(i->overwritten);
-		f->nodes.data[dst++] = *i;
+		min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
 	}
-	f->nodes.nr = dst;
 
-	if (c->opts.verbose) {
+	while (true) {
+		ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
+		if (ret)
+			goto err;
+
+		if (!nodes_heap.nr)
+			break;
+
+		ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
+		if (ret)
+			goto err;
+
+		min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
+	}
+
+	for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
+		BUG_ON(nodes_overlap(n, n + 1));
+
+	if (0 && c->opts.verbose) {
 		printbuf_reset(&buf);
 		prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
 		found_btree_nodes_to_text(&buf, c, f->nodes);
 		bch2_print_string_as_lines(KERN_INFO, buf.buf);
+	} else {
+		bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
 	}
 
 	eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
 err:
+	darray_exit(&nodes_heap);
 	printbuf_exit(&buf);
 	return ret;
 }
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
index b6c36c45d0be..2811b6857c97 100644
--- a/fs/bcachefs/btree_node_scan_types.h
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -6,7 +6,6 @@
 
 struct found_btree_node {
 	bool			range_updated:1;
-	bool			overwritten:1;
 	u8			btree_id;
 	u8			level;
 	unsigned		sectors_written;
-- 
cgit v1.2.3


From 60558d55f7e26c8aa2242718461642792fa200a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 7 Dec 2024 21:36:15 -0500
Subject: bcachefs: Plumb bkey_validate_context to journal_entry_validate

This lets us print the exact location in the journal if it was found in
the journal, or correctly print if it was found in the superblock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bkey_types.h         | 12 ++---
 fs/bcachefs/btree_trans_commit.c | 44 ++++++++-----------
 fs/bcachefs/error.c              |  9 +++-
 fs/bcachefs/extents.c            | 13 +++---
 fs/bcachefs/journal_io.c         | 95 ++++++++++++++++++++++------------------
 fs/bcachefs/journal_io.h         |  2 +-
 fs/bcachefs/sb-clean.c           |  6 ++-
 7 files changed, 97 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
index 2af6279b02a9..b4f328f9853c 100644
--- a/fs/bcachefs/bkey_types.h
+++ b/fs/bcachefs/bkey_types.h
@@ -213,16 +213,16 @@ BCH_BKEY_TYPES();
 enum bch_validate_flags {
 	BCH_VALIDATE_write		= BIT(0),
 	BCH_VALIDATE_commit		= BIT(1),
-	BCH_VALIDATE_journal		= BIT(2),
-	BCH_VALIDATE_silent		= BIT(3),
+	BCH_VALIDATE_silent		= BIT(2),
 };
 
 #define BKEY_VALIDATE_CONTEXTS()	\
 	x(unknown)			\
-	x(commit)			\
+	x(superblock)			\
 	x(journal)			\
 	x(btree_root)			\
-	x(btree_node)
+	x(btree_node)			\
+	x(commit)
 
 struct bkey_validate_context {
 	enum {
@@ -230,10 +230,12 @@ struct bkey_validate_context {
 	BKEY_VALIDATE_CONTEXTS()
 #undef x
 	}			from:8;
+	enum bch_validate_flags	flags:8;
 	u8			level;
 	enum btree_id		btree;
 	bool			root:1;
-	enum bch_validate_flags	flags:8;
+	unsigned		journal_offset;
+	u64			journal_seq;
 };
 
 #endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 78d72c26083d..9011cc3f7190 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -719,38 +719,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			goto fatal_err;
 	}
 
-	trans_for_each_update(trans, i) {
-		enum bch_validate_flags invalid_flags = 0;
-
-		if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
-			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
-		ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
-					 (struct bkey_validate_context) {
-						.from	= BKEY_VALIDATE_commit,
-						.level	= i->level,
-						.btree	= i->btree_id,
-						.flags	= invalid_flags,
-					 });
-		if (unlikely(ret)){
-			bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
-						trans->fn, (void *) i->ip_allocated);
-			goto fatal_err;
-		}
-		btree_insert_entry_checks(trans, i);
-	}
+	struct bkey_validate_context validate_context = { .from	= BKEY_VALIDATE_commit };
+
+	if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+		validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
 
 	for (struct jset_entry *i = trans->journal_entries;
 	     i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
 	     i = vstruct_next(i)) {
-		enum bch_validate_flags invalid_flags = 0;
-
-		if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
-			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
 		ret = bch2_journal_entry_validate(c, NULL, i,
 						  bcachefs_metadata_version_current,
-						  CPU_BIG_ENDIAN, invalid_flags);
+						  CPU_BIG_ENDIAN, validate_context);
 		if (unlikely(ret)) {
 			bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
 						trans->fn);
@@ -758,6 +737,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 		}
 	}
 
+	trans_for_each_update(trans, i) {
+		validate_context.level	= i->level;
+		validate_context.btree	= i->btree_id;
+
+		ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context);
+		if (unlikely(ret)){
+			bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+						trans->fn, (void *) i->ip_allocated);
+			goto fatal_err;
+		}
+		btree_insert_entry_checks(trans, i);
+	}
+
 	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
 		struct journal *j = &c->journal;
 		struct jset_entry *entry;
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 9e34374960f3..038da6a61f6b 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -486,9 +486,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
 		fsck_flags |= fsck_flags_extra[err];
 
 	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf, "invalid bkey in %s btree=",
+	prt_printf(&buf, "invalid bkey in %s",
 		   bch2_bkey_validate_contexts[from.from]);
+
+	if (from.from == BKEY_VALIDATE_journal)
+		prt_printf(&buf, " journal seq=%llu offset=%u",
+			   from.journal_seq, from.journal_offset);
+
+	prt_str(&buf, " btree=");
 	bch2_btree_id_to_text(&buf, from.btree);
 	prt_printf(&buf, " level=%u: ", from.level);
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2fc9ace5533c..05d5f71a7ca9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1238,6 +1238,12 @@ static int extent_ptr_validate(struct bch_fs *c,
 {
 	int ret = 0;
 
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr(ptrs, ptr2)
+		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
+				 c, ptr_to_duplicate_device,
+				 "multiple pointers to same device (%u)", ptr->dev);
+
 	/* bad pointers are repaired by check_fix_ptrs(): */
 	rcu_read_lock();
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
@@ -1252,13 +1258,6 @@ static int extent_ptr_validate(struct bch_fs *c,
 	unsigned bucket_size	= ca->mi.bucket_size;
 	rcu_read_unlock();
 
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	bkey_for_each_ptr(ptrs, ptr2)
-		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
-				 c, ptr_to_duplicate_device,
-				 "multiple pointers to same device (%u)", ptr->dev);
-
-
 	bkey_fsck_err_on(bucket >= nbuckets,
 			 c, ptr_after_last_bucket,
 			 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 2f4daa8bd498..7f2efe85a805 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -301,7 +301,7 @@ static void journal_entry_err_msg(struct printbuf *out,
 	journal_entry_err_msg(&_buf, version, jset, entry);		\
 	prt_printf(&_buf, msg, ##__VA_ARGS__);				\
 									\
-	switch (flags & BCH_VALIDATE_write) {				\
+	switch (from.flags & BCH_VALIDATE_write) {			\
 	case READ:							\
 		mustfix_fsck_err(c, _err, "%s", _buf.buf);		\
 		break;							\
@@ -390,15 +390,12 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	struct bkey_i *k = entry->start;
-	struct bkey_validate_context from = {
-		.from	= BKEY_VALIDATE_journal,
-		.level	= entry->level,
-		.btree	= entry->btree_id,
-		.flags	= flags|BCH_VALIDATE_journal,
-	};
+
+	from.level	= entry->level;
+	from.btree	= entry->btree_id;
 
 	while (k != vstruct_last(entry)) {
 		int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
@@ -435,11 +432,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	struct bkey_i *k = entry->start;
 	int ret = 0;
 
+	from.root	= true;
+	from.level	= entry->level + 1;
+	from.btree	= entry->btree_id;
+
 	if (journal_entry_err_on(!entry->u64s ||
 				 le16_to_cpu(entry->u64s) != k->k.u64s,
 				 c, version, jset, entry,
@@ -456,13 +457,6 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
 		return 0;
 	}
 
-	struct bkey_validate_context from = {
-		.from	= BKEY_VALIDATE_journal,
-		.level	= entry->level + 1,
-		.btree	= entry->btree_id,
-		.root	= true,
-		.flags	= flags,
-	};
 	ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
 	if (ret == FSCK_DELETED_KEY)
 		ret = 0;
@@ -480,7 +474,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	/* obsolete, don't care: */
 	return 0;
@@ -495,7 +489,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	int ret = 0;
 
@@ -522,7 +516,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	struct jset_entry_blacklist_v2 *bl_entry;
 	int ret = 0;
@@ -564,7 +558,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	struct jset_entry_usage *u =
 		container_of(entry, struct jset_entry_usage, entry);
@@ -598,7 +592,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	struct jset_entry_data_usage *u =
 		container_of(entry, struct jset_entry_data_usage, entry);
@@ -642,7 +636,7 @@ static int journal_entry_clock_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	struct jset_entry_clock *clock =
 		container_of(entry, struct jset_entry_clock, entry);
@@ -682,7 +676,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	struct jset_entry_dev_usage *u =
 		container_of(entry, struct jset_entry_dev_usage, entry);
@@ -739,7 +733,7 @@ static int journal_entry_log_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	return 0;
 }
@@ -756,10 +750,11 @@ static int journal_entry_overwrite_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
+	from.flags = 0;
 	return journal_entry_btree_keys_validate(c, jset, entry,
-				version, big_endian, READ);
+				version, big_endian, from);
 }
 
 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
@@ -772,10 +767,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	return journal_entry_btree_keys_validate(c, jset, entry,
-				version, big_endian, READ);
+				version, big_endian, from);
 }
 
 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
@@ -788,7 +783,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	unsigned bytes = vstruct_bytes(entry);
 	unsigned expected = 16;
@@ -818,7 +813,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, unsigned, int,
-			enum bch_validate_flags);
+			struct bkey_validate_context);
 	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
 
@@ -836,11 +831,11 @@ int bch2_journal_entry_validate(struct bch_fs *c,
 				struct jset *jset,
 				struct jset_entry *entry,
 				unsigned version, int big_endian,
-				enum bch_validate_flags flags)
+				struct bkey_validate_context from)
 {
 	return entry->type < BCH_JSET_ENTRY_NR
 		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
-				version, big_endian, flags)
+				version, big_endian, from)
 		: 0;
 }
 
@@ -858,10 +853,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 				 enum bch_validate_flags flags)
 {
+	struct bkey_validate_context from = {
+		.flags		= flags,
+		.from		= BKEY_VALIDATE_journal,
+		.journal_seq	= le64_to_cpu(jset->seq),
+	};
+
 	unsigned version = le32_to_cpu(jset->version);
 	int ret = 0;
 
 	vstruct_for_each(jset, entry) {
+		from.journal_offset = (u64 *) entry - jset->_data;
+
 		if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
 				c, version, jset, entry,
 				journal_entry_past_jset_end,
@@ -870,8 +873,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 			break;
 		}
 
-		ret = bch2_journal_entry_validate(c, jset, entry,
-					version, JSET_BIG_ENDIAN(jset), flags);
+		ret = bch2_journal_entry_validate(c, jset, entry, version,
+						  JSET_BIG_ENDIAN(jset), from);
 		if (ret)
 			break;
 	}
@@ -884,13 +887,17 @@ static int jset_validate(struct bch_fs *c,
 			 struct jset *jset, u64 sector,
 			 enum bch_validate_flags flags)
 {
-	unsigned version;
+	struct bkey_validate_context from = {
+		.flags		= flags,
+		.from		= BKEY_VALIDATE_journal,
+		.journal_seq	= le64_to_cpu(jset->seq),
+	};
 	int ret = 0;
 
 	if (le64_to_cpu(jset->magic) != jset_magic(c))
 		return JOURNAL_ENTRY_NONE;
 
-	version = le32_to_cpu(jset->version);
+	unsigned version = le32_to_cpu(jset->version);
 	if (journal_entry_err_on(!bch2_version_compatible(version),
 			c, version, jset, NULL,
 			jset_unsupported_version,
@@ -935,15 +942,16 @@ static int jset_validate_early(struct bch_fs *c,
 			 unsigned bucket_sectors_left,
 			 unsigned sectors_read)
 {
-	size_t bytes = vstruct_bytes(jset);
-	unsigned version;
-	enum bch_validate_flags flags = BCH_VALIDATE_journal;
+	struct bkey_validate_context from = {
+		.from		= BKEY_VALIDATE_journal,
+		.journal_seq	= le64_to_cpu(jset->seq),
+	};
 	int ret = 0;
 
 	if (le64_to_cpu(jset->magic) != jset_magic(c))
 		return JOURNAL_ENTRY_NONE;
 
-	version = le32_to_cpu(jset->version);
+	unsigned version = le32_to_cpu(jset->version);
 	if (journal_entry_err_on(!bch2_version_compatible(version),
 			c, version, jset, NULL,
 			jset_unsupported_version,
@@ -956,6 +964,7 @@ static int jset_validate_early(struct bch_fs *c,
 		return -EINVAL;
 	}
 
+	size_t bytes = vstruct_bytes(jset);
 	if (bytes > (sectors_read << 9) &&
 	    sectors_read < bucket_sectors_left)
 		return JOURNAL_ENTRY_REREAD;
@@ -1240,8 +1249,6 @@ int bch2_journal_read(struct bch_fs *c,
 	 * those entries will be blacklisted:
 	 */
 	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
-		enum bch_validate_flags flags = BCH_VALIDATE_journal;
-
 		i = *_i;
 
 		if (journal_replay_ignore(i))
@@ -1261,6 +1268,10 @@ int bch2_journal_read(struct bch_fs *c,
 			continue;
 		}
 
+		struct bkey_validate_context from = {
+			.from		= BKEY_VALIDATE_journal,
+			.journal_seq	= le64_to_cpu(i->j.seq),
+		};
 		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
 					 c, le32_to_cpu(i->j.version), &i->j, NULL,
 					 jset_last_seq_newer_than_seq,
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 2ca9cde30ea8..12b39fcb4424 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -63,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 
 int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
 				struct jset_entry *, unsigned, int,
-				enum bch_validate_flags);
+				struct bkey_validate_context);
 void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 				struct jset_entry *);
 
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 005275281804..59c8770e4a0e 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -23,6 +23,10 @@
 int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
 				int write)
 {
+	struct bkey_validate_context from = {
+		.flags		= write,
+		.from		= BKEY_VALIDATE_superblock,
+	};
 	struct jset_entry *entry;
 	int ret;
 
@@ -40,7 +44,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle
 		ret = bch2_journal_entry_validate(c, NULL, entry,
 						  le16_to_cpu(c->disk_sb.sb->version),
 						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
-						  write);
+						  from);
 		if (ret)
 			return ret;
 	}
-- 
cgit v1.2.3


From 44a43cf9fdccc3576b1f2a96dc3e0dc87796bedb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 5 Dec 2024 12:35:43 -0500
Subject: bcachefs: Don't add unknown accounting types to eytzinger tree

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 16 ++++++++++++++++
 fs/bcachefs/disk_accounting.h |  8 +++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index a0061bcf9159..b18cbe80936b 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -729,6 +729,16 @@ int bch2_accounting_read(struct bch_fs *c)
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
 			struct bkey u;
 			struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
+
+			if (k.k->type != KEY_TYPE_accounting)
+				continue;
+
+			struct disk_accounting_pos acc_k;
+			bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+			if (!bch2_accounting_is_mem(acc_k))
+				continue;
+
 			accounting_read_key(trans, k);
 		}));
 	if (ret)
@@ -740,6 +750,12 @@ int bch2_accounting_read(struct bch_fs *c)
 
 	darray_for_each(*keys, i) {
 		if (i->k->k.type == KEY_TYPE_accounting) {
+			struct disk_accounting_pos acc_k;
+			bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
+
+			if (!bch2_accounting_is_mem(acc_k))
+				continue;
+
 			struct bkey_s_c k = bkey_i_to_s_c(i->k);
 			unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
 						sizeof(acc->k.data[0]),
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 566aa2a8539d..0eeaca12c589 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -114,6 +114,12 @@ enum bch_accounting_mode {
 int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
 void bch2_accounting_mem_gc(struct bch_fs *);
 
+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
+{
+	return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR &&
+		acc.type != BCH_DISK_ACCOUNTING_inum;
+}
+
 /*
  * Update in memory counters so they match the btree update we're doing; called
  * from transaction commit path
@@ -130,7 +136,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
 
 	EBUG_ON(gc && !acc->gc_running);
 
-	if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+	if (!bch2_accounting_is_mem(acc_k))
 		return 0;
 
 	if (mode == BCH_ACCOUNTING_normal) {
-- 
cgit v1.2.3


From be565740ee84798b5b2d5ab88070d887fd77c1a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 7 Dec 2024 20:43:07 -0500
Subject: bcachefs: Set bucket needs discard, inc gen on empty -> nonempty
 transition

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b2d570453351..62069231c63b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -856,7 +856,10 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 	if (flags & BTREE_TRIGGER_transactional) {
 		alloc_data_type_set(new_a, new_a->data_type);
 
-		if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
+		int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
+				     (int) data_type_is_empty(old_a->data_type);
+
+		if (is_empty_delta < 0) {
 			new_a->io_time[READ] = bch2_current_io_time(c, READ);
 			new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
 			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
-- 
cgit v1.2.3


From 89e74eccab9248d37bf329c66af0b7d4e23eac12 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Dec 2024 00:28:16 -0500
Subject: bcachefs: bch2_journal_noflush_seq() now takes [start, end)

Harder to screw up if we're explicit about the range, and more correct
as journal reservations can be outstanding on multiple journal entries
simultaneously.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  4 +++-
 fs/bcachefs/journal.c          | 11 ++++++-----
 fs/bcachefs/journal.h          |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 62069231c63b..9ae567402b03 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -953,7 +953,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 		 */
 		if (is_empty_delta > 0) {
 			if (new_a->journal_seq == transaction_seq ||
-			    bch2_journal_noflush_seq(&c->journal, new_a->journal_seq))
+			    bch2_journal_noflush_seq(&c->journal,
+						     new_a->journal_seq,
+						     transaction_seq))
 				new_a->journal_seq = 0;
 			else {
 				new_a->journal_seq = transaction_seq;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 04a9ccf76d75..2cd20114b74b 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -807,10 +807,11 @@ int bch2_journal_flush(struct journal *j)
 }
 
 /*
- * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the
+ * range [start, end)
  * @seq
  */
-bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	u64 unwritten_seq;
@@ -819,15 +820,15 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
 		return false;
 
-	if (seq <= c->journal.flushed_seq_ondisk)
+	if (c->journal.flushed_seq_ondisk >= start)
 		return false;
 
 	spin_lock(&j->lock);
-	if (seq <= c->journal.flushed_seq_ondisk)
+	if (c->journal.flushed_seq_ondisk >= start)
 		goto out;
 
 	for (unwritten_seq = journal_last_unwritten_seq(j);
-	     unwritten_seq < seq;
+	     unwritten_seq < end;
 	     unwritten_seq++) {
 		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index a6a2e888c59b..cb0df0663946 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -404,7 +404,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *);
 
 int bch2_journal_flush_seq(struct journal *, u64, unsigned);
 int bch2_journal_flush(struct journal *);
-bool bch2_journal_noflush_seq(struct journal *, u64);
+bool bch2_journal_noflush_seq(struct journal *, u64, u64);
 int bch2_journal_meta(struct journal *);
 
 void bch2_journal_halt(struct journal *);
-- 
cgit v1.2.3


From 9e779f3f24fbca1594bcd70996426f3b84873bc8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Dec 2024 23:15:05 -0500
Subject: bcachefs: Fix reuse of bucket before journal flush on multiple empty
 -> nonempty transition

For each bucket we track when the bucket became nonempty and when it
became empty again: if we can ensure that there will be no journal
flushes in the range [nonempty, empty) (possibly because they occured at
the same journal sequence number), then it's safe to reuse the bucket
without waiting for a journal commit.

This is a major performance optimization for erasure coding, where
writes are initially replicated, but the extra replicas are quickly
dropped: if those buckets are reused and overwritten without issuing a
cache flush to the underlying device, then they only cost bus bandwidth.

But there's a tricky corner case when there's multiple empty -> nonempty
-> empty transitions in quick succession, i.e. when data is getting
overwritten immediately as it's being written.

If this happens and the previous empty transition hasn't been flushed,
we need to continue tracking the previous nonempty transition - not
start a new one.

Fixing this means we now need to track both the nonempty and empty
transitions in bch_alloc_v4.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c        | 78 ++++++++++++++++++-----------------
 fs/bcachefs/alloc_background_format.h |  4 +-
 2 files changed, 42 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9ae567402b03..94e7bc889cb1 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -323,7 +323,8 @@ void bch2_alloc_v4_swab(struct bkey_s k)
 {
 	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
 
-	a->journal_seq		= swab64(a->journal_seq);
+	a->journal_seq_nonempty	= swab64(a->journal_seq_nonempty);
+	a->journal_seq_empty	= swab64(a->journal_seq_empty);
 	a->flags		= swab32(a->flags);
 	a->dirty_sectors	= swab32(a->dirty_sectors);
 	a->cached_sectors	= swab32(a->cached_sectors);
@@ -346,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
 	bch2_prt_data_type(out, a->data_type);
 	prt_newline(out);
-	prt_printf(out, "journal_seq       %llu\n",	a->journal_seq);
-	prt_printf(out, "need_discard      %llu\n",	BCH_ALLOC_V4_NEED_DISCARD(a));
-	prt_printf(out, "need_inc_gen      %llu\n",	BCH_ALLOC_V4_NEED_INC_GEN(a));
-	prt_printf(out, "dirty_sectors     %u\n",	a->dirty_sectors);
-	prt_printf(out, "stripe_sectors    %u\n",	a->stripe_sectors);
-	prt_printf(out, "cached_sectors    %u\n",	a->cached_sectors);
-	prt_printf(out, "stripe            %u\n",	a->stripe);
-	prt_printf(out, "stripe_redundancy %u\n",	a->stripe_redundancy);
-	prt_printf(out, "io_time[READ]     %llu\n",	a->io_time[READ]);
-	prt_printf(out, "io_time[WRITE]    %llu\n",	a->io_time[WRITE]);
+	prt_printf(out, "journal_seq_nonempty %llu\n",	a->journal_seq_nonempty);
+	prt_printf(out, "journal_seq_empty    %llu\n",	a->journal_seq_empty);
+	prt_printf(out, "need_discard         %llu\n",	BCH_ALLOC_V4_NEED_DISCARD(a));
+	prt_printf(out, "need_inc_gen         %llu\n",	BCH_ALLOC_V4_NEED_INC_GEN(a));
+	prt_printf(out, "dirty_sectors        %u\n",	a->dirty_sectors);
+	prt_printf(out, "stripe_sectors       %u\n",	a->stripe_sectors);
+	prt_printf(out, "cached_sectors       %u\n",	a->cached_sectors);
+	prt_printf(out, "stripe               %u\n",	a->stripe);
+	prt_printf(out, "stripe_redundancy    %u\n",	a->stripe_redundancy);
+	prt_printf(out, "io_time[READ]        %llu\n",	a->io_time[READ]);
+	prt_printf(out, "io_time[WRITE]       %llu\n",	a->io_time[WRITE]);
 
 	if (ca)
 		prt_printf(out, "fragmentation     %llu\n",	alloc_lru_idx_fragmentation(*a, ca));
@@ -384,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
 		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
 		*out = (struct bch_alloc_v4) {
-			.journal_seq		= u.journal_seq,
+			.journal_seq_nonempty	= u.journal_seq,
 			.flags			= u.need_discard,
 			.gen			= u.gen,
 			.oldest_gen		= u.oldest_gen,
@@ -930,20 +932,29 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 
 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
 		u64 transaction_seq = trans->journal_res.seq;
+		BUG_ON(!transaction_seq);
 
-		if (log_fsck_err_on(transaction_seq && new_a->journal_seq > transaction_seq,
+		if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
 				    trans, alloc_key_journal_seq_in_future,
 				    "bucket journal seq in future (currently at %llu)\n%s",
 				    journal_cur_seq(&c->journal),
 				    (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
-			new_a->journal_seq = transaction_seq;
+			new_a->journal_seq_nonempty = transaction_seq;
 
 		int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
 				     (int) data_type_is_empty(old_a->data_type);
 
-		/* Record journal sequence number of empty -> nonempty transition: */
-		if (is_empty_delta < 0)
-			new_a->journal_seq = max(new_a->journal_seq, transaction_seq);
+		/*
+		 * Record journal sequence number of empty -> nonempty transition:
+		 * Note that there may be multiple empty -> nonempty
+		 * transitions, data in a bucket may be overwritten while we're
+		 * still writing to it - so be careful to only record the first:
+		 * */
+		if (is_empty_delta < 0 &&
+		    new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
+			new_a->journal_seq_nonempty	= transaction_seq;
+			new_a->journal_seq_empty	= 0;
+		}
 
 		/*
 		 * Bucket becomes empty: mark it as waiting for a journal flush,
@@ -952,20 +963,21 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 		 * intermediate sequence numbers:
 		 */
 		if (is_empty_delta > 0) {
-			if (new_a->journal_seq == transaction_seq ||
+			if (new_a->journal_seq_nonempty == transaction_seq ||
 			    bch2_journal_noflush_seq(&c->journal,
-						     new_a->journal_seq,
-						     transaction_seq))
-				new_a->journal_seq = 0;
-			else {
-				new_a->journal_seq = transaction_seq;
+						     new_a->journal_seq_nonempty,
+						     transaction_seq)) {
+				new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
+			} else {
+				new_a->journal_seq_empty = transaction_seq;
 
 				ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-						c->journal.flushed_seq_ondisk,
-						new.k->p.inode, new.k->p.offset,
-						transaction_seq);
+									   c->journal.flushed_seq_ondisk,
+									   new.k->p.inode, new.k->p.offset,
+									   transaction_seq);
 				if (bch2_fs_fatal_err_on(ret, c,
-						"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
+						"setting bucket_needs_journal_commit: %s",
+						bch2_err_str(ret)))
 					goto err;
 			}
 		}
@@ -983,7 +995,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 
 #define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
 #define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
-#define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
+#define bucket_flushed(a)		(a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
 
 		if (statechange(a->data_type == BCH_DATA_free) &&
 		    bucket_flushed(new_a))
@@ -1845,16 +1857,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
-		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
-					       trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
-					       a->v.journal_seq,
-					       c->journal.flushed_seq_ondisk,
-					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = -EIO;
-		goto out;
-	}
-
 	if (!fastpath) {
 		if (discard_in_flight_add(ca, iter.pos.offset, true))
 			goto out;
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index befdaa95c515..740238369a5a 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
 
 struct bch_alloc_v4 {
 	struct bch_val		v;
-	__u64			journal_seq;
+	__u64			journal_seq_nonempty;
 	__u32			flags;
 	__u8			gen;
 	__u8			oldest_gen;
@@ -70,7 +70,7 @@ struct bch_alloc_v4 {
 	__u32			stripe;
 	__u32			nr_external_backpointers;
 	/* end of fields in original version of alloc_v4 */
-	__u64			_fragmentation_lru; /* obsolete */
+	__u64			journal_seq_empty;
 	__u32			stripe_sectors;
 	__u32			pad;
 } __packed __aligned(8);
-- 
cgit v1.2.3


From 54dacdada6de8d97e7ca7e51eadc96c61032bdb4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 9 Dec 2024 06:00:33 -0500
Subject: bcachefs: Don't start rewriting btree nodes until after journal
 replay

This fixes a deadlock during journal replay when btree node read errors
kick off a ton of rewrites: we don't want them competing with journal
replay.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7d9dab95bdcf..03a6eba7403d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2291,7 +2291,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 	bool now = false, pending = false;
 
 	spin_lock(&c->btree_node_rewrites_lock);
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
+	    bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
 		list_add(&a->list, &c->btree_node_rewrites);
 		now = true;
 	} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
-- 
cgit v1.2.3


From 49f2d182638a54ecda9cb35ede5224f8d9f5f2e6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Dec 2024 04:11:21 -0500
Subject: bcachefs: Kill unnecessary mark_lock usage

We can't hold mark_lock while calling fsck_err() - that's a deadlock,
mark_lock is meant to be a leaf node lock.

It's also unnecessary for gc_bucket() and bucket_gen(); rcu suffices
since the bucket_gens array describes its size, and we can't race with
device removal or resize during gc/fsck since that takes state lock.

Reported-by: syzbot+38641fcbda1aaffefdd4@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  4 ----
 fs/bcachefs/bcachefs.h         |  6 ++----
 fs/bcachefs/btree_gc.c         |  7 -------
 fs/bcachefs/buckets.c          | 40 +++++++++++-----------------------------
 fs/bcachefs/buckets.h          |  9 ++++-----
 fs/bcachefs/ec.c               |  6 ++----
 fs/bcachefs/errcode.h          |  1 +
 fs/bcachefs/super.c            |  2 --
 8 files changed, 20 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 57d5f14c93d0..6df41c331a52 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 		return;
 	}
 
-	percpu_down_read(&c->mark_lock);
 	spin_lock(&ob->lock);
-
 	ob->valid = false;
 	ob->data_type = 0;
-
 	spin_unlock(&ob->lock);
-	percpu_up_read(&c->mark_lock);
 
 	spin_lock(&c->freelist_lock);
 	bch2_open_bucket_hash_remove(c, ob);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e6cd93e1ed0f..3a3cb79d8518 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -547,15 +547,13 @@ struct bch_dev {
 
 	/*
 	 * Buckets:
-	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
-	 * gc_gens_lock, for device resize - holding any is sufficient for
-	 * access: Or rcu_read_lock(), but only for dev_ptr_stale():
+	 * Per-bucket arrays are protected by either rcu_read_lock or
+	 * state_lock, for device resize.
 	 */
 	GENRADIX(struct bucket)	buckets_gc;
 	struct bucket_gens __rcu *bucket_gens;
 	u8			*oldest_gen;
 	unsigned long		*buckets_nouse;
-	struct rw_semaphore	bucket_lock;
 
 	struct bch_dev_usage __percpu	*usage;
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 24f2f3bdf704..e5ba7d1429b9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -811,7 +811,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	old = bch2_alloc_to_v4(k, &old_convert);
 	gc = new = *old;
 
-	percpu_down_read(&c->mark_lock);
 	__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
 
 	old_gc = gc;
@@ -822,7 +821,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		gc.data_type = old->data_type;
 		gc.dirty_sectors = old->dirty_sectors;
 	}
-	percpu_up_read(&c->mark_lock);
 
 	/*
 	 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
@@ -840,11 +838,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		 * safe w.r.t. transaction restarts, so fixup the gc_bucket so
 		 * we don't run it twice:
 		 */
-		percpu_down_read(&c->mark_lock);
 		struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
 		gc_m->data_type = gc.data_type;
 		gc_m->dirty_sectors = gc.dirty_sectors;
-		percpu_up_read(&c->mark_lock);
 	}
 
 	if (fsck_err_on(new.data_type != gc.data_type,
@@ -1088,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
 	if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
 		return -EROFS;
 
-	percpu_down_read(&c->mark_lock);
 	rcu_read_lock();
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
@@ -1097,7 +1092,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
 
 		if (dev_ptr_stale(ca, ptr) > 16) {
 			rcu_read_unlock();
-			percpu_up_read(&c->mark_lock);
 			goto update;
 		}
 	}
@@ -1112,7 +1106,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
 			*gen = ptr->gen;
 	}
 	rcu_read_unlock();
-	percpu_up_read(&c->mark_lock);
 	return 0;
 update:
 	u = bch2_bkey_make_mut(trans, iter, &k, 0);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index afd35c93fcfb..eb2ed4edbbbc 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -262,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	percpu_down_read(&c->mark_lock);
-
 	bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
 		ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
 		if (ret)
@@ -364,7 +362,6 @@ found:
 			bch_info(c, "new key %s", buf.buf);
 		}
 
-		percpu_up_read(&c->mark_lock);
 		struct btree_iter iter;
 		bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
 					  BTREE_ITER_intent|BTREE_ITER_all_snapshots);
@@ -373,8 +370,6 @@ found:
 					  BTREE_UPDATE_internal_snapshot_node|
 					  BTREE_TRIGGER_norun);
 		bch2_trans_iter_exit(trans, &iter);
-		percpu_down_read(&c->mark_lock);
-
 		if (ret)
 			goto err;
 
@@ -382,7 +377,6 @@ found:
 			bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
 	}
 err:
-	percpu_up_read(&c->mark_lock);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -603,13 +597,12 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	}
 
 	if (flags & BTREE_TRIGGER_gc) {
-		percpu_down_read(&c->mark_lock);
 		struct bucket *g = gc_bucket(ca, bucket.offset);
 		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
 					    p.ptr.dev,
 					    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 			ret = -BCH_ERR_trigger_pointer;
-			goto err_unlock;
+			goto err;
 		}
 
 		bucket_lock(g);
@@ -617,8 +610,6 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new);
 		alloc_to_bucket(g, new);
 		bucket_unlock(g);
-err_unlock:
-		percpu_up_read(&c->mark_lock);
 
 		if (!ret)
 			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
@@ -996,11 +987,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
 	struct bch_fs *c = trans->c;
 	int ret = 0;
 
-	percpu_down_read(&c->mark_lock);
 	struct bucket *g = gc_bucket(ca, b);
 	if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
 				    ca->dev_idx, bch2_data_type_str(data_type)))
-		goto err_unlock;
+		goto err;
 
 	bucket_lock(g);
 	struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
@@ -1010,26 +1000,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
 			"different types of data in same bucket: %s, %s",
 			bch2_data_type_str(g->data_type),
 			bch2_data_type_str(data_type)))
-		goto err;
+		goto err_unlock;
 
 	if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
 			"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
 			ca->dev_idx, b, g->gen,
 			bch2_data_type_str(g->data_type ?: data_type),
 			g->dirty_sectors, sectors))
-		goto err;
+		goto err_unlock;
 
 	g->data_type = data_type;
 	g->dirty_sectors += sectors;
 	struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
 	bucket_unlock(g);
-	percpu_up_read(&c->mark_lock);
 	ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
 	return ret;
-err:
-	bucket_unlock(g);
 err_unlock:
-	percpu_up_read(&c->mark_lock);
+	bucket_unlock(g);
+err:
 	return -BCH_ERR_metadata_bucket_inconsistency;
 }
 
@@ -1295,7 +1283,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bool resize = ca->bucket_gens != NULL;
 	int ret;
 
-	BUG_ON(resize && ca->buckets_nouse);
+	if (resize)
+		lockdep_assert_held(&c->state_lock);
+
+	if (resize && ca->buckets_nouse)
+		return -BCH_ERR_no_resize_with_buckets_nouse;
 
 	bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets),
 			       GFP_KERNEL|__GFP_ZERO);
@@ -1309,11 +1301,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bucket_gens->nbuckets_minus_first =
 		bucket_gens->nbuckets - bucket_gens->first_bucket;
 
-	if (resize) {
-		down_write(&ca->bucket_lock);
-		percpu_down_write(&c->mark_lock);
-	}
-
 	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 
 	if (resize) {
@@ -1331,11 +1318,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	nbuckets = ca->mi.nbuckets;
 
-	if (resize) {
-		percpu_up_write(&c->mark_lock);
-		up_write(&ca->bucket_lock);
-	}
-
 	ret = 0;
 err:
 	if (bucket_gens)
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 3bebc4c3044f..a9acdd6c0c86 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b)
 
 static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
 {
-	return genradix_ptr(&ca->buckets_gc, b);
+	return bucket_valid(ca, b)
+		? genradix_ptr(&ca->buckets_gc, b)
+		: NULL;
 }
 
 static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
 {
 	return rcu_dereference_check(ca->bucket_gens,
-				     !ca->fs ||
-				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
-				     lockdep_is_held(&ca->fs->state_lock) ||
-				     lockdep_is_held(&ca->bucket_lock));
+				     lockdep_is_held(&ca->fs->state_lock));
 }
 
 static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index df9c0e453391..3e7e41cd8380 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -305,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	}
 
 	if (flags & BTREE_TRIGGER_gc) {
-		percpu_down_read(&c->mark_lock);
 		struct bucket *g = gc_bucket(ca, bucket.offset);
 		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
 					    ptr->dev,
 					    (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
 			ret = -BCH_ERR_mark_stripe;
-			goto err_unlock;
+			goto err;
 		}
 
 		bucket_lock(g);
@@ -319,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
 		alloc_to_bucket(g, new);
 		bucket_unlock(g);
-err_unlock:
-		percpu_up_read(&c->mark_lock);
+
 		if (!ret)
 			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
 	}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 5e4dd85ac669..a6a9561a890d 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -195,6 +195,7 @@
 	x(EINVAL,			opt_parse_error)			\
 	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
 	x(EINVAL,			remove_would_lose_data)			\
+	x(EINVAL,			no_resize_with_buckets_nouse)		\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 14157820705d..2b2e0835c8fe 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1311,8 +1311,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 	init_completion(&ca->ref_completion);
 	init_completion(&ca->io_ref_completion);
 
-	init_rwsem(&ca->bucket_lock);
-
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
 	bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
-- 
cgit v1.2.3


From cd150cf9240225958a72d258d8596055459208a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Dec 2024 20:55:03 -0500
Subject: bcachefs: kill sysfs internal/accounting

Since we added per-inode counters there's now far too many counters to
show in one shot - if we want this in the future, it'll have to be in
debugfs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 26 --------------------------
 fs/bcachefs/disk_accounting.h |  1 -
 fs/bcachefs/sysfs.c           |  5 -----
 3 files changed, 32 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index b18cbe80936b..22a7db63e50c 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -471,32 +471,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc
 	return ret;
 }
 
-void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-
-	percpu_down_read(&c->mark_lock);
-	out->atomic++;
-
-	eytzinger0_for_each(i, acc->k.nr) {
-		struct disk_accounting_pos acc_k;
-		bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos);
-
-		bch2_accounting_key_to_text(out, &acc_k);
-
-		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
-
-		prt_str(out, ":");
-		for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
-			prt_printf(out, " %llu", v[j]);
-		prt_newline(out);
-	}
-
-	--out->atomic;
-	percpu_up_read(&c->mark_lock);
-}
-
 static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
 {
 	darray_for_each(acc->k, e) {
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 0eeaca12c589..2560de10b09d 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -251,7 +251,6 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans
 
 int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
 int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
-void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
 
 int bch2_gc_accounting_start(struct bch_fs *);
 int bch2_gc_accounting_done(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 97733c766948..48bc6ad03f09 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -203,7 +203,6 @@ read_attribute(disk_groups);
 
 read_attribute(has_data);
 read_attribute(alloc_debug);
-read_attribute(accounting);
 read_attribute(usage_base);
 
 #define x(t, n, ...) read_attribute(t);
@@ -397,9 +396,6 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_alloc_debug)
 		bch2_fs_alloc_debug_to_text(out, c);
 
-	if (attr == &sysfs_accounting)
-		bch2_fs_accounting_to_text(out, c);
-
 	if (attr == &sysfs_usage_base)
 		bch2_fs_usage_base_to_text(out, c);
 
@@ -595,7 +591,6 @@ struct attribute *bch2_fs_internal_files[] = {
 
 	&sysfs_disk_groups,
 	&sysfs_alloc_debug,
-	&sysfs_accounting,
 	&sysfs_usage_base,
 	NULL
 };
-- 
cgit v1.2.3


From 7b11260456ed49ac14be623400b5ebbb847b71de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Dec 2024 21:42:49 -0500
Subject: bcachefs: Use proper errcodes for inode unpack errors

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h |  3 +++
 fs/bcachefs/inode.c   | 12 ++++++------
 fs/bcachefs/varint.c  |  5 +++--
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index a6a9561a890d..5d17ceb1e83a 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -196,6 +196,8 @@
 	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
 	x(EINVAL,			remove_would_lose_data)			\
 	x(EINVAL,			no_resize_with_buckets_nouse)		\
+	x(EINVAL,			inode_unpack_error)			\
+	x(EINVAL,			varint_decode_error)			\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
@@ -313,6 +315,7 @@ static inline long bch2_err_class(long err)
 
 #define BLK_STS_REMOVED		((__force blk_status_t)128)
 
+#include <linux/blk_types.h>
 const char *bch2_blk_status_to_str(blk_status_t);
 
 #endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 8818e41883f2..f6245b78eb78 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -48,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end,
 	u8 *p;
 
 	if (in >= end)
-		return -1;
+		return -BCH_ERR_inode_unpack_error;
 
 	if (!*in)
-		return -1;
+		return -BCH_ERR_inode_unpack_error;
 
 	/*
 	 * position of highest set bit indicates number of bytes:
@@ -61,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end,
 	bytes	= byte_table[shift - 1];
 
 	if (in + bytes > end)
-		return -1;
+		return -BCH_ERR_inode_unpack_error;
 
 	p = (u8 *) be + 16 - bytes;
 	memcpy(p, in, bytes);
@@ -177,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
 		return ret;						\
 									\
 	if (field_bits > sizeof(unpacked->_name) * 8)			\
-		return -1;						\
+		return -BCH_ERR_inode_unpack_error;			\
 									\
 	unpacked->_name = field[1];					\
 	in += ret;
@@ -218,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
 									\
 	unpacked->_name = v[0];						\
 	if (v[1] || v[0] != unpacked->_name)				\
-		return -1;						\
+		return -BCH_ERR_inode_unpack_error;			\
 	fieldnr++;
 
 	BCH_INODE_FIELDS_v2()
@@ -269,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
 									\
 	unpacked->_name = v[0];						\
 	if (v[1] || v[0] != unpacked->_name)				\
-		return -1;						\
+		return -BCH_ERR_inode_unpack_error;			\
 	fieldnr++;
 
 	BCH_INODE_FIELDS_v3()
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index 6a78553d9b0c..6620ecae26af 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -9,6 +9,7 @@
 #include <valgrind/memcheck.h>
 #endif
 
+#include "errcode.h"
 #include "varint.h"
 
 /**
@@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
 	u64 v;
 
 	if (unlikely(in + bytes > end))
-		return -1;
+		return -BCH_ERR_varint_decode_error;
 
 	if (likely(bytes < 9)) {
 		__le64 v_le = 0;
@@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
 	unsigned bytes = ffz(*in) + 1;
 
 	if (unlikely(in + bytes > end))
-		return -1;
+		return -BCH_ERR_varint_decode_error;
 
 	if (likely(bytes < 9)) {
 		v >>= bytes;
-- 
cgit v1.2.3


From 644457ed8315c8018a5a8c16fdee9acce5cfef27 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Dec 2024 22:00:36 -0500
Subject: bcachefs: Don't BUG_ON() inode unpack error

Bkey validation checks that inodes are well-formed and unpack
successfully, so an unpack error should always indicate memory
corruption or some other kind of hardware bug - but these are still
errors we can recover from.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 37 +++++++++++++++++++++++++------------
 fs/bcachefs/move.c |  4 +++-
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 22a33b9ba30d..1b887f332b74 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -458,7 +458,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
 				continue;
 
 			struct bch_inode_unpacked child_inode;
-			bch2_inode_unpack(k, &child_inode);
+			ret = bch2_inode_unpack(k, &child_inode);
+			if (ret)
+				break;
 
 			if (!inode_should_reattach(&child_inode)) {
 				ret = maybe_delete_dirent(trans,
@@ -809,9 +811,8 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
 {
 	struct bch_inode_unpacked u;
 
-	BUG_ON(bch2_inode_unpack(inode, &u));
-
-	return darray_push(&w->inodes, ((struct inode_walker_entry) {
+	return bch2_inode_unpack(inode, &u) ?:
+		darray_push(&w->inodes, ((struct inode_walker_entry) {
 		.inode		= u,
 		.snapshot	= inode.k->p.snapshot,
 	}));
@@ -1065,7 +1066,7 @@ static int get_snapshot_root_inode(struct btree_trans *trans,
 		goto err;
 	BUG();
 found_root:
-	BUG_ON(bch2_inode_unpack(k, root));
+	ret = bch2_inode_unpack(k, root);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -1096,7 +1097,9 @@ static int check_inode(struct btree_trans *trans,
 	if (!bkey_is_inode(k.k))
 		return 0;
 
-	BUG_ON(bch2_inode_unpack(k, &u));
+	ret = bch2_inode_unpack(k, &u);
+	if (ret)
+		goto err;
 
 	if (snapshot_root->bi_inum != u.bi_inum) {
 		ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
@@ -1318,7 +1321,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
 			break;
 
 		struct bch_inode_unpacked parent_inode;
-		bch2_inode_unpack(k, &parent_inode);
+		ret = bch2_inode_unpack(k, &parent_inode);
+		if (ret)
+			break;
 
 		if (!inode_should_reattach(&parent_inode))
 			break;
@@ -1341,7 +1346,9 @@ static int check_unreachable_inode(struct btree_trans *trans,
 		return 0;
 
 	struct bch_inode_unpacked inode;
-	BUG_ON(bch2_inode_unpack(k, &inode));
+	ret = bch2_inode_unpack(k, &inode);
+	if (ret)
+		return ret;
 
 	if (!inode_should_reattach(&inode))
 		return 0;
@@ -2603,14 +2610,16 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter inode_iter = {};
-	struct bch_inode_unpacked inode;
 	struct printbuf buf = PRINTBUF;
 	u32 snapshot = inode_k.k->p.snapshot;
 	int ret = 0;
 
 	p->nr = 0;
 
-	BUG_ON(bch2_inode_unpack(inode_k, &inode));
+	struct bch_inode_unpacked inode;
+	ret = bch2_inode_unpack(inode_k, &inode);
+	if (ret)
+		return ret;
 
 	if (!S_ISDIR(inode.bi_mode))
 		return 0;
@@ -2810,7 +2819,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 
 			/* Should never fail, checked by bch2_inode_invalid: */
 			struct bch_inode_unpacked u;
-			BUG_ON(bch2_inode_unpack(k, &u));
+			_ret3 = bch2_inode_unpack(k, &u);
+			if (_ret3)
+				break;
 
 			/*
 			 * Backpointer and directory structure checks are sufficient for
@@ -2888,7 +2899,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
 	if (!bkey_is_inode(k.k))
 		return 0;
 
-	BUG_ON(bch2_inode_unpack(k, &u));
+	ret = bch2_inode_unpack(k, &u);
+	if (ret)
+		return ret;
 
 	if (S_ISDIR(u.bi_mode))
 		return 0;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 6f21e36d89f7..6d38afcaaaab 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -412,7 +412,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
 				continue;
 
 			struct bch_inode_unpacked inode;
-			BUG_ON(bch2_inode_unpack(k, &inode));
+			_ret3 = bch2_inode_unpack(k, &inode);
+			if (_ret3)
+				break;
 
 			struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
 			bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
-- 
cgit v1.2.3


From 6ea607ca61475f9281191dab9d5f8a61baee1c4f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Dec 2024 21:47:34 -0500
Subject: bcachefs: bch2_str_hash_check_key() now checks inode hash info

Versions of the same inode in different snapshots must have the same
hash info; this is critical for lookups to work correctly.

We're going to be running the str_hash checks online, at readdir or
xattr list time, so we now need str_hash_check_key() to check for inode
hash seed mismatches, since it won't be run right after check_inodes().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     |   6 +--
 fs/bcachefs/str_hash.c | 127 +++++++++++++++++++++++++++++++++++++++----------
 fs/bcachefs/str_hash.h |  25 ++++++++--
 3 files changed, 125 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 1b887f332b74..b8ced64cce2c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1110,7 +1110,7 @@ static int check_inode(struct btree_trans *trans,
 	if (fsck_err_on(u.bi_hash_seed		!= snapshot_root->bi_hash_seed ||
 			INODE_STR_HASH(&u)	!= INODE_STR_HASH(snapshot_root),
 			trans, inode_snapshot_mismatch,
-			"inodes in different snapshots don't match")) {
+			"inode hash info in different snapshots don't match")) {
 		u.bi_hash_seed = snapshot_root->bi_hash_seed;
 		SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
 		do_update = true;
@@ -2303,7 +2303,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &i->inode);
 	dir->first_this_inode = false;
 
-	ret = bch2_str_hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
+	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -2417,7 +2417,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &i->inode);
 	inode->first_this_inode = false;
 
-	ret = bch2_str_hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
+	ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
 	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index c3276a7e7324..ed3c852fc0be 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -101,38 +101,108 @@ static int hash_pick_winner(struct btree_trans *trans,
 	}
 }
 
-int bch2_str_hash_check_key(struct btree_trans *trans,
-			    struct snapshots_seen *s,
-			    const struct bch_hash_desc desc,
-			    struct bch_hash_info *hash_info,
-			    struct btree_iter *k_iter, struct bkey_s_c hash_k)
+static int repair_inode_hash_info(struct btree_trans *trans,
+				  struct bch_inode_unpacked *snapshot_root)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+					     SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1),
+					     BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != snapshot_root->bi_inum)
+			break;
+		if (!bkey_is_inode(k.k))
+			continue;
+
+		struct bch_inode_unpacked inode;
+		ret = bch2_inode_unpack(k, &inode);
+		if (ret)
+			break;
+
+		if (fsck_err_on(inode.bi_hash_seed	!= snapshot_root->bi_hash_seed ||
+				INODE_STR_HASH(&inode)	!= INODE_STR_HASH(snapshot_root),
+				trans, inode_snapshot_mismatch,
+				"inode hash info in different snapshots don't match")) {
+			inode.bi_hash_seed = snapshot_root->bi_hash_seed;
+			SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
+			ret = __bch2_fsck_write_inode(trans, &inode) ?:
+				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+				-BCH_ERR_transaction_restart_nested;
+			break;
+		}
+	}
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * All versions of the same inode in different snapshots must have the same hash
+ * seed/type: verify that the hash info we're using matches the root
+ */
+static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
+					      struct bch_hash_info *hash_info)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter = { NULL };
-	struct printbuf buf = PRINTBUF;
+	struct btree_iter iter;
 	struct bkey_s_c k;
-	u64 hash;
 	int ret = 0;
 
-	if (hash_k.k->type != desc.key_type)
-		return 0;
+	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX),
+					     BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != inum)
+			break;
+		if (bkey_is_inode(k.k))
+			goto found;
+	}
+	bch_err(c, "%s(): inum %llu not found", __func__, inum);
+	ret = -BCH_ERR_fsck_repair_unimplemented;
+	goto err;
+found:
+	struct bch_inode_unpacked inode;
+	ret = bch2_inode_unpack(k, &inode);
+	if (ret)
+		goto err;
 
-	hash = desc.hash_bkey(hash_info, hash_k);
+	struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
+	if (memcmp(hash_info, &hash2, sizeof(hash2))) {
+		ret = repair_inode_hash_info(trans, &inode);
+		if (!ret) {
+			bch_err(c, "inode hash info mismatch with root, but mismatch not found");
+			ret = -BCH_ERR_fsck_repair_unimplemented;
+		}
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	if (likely(hash == hash_k.k->p.offset))
-		return 0;
+int __bch2_str_hash_check_key(struct btree_trans *trans,
+			      struct snapshots_seen *s,
+			      const struct bch_hash_desc *desc,
+			      struct bch_hash_info *hash_info,
+			      struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c k;
+	int ret = 0;
 
+	u64 hash = desc->hash_bkey(hash_info, hash_k);
 	if (hash_k.k->p.offset < hash)
 		goto bad_hash;
 
-	for_each_btree_key_norestart(trans, iter, desc.btree_id,
+	for_each_btree_key_norestart(trans, iter, desc->btree_id,
 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
 				     BTREE_ITER_slots, k, ret) {
 		if (bkey_eq(k.k->p, hash_k.k->p))
 			break;
 
-		if (k.k->type == desc.key_type &&
-		    !desc.cmp_bkey(k, hash_k))
+		if (k.k->type == desc->key_type &&
+		    !desc->cmp_bkey(k, hash_k))
 			goto duplicate_entries;
 
 		if (bkey_deleted(k.k)) {
@@ -145,16 +215,23 @@ out:
 	printbuf_exit(&buf);
 	return ret;
 bad_hash:
+	/*
+	 * Before doing any repair, check hash_info itself:
+	 */
+	ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
+	if (ret)
+		goto out;
+
 	if (fsck_err(trans, hash_table_key_wrong_offset,
 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n  %s",
-		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
+		     bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
 		     (printbuf_reset(&buf),
 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
 		if (IS_ERR(new))
 			return PTR_ERR(new);
 
-		k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
+		k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
 				       (subvol_inum) { 0, hash_k.k->p.inode },
 				       hash_k.k->p.snapshot, new,
 				       STR_HASH_must_create|
@@ -166,9 +243,9 @@ bad_hash:
 		if (k.k)
 			goto duplicate_entries;
 
-		ret =   bch2_hash_delete_at(trans, desc, hash_info, k_iter,
+		ret =   bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
 					    BTREE_UPDATE_internal_snapshot_node) ?:
-			bch2_fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
+			bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
 			-BCH_ERR_transaction_restart_nested;
 		goto out;
@@ -176,7 +253,7 @@ bad_hash:
 fsck_err:
 	goto out;
 duplicate_entries:
-	ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
+	ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
 	if (ret < 0)
 		goto out;
 
@@ -192,14 +269,14 @@ duplicate_entries:
 
 	switch (ret) {
 	case 0:
-		ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+		ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
 		break;
 	case 1:
-		ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
+		ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
 		break;
 	case 2:
-		ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
-			bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+		ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+			bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
 		goto out;
 	}
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 0c20f3af03f8..55a4ac7bf220 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -394,10 +394,25 @@ int bch2_hash_delete(struct btree_trans *trans,
 }
 
 struct snapshots_seen;
-int bch2_str_hash_check_key(struct btree_trans *,
-			    struct snapshots_seen *,
-			    const struct bch_hash_desc,
-			    struct bch_hash_info *,
-			    struct btree_iter *, struct bkey_s_c);
+int __bch2_str_hash_check_key(struct btree_trans *,
+			      struct snapshots_seen *,
+			      const struct bch_hash_desc *,
+			      struct bch_hash_info *,
+			      struct btree_iter *, struct bkey_s_c);
+
+static inline int bch2_str_hash_check_key(struct btree_trans *trans,
+			    struct snapshots_seen *s,
+			    const struct bch_hash_desc *desc,
+			    struct bch_hash_info *hash_info,
+			    struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+	if (hash_k.k->type != desc->key_type)
+		return 0;
+
+	if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
+		return 0;
+
+	return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
+}
 
 #endif /* _BCACHEFS_STR_HASH_H */
-- 
cgit v1.2.3


From be203120dc084c21fa8208d12fa394d50e843b62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Dec 2024 22:30:19 -0500
Subject: bcachefs: bch2_check_key_has_snapshot() prints btree id

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 99f045518312..f65f7b191d31 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bkey_buf.h"
+#include "btree_cache.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
 #include "buckets.h"
@@ -1097,7 +1098,9 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans,
 	if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
 			trans, bkey_in_missing_snapshot,
 			"key in missing snapshot %s, delete?",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			(bch2_btree_id_to_text(&buf, iter->btree_id),
+			 prt_char(&buf, ' '),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 		ret = bch2_btree_delete_at(trans, iter,
 					    BTREE_UPDATE_internal_snapshot_node) ?: 1;
 fsck_err:
-- 
cgit v1.2.3


From 9f95fc3c12e05d3796fcb261a6b0594e51e63ba5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 9 Dec 2024 01:31:43 -0500
Subject: bcachefs: bch2_snapshot_exists()

bch2_snapshot_equiv() is going away; convert users that just wanted to
know if the snapshot exists to something better

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c     |  2 +-
 fs/bcachefs/snapshot.c        |  7 ++++---
 fs/bcachefs/snapshot.h        | 15 +++++++++++++++
 fs/bcachefs/subvolume_types.h |  1 +
 4 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 31b2aeb0c6e6..585214931e05 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -620,7 +620,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	 * and we have to check for this because we go rw before repairing the
 	 * snapshots table - just skip it, we can move it later.
 	 */
-	if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+	if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
 		return -BCH_ERR_data_update_done;
 
 	if (!bkey_get_dev_refs(c, k))
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index f65f7b191d31..ac664888847f 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -318,6 +318,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
 	if (new.k->type == KEY_TYPE_snapshot) {
 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
 
+		t->live		= true;
 		t->parent	= le32_to_cpu(s.v->parent);
 		t->children[0]	= le32_to_cpu(s.v->children[0]);
 		t->children[1]	= le32_to_cpu(s.v->children[1]);
@@ -914,7 +915,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
 {
 	struct bch_fs *c = trans->c;
 
-	if (bch2_snapshot_equiv(c, id))
+	if (bch2_snapshot_exists(c, id))
 		return 0;
 
 	/* Do we need to reconstruct the snapshot_tree entry as well? */
@@ -1062,7 +1063,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
 		snapshot_id_list_to_text(&buf, t);
 
 		darray_for_each(*t, id) {
-			if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+			if (fsck_err_on(!bch2_snapshot_exists(c, *id),
 					trans, snapshot_node_missing,
 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
 				if (t->nr > 1) {
@@ -1095,7 +1096,7 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
+	if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
 			trans, bkey_in_missing_snapshot,
 			"key in missing snapshot %s, delete?",
 			(bch2_btree_id_to_text(&buf, iter->btree_id),
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index ae23d45fad66..3ff0ffa774f5 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -119,6 +119,21 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
 	return id;
 }
 
+static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s = snapshot_t(c, id);
+	return s ? s->live : 0;
+}
+
+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	bool ret = __bch2_snapshot_exists(c, id);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
 {
 	const struct snapshot_t *s = snapshot_t(c, id);
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index f2ec4277c2a5..8a7f7e87c381 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -9,6 +9,7 @@ typedef DARRAY(u32) snapshot_id_list;
 #define IS_ANCESTOR_BITMAP	128
 
 struct snapshot_t {
+	bool			live;
 	u32			parent;
 	u32			skip[3];
 	u32			depth;
-- 
cgit v1.2.3


From 3f57171d8ddd792a0367e05395a6266e5fc011e0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 10 Dec 2024 10:29:12 -0500
Subject: bcachefs: trace_write_buffer_maybe_flush

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c       |  2 +-
 fs/bcachefs/btree_write_buffer.c |  8 ++++++++
 fs/bcachefs/trace.h              | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 702bf62d7fa7..0e3b7b5d626e 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -206,7 +206,7 @@ static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
 	       : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0);
 }
 
-static int bch2_backpointers_maybe_flush(struct btree_trans *trans,
+static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
 					 struct bkey_s_c visiting_k,
 					 struct bkey_buf *last_flushed)
 {
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 49ce2d1e5c02..746db6d5a0fb 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -632,6 +632,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
 	bch2_bkey_buf_init(&tmp);
 
 	if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
+		if (trace_write_buffer_maybe_flush_enabled()) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, referring_k);
+			trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
+			printbuf_exit(&buf);
+		}
+
 		bch2_bkey_buf_reassemble(&tmp, c, referring_k);
 
 		if (bkey_is_btree_ptr(referring_k.k)) {
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 7baf66beee22..11e6547f91d6 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1436,6 +1436,24 @@ TRACE_EVENT(write_buffer_flush_slowpath,
 	TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
 );
 
+TRACE_EVENT(write_buffer_maybe_flush,
+	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key),
+	TP_ARGS(trans, caller_ip, key),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__string(key,		key			)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__assign_str(key);
+	),
+
+	TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
+);
+
 DEFINE_EVENT(fs_str, rebalance_extent,
 	TP_PROTO(struct bch_fs *c, const char *str),
 	TP_ARGS(c, str)
-- 
cgit v1.2.3


From 64833d396584676cdc9f841c056514bf5b8f5a4a Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 10 Dec 2024 11:12:07 -0700
Subject: bcachefs: Add empty statement between label and declaration in
 check_inode_hash_info_matches_root()

Clang 18 and newer warns (or errors with CONFIG_WERROR=y):

  fs/bcachefs/str_hash.c:164:2: error: label followed by a declaration is a C23 extension [-Werror,-Wc23-extensions]
    164 |         struct bch_inode_unpacked inode;
        |         ^

In Clang 17 and prior, this is an unconditional hard error:

  fs/bcachefs/str_hash.c:164:2: error: expected expression
    164 |         struct bch_inode_unpacked inode;
        |         ^
  fs/bcachefs/str_hash.c:165:30: error: use of undeclared identifier 'inode'
    165 |         ret = bch2_inode_unpack(k, &inode);
        |                                     ^
  fs/bcachefs/str_hash.c:169:55: error: use of undeclared identifier 'inode'
    169 |         struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
        |                                                              ^
  fs/bcachefs/str_hash.c:171:40: error: use of undeclared identifier 'inode'
    171 |                 ret = repair_inode_hash_info(trans, &inode);
        |                                                      ^

Add an empty statement between the label and the declaration to fix the
warning/error without disturbing the code too much.

Fixes: 2519d3b0d656 ("bcachefs: bch2_str_hash_check_key() now checks inode hash info")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202412092339.QB7hffGC-lkp@intel.com/
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/str_hash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index ed3c852fc0be..f5977c5c6743 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -160,7 +160,7 @@ static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inu
 	bch_err(c, "%s(): inum %llu not found", __func__, inum);
 	ret = -BCH_ERR_fsck_repair_unimplemented;
 	goto err;
-found:
+found:;
 	struct bch_inode_unpacked inode;
 	ret = bch2_inode_unpack(k, &inode);
 	if (ret)
-- 
cgit v1.2.3


From 00fa283a41fedc209ee7eda343828275b8303a61 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 10 Dec 2024 13:23:47 -0500
Subject: bcachefs: Refactor c->opts.reconstruct_alloc

Now handled in one place.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a342744fd275..fbef6579d884 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -107,6 +107,12 @@ out:
 	return ret;
 }
 
+static void kill_btree(struct bch_fs *c, enum btree_id btree)
+{
+	bch2_btree_id_root(c, btree)->alive = false;
+	bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+}
+
 /* for -o reconstruct_alloc: */
 static void bch2_reconstruct_alloc(struct bch_fs *c)
 {
@@ -157,16 +163,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-	bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-	bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-	bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-	bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
+		if (btree_id_is_alloc(i))
+			kill_btree(c, i);
 }
 
 /*
@@ -573,9 +572,6 @@ static int read_btree_roots(struct bch_fs *c)
 		if (!r->alive)
 			continue;
 
-		if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
-			continue;
-
 		printbuf_reset(&buf);
 		bch2_btree_id_level_to_text(&buf, i, r->level);
 
@@ -863,15 +859,15 @@ use_clean:
 	c->journal_replay_seq_start	= last_seq;
 	c->journal_replay_seq_end	= blacklist_seq - 1;
 
-	if (c->opts.reconstruct_alloc)
-		bch2_reconstruct_alloc(c);
-
 	zero_out_btree_mem_ptr(&c->journal_keys);
 
 	ret = journal_replay_early(c, clean);
 	if (ret)
 		goto err;
 
+	if (c->opts.reconstruct_alloc)
+		bch2_reconstruct_alloc(c);
+
 	/*
 	 * After an unclean shutdown, skip then next few journal sequence
 	 * numbers as they may have been referenced by btree writes that
-- 
cgit v1.2.3


From 25a3123a67d98d4560ec9ee2286f4f7cffe4c22c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 8 Dec 2024 21:10:27 -0500
Subject: bcachefs: check_indirect_extents can run online

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery_passes_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 94dc20ca2065..2b3ef3980fc3 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -43,7 +43,7 @@
 	x(fs_upgrade_for_subvolumes,		22, 0)				\
 	x(check_inodes,				24, PASS_FSCK)			\
 	x(check_extents,			25, PASS_FSCK)			\
-	x(check_indirect_extents,		26, PASS_FSCK)			\
+	x(check_indirect_extents,		26, PASS_ONLINE|PASS_FSCK)	\
 	x(check_dirents,			27, PASS_FSCK)			\
 	x(check_xattrs,				28, PASS_FSCK)			\
 	x(check_root,				29, PASS_ONLINE|PASS_FSCK)	\
-- 
cgit v1.2.3


From 68eb4fdd8c1c729aa7fbb1eb8d7af6c39917e117 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 03:38:14 -0500
Subject: bcachefs: tidy up __bch2_btree_iter_peek()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 9c54891c737a..368ebcaf05fd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2260,7 +2260,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			/* ensure that iter->k is consistent with iter->pos: */
 			bch2_btree_iter_set_pos(iter, iter->pos);
 			k = bkey_s_c_err(ret);
-			goto out;
+			break;
 		}
 
 		struct btree_path *path = btree_iter_path(trans, iter);
@@ -2270,7 +2270,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			/* No btree nodes at requested level: */
 			bch2_btree_iter_set_pos(iter, SPOS_MAX);
 			k = bkey_s_c_null;
-			goto out;
+			break;
 		}
 
 		btree_path_set_should_be_locked(trans, path);
@@ -2281,10 +2281,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 		    k.k &&
 		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
 			k = k2;
-			ret = bkey_err(k);
-			if (ret) {
+			if (bkey_err(k)) {
 				bch2_btree_iter_set_pos(iter, iter->pos);
-				goto out;
+				break;
 			}
 		}
 
@@ -2318,12 +2317,11 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			/* End of btree: */
 			bch2_btree_iter_set_pos(iter, SPOS_MAX);
 			k = bkey_s_c_null;
-			goto out;
+			break;
 		}
 	}
-out:
-	bch2_btree_iter_verify(iter);
 
+	bch2_btree_iter_verify(iter);
 	return k;
 }
 
-- 
cgit v1.2.3


From c50341be4eb65d7f3b85ac740c0121e36fd69bb4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 13 Dec 2024 06:02:24 -0500
Subject: bcachefs: tidy btree_trans_peek_journal()

Change to match bch2_btree_trans_peek_updates() calling convention.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 368ebcaf05fd..51ebce9d5b5c 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2144,21 +2144,18 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
 }
 
 static noinline
-struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 struct bkey_s_c k)
+void btree_trans_peek_journal(struct btree_trans *trans,
+			      struct btree_iter *iter,
+			      struct bkey_s_c *k)
 {
 	struct btree_path *path = btree_iter_path(trans, iter);
 	struct bkey_i *next_journal =
 		bch2_btree_journal_peek(trans, iter,
-				k.k ? k.k->p : path_l(path)->b->key.k.p);
-
+				k->k ? k->k->p : path_l(path)->b->key.k.p);
 	if (next_journal) {
 		iter->k = next_journal->k;
-		k = bkey_i_to_s_c(next_journal);
+		*k = bkey_i_to_s_c(next_journal);
 	}
-
-	return k;
 }
 
 static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
@@ -2175,21 +2172,19 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
 }
 
 static noinline
-struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 struct bkey_s_c k)
+void btree_trans_peek_prev_journal(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c *k)
 {
 	struct btree_path *path = btree_iter_path(trans, iter);
 	struct bkey_i *next_journal =
 		bch2_btree_journal_peek_prev(trans, iter,
-				k.k ? k.k->p : path_l(path)->b->key.k.p);
+				k->k ? k->k->p : path_l(path)->b->key.k.p);
 
 	if (next_journal) {
 		iter->k = next_journal->k;
-		k = bkey_i_to_s_c(next_journal);
+		*k = bkey_i_to_s_c(next_journal);
 	}
-
-	return k;
 }
 
 /*
@@ -2288,7 +2283,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 		}
 
 		if (unlikely(iter->flags & BTREE_ITER_with_journal))
-			k = btree_trans_peek_journal(trans, iter, k);
+			btree_trans_peek_journal(trans, iter, &k);
 
 		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
 			     trans->nr_updates))
@@ -2545,7 +2540,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
 		}
 
 		if (unlikely(iter->flags & BTREE_ITER_with_journal))
-			k = btree_trans_peek_prev_journal(trans, iter, k);
+			btree_trans_peek_prev_journal(trans, iter, &k);
 
 		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
 			     trans->nr_updates))
-- 
cgit v1.2.3


From 7e320a4063a81508e171012e0f75ec4a111850d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 02:26:15 -0500
Subject: bcachefs: Fix btree_trans_peek_key_cache() BTREE_ITER_all_snapshots

In BTREE_ITER_all_snapshots mode, we're required to only return keys
where the snapshot field matches the iterator position -
BTREE_ITER_filter_snapshots requires pulling keys into the key cache
from ancestor snapshots, so we have to check for that.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 51ebce9d5b5c..e370fa327769 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2230,6 +2230,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 
 	k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
 	if (k.k && !bkey_err(k)) {
+		if ((iter->flags & BTREE_ITER_all_snapshots) &&
+		    !bpos_eq(pos, k.k->p))
+			return bkey_s_c_null;
+
 		iter->k = u;
 		k.k = &iter->k;
 	}
-- 
cgit v1.2.3


From b9a37144da8d5c1f900d7d7782bbd9842a40b806 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 13 Dec 2024 05:29:27 -0500
Subject: bcachefs: Fix key cache + BTREE_ITER_all_snapshots

Normally, whitouts (KEY_TYPE_whitout) are filtered from btree lookups,
since they exist only to represent deletions of keys in ancestor
snapshots - except, they should not be filtered in
BTREE_ITER_all_snapshots mode, so that e.g. snapshot deletion can clean
them up.

This means that that the key cache has to store whiteouts, and key cache
fills cannot filter them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e370fa327769..b27944b62087 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1854,7 +1854,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
 			!bkey_eq(path->pos, ck->key.pos));
 
 		*u = ck->k->k;
-		k = bkey_i_to_s_c(ck->k);
+		k = (struct bkey_s_c) { u, &ck->k->v };
 	}
 
 	return k;
@@ -2421,7 +2421,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
 				continue;
 			}
 
-			if (bkey_whiteout(k.k)) {
+			if (bkey_whiteout(k.k) &&
+			    !(iter->flags & BTREE_ITER_key_cache_fill)) {
 				search_key = bkey_successor(iter, k.k->p);
 				continue;
 			}
@@ -2781,6 +2782,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
 		if (unlikely(!k.k))
 			goto out_no_locked;
+
+		if (unlikely(k.k->type == KEY_TYPE_whiteout &&
+			     (iter->flags & BTREE_ITER_filter_snapshots) &&
+			     !(iter->flags & BTREE_ITER_key_cache_fill)))
+			iter->k.type = KEY_TYPE_deleted;
 	} else {
 		struct bpos next;
 		struct bpos end = iter->pos;
-- 
cgit v1.2.3


From f859bc945ebb1ed8e915cfd31bbe14ce3bb242e5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 02:32:32 -0500
Subject: bcachefs: alloc_data_type_set() happens in alloc trigger

Originally, we ran insert triggers before overwrite so that if an extent
was being moved (by fallocate insert/collapse range), the bucket sector
count wouldn't hit 0 partway through, and so we don't trigger state
changes caused by that too soon.

But this is better solved by just moving the data type change to the
alloc trigger itself, where it's already called.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  1 -
 fs/bcachefs/buckets.c  | 11 ++++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e5ba7d1429b9..5aa11ca08c94 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1134,7 +1134,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev
 		return ret;
 
 	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
-	alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
 
 	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
 }
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index eb2ed4edbbbc..bbd37b1ed5d2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -541,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
 			  struct bkey_s_c k,
 			  const struct extent_ptr_decoded *p,
 			  s64 sectors, enum bch_data_type ptr_data_type,
-			  struct bch_alloc_v4 *a)
+			  struct bch_alloc_v4 *a,
+			  bool insert)
 {
 	u32 *dst_sectors = p->has_ec	? &a->stripe_sectors :
 		!p->ptr.cached		? &a->dirty_sectors :
@@ -551,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
 
 	if (ret)
 		return ret;
-
-	alloc_data_type_set(a, ptr_data_type);
+	if (insert)
+		alloc_data_type_set(a, ptr_data_type);
 	return 0;
 }
 
@@ -585,7 +586,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	if (flags & BTREE_TRIGGER_transactional) {
 		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
 		ret = PTR_ERR_OR_ZERO(a) ?:
-			__mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v);
+			__mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
 		if (ret)
 			goto err;
 
@@ -607,7 +608,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 
 		bucket_lock(g);
 		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new);
+		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
 		alloc_to_bucket(g, new);
 		bucket_unlock(g);
 
-- 
cgit v1.2.3


From 92e31d425179c0f5b14d27ad1ad4a7b716c8db7e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 02:27:52 -0500
Subject: bcachefs: Don't run overwrite triggers before insert

This breaks when the trigger is inserting updates for the same btree, as
the inode trigger now does.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 81 ++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 9011cc3f7190..c3a3bfd11e8c 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 				old, flags);
 }
 
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
-				 bool overwrite)
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
 {
 	verify_update_old_key(trans, i);
 
@@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 		return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
 					BTREE_TRIGGER_insert|
 					BTREE_TRIGGER_overwrite|flags) ?: 1;
-	} else if (overwrite && !i->overwrite_trigger_run) {
+	} else if (!i->overwrite_trigger_run) {
 		i->overwrite_trigger_run = true;
 		return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
-	} else if (!overwrite && !i->insert_trigger_run) {
+	} else if (!i->insert_trigger_run) {
 		i->insert_trigger_run = true;
 		return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
 	} else {
@@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 }
 
 static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-			      unsigned btree_id_start)
+			      unsigned *btree_id_updates_start)
 {
-	for (int overwrite = 1; overwrite >= 0; --overwrite) {
-		bool trans_trigger_run;
+	bool trans_trigger_run;
 
-		/*
-		 * Running triggers will append more updates to the list of updates as
-		 * we're walking it:
-		 */
-		do {
-			trans_trigger_run = false;
-
-			for (unsigned i = btree_id_start;
-			     i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
-			     i++) {
-				if (trans->updates[i].btree_id != btree_id)
-					continue;
+	/*
+	 * Running triggers will append more updates to the list of updates as
+	 * we're walking it:
+	 */
+	do {
+		trans_trigger_run = false;
 
-				int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
-				if (ret < 0)
-					return ret;
-				if (ret)
-					trans_trigger_run = true;
+		for (unsigned i = *btree_id_updates_start;
+		     i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
+		     i++) {
+			if (trans->updates[i].btree_id < btree_id) {
+				*btree_id_updates_start = i;
+				continue;
 			}
-		} while (trans_trigger_run);
-	}
+
+			int ret = run_one_trans_trigger(trans, trans->updates + i);
+			if (ret < 0)
+				return ret;
+			if (ret)
+				trans_trigger_run = true;
+		}
+	} while (trans_trigger_run);
+
+	trans_for_each_update(trans, i)
+		BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
+		       i->btree_id == btree_id &&
+		       btree_node_type_has_trans_triggers(i->bkey_type) &&
+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
 
 	return 0;
 }
 
 static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 {
-	unsigned btree_id = 0, btree_id_start = 0;
+	unsigned btree_id = 0, btree_id_updates_start = 0;
 	int ret = 0;
 
 	/*
@@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 		if (btree_id == BTREE_ID_alloc)
 			continue;
 
-		while (btree_id_start < trans->nr_updates &&
-		       trans->updates[btree_id_start].btree_id < btree_id)
-			btree_id_start++;
-
-		ret = run_btree_triggers(trans, btree_id, btree_id_start);
+		ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
 		if (ret)
 			return ret;
 	}
 
-	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
-		struct btree_insert_entry *i = trans->updates + idx;
-
-		if (i->btree_id > BTREE_ID_alloc)
-			break;
-		if (i->btree_id == BTREE_ID_alloc) {
-			ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
-			if (ret)
-				return ret;
-			break;
-		}
-	}
+	btree_id_updates_start = 0;
+	ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
+	if (ret)
+		return ret;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i)
-- 
cgit v1.2.3


From 85c060f62da4d039952895177f0c58b0167e320b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 02:41:37 -0500
Subject: bcachefs: Kill equiv_seen arg to delete_dead_snapshots_process_key()

When deleting dead snapshots, we move keys from redundant interior
snapshot nodes to child nodes - unless there's already a key, in which
case the ancestor key is deleted.

Previously, we tracked via equiv_seen whether the child snapshot had a
key, but this was tricky w.r.t. transaction restarts, and not
transactionally safe w.r.t. updates in the child snapshot.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 51 +++++++++++++++++---------------------------------
 1 file changed, 17 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ac664888847f..ca7e4e975a60 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1421,9 +1421,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       struct bkey_s_c k,
-			       snapshot_id_list *deleted,
-			       snapshot_id_list *equiv_seen,
-			       struct bpos *last_pos)
+			       snapshot_id_list *deleted)
 {
 	int ret = bch2_check_key_has_snapshot(trans, iter, k);
 	if (ret)
@@ -1434,24 +1432,10 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
 	if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
 		return 0;
 
-	if (!bkey_eq(k.k->p, *last_pos))
-		equiv_seen->nr = 0;
-
 	if (snapshot_list_has_id(deleted, k.k->p.snapshot))
 		return bch2_btree_delete_at(trans, iter,
 					    BTREE_UPDATE_internal_snapshot_node);
 
-	if (!bpos_eq(*last_pos, k.k->p) &&
-	    snapshot_list_has_id(equiv_seen, equiv))
-		return bch2_btree_delete_at(trans, iter,
-					    BTREE_UPDATE_internal_snapshot_node);
-
-	*last_pos = k.k->p;
-
-	ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
-	if (ret)
-		return ret;
-
 	/*
 	 * When we have a linear chain of snapshot nodes, we consider
 	 * those to form an equivalence class: we're going to collapse
@@ -1473,20 +1457,23 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
 
 		new->k.p.snapshot = equiv;
 
-		struct btree_iter new_iter;
-		bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
-				     BTREE_ITER_all_snapshots|
-				     BTREE_ITER_cached|
-				     BTREE_ITER_intent);
-
-		ret =   bch2_btree_iter_traverse(&new_iter) ?:
-			bch2_trans_update(trans, &new_iter, new,
-					BTREE_UPDATE_internal_snapshot_node) ?:
-			bch2_btree_delete_at(trans, iter,
-					BTREE_UPDATE_internal_snapshot_node);
-		bch2_trans_iter_exit(trans, &new_iter);
+		struct btree_iter dst_iter;
+		struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
+							   iter->btree_id, new->k.p,
+							   BTREE_ITER_all_snapshots|
+							   BTREE_ITER_intent);
+		ret = bkey_err(dst_k);
 		if (ret)
 			return ret;
+
+		ret =   (bkey_deleted(dst_k.k)
+			 ? bch2_trans_update(trans, &dst_iter, new,
+					     BTREE_UPDATE_internal_snapshot_node)
+			 : 0) ?:
+			bch2_btree_delete_at(trans, iter,
+					     BTREE_UPDATE_internal_snapshot_node);
+		bch2_trans_iter_exit(trans, &dst_iter);
+		return ret;
 	}
 
 	return 0;
@@ -1648,8 +1635,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		goto err;
 
 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
-		struct bpos last_pos = POS_MIN;
-		snapshot_id_list equiv_seen = { 0 };
 		struct disk_reservation res = { 0 };
 
 		if (!btree_type_has_snapshots(btree))
@@ -1659,11 +1644,9 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 				btree, POS_MIN,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
 				&res, NULL, BCH_TRANS_COMMIT_no_enospc,
-			delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
-							  &equiv_seen, &last_pos));
+			delete_dead_snapshots_process_key(trans, &iter, k, &deleted));
 
 		bch2_disk_reservation_put(c, &res);
-		darray_exit(&equiv_seen);
 
 		bch_err_msg(c, ret, "deleting keys from dying snapshots");
 		if (ret)
-- 
cgit v1.2.3


From 35c5609abf512302e4f3119d08fd1729e392d339 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 03:03:58 -0500
Subject: bcachefs: Snapshot deletion no longer uses snapshot_t->equiv

Switch to generating a private list of interior nodes to delete, instead
of using the equivalence class in the global data structure.

This eliminates possible races with snapshot creation, and is much
cleaner - it'll let us delete a lot of janky code for calculating and
maintaining the equivalence classes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 268 ++++++++++++++++++++++++-------------------------
 1 file changed, 133 insertions(+), 135 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ca7e4e975a60..0d60251946f1 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1418,44 +1418,74 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
  * that key to snapshot leaf nodes, where we can mutate it
  */
 
-static int delete_dead_snapshots_process_key(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c k,
-			       snapshot_id_list *deleted)
+struct snapshot_interior_delete {
+	u32	id;
+	u32	live_child;
+};
+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
+
+static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
 {
-	int ret = bch2_check_key_has_snapshot(trans, iter, k);
-	if (ret)
-		return ret < 0 ? ret : 0;
+	darray_for_each(*l, i)
+		if (i->id == id)
+			return i->live_child;
+	return 0;
+}
 
-	struct bch_fs *c = trans->c;
-	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
-	if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
+static unsigned __live_child(struct snapshot_table *t, u32 id,
+			     snapshot_id_list *delete_leaves,
+			     interior_delete_list *delete_interior)
+{
+	struct snapshot_t *s = __snapshot_t(t, id);
+	if (!s)
 		return 0;
 
-	if (snapshot_list_has_id(deleted, k.k->p.snapshot))
+	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
+		if (s->children[i] &&
+		    !snapshot_list_has_id(delete_leaves, s->children[i]) &&
+		    !interior_delete_has_id(delete_interior, s->children[i]))
+			return s->children[i];
+
+	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
+		u32 live_child = s->children[i]
+			? __live_child(t, s->children[i], delete_leaves, delete_interior)
+			: 0;
+		if (live_child)
+			return live_child;
+	}
+
+	return 0;
+}
+
+static unsigned live_child(struct bch_fs *c, u32 id,
+			   snapshot_id_list *delete_leaves,
+			   interior_delete_list *delete_interior)
+{
+	rcu_read_lock();
+	u32 ret = __live_child(rcu_dereference(c->snapshots), id,
+			       delete_leaves, delete_interior);
+	rcu_read_unlock();
+	return ret;
+}
+
+static int delete_dead_snapshots_process_key(struct btree_trans *trans,
+					     struct btree_iter *iter,
+					     struct bkey_s_c k,
+					     snapshot_id_list *delete_leaves,
+					     interior_delete_list *delete_interior)
+{
+	if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
 		return bch2_btree_delete_at(trans, iter,
 					    BTREE_UPDATE_internal_snapshot_node);
 
-	/*
-	 * When we have a linear chain of snapshot nodes, we consider
-	 * those to form an equivalence class: we're going to collapse
-	 * them all down to a single node, and keep the leaf-most node -
-	 * which has the same id as the equivalence class id.
-	 *
-	 * If there are multiple keys in different snapshots at the same
-	 * position, we're only going to keep the one in the newest
-	 * snapshot (we delete the others above) - the rest have been
-	 * overwritten and are redundant, and for the key we're going to keep we
-	 * need to move it to the equivalance class ID if it's not there
-	 * already.
-	 */
-	if (equiv != k.k->p.snapshot) {
+	u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
+	if (live_child) {
 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
 		int ret = PTR_ERR_OR_ZERO(new);
 		if (ret)
 			return ret;
 
-		new->k.p.snapshot = equiv;
+		new->k.p.snapshot = live_child;
 
 		struct btree_iter dst_iter;
 		struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
@@ -1479,55 +1509,62 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
 	return 0;
 }
 
-static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
+					snapshot_id_list *delete_leaves,
+					interior_delete_list *delete_interior)
 {
-	struct bkey_s_c_snapshot snap;
-	u32 children[2];
-	int ret;
-
 	if (k.k->type != KEY_TYPE_snapshot)
 		return 0;
 
-	snap = bkey_s_c_to_snapshot(k);
-	if (BCH_SNAPSHOT_DELETED(snap.v) ||
-	    BCH_SNAPSHOT_SUBVOL(snap.v))
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+	unsigned live_children = 0;
+
+	if (BCH_SNAPSHOT_SUBVOL(s.v))
 		return 0;
 
-	children[0] = le32_to_cpu(snap.v->children[0]);
-	children[1] = le32_to_cpu(snap.v->children[1]);
+	for (unsigned i = 0; i < 2; i++) {
+		u32 child = le32_to_cpu(s.v->children[i]);
 
-	ret   = bch2_snapshot_live(trans, children[0]) ?:
-		bch2_snapshot_live(trans, children[1]);
-	if (ret < 0)
-		return ret;
-	return !ret;
-}
+		live_children += child &&
+			!snapshot_list_has_id(delete_leaves, child);
+	}
 
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
-{
-	int ret = bch2_snapshot_needs_delete(trans, k);
+	if (live_children == 0) {
+		return snapshot_list_add(c, delete_leaves, s.k->p.offset);
+	} else if (live_children == 1) {
+		struct snapshot_interior_delete d = {
+			.id		= s.k->p.offset,
+			.live_child	= live_child(c, s.k->p.offset, delete_leaves, delete_interior),
+		};
+
+		if (!d.live_child) {
+			bch_err(c, "error finding live child of snapshot %u", d.id);
+			return -EINVAL;
+		}
 
-	return ret <= 0
-		? ret
-		: bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+		return darray_push(delete_interior, d);
+	} else {
+		return 0;
+	}
 }
 
 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
-						snapshot_id_list *skip)
+						interior_delete_list *skip)
 {
 	rcu_read_lock();
-	while (snapshot_list_has_id(skip, id))
+	while (interior_delete_has_id(skip, id))
 		id = __bch2_snapshot_parent(c, id);
 
 	while (n--) {
 		do {
 			id = __bch2_snapshot_parent(c, id);
-		} while (snapshot_list_has_id(skip, id));
+		} while (interior_delete_has_id(skip, id));
 	}
 	rcu_read_unlock();
 
@@ -1536,7 +1573,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
 
 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 					      struct btree_iter *iter, struct bkey_s_c k,
-					      snapshot_id_list *deleted)
+					      interior_delete_list *deleted)
 {
 	struct bch_fs *c = trans->c;
 	u32 nr_deleted_ancestors = 0;
@@ -1546,7 +1583,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_snapshot)
 		return 0;
 
-	if (snapshot_list_has_id(deleted, k.k->p.offset))
+	if (interior_delete_has_id(deleted, k.k->p.offset))
 		return 0;
 
 	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
@@ -1555,7 +1592,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 		return ret;
 
 	darray_for_each(*deleted, i)
-		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
 
 	if (!nr_deleted_ancestors)
 		return 0;
@@ -1573,7 +1610,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
 			u32 id = le32_to_cpu(s->v.skip[j]);
 
-			if (snapshot_list_has_id(deleted, id)) {
+			if (interior_delete_has_id(deleted, id)) {
 				id = bch2_snapshot_nth_parent_skip(c,
 							parent,
 							depth > 1
@@ -1592,46 +1629,25 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 
 int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-	struct btree_trans *trans;
-	snapshot_id_list deleted = { 0 };
-	snapshot_id_list deleted_interior = { 0 };
-	int ret = 0;
-
 	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
 		return 0;
 
-	trans = bch2_trans_get(c);
+	struct btree_trans *trans = bch2_trans_get(c);
+	snapshot_id_list delete_leaves = {};
+	interior_delete_list delete_interior = {};
+	int ret = 0;
 
 	/*
 	 * For every snapshot node: If we have no live children and it's not
 	 * pointed to by a subvolume, delete it:
 	 */
-	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
-			POS_MIN, 0, k,
-			NULL, NULL, 0,
-		bch2_delete_redundant_snapshot(trans, k));
-	bch_err_msg(c, ret, "deleting redundant snapshots");
-	if (ret)
-		goto err;
-
-	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-				 POS_MIN, 0, k,
-		bch2_snapshot_set_equiv(trans, k));
-	bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
+		check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
+	bch_err_msg(c, ret, "walking snapshots");
 	if (ret)
 		goto err;
 
-	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-				 POS_MIN, 0, k, ({
-		if (k.k->type != KEY_TYPE_snapshot)
-			continue;
-
-		BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
-			? snapshot_list_add(c, &deleted, k.k->p.offset)
-			: 0;
-	}));
-	bch_err_msg(c, ret, "walking snapshots");
-	if (ret)
+	if (!delete_leaves.nr && !delete_interior.nr)
 		goto err;
 
 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
@@ -1644,7 +1660,9 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 				btree, POS_MIN,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
 				&res, NULL, BCH_TRANS_COMMIT_no_enospc,
-			delete_dead_snapshots_process_key(trans, &iter, k, &deleted));
+			delete_dead_snapshots_process_key(trans, &iter, k,
+							  &delete_leaves,
+							  &delete_interior));
 
 		bch2_disk_reservation_put(c, &res);
 
@@ -1653,22 +1671,13 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			goto err;
 	}
 
-	bch2_trans_unlock(trans);
-	down_write(&c->snapshot_create_lock);
-
-	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-				 POS_MIN, 0, k, ({
-		u32 snapshot = k.k->p.offset;
-		u32 equiv = bch2_snapshot_equiv(c, snapshot);
-
-		equiv != snapshot
-			? snapshot_list_add(c, &deleted_interior, snapshot)
-			: 0;
-	}));
-
-	bch_err_msg(c, ret, "walking snapshots");
-	if (ret)
-		goto err_create_lock;
+	darray_for_each(delete_leaves, i) {
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
+		bch_err_msg(c, ret, "deleting snapshot %u", *i);
+		if (ret)
+			goto err;
+	}
 
 	/*
 	 * Fixing children of deleted snapshots can't be done completely
@@ -1678,30 +1687,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
 				  BTREE_ITER_intent, k,
 				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
 	if (ret)
-		goto err_create_lock;
-
-	darray_for_each(deleted, i) {
-		ret = commit_do(trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(trans, *i));
-		bch_err_msg(c, ret, "deleting snapshot %u", *i);
-		if (ret)
-			goto err_create_lock;
-	}
+		goto err;
 
-	darray_for_each(deleted_interior, i) {
+	darray_for_each(delete_interior, i) {
 		ret = commit_do(trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(trans, *i));
-		bch_err_msg(c, ret, "deleting snapshot %u", *i);
+			bch2_snapshot_node_delete(trans, i->id));
+		bch_err_msg(c, ret, "deleting snapshot %u", i->id);
 		if (ret)
-			goto err_create_lock;
+			goto err;
 	}
-err_create_lock:
-	up_write(&c->snapshot_create_lock);
 err:
-	darray_exit(&deleted_interior);
-	darray_exit(&deleted);
+	darray_exit(&delete_interior);
+	darray_exit(&delete_leaves);
 	bch2_trans_put(trans);
 	bch_err_fn(c, ret);
 	return ret;
@@ -1754,24 +1753,23 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	return ret;
 }
 
-static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
 {
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c_snapshot snap;
-	int ret = 0;
+	/* If there's one child, it's redundant and keys will be moved to the child */
+	return !!snap.v->children[0] + !!snap.v->children[1] == 1;
+}
 
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+{
 	if (k.k->type != KEY_TYPE_snapshot)
 		return 0;
 
-	snap = bkey_s_c_to_snapshot(k);
+	struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
 	if (BCH_SNAPSHOT_DELETED(snap.v) ||
-	    bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
-	    (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
-		set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
-		return 0;
-	}
+	    interior_snapshot_needs_delete(snap))
+		set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
 
-	return ret;
+	return 0;
 }
 
 int bch2_snapshots_read(struct bch_fs *c)
-- 
cgit v1.2.3


From ef4144ac2dec35d47de666f35cd873eb1be4172e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 19 Dec 2024 18:01:32 +0100
Subject: pidfs: allow bind-mounts

Allow bind-mounting pidfds. Similar to nsfs let's allow bind-mounts for
pidfds. This allows pidfds to be safely recovered and checked for
process recycling.

Link: https://lore.kernel.org/r/20241219-work-pidfs-mount-v1-1-dbc56198b839@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c        | 10 ++++++++--
 fs/pidfs.c            |  2 +-
 include/linux/pidfs.h |  1 +
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 23e81c2a1e3f..7baffa2ea582 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,6 +32,7 @@
 #include <linux/fs_context.h>
 #include <linux/shmem_fs.h>
 #include <linux/mnt_idmapping.h>
+#include <linux/pidfs.h>
 #include <linux/nospec.h>
 
 #include "pnode.h"
@@ -2732,8 +2733,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
 	if (IS_MNT_UNBINDABLE(old))
 		return mnt;
 
-	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
-		return mnt;
+	if (!check_mnt(old)) {
+		const struct dentry_operations *d_op = old_path->dentry->d_op;
+
+		if (d_op != &ns_dentry_operations &&
+		    d_op != &pidfs_dentry_operations)
+			return mnt;
+	}
 
 	if (!recurse && has_locked_children(old, old_path->dentry))
 		return mnt;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index c5a51c69acc8..049352f973de 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -510,7 +510,7 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
 	return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
 }
 
-static const struct dentry_operations pidfs_dentry_operations = {
+const struct dentry_operations pidfs_dentry_operations = {
 	.d_delete	= always_delete_dentry,
 	.d_dname	= pidfs_dname,
 	.d_prune	= stashed_dentry_prune,
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index df574d6708d4..7c830d0dec9a 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -6,5 +6,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
 void pidfs_add_pid(struct pid *pid);
 void pidfs_remove_pid(struct pid *pid);
+extern const struct dentry_operations pidfs_dentry_operations;
 
 #endif /* _LINUX_PID_FS_H */
-- 
cgit v1.2.3


From ea382199071931d19aac5f688b543e07360e2b64 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 20 Nov 2024 12:20:34 +0100
Subject: vfs: support caching symlink lengths in inodes

When utilized it dodges strlen() in vfs_readlink(), giving about 1.5%
speed up when issuing readlink on /initrd.img on ext4.

Filesystems opt in by calling inode_set_cached_link() when creating an
inode.

The size is stored in a new union utilizing the same space as i_devices,
thus avoiding growing the struct or taking up any more space.

Churn-wise the current readlink_copy() helper is patched to accept the
size instead of calculating it.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20241120112037.822078-2-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c                     | 34 +++++++++++++++++++---------------
 fs/proc/namespaces.c           |  2 +-
 include/linux/fs.h             | 15 +++++++++++++--
 security/apparmor/apparmorfs.c |  2 +-
 4 files changed, 34 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9d30c7aa9aa6..e56c29a22d26 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
 				getname(newname), 0);
 }
 
-int readlink_copy(char __user *buffer, int buflen, const char *link)
+int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
 {
-	int len = PTR_ERR(link);
-	if (IS_ERR(link))
-		goto out;
+	int copylen;
 
-	len = strlen(link);
-	if (len > (unsigned) buflen)
-		len = buflen;
-	if (copy_to_user(buffer, link, len))
-		len = -EFAULT;
-out:
-	return len;
+	copylen = linklen;
+	if (unlikely(copylen > (unsigned) buflen))
+		copylen = buflen;
+	if (copy_to_user(buffer, link, copylen))
+		copylen = -EFAULT;
+	return copylen;
 }
 
 /**
@@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	const char *link;
 	int res;
 
+	if (inode->i_opflags & IOP_CACHED_LINK)
+		return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
+
 	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
 		if (unlikely(inode->i_op->readlink))
 			return inode->i_op->readlink(dentry, buffer, buflen);
@@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 		if (IS_ERR(link))
 			return PTR_ERR(link);
 	}
-	res = readlink_copy(buffer, buflen, link);
+	res = readlink_copy(buffer, buflen, link, strlen(link));
 	do_delayed_call(&done);
 	return res;
 }
@@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link);
 
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
+	const char *link;
+	int res;
+
 	DEFINE_DELAYED_CALL(done);
-	int res = readlink_copy(buffer, buflen,
-				page_get_link(dentry, d_inode(dentry),
-					      &done));
+	link = page_get_link(dentry, d_inode(dentry), &done);
+	res = PTR_ERR(link);
+	if (!IS_ERR(link))
+		res = readlink_copy(buffer, buflen, link, strlen(link));
 	do_delayed_call(&done);
 	return res;
 }
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 8e159fc78c0a..c610224faf10 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		res = ns_get_name(name, sizeof(name), task, ns_ops);
 		if (res >= 0)
-			res = readlink_copy(buffer, buflen, name);
+			res = readlink_copy(buffer, buflen, name, strlen(name));
 	}
 	put_task_struct(task);
 	return res;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e29433c5ecc..2cc98de5af43 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -626,6 +626,7 @@ is_uncached_acl(struct posix_acl *acl)
 #define IOP_XATTR	0x0008
 #define IOP_DEFAULT_READLINK	0x0010
 #define IOP_MGTIME	0x0020
+#define IOP_CACHED_LINK	0x0040
 
 /*
  * Keep mostly read-only and often accessed (especially for
@@ -723,7 +724,10 @@ struct inode {
 	};
 	struct file_lock_context	*i_flctx;
 	struct address_space	i_data;
-	struct list_head	i_devices;
+	union {
+		struct list_head	i_devices;
+		int			i_linklen;
+	};
 	union {
 		struct pipe_inode_info	*i_pipe;
 		struct cdev		*i_cdev;
@@ -749,6 +753,13 @@ struct inode {
 	void			*i_private; /* fs or device private pointer */
 } __randomize_layout;
 
+static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
+{
+	inode->i_link = link;
+	inode->i_linklen = linklen;
+	inode->i_opflags |= IOP_CACHED_LINK;
+}
+
 /*
  * Get bit address from inode->i_state to use with wait_var_event()
  * infrastructre.
@@ -3351,7 +3362,7 @@ extern const struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 
-extern int readlink_copy(char __user *, int, const char *);
+extern int readlink_copy(char __user *, int, const char *, int);
 extern int page_readlink(struct dentry *, char __user *, int);
 extern const char *page_get_link(struct dentry *, struct inode *,
 				 struct delayed_call *);
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 2c0185ebc900..c07d150685d7 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -2612,7 +2612,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer,
 	res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME,
 		       d_inode(dentry)->i_ino);
 	if (res > 0 && res < sizeof(name))
-		res = readlink_copy(buffer, buflen, name);
+		res = readlink_copy(buffer, buflen, name, strlen(name));
 	else
 		res = -ENOENT;
 
-- 
cgit v1.2.3


From bae80473f7b0b25772619e7692019b1549d4a82c Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 20 Nov 2024 12:20:35 +0100
Subject: ext4: use inode_set_cached_link()

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20241120112037.822078-3-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ext4/inode.c | 3 ++-
 fs/ext4/namei.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89aade6f45f6..7c54ae5fcbd4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5006,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 		if (IS_ENCRYPTED(inode)) {
 			inode->i_op = &ext4_encrypted_symlink_inode_operations;
 		} else if (ext4_inode_is_fast_symlink(inode)) {
-			inode->i_link = (char *)ei->i_data;
 			inode->i_op = &ext4_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
+			inode_set_cached_link(inode, (char *)ei->i_data,
+					      inode->i_size);
 		} else {
 			inode->i_op = &ext4_symlink_inode_operations;
 		}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bcf2737078b8..536d56d15072 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3418,7 +3418,6 @@ retry:
 			inode->i_op = &ext4_symlink_inode_operations;
 		} else {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
-			inode->i_link = (char *)&EXT4_I(inode)->i_data;
 		}
 	}
 
@@ -3434,6 +3433,9 @@ retry:
 		       disk_link.len);
 		inode->i_size = disk_link.len - 1;
 		EXT4_I(inode)->i_disksize = inode->i_size;
+		if (!IS_ENCRYPTED(inode))
+			inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data,
+					      inode->i_size);
 	}
 	err = ext4_add_nondir(handle, dentry, &inode);
 	if (handle)
-- 
cgit v1.2.3


From d727935cad9f6f52c8d184968f9720fdc966c669 Mon Sep 17 00:00:00 2001
From: Jinliang Zheng <alexjlzheng@gmail.com>
Date: Sun, 24 Nov 2024 11:46:36 +0800
Subject: fs: fix proc_handler for sysctl_nr_open

Use proc_douintvec_minmax() instead of proc_dointvec_minmax() to handle
sysctl_nr_open, because its data type is unsigned int, not int.

Fixes: 9b80a184eaad ("fs/file: more unsigned file descriptors")
Signed-off-by: Jinliang Zheng <alexjlzheng@tencent.com>
Link: https://lore.kernel.org/r/20241124034636.325337-1-alexjlzheng@tencent.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 976736be47cb..502b81f614d9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -128,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = {
 		.data		= &sysctl_nr_open,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_douintvec_minmax,
 		.extra1		= &sysctl_nr_open_min,
 		.extra2		= &sysctl_nr_open_max,
 	},
-- 
cgit v1.2.3


From 9b7da575f85962c44abe7dc245b0a58179ad2c45 Mon Sep 17 00:00:00 2001
From: shao mingyin <shao.mingyin@zte.com.cn>
Date: Wed, 23 Oct 2024 13:58:50 +0800
Subject: file: flush delayed work in delayed fput()

The fput() of file rcS might not have completed causing issues when
executing the file.

rcS is opened in do_populate_rootfs before executed. At the end of
do_populate_rootfs() flush_delayed_fput() is called. Now
do_populate_rootfs() assumes that all fput()s caused by
do_populate_rootfs() have completed.

But flush_delayed_fput() can only ensure that fput() on the current
delayed_fput_list has finished. Any file that has been removed from
delayed_fput_list asynchronously in the meantime might not have
completed causing the exec to fail.

do_populate_rootfs	delayed_fput_list	delayed_fput	execve
fput()			a
fput()			a->b
fput()			a->b->rcS
						__fput(a)
fput()			c
fput()			c->d
						__fput(b)
flush_delayed_fput
__fput(c)
__fput(d)
						__fput(b)
						__fput(b)	execve(rcS)

Ensure that all delayed work is done by calling flush_delayed_work() in
flush_delayed_fput() explicitly.

Signed-off-by: Chen Lin <chen.lin5@zte.com.cn>
Signed-off-by: Shao Mingyin <shao.mingyin@zte.com.cn>
Link: https://lore.kernel.org/r/20241023135850067m3w2R0UXESiVCYz_wdAoT@zte.com.cn
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: Yang Tao <yang.tao172@zte.com.cn>
Cc: Xu Xin <xu.xin16@zte.com.cn>
[brauner: rewrite commit message]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 502b81f614d9..a32171d2b83f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -478,6 +478,8 @@ static void ____fput(struct callback_head *work)
 	__fput(container_of(work, struct file, f_task_work));
 }
 
+static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
+
 /*
  * If kernel thread really needs to have the final fput() it has done
  * to complete, call this.  The only user right now is the boot - we
@@ -491,11 +493,10 @@ static void ____fput(struct callback_head *work)
 void flush_delayed_fput(void)
 {
 	delayed_fput(NULL);
+	flush_delayed_work(&delayed_fput_work);
 }
 EXPORT_SYMBOL_GPL(flush_delayed_fput);
 
-static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
-
 void fput(struct file *file)
 {
 	if (file_ref_put(&file->f_ref)) {
-- 
cgit v1.2.3


From 4db9f52fa9b81addc412330957bb7a657d2f1ffb Mon Sep 17 00:00:00 2001
From: Guo Weikang <guoweikang.kernel@gmail.com>
Date: Mon, 2 Dec 2024 16:11:45 +0800
Subject: fs: fc_log replace magic number 7 with ARRAY_SIZE()

Replace the hardcoded value `7` in `put_fc_log()` with `ARRAY_SIZE`.
This improves maintainability by ensuring the loop adapts to changes
in the buffer size.

Signed-off-by: Guo Weikang <guoweikang.kernel@gmail.com>
Link: https://lore.kernel.org/r/20241202081146.1031780-1-guoweikang.kernel@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 98589aae5208..582d33e81117 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc)
 	if (log) {
 		if (refcount_dec_and_test(&log->usage)) {
 			fc->log.log = NULL;
-			for (i = 0; i <= 7; i++)
+			for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++)
 				if (log->need_free & (1 << i))
 					kfree(log->buffer[i]);
 			kfree(log);
-- 
cgit v1.2.3


From 175c6a216dda4c88f7050b67e75a6cf331086c75 Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Wed, 4 Dec 2024 00:12:18 -0800
Subject: fs: Fix grammar and spelling in propagate_umount()

Fix grammar and spelling in the propagate_umount() function.

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Link: https://lore.kernel.org/r/20241204081218.12141-1-zhujun2@cmss.chinamobile.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pnode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/pnode.c b/fs/pnode.c
index a799e0315cc9..ef048f008bdd 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list)
 				continue;
 			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
 				/*
-				 * We have come accross an partially unmounted
-				 * mount in list that has not been visited yet.
-				 * Remember it has been visited and continue
-				 * about our merry way.
+				 * We have come across a partially unmounted
+				 * mount in a list that has not been visited
+				 * yet. Remember it has been visited and
+				 * continue about our merry way.
 				 */
 				list_add_tail(&child->mnt_umounting, &visited);
 				continue;
-- 
cgit v1.2.3


From ec052fae814d467d6aa7e591b4b24531b87e65ec Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 5 Dec 2024 16:47:43 +0100
Subject: fs: sort out a stale comment about races between fd alloc and dup2

It claims the issue is only relevant for shared descriptor tables which
is of no concern for POSIX (but then is POSIX of concern to anyone
today?), which I presume predates standarized threading.

The comment also mentions the following systems:
- OpenBSD installing a larval file -- they moved away from it, file is
  installed late and EBUSY is returned on conflict
- FreeBSD returning EBADF -- reworked to install the file early like
  OpenBSD used to do
- NetBSD "deadlocks in amusing ways" -- their solution looks
  Solaris-inspired (not a compliment) and I would not be particularly
  surprised if it indeed deadlocked, in amusing ways or otherwise

I don't believe mentioning any of these adds anything and the statement
about the issue not being POSIX-relevant is outdated.

dup2 description in POSIX still does not mention the problem.

Just shorten the comment and be done with it.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20241205154743.1586584-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 019fb9acf91b..d498715ef415 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1230,17 +1230,9 @@ __releases(&files->file_lock)
 
 	/*
 	 * We need to detect attempts to do dup2() over allocated but still
-	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
-	 * extra work in their equivalent of fget() - they insert struct
-	 * file immediately after grabbing descriptor, mark it larval if
-	 * more work (e.g. actual opening) is needed and make sure that
-	 * fget() treats larval files as absent.  Potentially interesting,
-	 * but while extra work in fget() is trivial, locking implications
-	 * and amount of surgery on open()-related paths in VFS are not.
-	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
-	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
-	 * scope of POSIX or SUS, since neither considers shared descriptor
-	 * tables and this condition does not arise without those.
+	 * not finished descriptor.
+	 *
+	 * POSIX is silent on the issue, we return -EBUSY.
 	 */
 	fdt = files_fdtable(files);
 	fd = array_index_nospec(fd, fdt->max_fds);
-- 
cgit v1.2.3


From b8ea3b1ff544b47c1d64a22860f33b755638164e Mon Sep 17 00:00:00 2001
From: Bharath SM <bharathsm@microsoft.com>
Date: Fri, 13 Dec 2024 22:50:21 +0530
Subject: smb: enable reuse of deferred file handles for write operations

Previously, deferred file handles were reused only for read
operations, this commit extends to reusing deferred handles
for write operations. By reusing these handles we can reduce
the need for open/close operations over the wire.

Signed-off-by: Bharath SM <bharathsm@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/file.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index a58a3333ecc3..3b2d33291a7e 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -990,7 +990,11 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 
 	/* Get the cached handle as SMB2 close is deferred */
-	rc = cifs_get_readable_path(tcon, full_path, &cfile);
+	if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) {
+		rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile);
+	} else {
+		rc = cifs_get_readable_path(tcon, full_path, &cfile);
+	}
 	if (rc == 0) {
 		if (file->f_flags == cfile->f_flags) {
 			file->private_data = cfile;
-- 
cgit v1.2.3


From f17224c2a7bdc11a17c96d9d8cb2d829f54d40bb Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 20 Dec 2024 21:59:37 +0000
Subject: cifs: Remove unused is_server_using_iface()

The last use of is_server_using_iface() was removed in 2022 by
commit aa45dadd34e4 ("cifs: change iface_list from array to sorted linked
list")

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsproto.h |  2 --
 fs/smb/client/sess.c      | 25 -------------------------
 2 files changed, 27 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 754417cb3294..d26f9bbb5382 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -614,8 +614,6 @@ int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
 void cifs_free_hash(struct shash_desc **sdesc);
 
 int cifs_try_adding_channels(struct cifs_ses *ses);
-bool is_server_using_iface(struct TCP_Server_Info *server,
-			   struct cifs_server_iface *iface);
 bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
 void cifs_ses_mark_for_reconnect(struct cifs_ses *ses);
 
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 3306fb655136..91d4d409cb1d 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -27,31 +27,6 @@ static int
 cifs_ses_add_channel(struct cifs_ses *ses,
 		     struct cifs_server_iface *iface);
 
-bool
-is_server_using_iface(struct TCP_Server_Info *server,
-		      struct cifs_server_iface *iface)
-{
-	struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr;
-	struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr;
-	struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr;
-	struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr;
-
-	if (server->dstaddr.ss_family != iface->sockaddr.ss_family)
-		return false;
-	if (server->dstaddr.ss_family == AF_INET) {
-		if (s4->sin_addr.s_addr != i4->sin_addr.s_addr)
-			return false;
-	} else if (server->dstaddr.ss_family == AF_INET6) {
-		if (memcmp(&s6->sin6_addr, &i6->sin6_addr,
-			   sizeof(i6->sin6_addr)) != 0)
-			return false;
-	} else {
-		/* unknown family.. */
-		return false;
-	}
-	return true;
-}
-
 bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
 {
 	int i;
-- 
cgit v1.2.3


From 19206d3f5ef7f051056d2fb49203a347e4844e6e Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Mon, 2 Dec 2024 11:19:57 +0000
Subject: block: Delete bio_set_prio()

Since commit 43b62ce3ff0a ("block: move bio io prio to a new field"), macro
bio_set_prio() does nothing but set bio->bi_ioprio. All other places just
set bio->bi_ioprio directly, so replace bio_set_prio() remaining
callsites with setting bio->bi_ioprio directly and delete that macro.

Signed-off-by: John Garry <john.g.garry@oracle.com>
Acked-by: Jack Wang <jinpu.wang@ionos.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20241202111957.2311683-3-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/rnbd-srv.c | 2 +-
 drivers/md/bcache/movinggc.c  | 2 +-
 drivers/md/bcache/writeback.c | 2 +-
 fs/bcachefs/move.c            | 6 +++---
 include/linux/bio.h           | 2 --
 5 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 08ce6d96d04c..2ee6e9bd4e28 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -167,7 +167,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 	bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
 	prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR ||
 	       usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio);
-	bio_set_prio(bio, prio);
+	bio->bi_ioprio = prio;
 
 	submit_bio(bio);
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index ef6abf33f926..45ca134cbf02 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -82,7 +82,7 @@ static void moving_init(struct moving_io *io)
 	bio_init(bio, NULL, bio->bi_inline_vecs,
 		 DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
 	bio_get(bio);
-	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
 
 	bio->bi_iter.bi_size	= KEY_SIZE(&io->w->key) << 9;
 	bio->bi_private		= &io->cl;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c1d28e365910..453efbbdc8ee 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -334,7 +334,7 @@ static void dirty_init(struct keybuf_key *w)
 	bio_init(bio, NULL, bio->bi_inline_vecs,
 		 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
 	if (!io->dc->writeback_percent)
-		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+		bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
 
 	bio->bi_iter.bi_size	= KEY_SIZE(&w->key) << 9;
 	bio->bi_private		= w;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 0ef4a86850bb..67fb651f4af4 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -292,8 +292,8 @@ int bch2_move_extent(struct moving_context *ctxt,
 	io->write_sectors	= k.k->size;
 
 	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
-	bio_set_prio(&io->write.op.wbio.bio,
-		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->write.op.wbio.bio.bi_ioprio =
+		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
 
 	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
 				 GFP_KERNEL))
@@ -303,7 +303,7 @@ int bch2_move_extent(struct moving_context *ctxt,
 	io->rbio.opts		= io_opts;
 	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
 	io->rbio.bio.bi_vcnt = pages;
-	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
 	io->rbio.bio.bi_iter.bi_size = sectors << 9;
 
 	io->rbio.bio.bi_opf		= REQ_OP_READ;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 99676916f3db..1eec59699100 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -19,8 +19,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs)
 	return min(nr_segs, BIO_MAX_VECS);
 }
 
-#define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)
-
 #define bio_iter_iovec(bio, iter)				\
 	bvec_iter_bvec((bio)->bi_io_vec, (iter))
 
-- 
cgit v1.2.3


From d29662695ed7c015521e5fc9387df25aab192a2e Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 13 Nov 2024 18:16:49 +0100
Subject: btrfs: fix use-after-free waiting for encoded read endios

Fix a use-after-free in the I/O completion path for encoded reads by
using a completion instead of a wait_queue for synchronizing the
destruction of 'struct btrfs_encoded_read_private'.

Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl")
CC: stable@vger.kernel.org # 6.1+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 94c8809e8170..6baa0269a85b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9078,9 +9078,9 @@ out:
 }
 
 struct btrfs_encoded_read_private {
-	wait_queue_head_t wait;
+	struct completion done;
 	void *uring_ctx;
-	atomic_t pending;
+	refcount_t pending_refs;
 	blk_status_t status;
 };
 
@@ -9099,14 +9099,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 		 */
 		WRITE_ONCE(priv->status, bbio->bio.bi_status);
 	}
-	if (atomic_dec_and_test(&priv->pending)) {
+	if (refcount_dec_and_test(&priv->pending_refs)) {
 		int err = blk_status_to_errno(READ_ONCE(priv->status));
 
 		if (priv->uring_ctx) {
 			btrfs_uring_read_extent_endio(priv->uring_ctx, err);
 			kfree(priv);
 		} else {
-			wake_up(&priv->wait);
+			complete(&priv->done);
 		}
 	}
 	bio_put(&bbio->bio);
@@ -9126,8 +9126,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 	if (!priv)
 		return -ENOMEM;
 
-	init_waitqueue_head(&priv->wait);
-	atomic_set(&priv->pending, 1);
+	init_completion(&priv->done);
+	refcount_set(&priv->pending_refs, 1);
 	priv->status = 0;
 	priv->uring_ctx = uring_ctx;
 
@@ -9140,7 +9140,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
 
 		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
-			atomic_inc(&priv->pending);
+			refcount_inc(&priv->pending_refs);
 			btrfs_submit_bbio(bbio, 0);
 
 			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
@@ -9155,11 +9155,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 		disk_io_size -= bytes;
 	} while (disk_io_size);
 
-	atomic_inc(&priv->pending);
+	refcount_inc(&priv->pending_refs);
 	btrfs_submit_bbio(bbio, 0);
 
 	if (uring_ctx) {
-		if (atomic_dec_return(&priv->pending) == 0) {
+		if (refcount_dec_and_test(&priv->pending_refs)) {
 			ret = blk_status_to_errno(READ_ONCE(priv->status));
 			btrfs_uring_read_extent_endio(uring_ctx, ret);
 			kfree(priv);
@@ -9168,8 +9168,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 
 		return -EIOCBQUEUED;
 	} else {
-		if (atomic_dec_return(&priv->pending) != 0)
-			io_wait_event(priv->wait, !atomic_read(&priv->pending));
+		if (!refcount_dec_and_test(&priv->pending_refs))
+			wait_for_completion_io(&priv->done);
 		/* See btrfs_encoded_read_endio() for ordering. */
 		ret = blk_status_to_errno(READ_ONCE(priv->status));
 		kfree(priv);
-- 
cgit v1.2.3


From 44f52bbe96dfdbe4aca3818a2534520082a07040 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 11 Dec 2024 16:08:07 +0000
Subject: btrfs: fix use-after-free when COWing tree bock and tracing is
 enabled

When a COWing a tree block, at btrfs_cow_block(), and we have the
tracepoint trace_btrfs_cow_block() enabled and preemption is also enabled
(CONFIG_PREEMPT=y), we can trigger a use-after-free in the COWed extent
buffer while inside the tracepoint code. This is because in some paths
that call btrfs_cow_block(), such as btrfs_search_slot(), we are holding
the last reference on the extent buffer @buf so btrfs_force_cow_block()
drops the last reference on the @buf extent buffer when it calls
free_extent_buffer_stale(buf), which schedules the release of the extent
buffer with RCU. This means that if we are on a kernel with preemption,
the current task may be preempted before calling trace_btrfs_cow_block()
and the extent buffer already released by the time trace_btrfs_cow_block()
is called, resulting in a use-after-free.

Fix this by moving the trace_btrfs_cow_block() from btrfs_cow_block() to
btrfs_force_cow_block() before the COWed extent buffer is freed.
This also has a side effect of invoking the tracepoint in the tree defrag
code, at defrag.c:btrfs_realloc_node(), since btrfs_force_cow_block() is
called there, but this is fine and it was actually missing there.

Reported-by: syzbot+8517da8635307182c8a5@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/6759a9b9.050a0220.1ac542.000d.GAE@google.com/
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 693dc27ffb89..185985a337b3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -654,6 +654,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
 			goto error_unlock_cow;
 		}
 	}
+
+	trace_btrfs_cow_block(root, buf, cow);
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
 	free_extent_buffer_stale(buf);
@@ -710,7 +712,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 search_start;
-	int ret;
 
 	if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
 		btrfs_abort_transaction(trans, -EUCLEAN);
@@ -751,12 +752,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	 * Also We don't care about the error, as it's handled internally.
 	 */
 	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
-	ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
-				    cow_ret, search_start, 0, nest);
-
-	trace_btrfs_cow_block(root, buf, *cow_ret);
-
-	return ret;
+	return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
+				     cow_ret, search_start, 0, nest);
 }
 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
 
-- 
cgit v1.2.3


From 3e74859ee35edc33a022c3f3971df066ea0ca6b9 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 13 Dec 2024 12:22:32 -0800
Subject: btrfs: check folio mapping after unlock in relocate_one_folio()

When we call btrfs_read_folio() to bring a folio uptodate, we unlock the
folio. The result of that is that a different thread can modify the
mapping (like remove it with invalidate) before we call folio_lock().
This results in an invalid page and we need to try again.

In particular, if we are relocating concurrently with aborting a
transaction, this can result in a crash like the following:

  BUG: kernel NULL pointer dereference, address: 0000000000000000
  PGD 0 P4D 0
  Oops: 0000 [#1] SMP
  CPU: 76 PID: 1411631 Comm: kworker/u322:5
  Workqueue: events_unbound btrfs_reclaim_bgs_work
  RIP: 0010:set_page_extent_mapped+0x20/0xb0
  RSP: 0018:ffffc900516a7be8 EFLAGS: 00010246
  RAX: ffffea009e851d08 RBX: ffffea009e0b1880 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: ffffc900516a7b90 RDI: ffffea009e0b1880
  RBP: 0000000003573000 R08: 0000000000000001 R09: ffff88c07fd2f3f0
  R10: 0000000000000000 R11: 0000194754b575be R12: 0000000003572000
  R13: 0000000003572fff R14: 0000000000100cca R15: 0000000005582fff
  FS:  0000000000000000(0000) GS:ffff88c07fd00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 000000407d00f002 CR4: 00000000007706f0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  PKRU: 55555554
  Call Trace:
  <TASK>
  ? __die+0x78/0xc0
  ? page_fault_oops+0x2a8/0x3a0
  ? __switch_to+0x133/0x530
  ? wq_worker_running+0xa/0x40
  ? exc_page_fault+0x63/0x130
  ? asm_exc_page_fault+0x22/0x30
  ? set_page_extent_mapped+0x20/0xb0
  relocate_file_extent_cluster+0x1a7/0x940
  relocate_data_extent+0xaf/0x120
  relocate_block_group+0x20f/0x480
  btrfs_relocate_block_group+0x152/0x320
  btrfs_relocate_chunk+0x3d/0x120
  btrfs_reclaim_bgs_work+0x2ae/0x4e0
  process_scheduled_works+0x184/0x370
  worker_thread+0xc6/0x3e0
  ? blk_add_timer+0xb0/0xb0
  kthread+0xae/0xe0
  ? flush_tlb_kernel_range+0x90/0x90
  ret_from_fork+0x2f/0x40
  ? flush_tlb_kernel_range+0x90/0x90
  ret_from_fork_asm+0x11/0x20
  </TASK>

This occurs because cleanup_one_transaction() calls
destroy_delalloc_inodes() which calls invalidate_inode_pages2() which
takes the folio_lock before setting mapping to NULL. We fail to check
this, and subsequently call set_extent_mapping(), which assumes that
mapping != NULL (in fact it asserts that in debug mode)

Note that the "fixes" patch here is not the one that introduced the
race (the very first iteration of this code from 2009) but a more recent
change that made this particular crash happen in practice.

Fixes: e7f1326cc24e ("btrfs: set page extent mapped after read_folio in relocate_one_page")
CC: stable@vger.kernel.org # 6.1+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index bf267bdfa8f8..db8b42f674b7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2902,6 +2902,7 @@ static int relocate_one_folio(struct reloc_control *rc,
 	const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
 
 	ASSERT(index <= last_index);
+again:
 	folio = filemap_lock_folio(inode->i_mapping, index);
 	if (IS_ERR(folio)) {
 
@@ -2937,6 +2938,11 @@ static int relocate_one_folio(struct reloc_control *rc,
 			ret = -EIO;
 			goto release_folio;
 		}
+		if (folio->mapping != inode->i_mapping) {
+			folio_unlock(folio);
+			folio_put(folio);
+			goto again;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From 0fba7be1ca6df2881e68386e5575fe096f33c4ca Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 13 Dec 2024 12:33:22 -0800
Subject: btrfs: check folio mapping after unlock in put_file_data()

When we call btrfs_read_folio() we get an unlocked folio, so it is possible
for a different thread to concurrently modify folio->mapping. We must
check that this hasn't happened once we do have the lock.

CC: stable@vger.kernel.org # 6.12+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7254279c3cc9..498c84323253 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 		unsigned cur_len = min_t(unsigned, len,
 					 PAGE_SIZE - pg_offset);
 
+again:
 		folio = filemap_lock_folio(mapping, index);
 		if (IS_ERR(folio)) {
 			page_cache_sync_readahead(mapping,
@@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 				ret = -EIO;
 				break;
 			}
+			if (folio->mapping != mapping) {
+				folio_unlock(folio);
+				folio_put(folio);
+				goto again;
+			}
 		}
 
 		memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
-- 
cgit v1.2.3


From 0525064bb82e50d59543b62b9d41a606198a4a44 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 29 Nov 2024 12:25:30 +0000
Subject: btrfs: fix race with memory mapped writes when activating swap file

When activating the swap file we flush all delalloc and wait for ordered
extent completion, so that we don't miss any delalloc and extents before
we check that the file's extent layout is usable for a swap file and
activate the swap file. We are called with the inode's VFS lock acquired,
so we won't race with buffered and direct IO writes, however we can still
race with memory mapped writes since they don't acquire the inode's VFS
lock. The race window is between flushing all delalloc and locking the
whole file's extent range, since memory mapped writes lock an extent range
with the length of a page.

Fix this by acquiring the inode's mmap lock before we flush delalloc.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6baa0269a85b..b2abc0aa5300 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9809,6 +9809,15 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	u64 isize;
 	u64 start;
 
+	/*
+	 * Acquire the inode's mmap lock to prevent races with memory mapped
+	 * writes, as they could happen after we flush delalloc below and before
+	 * we lock the extent range further below. The inode was already locked
+	 * up in the call chain.
+	 */
+	btrfs_assert_inode_locked(BTRFS_I(inode));
+	down_write(&BTRFS_I(inode)->i_mmap_lock);
+
 	/*
 	 * If the swap file was just created, make sure delalloc is done. If the
 	 * file changes again after this, the user is doing something stupid and
@@ -9816,22 +9825,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	 */
 	ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
 	if (ret)
-		return ret;
+		goto out_unlock_mmap;
 
 	/*
 	 * The inode is locked, so these flags won't change after we check them.
 	 */
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
 		btrfs_warn(fs_info, "swapfile must not be compressed");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_unlock_mmap;
 	}
 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
 		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_unlock_mmap;
 	}
 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 		btrfs_warn(fs_info, "swapfile must not be checksummed");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_unlock_mmap;
 	}
 
 	/*
@@ -9846,7 +9858,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
 		btrfs_warn(fs_info,
 	   "cannot activate swapfile while exclusive operation is running");
-		return -EBUSY;
+		ret = -EBUSY;
+		goto out_unlock_mmap;
 	}
 
 	/*
@@ -9860,7 +9873,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 		btrfs_exclop_finish(fs_info);
 		btrfs_warn(fs_info,
 	   "cannot activate swapfile because snapshot creation is in progress");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_unlock_mmap;
 	}
 	/*
 	 * Snapshots can create extents which require COW even if NODATACOW is
@@ -9881,7 +9895,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 		btrfs_warn(fs_info,
 		"cannot activate swapfile because subvolume %llu is being deleted",
 			btrfs_root_id(root));
-		return -EPERM;
+		ret = -EPERM;
+		goto out_unlock_mmap;
 	}
 	atomic_inc(&root->nr_swapfiles);
 	spin_unlock(&root->root_item_lock);
@@ -10036,6 +10051,8 @@ out:
 
 	btrfs_exclop_finish(fs_info);
 
+out_unlock_mmap:
+	up_write(&BTRFS_I(inode)->i_mmap_lock);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 03018e5d8508254534511d40fb57bc150e6a87f2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Dec 2024 12:54:14 +0000
Subject: btrfs: fix swap file activation failure due to extents that used to
 be shared

When activating a swap file, to determine if an extent is shared we use
can_nocow_extent(), which ends up at btrfs_cross_ref_exist(). That helper
is meant to be quick because it's used in the NOCOW write path, when
flushing delalloc and when doing a direct IO write, however it does return
some false positives, meaning it may indicate that an extent is shared
even if it's no longer the case. For the write path this is fine, we just
do a unnecessary COW operation instead of doing a more rigorous check
which would be too heavy (calling btrfs_is_data_extent_shared()).

However when activating a swap file, the false positives simply result
in a failure, which is confusing for users/applications. One particular
case where this happens is when a data extent only has 1 reference but
that reference is not inlined in the extent item located in the extent
tree - this happens when we create more than 33 references for an extent
and then delete those 33 references plus every other non-inline reference
except one. The function check_committed_ref() assumes that if the size
of an extent item doesn't match the size of struct btrfs_extent_item
plus the size of an inline reference (plus an owner reference in case
simple quotas are enabled), then the extent is shared - that is not the
case however, we can have a single reference but it's not inlined - the
reason we do this is to be fast and avoid inspecting non-inline references
which may be located in another leaf of the extent tree, slowing down
write paths.

The following test script reproduces the bug:

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/sdi
   MNT=/mnt/sdi
   NUM_CLONES=50

   umount $DEV &> /dev/null

   run_test()
   {
        local sync_after_add_reflinks=$1
        local sync_after_remove_reflinks=$2

        mkfs.btrfs -f $DEV > /dev/null
        #mkfs.xfs -f $DEV > /dev/null
        mount $DEV $MNT

        touch $MNT/foo
        chmod 0600 $MNT/foo
   	# On btrfs the file must be NOCOW.
        chattr +C $MNT/foo &> /dev/null
        xfs_io -s -c "pwrite -b 1M 0 1M" $MNT/foo
        mkswap $MNT/foo

        for ((i = 1; i <= $NUM_CLONES; i++)); do
            touch $MNT/foo_clone_$i
            chmod 0600 $MNT/foo_clone_$i
            # On btrfs the file must be NOCOW.
            chattr +C $MNT/foo_clone_$i &> /dev/null
            cp --reflink=always $MNT/foo $MNT/foo_clone_$i
        done

        if [ $sync_after_add_reflinks -ne 0 ]; then
            # Flush delayed refs and commit current transaction.
            sync -f $MNT
        fi

        # Remove the original file and all clones except the last.
        rm -f $MNT/foo
        for ((i = 1; i < $NUM_CLONES; i++)); do
            rm -f $MNT/foo_clone_$i
        done

        if [ $sync_after_remove_reflinks -ne 0 ]; then
            # Flush delayed refs and commit current transaction.
            sync -f $MNT
        fi

        # Now use the last clone as a swap file. It should work since
        # its extent are not shared anymore.
        swapon $MNT/foo_clone_${NUM_CLONES}
        swapoff $MNT/foo_clone_${NUM_CLONES}

        umount $MNT
   }

   echo -e "\nTest without sync after creating and removing clones"
   run_test 0 0

   echo -e "\nTest with sync after creating clones"
   run_test 1 0

   echo -e "\nTest with sync after removing clones"
   run_test 0 1

   echo -e "\nTest with sync after creating and removing clones"
   run_test 1 1

Running the test:

   $ ./test.sh
   Test without sync after creating and removing clones
   wrote 1048576/1048576 bytes at offset 0
   1 MiB, 1 ops; 0.0017 sec (556.793 MiB/sec and 556.7929 ops/sec)
   Setting up swapspace version 1, size = 1020 KiB (1044480 bytes)
   no label, UUID=a6b9c29e-5ef4-4689-a8ac-bc199c750f02
   swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument
   swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument

   Test with sync after creating clones
   wrote 1048576/1048576 bytes at offset 0
   1 MiB, 1 ops; 0.0036 sec (271.739 MiB/sec and 271.7391 ops/sec)
   Setting up swapspace version 1, size = 1020 KiB (1044480 bytes)
   no label, UUID=5e9008d6-1f7a-4948-a1b4-3f30aba20a33
   swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument
   swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument

   Test with sync after removing clones
   wrote 1048576/1048576 bytes at offset 0
   1 MiB, 1 ops; 0.0103 sec (96.665 MiB/sec and 96.6651 ops/sec)
   Setting up swapspace version 1, size = 1020 KiB (1044480 bytes)
   no label, UUID=916c2740-fa9f-4385-9f06-29c3f89e4764

   Test with sync after creating and removing clones
   wrote 1048576/1048576 bytes at offset 0
   1 MiB, 1 ops; 0.0031 sec (314.268 MiB/sec and 314.2678 ops/sec)
   Setting up swapspace version 1, size = 1020 KiB (1044480 bytes)
   no label, UUID=06aab1dd-4d90-49c0-bd9f-3a8db4e2f912
   swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument
   swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument

Fix this by reworking btrfs_swap_activate() to instead of using extent
maps and checking for shared extents with can_nocow_extent(), iterate
over the inode's file extent items and use the accurate
btrfs_is_data_extent_shared().

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 96 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2abc0aa5300..b87f19630b00 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9799,15 +9799,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_state *cached_state = NULL;
-	struct extent_map *em = NULL;
 	struct btrfs_chunk_map *map = NULL;
 	struct btrfs_device *device = NULL;
 	struct btrfs_swap_info bsi = {
 		.lowest_ppage = (sector_t)-1ULL,
 	};
+	struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
+	struct btrfs_path *path = NULL;
 	int ret = 0;
 	u64 isize;
-	u64 start;
+	u64 prev_extent_end = 0;
 
 	/*
 	 * Acquire the inode's mmap lock to prevent races with memory mapped
@@ -9846,6 +9847,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 		goto out_unlock_mmap;
 	}
 
+	path = btrfs_alloc_path();
+	backref_ctx = btrfs_alloc_backref_share_check_ctx();
+	if (!path || !backref_ctx) {
+		ret = -ENOMEM;
+		goto out_unlock_mmap;
+	}
+
 	/*
 	 * Balance or device remove/replace/resize can move stuff around from
 	 * under us. The exclop protection makes sure they aren't running/won't
@@ -9904,24 +9912,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
 
 	lock_extent(io_tree, 0, isize - 1, &cached_state);
-	start = 0;
-	while (start < isize) {
-		u64 logical_block_start, physical_block_start;
+	while (prev_extent_end < isize) {
+		struct btrfs_key key;
+		struct extent_buffer *leaf;
+		struct btrfs_file_extent_item *ei;
 		struct btrfs_block_group *bg;
-		u64 len = isize - start;
+		u64 logical_block_start;
+		u64 physical_block_start;
+		u64 extent_gen;
+		u64 disk_bytenr;
+		u64 len;
 
-		em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
-		if (IS_ERR(em)) {
-			ret = PTR_ERR(em);
+		key.objectid = btrfs_ino(BTRFS_I(inode));
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = prev_extent_end;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
 			goto out;
-		}
 
-		if (em->disk_bytenr == EXTENT_MAP_HOLE) {
+		/*
+		 * If key not found it means we have an implicit hole (NO_HOLES
+		 * is enabled).
+		 */
+		if (ret > 0) {
 			btrfs_warn(fs_info, "swapfile must not have holes");
 			ret = -EINVAL;
 			goto out;
 		}
-		if (em->disk_bytenr == EXTENT_MAP_INLINE) {
+
+		leaf = path->nodes[0];
+		ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
 			/*
 			 * It's unlikely we'll ever actually find ourselves
 			 * here, as a file small enough to fit inline won't be
@@ -9933,23 +9956,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 			ret = -EINVAL;
 			goto out;
 		}
-		if (extent_map_is_compressed(em)) {
+
+		if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
 			btrfs_warn(fs_info, "swapfile must not be compressed");
 			ret = -EINVAL;
 			goto out;
 		}
 
-		logical_block_start = extent_map_block_start(em) + (start - em->start);
-		len = min(len, em->len - (start - em->start));
-		free_extent_map(em);
-		em = NULL;
+		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+		if (disk_bytenr == 0) {
+			btrfs_warn(fs_info, "swapfile must not have holes");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
+		extent_gen = btrfs_file_extent_generation(leaf, ei);
+		prev_extent_end = btrfs_file_extent_end(path);
+
+		if (prev_extent_end > isize)
+			len = isize - key.offset;
+		else
+			len = btrfs_file_extent_num_bytes(leaf, ei);
 
-		ret = can_nocow_extent(inode, start, &len, NULL, false, true);
+		backref_ctx->curr_leaf_bytenr = leaf->start;
+
+		/*
+		 * Don't need the path anymore, release to avoid deadlocks when
+		 * calling btrfs_is_data_extent_shared() because when joining a
+		 * transaction it can block waiting for the current one's commit
+		 * which in turn may be trying to lock the same leaf to flush
+		 * delayed items for example.
+		 */
+		btrfs_release_path(path);
+
+		ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
+						  extent_gen, backref_ctx);
 		if (ret < 0) {
 			goto out;
-		} else if (ret) {
-			ret = 0;
-		} else {
+		} else if (ret > 0) {
 			btrfs_warn(fs_info,
 				   "swapfile must not be copy-on-write");
 			ret = -EINVAL;
@@ -9984,7 +10029,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 
 		physical_block_start = (map->stripes[0].physical +
 					(logical_block_start - map->start));
-		len = min(len, map->chunk_len - (logical_block_start - map->start));
 		btrfs_free_chunk_map(map);
 		map = NULL;
 
@@ -10025,20 +10069,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 				if (ret)
 					goto out;
 			}
-			bsi.start = start;
+			bsi.start = key.offset;
 			bsi.block_start = physical_block_start;
 			bsi.block_len = len;
 		}
-
-		start += len;
 	}
 
 	if (bsi.block_len)
 		ret = btrfs_add_swap_extent(sis, &bsi);
 
 out:
-	if (!IS_ERR_OR_NULL(em))
-		free_extent_map(em);
 	if (!IS_ERR_OR_NULL(map))
 		btrfs_free_chunk_map(map);
 
@@ -10053,6 +10093,8 @@ out:
 
 out_unlock_mmap:
 	up_write(&BTRFS_I(inode)->i_mmap_lock);
+	btrfs_free_backref_share_ctx(backref_ctx);
+	btrfs_free_path(path);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 9a45022a0efadd99bcc58f7f1cc2b6fb3b808c40 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Dec 2024 16:31:41 +0000
Subject: btrfs: allow swap activation to be interruptible

During swap activation we iterate over the extents of a file, then do
several checks for each extent, some of which may take some significant
time such as checking if an extent is shared. Since a file can have
many thousands of extents, this can be a very slow operation and it's
currently not interruptible. I had a bug during development of a previous
patch that resulted in an infinite loop when iterating the extents, so
a core was busy looping and I couldn't cancel the operation, which is very
annoying and requires a reboot. So make the loop interruptible by checking
for fatal signals at the end of each iteration and stopping immediately if
there is one.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b87f19630b00..c4675f4345fd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10073,6 +10073,11 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 			bsi.block_start = physical_block_start;
 			bsi.block_len = len;
 		}
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			goto out;
+		}
 	}
 
 	if (bsi.block_len)
-- 
cgit v1.2.3


From 2c8507c63f5498d4ee4af404a8e44ceae4345056 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Dec 2024 16:43:44 +0000
Subject: btrfs: avoid monopolizing a core when activating a swap file

During swap activation we iterate over the extents of a file and we can
have many thousands of them, so we can end up in a busy loop monopolizing
a core. Avoid this by doing a voluntary reschedule after processing each
extent.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c4675f4345fd..623d9d7ab480 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10078,6 +10078,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 			ret = -EINTR;
 			goto out;
 		}
+
+		cond_resched();
 	}
 
 	if (bsi.block_len)
-- 
cgit v1.2.3


From f2363e6fcc7938c5f0f6ac066fad0dd247598b51 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Wed, 11 Dec 2024 19:13:15 +0800
Subject: btrfs: fix transaction atomicity bug when enabling simple quotas

Set squota incompat bit before committing the transaction that enables
the feature.

With the config CONFIG_BTRFS_ASSERT enabled, an assertion
failure occurs regarding the simple quota feature.

  [5.596534] assertion failed: btrfs_fs_incompat(fs_info, SIMPLE_QUOTA), in fs/btrfs/qgroup.c:365
  [5.597098] ------------[ cut here ]------------
  [5.597371] kernel BUG at fs/btrfs/qgroup.c:365!
  [5.597946] CPU: 1 UID: 0 PID: 268 Comm: mount Not tainted 6.13.0-rc2-00031-gf92f4749861b #146
  [5.598450] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
  [5.599008] RIP: 0010:btrfs_read_qgroup_config+0x74d/0x7a0
  [5.604303]  <TASK>
  [5.605230]  ? btrfs_read_qgroup_config+0x74d/0x7a0
  [5.605538]  ? exc_invalid_op+0x56/0x70
  [5.605775]  ? btrfs_read_qgroup_config+0x74d/0x7a0
  [5.606066]  ? asm_exc_invalid_op+0x1f/0x30
  [5.606441]  ? btrfs_read_qgroup_config+0x74d/0x7a0
  [5.606741]  ? btrfs_read_qgroup_config+0x74d/0x7a0
  [5.607038]  ? try_to_wake_up+0x317/0x760
  [5.607286]  open_ctree+0xd9c/0x1710
  [5.607509]  btrfs_get_tree+0x58a/0x7e0
  [5.608002]  vfs_get_tree+0x2e/0x100
  [5.608224]  fc_mount+0x16/0x60
  [5.608420]  btrfs_get_tree+0x2f8/0x7e0
  [5.608897]  vfs_get_tree+0x2e/0x100
  [5.609121]  path_mount+0x4c8/0xbc0
  [5.609538]  __x64_sys_mount+0x10d/0x150

The issue can be easily reproduced using the following reproducer:

  root@q:linux# cat repro.sh
  set -e

  mkfs.btrfs -q -f /dev/sdb
  mount /dev/sdb /mnt/btrfs
  btrfs quota enable -s /mnt/btrfs
  umount /mnt/btrfs
  mount /dev/sdb /mnt/btrfs

The issue is that when enabling quotas, at btrfs_quota_enable(), we set
BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE at fs_info->qgroup_flags and persist
it in the quota root in the item with the key BTRFS_QGROUP_STATUS_KEY, but
we only set the incompat bit BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA after we
commit the transaction used to enable simple quotas.

This means that if after that transaction commit we unmount the filesystem
without starting and committing any other transaction, or we have a power
failure, the next time we mount the filesystem we will find the flag
BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE set in the item with the key
BTRFS_QGROUP_STATUS_KEY but we will not find the incompat bit
BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA set in the superblock, triggering an
assertion failure at:

  btrfs_read_qgroup_config() -> qgroup_read_enable_gen()

To fix this issue, set the BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA flag
immediately after setting the BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE.
This ensures that both flags are flushed to disk within the same
transaction.

Fixes: 182940f4f4db ("btrfs: qgroup: add new quota mode for simple quotas")
CC: stable@vger.kernel.org # 6.6+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a6f92836c9b1..f9b214992212 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1121,6 +1121,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
 	if (simple) {
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+		btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
 		btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
 	} else {
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -1254,8 +1255,6 @@ out_add_root:
 	spin_lock(&fs_info->qgroup_lock);
 	fs_info->quota_root = quota_root;
 	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
-	if (simple)
-		btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
 	spin_unlock(&fs_info->qgroup_lock);
 
 	/* Skip rescan for simple qgroups. */
-- 
cgit v1.2.3


From 4b8d867ca6e2fc6d152f629fdaf027053b81765a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 12 Dec 2024 14:37:56 -0800
Subject: xfs: don't over-report free space or inodes in statvfs

Emmanual Florac reports a strange occurrence when project quota limits
are enabled, free space is lower than the remaining quota, and someone
runs statvfs:

  # mkfs.xfs -f /dev/sda
  # mount /dev/sda /mnt -o prjquota
  # xfs_quota  -x -c 'limit -p bhard=2G 55' /mnt
  # mkdir /mnt/dir
  # xfs_io -c 'chproj 55' -c 'chattr +P' -c 'stat -vvvv' /mnt/dir
  # fallocate -l 19g /mnt/a
  # df /mnt /mnt/dir
  Filesystem      Size  Used Avail Use% Mounted on
  /dev/sda         20G   20G  345M  99% /mnt
  /dev/sda        2.0G     0  2.0G   0% /mnt

I think the bug here is that xfs_fill_statvfs_from_dquot unconditionally
assigns to f_bfree without checking that the filesystem has enough free
space to fill the remaining project quota.  However, this is a
longstanding behavior of xfs so it's unclear what to do here.

Cc: <stable@vger.kernel.org> # v2.6.18
Fixes: 932f2c323196c2 ("[XFS] statvfs component of directory/project quota support, code originally by Glen.")
Reported-by: Emmanuel Florac <eflorac@intellique.com>
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_qm_bhv.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 847ba29630e9..db5b8afd9d1b 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -32,21 +32,28 @@ xfs_fill_statvfs_from_dquot(
 	limit = blkres->softlimit ?
 		blkres->softlimit :
 		blkres->hardlimit;
-	if (limit && statp->f_blocks > limit) {
-		statp->f_blocks = limit;
-		statp->f_bfree = statp->f_bavail =
-			(statp->f_blocks > blkres->reserved) ?
-			 (statp->f_blocks - blkres->reserved) : 0;
+	if (limit) {
+		uint64_t	remaining = 0;
+
+		if (limit > blkres->reserved)
+			remaining = limit - blkres->reserved;
+
+		statp->f_blocks = min(statp->f_blocks, limit);
+		statp->f_bfree = min(statp->f_bfree, remaining);
+		statp->f_bavail = min(statp->f_bavail, remaining);
 	}
 
 	limit = dqp->q_ino.softlimit ?
 		dqp->q_ino.softlimit :
 		dqp->q_ino.hardlimit;
-	if (limit && statp->f_files > limit) {
-		statp->f_files = limit;
-		statp->f_ffree =
-			(statp->f_files > dqp->q_ino.reserved) ?
-			 (statp->f_files - dqp->q_ino.reserved) : 0;
+	if (limit) {
+		uint64_t	remaining = 0;
+
+		if (limit > dqp->q_ino.reserved)
+			remaining = limit - dqp->q_ino.reserved;
+
+		statp->f_files = min(statp->f_files, limit);
+		statp->f_ffree = min(statp->f_ffree, remaining);
 	}
 }
 
-- 
cgit v1.2.3


From 4f13f0a3fc6ad193e4d144a5e001b7b8f1fc4b7f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:12 -0800
Subject: xfs: tidy up xfs_iroot_realloc

Tidy up this function a bit before we start refactoring the memory
handling and move the function to the bmbt code.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_inode_fork.c | 83 ++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1158ca48626b..7f865479c415 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -382,33 +382,32 @@ xfs_iformat_attr_fork(
  */
 void
 xfs_iroot_realloc(
-	xfs_inode_t		*ip,
+	struct xfs_inode	*ip,
 	int			rec_diff,
 	int			whichfork)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	int			cur_max;
-	struct xfs_ifork	*ifp;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*new_broot;
-	int			new_max;
-	size_t			new_size;
 	char			*np;
 	char			*op;
+	size_t			new_size;
+	short			old_size = ifp->if_broot_bytes;
+	int			cur_max;
+	int			new_max;
 
 	/*
 	 * Handle the degenerate case quietly.
 	 */
-	if (rec_diff == 0) {
+	if (rec_diff == 0)
 		return;
-	}
 
-	ifp = xfs_ifork_ptr(ip, whichfork);
 	if (rec_diff > 0) {
 		/*
 		 * If there wasn't any memory allocated before, just
 		 * allocate it now and get out.
 		 */
-		if (ifp->if_broot_bytes == 0) {
+		if (old_size == 0) {
 			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
 			ifp->if_broot = kmalloc(new_size,
 						GFP_KERNEL | __GFP_NOFAIL);
@@ -422,13 +421,13 @@ xfs_iroot_realloc(
 		 * location.  The records don't change location because
 		 * they are kept butted up against the btree block header.
 		 */
-		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false);
+		cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
 		new_max = cur_max + rec_diff;
 		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_KERNEL | __GFP_NOFAIL);
 		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     ifp->if_broot_bytes);
+						     old_size);
 		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     (int)new_size);
 		ifp->if_broot_bytes = (int)new_size;
@@ -443,52 +442,50 @@ xfs_iroot_realloc(
 	 * if_broot buffer.  It must already exist.  If we go to zero
 	 * records, just get rid of the root and clear the status bit.
 	 */
-	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false);
+	ASSERT(ifp->if_broot != NULL && old_size > 0);
+	cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	if (new_max > 0)
 		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	else
 		new_size = 0;
-	if (new_size > 0) {
-		new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
-		/*
-		 * First copy over the btree block header.
-		 */
-		memcpy(new_broot, ifp->if_broot,
-			xfs_bmbt_block_len(ip->i_mount));
-	} else {
-		new_broot = NULL;
+	if (new_size == 0) {
+		ifp->if_broot = NULL;
+		ifp->if_broot_bytes = 0;
+		return;
 	}
 
 	/*
-	 * Only copy the keys and pointers if there are any.
+	 * Shrink the btree root by allocating a smaller object and copying the
+	 * fields from the old object to the new object.  krealloc does nothing
+	 * if we realloc downwards.
 	 */
-	if (new_max > 0) {
-		/*
-		 * First copy the keys.
-		 */
-		op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
-		np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
+	new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
+	/*
+	 * First copy over the btree block header.
+	 */
+	memcpy(new_broot, ifp->if_broot, xfs_bmbt_block_len(ip->i_mount));
+
+	/*
+	 * First copy the keys.
+	 */
+	op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
+	np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
+	memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
+
+	/*
+	 * Then copy the pointers.
+	 */
+	op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, old_size);
+	np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1, (int)new_size);
+	memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
 
-		/*
-		 * Then copy the pointers.
-		 */
-		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     ifp->if_broot_bytes);
-		np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1,
-						     (int)new_size);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
-	}
 	kfree(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
-	if (ifp->if_broot)
-		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-			xfs_inode_fork_size(ip, whichfork));
-	return;
+	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
+	       xfs_inode_fork_size(ip, whichfork));
 }
 
 
-- 
cgit v1.2.3


From 1aacd3fac248902ea1f7607f2d12b93929a4833b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 17 Dec 2024 15:00:49 -0800
Subject: xfs: release the dquot buf outside of qli_lock

Lai Yi reported a lockdep complaint about circular locking:

 Chain exists of:
   &lp->qli_lock --> &bch->bc_lock --> &l->lock

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&l->lock);
                                lock(&bch->bc_lock);
                                lock(&l->lock);
   lock(&lp->qli_lock);

I /think/ the problem here is that xfs_dquot_attach_buf during
quotacheck will release the buffer while it's holding the qli_lock.
Because this is a cached buffer, xfs_buf_rele_cached takes b_lock before
decrementing b_hold.  Other threads have taught lockdep that a locking
dependency chain is bp->b_lock -> bch->bc_lock -> l(ru)->lock; and that
another chain is l(ru)->lock -> lp->qli_lock.  Hence we do not want to
take b_lock while holding qli_lock.

Reported-by: syzbot+3126ab3db03db42e7a31@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org> # v6.13-rc3
Fixes: ca378189fdfa89 ("xfs: convert quotacheck to attach dquot buffers")
Tested-by: syzbot+3126ab3db03db42e7a31@syzkaller.appspotmail.com
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index f11d475898f2..576b7755b1f1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1315,7 +1315,8 @@ out_abort:
 
 /*
  * Attach a dquot buffer to this dquot to avoid allocating a buffer during a
- * dqflush, since dqflush can be called from reclaim context.
+ * dqflush, since dqflush can be called from reclaim context.  Caller must hold
+ * the dqlock.
  */
 int
 xfs_dquot_attach_buf(
@@ -1336,13 +1337,16 @@ xfs_dquot_attach_buf(
 			return error;
 
 		/*
-		 * Attach the dquot to the buffer so that the AIL does not have
-		 * to read the dquot buffer to push this item.
+		 * Hold the dquot buffer so that we retain our ref to it after
+		 * detaching it from the transaction, then give that ref to the
+		 * dquot log item so that the AIL does not have to read the
+		 * dquot buffer to push this item.
 		 */
 		xfs_buf_hold(bp);
+		xfs_trans_brelse(tp, bp);
+
 		spin_lock(&qlip->qli_lock);
 		lip->li_buf = bp;
-		xfs_trans_brelse(tp, bp);
 	}
 	qlip->qli_dirty = true;
 	spin_unlock(&qlip->qli_lock);
-- 
cgit v1.2.3


From 6c1c55ac3c0512262817a088e805d99aad4c0867 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:13 -0800
Subject: xfs: refactor the inode fork memory allocation functions

Hoist the code that allocates, frees, and reallocates if_broot into a
single xfs_iroot_krealloc function.  Eventually we're going to push
xfs_iroot_realloc into the btree ops structure to handle multiple
inode-rooted btrees, but first let's separate out the bits that should
stay in xfs_inode_fork.c.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_inode_fork.c | 116 +++++++++++++++++++++++++++--------------
 fs/xfs/libxfs/xfs_inode_fork.h |   5 ++
 2 files changed, 82 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 7f865479c415..294c3c555683 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -178,7 +178,7 @@ xfs_iformat_btree(
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_bmdr_block_t	*dfp;
 	struct xfs_ifork	*ifp;
-	/* REFERENCED */
+	struct xfs_btree_block	*broot;
 	int			nrecs;
 	int			size;
 	int			level;
@@ -211,16 +211,13 @@ xfs_iformat_btree(
 		return -EFSCORRUPTED;
 	}
 
-	ifp->if_broot_bytes = size;
-	ifp->if_broot = kmalloc(size,
-				GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
-	ASSERT(ifp->if_broot != NULL);
+	broot = xfs_broot_alloc(ifp, size);
 	/*
 	 * Copy and convert from the on-disk structure
 	 * to the in-memory structure.
 	 */
 	xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
-			 ifp->if_broot, size);
+			 broot, size);
 
 	ifp->if_bytes = 0;
 	ifp->if_data = NULL;
@@ -362,6 +359,69 @@ xfs_iformat_attr_fork(
 	return error;
 }
 
+/*
+ * Allocate the if_broot component of an inode fork so that it is @new_size
+ * bytes in size, using __GFP_NOLOCKDEP like all the other code that
+ * initializes a broot during inode load.  Returns if_broot.
+ */
+struct xfs_btree_block *
+xfs_broot_alloc(
+	struct xfs_ifork	*ifp,
+	size_t			new_size)
+{
+	ASSERT(ifp->if_broot == NULL);
+
+	ifp->if_broot = kmalloc(new_size,
+				GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+	ifp->if_broot_bytes = new_size;
+	return ifp->if_broot;
+}
+
+/*
+ * Reallocate the if_broot component of an inode fork so that it is @new_size
+ * bytes in size.  Returns if_broot.
+ */
+struct xfs_btree_block *
+xfs_broot_realloc(
+	struct xfs_ifork	*ifp,
+	size_t			new_size)
+{
+	/* No size change?  No action needed. */
+	if (new_size == ifp->if_broot_bytes)
+		return ifp->if_broot;
+
+	/* New size is zero, free it. */
+	if (new_size == 0) {
+		ifp->if_broot_bytes = 0;
+		kfree(ifp->if_broot);
+		ifp->if_broot = NULL;
+		return NULL;
+	}
+
+	/*
+	 * Shrinking the iroot means we allocate a new smaller object and copy
+	 * it.  We don't trust krealloc not to nop on realloc-down.
+	 */
+	if (ifp->if_broot_bytes > 0 && ifp->if_broot_bytes > new_size) {
+		struct xfs_btree_block	*old_broot = ifp->if_broot;
+
+		ifp->if_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
+		ifp->if_broot_bytes = new_size;
+		memcpy(ifp->if_broot, old_broot, new_size);
+		kfree(old_broot);
+		return ifp->if_broot;
+	}
+
+	/*
+	 * Growing the iroot means we can krealloc.  This may get us the same
+	 * object.
+	 */
+	ifp->if_broot = krealloc(ifp->if_broot, new_size,
+			GFP_KERNEL | __GFP_NOFAIL);
+	ifp->if_broot_bytes = new_size;
+	return ifp->if_broot;
+}
+
 /*
  * Reallocate the space for if_broot based on the number of records
  * being added or deleted as indicated in rec_diff.  Move the records
@@ -388,7 +448,6 @@ xfs_iroot_realloc(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
-	struct xfs_btree_block	*new_broot;
 	char			*np;
 	char			*op;
 	size_t			new_size;
@@ -409,9 +468,7 @@ xfs_iroot_realloc(
 		 */
 		if (old_size == 0) {
 			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
-			ifp->if_broot = kmalloc(new_size,
-						GFP_KERNEL | __GFP_NOFAIL);
-			ifp->if_broot_bytes = (int)new_size;
+			xfs_broot_realloc(ifp, new_size);
 			return;
 		}
 
@@ -424,13 +481,12 @@ xfs_iroot_realloc(
 		cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
 		new_max = cur_max + rec_diff;
 		new_size = xfs_bmap_broot_space_calc(mp, new_max);
-		ifp->if_broot = krealloc(ifp->if_broot, new_size,
-					 GFP_KERNEL | __GFP_NOFAIL);
+
+		xfs_broot_realloc(ifp, new_size);
 		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     old_size);
 		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     (int)new_size);
-		ifp->if_broot_bytes = (int)new_size;
 		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 			xfs_inode_fork_size(ip, whichfork));
 		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
@@ -451,39 +507,21 @@ xfs_iroot_realloc(
 	else
 		new_size = 0;
 	if (new_size == 0) {
-		ifp->if_broot = NULL;
-		ifp->if_broot_bytes = 0;
+		xfs_broot_realloc(ifp, 0);
 		return;
 	}
 
 	/*
-	 * Shrink the btree root by allocating a smaller object and copying the
-	 * fields from the old object to the new object.  krealloc does nothing
-	 * if we realloc downwards.
-	 */
-	new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
-	/*
-	 * First copy over the btree block header.
-	 */
-	memcpy(new_broot, ifp->if_broot, xfs_bmbt_block_len(ip->i_mount));
-
-	/*
-	 * First copy the keys.
-	 */
-	op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
-	np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
-	memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
-
-	/*
-	 * Then copy the pointers.
+	 * Shrink the btree root by moving the bmbt pointers, since they are
+	 * not butted up against the btree block header, then reallocating
+	 * broot.
 	 */
 	op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, old_size);
-	np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1, (int)new_size);
-	memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
+	np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
+					     (int)new_size);
+	memmove(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
 
-	kfree(ifp->if_broot);
-	ifp->if_broot = new_broot;
-	ifp->if_broot_bytes = (int)new_size;
+	xfs_broot_realloc(ifp, new_size);
 	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 	       xfs_inode_fork_size(ip, whichfork));
 }
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 2373d12fd474..e3c5c9121044 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -170,6 +170,11 @@ void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
 void		xfs_idestroy_fork(struct xfs_ifork *ifp);
 void *		xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
 				int whichfork);
+struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp,
+				size_t new_size);
+struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp,
+				size_t new_size);
+
 void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
-- 
cgit v1.2.3


From 6a92924275ecdd768c8105f8975b971300c5ba7d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:14 -0800
Subject: xfs: make xfs_iroot_realloc take the new numrecs instead of deltas

Change the calling signature of xfs_iroot_realloc to take the ifork and
the new number of records in the btree block, not a diff against the
current number.  This will make the callsites easier to understand.

Note that this function is misnamed because it is very specific to the
single type of inode-rooted btree supported.  This will be addressed in
a subsequent patch.

Return the new btree root to reduce the amount of code clutter.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c       |  7 ++--
 fs/xfs/libxfs/xfs_btree.c      | 25 +++++--------
 fs/xfs/libxfs/xfs_inode_fork.c | 83 +++++++++++++++++++-----------------------
 fs/xfs/libxfs/xfs_inode_fork.h |  3 +-
 4 files changed, 51 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 5255f93bae31..8ab38f07cb78 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -615,7 +615,7 @@ xfs_bmap_btree_to_extents(
 	xfs_trans_binval(tp, cbp);
 	if (cur->bc_levels[0].bp == cbp)
 		cur->bc_levels[0].bp = NULL;
-	xfs_iroot_realloc(ip, -1, whichfork);
+	xfs_iroot_realloc(ip, whichfork, 0);
 	ASSERT(ifp->if_broot == NULL);
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -659,12 +659,11 @@ xfs_bmap_extents_to_btree(
 	 * Make space in the inode incore. This needs to be undone if we fail
 	 * to expand the root.
 	 */
-	xfs_iroot_realloc(ip, 1, whichfork);
+	block = xfs_iroot_realloc(ip, whichfork, 1);
 
 	/*
 	 * Fill in the root.
 	 */
-	block = ifp->if_broot;
 	xfs_bmbt_init_block(ip, block, NULL, 1, 1);
 	/*
 	 * Need a cursor.  Can't allocate until bb_level is filled in.
@@ -746,7 +745,7 @@ xfs_bmap_extents_to_btree(
 out_unreserve_dquot:
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
 out_root_realloc:
-	xfs_iroot_realloc(ip, -1, whichfork);
+	xfs_iroot_realloc(ip, whichfork, 0);
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	ASSERT(ifp->if_broot == NULL);
 	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 68ee1c299c25..5714bec26c20 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3161,9 +3161,7 @@ xfs_btree_new_iroot(
 
 	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
 
-	xfs_iroot_realloc(cur->bc_ino.ip,
-			  1 - xfs_btree_get_numrecs(cblock),
-			  cur->bc_ino.whichfork);
+	xfs_iroot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork, 1);
 
 	xfs_btree_setbuf(cur, level, cbp);
 
@@ -3347,7 +3345,8 @@ xfs_btree_make_block_unfull(
 
 		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
 			/* A root block that can be made bigger. */
-			xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
+			xfs_iroot_realloc(ip, cur->bc_ino.whichfork,
+					numrecs + 1);
 			*stat = 1;
 		} else {
 			/* A root block that needs replacing */
@@ -3705,9 +3704,7 @@ STATIC int
 xfs_btree_kill_iroot(
 	struct xfs_btree_cur	*cur)
 {
-	int			whichfork = cur->bc_ino.whichfork;
 	struct xfs_inode	*ip = cur->bc_ino.ip;
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*block;
 	struct xfs_btree_block	*cblock;
 	union xfs_btree_key	*kp;
@@ -3716,7 +3713,6 @@ xfs_btree_kill_iroot(
 	union xfs_btree_ptr	*cpp;
 	struct xfs_buf		*cbp;
 	int			level;
-	int			index;
 	int			numrecs;
 	int			error;
 #ifdef DEBUG
@@ -3762,14 +3758,10 @@ xfs_btree_kill_iroot(
 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 #endif
 
-	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
-	if (index) {
-		xfs_iroot_realloc(cur->bc_ino.ip, index,
-				  cur->bc_ino.whichfork);
-		block = ifp->if_broot;
-	}
+	block = xfs_iroot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
+			numrecs);
 
-	be16_add_cpu(&block->bb_numrecs, index);
+	block->bb_numrecs = be16_to_cpu(numrecs);
 	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
 
 	kp = xfs_btree_key_addr(cur, 1, block);
@@ -3949,10 +3941,11 @@ xfs_btree_delrec(
 	/*
 	 * We're at the root level.  First, shrink the root block in-memory.
 	 * Try to get rid of the next level down.  If we can't then there's
-	 * nothing left to do.
+	 * nothing left to do.  numrecs was decremented above.
 	 */
 	if (xfs_btree_at_iroot(cur, level)) {
-		xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
+		xfs_iroot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
+				numrecs);
 
 		error = xfs_btree_kill_iroot(cur);
 		if (error)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 294c3c555683..53bfdf422ad8 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -423,12 +423,10 @@ xfs_broot_realloc(
 }
 
 /*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff.  Move the records
- * and pointers in if_broot to fit the new size.  When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller.  When growing this will create holes to be filled in
- * by the caller.
+ * Reallocate the space for if_broot based on the number of records.  Move the
+ * records and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by the caller.
+ * When growing this will create holes to be filled in by the caller.
  *
  * The caller must not request to add more records than would fit in
  * the on-disk inode root.  If the if_broot is currently NULL, then
@@ -437,40 +435,47 @@ xfs_broot_realloc(
  * it can go to zero.
  *
  * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- *	 requested for the if_broot array.
+ * whichfork -- which inode fork to change
+ * new_numrecs -- the new number of records requested for the if_broot array
+ *
+ * Returns the incore btree root block.
  */
-void
+struct xfs_btree_block *
 xfs_iroot_realloc(
 	struct xfs_inode	*ip,
-	int			rec_diff,
-	int			whichfork)
+	int			whichfork,
+	unsigned int		new_numrecs)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	char			*np;
 	char			*op;
-	size_t			new_size;
-	short			old_size = ifp->if_broot_bytes;
-	int			cur_max;
-	int			new_max;
+	unsigned int		new_size;
+	unsigned int		old_size = ifp->if_broot_bytes;
 
 	/*
-	 * Handle the degenerate case quietly.
+	 * Block mapping btrees do not support storing zero records; if this
+	 * happens, the fork is being changed to FMT_EXTENTS.  Free the broot
+	 * and get out.
 	 */
-	if (rec_diff == 0)
-		return;
+	if (new_numrecs == 0)
+		return xfs_broot_realloc(ifp, 0);
+
+	new_size = xfs_bmap_broot_space_calc(mp, new_numrecs);
+
+	/* Handle the nop case quietly. */
+	if (new_size == old_size)
+		return ifp->if_broot;
+
+	if (new_size > old_size) {
+		unsigned int	old_numrecs;
 
-	if (rec_diff > 0) {
 		/*
 		 * If there wasn't any memory allocated before, just
 		 * allocate it now and get out.
 		 */
-		if (old_size == 0) {
-			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
-			xfs_broot_realloc(ifp, new_size);
-			return;
-		}
+		if (old_size == 0)
+			return xfs_broot_realloc(ifp, new_size);
 
 		/*
 		 * If there is already an existing if_broot, then we need
@@ -478,10 +483,7 @@ xfs_iroot_realloc(
 		 * location.  The records don't change location because
 		 * they are kept butted up against the btree block header.
 		 */
-		cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
-		new_max = cur_max + rec_diff;
-		new_size = xfs_bmap_broot_space_calc(mp, new_max);
-
+		old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false);
 		xfs_broot_realloc(ifp, new_size);
 		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     old_size);
@@ -489,27 +491,15 @@ xfs_iroot_realloc(
 						     (int)new_size);
 		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 			xfs_inode_fork_size(ip, whichfork));
-		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
-		return;
+		memmove(np, op, old_numrecs * (uint)sizeof(xfs_fsblock_t));
+		return ifp->if_broot;
 	}
 
 	/*
-	 * rec_diff is less than 0.  In this case, we are shrinking the
-	 * if_broot buffer.  It must already exist.  If we go to zero
-	 * records, just get rid of the root and clear the status bit.
+	 * We're reducing, but not totally eliminating, numrecs.  In this case,
+	 * we are shrinking the if_broot buffer, so it must already exist.
 	 */
-	ASSERT(ifp->if_broot != NULL && old_size > 0);
-	cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
-	new_max = cur_max + rec_diff;
-	ASSERT(new_max >= 0);
-	if (new_max > 0)
-		new_size = xfs_bmap_broot_space_calc(mp, new_max);
-	else
-		new_size = 0;
-	if (new_size == 0) {
-		xfs_broot_realloc(ifp, 0);
-		return;
-	}
+	ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0);
 
 	/*
 	 * Shrink the btree root by moving the bmbt pointers, since they are
@@ -519,11 +509,12 @@ xfs_iroot_realloc(
 	op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, old_size);
 	np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 					     (int)new_size);
-	memmove(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
+	memmove(np, op, new_numrecs * (uint)sizeof(xfs_fsblock_t));
 
 	xfs_broot_realloc(ifp, new_size);
 	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 	       xfs_inode_fork_size(ip, whichfork));
+	return ifp->if_broot;
 }
 
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index e3c5c9121044..d05eb0bad864 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -175,7 +175,8 @@ struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp,
 struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp,
 				size_t new_size);
 
-void		xfs_iroot_realloc(struct xfs_inode *, int, int);
+struct xfs_btree_block *xfs_iroot_realloc(struct xfs_inode *ip, int whichfork,
+				unsigned int new_numrecs);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
 				  int);
-- 
cgit v1.2.3


From eb9bff22311ca47ef4848bbdcf24dae06ae3f243 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:15 -0800
Subject: xfs: make xfs_iroot_realloc a bmap btree function

Move the inode fork btree root reallocation function part of the btree
ops because it's now mostly bmbt-specific code.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c       |   6 +--
 fs/xfs/libxfs/xfs_bmap_btree.c | 104 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_bmap_btree.h |   3 ++
 fs/xfs/libxfs/xfs_btree.c      |  11 ++---
 fs/xfs/libxfs/xfs_btree.h      |  16 +++++++
 fs/xfs/libxfs/xfs_inode_fork.c |  96 -------------------------------------
 fs/xfs/libxfs/xfs_inode_fork.h |   2 -
 7 files changed, 130 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 8ab38f07cb78..0842577755f7 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -615,7 +615,7 @@ xfs_bmap_btree_to_extents(
 	xfs_trans_binval(tp, cbp);
 	if (cur->bc_levels[0].bp == cbp)
 		cur->bc_levels[0].bp = NULL;
-	xfs_iroot_realloc(ip, whichfork, 0);
+	xfs_bmap_broot_realloc(ip, whichfork, 0);
 	ASSERT(ifp->if_broot == NULL);
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -659,7 +659,7 @@ xfs_bmap_extents_to_btree(
 	 * Make space in the inode incore. This needs to be undone if we fail
 	 * to expand the root.
 	 */
-	block = xfs_iroot_realloc(ip, whichfork, 1);
+	block = xfs_bmap_broot_realloc(ip, whichfork, 1);
 
 	/*
 	 * Fill in the root.
@@ -745,7 +745,7 @@ xfs_bmap_extents_to_btree(
 out_unreserve_dquot:
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
 out_root_realloc:
-	xfs_iroot_realloc(ip, whichfork, 0);
+	xfs_bmap_broot_realloc(ip, whichfork, 0);
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	ASSERT(ifp->if_broot == NULL);
 	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 3464be771f95..22cf2059d54d 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -516,6 +516,109 @@ xfs_bmbt_keys_contiguous(
 				 be64_to_cpu(key2->bmbt.br_startoff));
 }
 
+/*
+ * Reallocate the space for if_broot based on the number of records.  Move the
+ * records and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by the caller.
+ * When growing this will create holes to be filled in by the caller.
+ *
+ * The caller must not request to add more records than would fit in the
+ * on-disk inode root.  If the if_broot is currently NULL, then if we are
+ * adding records, one will be allocated.  The caller must also not request
+ * that the number of records go below zero, although it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * whichfork -- which inode fork to change
+ * new_numrecs -- the new number of records requested for the if_broot array
+ *
+ * Returns the incore btree root block.
+ */
+struct xfs_btree_block *
+xfs_bmap_broot_realloc(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	unsigned int		new_numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	char			*np;
+	char			*op;
+	unsigned int		new_size;
+	unsigned int		old_size = ifp->if_broot_bytes;
+
+	/*
+	 * Block mapping btrees do not support storing zero records; if this
+	 * happens, the fork is being changed to FMT_EXTENTS.  Free the broot
+	 * and get out.
+	 */
+	if (new_numrecs == 0)
+		return xfs_broot_realloc(ifp, 0);
+
+	new_size = xfs_bmap_broot_space_calc(mp, new_numrecs);
+
+	/* Handle the nop case quietly. */
+	if (new_size == old_size)
+		return ifp->if_broot;
+
+	if (new_size > old_size) {
+		unsigned int	old_numrecs;
+
+		/*
+		 * If there wasn't any memory allocated before, just
+		 * allocate it now and get out.
+		 */
+		if (old_size == 0)
+			return xfs_broot_realloc(ifp, new_size);
+
+		/*
+		 * If there is already an existing if_broot, then we need
+		 * to realloc() it and shift the pointers to their new
+		 * location.  The records don't change location because
+		 * they are kept butted up against the btree block header.
+		 */
+		old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false);
+		xfs_broot_realloc(ifp, new_size);
+		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
+						     old_size);
+		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
+						     (int)new_size);
+		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, whichfork));
+		memmove(np, op, old_numrecs * (uint)sizeof(xfs_fsblock_t));
+		return ifp->if_broot;
+	}
+
+	/*
+	 * We're reducing, but not totally eliminating, numrecs.  In this case,
+	 * we are shrinking the if_broot buffer, so it must already exist.
+	 */
+	ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0);
+
+	/*
+	 * Shrink the btree root by moving the bmbt pointers, since they are
+	 * not butted up against the btree block header, then reallocating
+	 * broot.
+	 */
+	op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, old_size);
+	np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
+					     (int)new_size);
+	memmove(np, op, new_numrecs * (uint)sizeof(xfs_fsblock_t));
+
+	xfs_broot_realloc(ifp, new_size);
+	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
+	       xfs_inode_fork_size(ip, whichfork));
+	return ifp->if_broot;
+}
+
+static struct xfs_btree_block *
+xfs_bmbt_broot_realloc(
+	struct xfs_btree_cur	*cur,
+	unsigned int		new_numrecs)
+{
+	return xfs_bmap_broot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
+			new_numrecs);
+}
+
 const struct xfs_btree_ops xfs_bmbt_ops = {
 	.name			= "bmap",
 	.type			= XFS_BTREE_TYPE_INODE,
@@ -543,6 +646,7 @@ const struct xfs_btree_ops xfs_bmbt_ops = {
 	.keys_inorder		= xfs_bmbt_keys_inorder,
 	.recs_inorder		= xfs_bmbt_recs_inorder,
 	.keys_contiguous	= xfs_bmbt_keys_contiguous,
+	.broot_realloc		= xfs_bmbt_broot_realloc,
 };
 
 /*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 49a3bae3f6ec..b238d559ab03 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -198,4 +198,7 @@ xfs_bmap_bmdr_space(struct xfs_btree_block *bb)
 	return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs));
 }
 
+struct xfs_btree_block *xfs_bmap_broot_realloc(struct xfs_inode *ip,
+		int whichfork, unsigned int new_numrecs);
+
 #endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 5714bec26c20..672746f7217c 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3161,7 +3161,7 @@ xfs_btree_new_iroot(
 
 	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
 
-	xfs_iroot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork, 1);
+	cur->bc_ops->broot_realloc(cur, 1);
 
 	xfs_btree_setbuf(cur, level, cbp);
 
@@ -3345,8 +3345,7 @@ xfs_btree_make_block_unfull(
 
 		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
 			/* A root block that can be made bigger. */
-			xfs_iroot_realloc(ip, cur->bc_ino.whichfork,
-					numrecs + 1);
+			cur->bc_ops->broot_realloc(cur, numrecs + 1);
 			*stat = 1;
 		} else {
 			/* A root block that needs replacing */
@@ -3758,8 +3757,7 @@ xfs_btree_kill_iroot(
 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 #endif
 
-	block = xfs_iroot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
-			numrecs);
+	block = cur->bc_ops->broot_realloc(cur, numrecs);
 
 	block->bb_numrecs = be16_to_cpu(numrecs);
 	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
@@ -3944,8 +3942,7 @@ xfs_btree_delrec(
 	 * nothing left to do.  numrecs was decremented above.
 	 */
 	if (xfs_btree_at_iroot(cur, level)) {
-		xfs_iroot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
-				numrecs);
+		cur->bc_ops->broot_realloc(cur, numrecs);
 
 		error = xfs_btree_kill_iroot(cur);
 		if (error)
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index c5bff273cae2..8380ae0a64dd 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -213,6 +213,22 @@ struct xfs_btree_ops {
 			       const union xfs_btree_key *key1,
 			       const union xfs_btree_key *key2,
 			       const union xfs_btree_key *mask);
+
+	/*
+	 * Reallocate the space for if_broot to fit the number of records.
+	 * Move the records and pointers in if_broot to fit the new size.  When
+	 * shrinking this will eliminate holes between the records and pointers
+	 * created by the caller.  When growing this will create holes to be
+	 * filled in by the caller.
+	 *
+	 * The caller must not request to add more records than would fit in
+	 * the on-disk inode root.  If the if_broot is currently NULL, then if
+	 * we are adding records, one will be allocated.  The caller must also
+	 * not request that the number of records go below zero, although it
+	 * can go to zero.
+	 */
+	struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur,
+				unsigned int new_numrecs);
 };
 
 /* btree geometry flags */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 53bfdf422ad8..60853bac289a 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -422,102 +422,6 @@ xfs_broot_realloc(
 	return ifp->if_broot;
 }
 
-/*
- * Reallocate the space for if_broot based on the number of records.  Move the
- * records and pointers in if_broot to fit the new size.  When shrinking this
- * will eliminate holes between the records and pointers created by the caller.
- * When growing this will create holes to be filled in by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root.  If the if_broot is currently NULL, then
- * if we are adding records, one will be allocated.  The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * whichfork -- which inode fork to change
- * new_numrecs -- the new number of records requested for the if_broot array
- *
- * Returns the incore btree root block.
- */
-struct xfs_btree_block *
-xfs_iroot_realloc(
-	struct xfs_inode	*ip,
-	int			whichfork,
-	unsigned int		new_numrecs)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
-	char			*np;
-	char			*op;
-	unsigned int		new_size;
-	unsigned int		old_size = ifp->if_broot_bytes;
-
-	/*
-	 * Block mapping btrees do not support storing zero records; if this
-	 * happens, the fork is being changed to FMT_EXTENTS.  Free the broot
-	 * and get out.
-	 */
-	if (new_numrecs == 0)
-		return xfs_broot_realloc(ifp, 0);
-
-	new_size = xfs_bmap_broot_space_calc(mp, new_numrecs);
-
-	/* Handle the nop case quietly. */
-	if (new_size == old_size)
-		return ifp->if_broot;
-
-	if (new_size > old_size) {
-		unsigned int	old_numrecs;
-
-		/*
-		 * If there wasn't any memory allocated before, just
-		 * allocate it now and get out.
-		 */
-		if (old_size == 0)
-			return xfs_broot_realloc(ifp, new_size);
-
-		/*
-		 * If there is already an existing if_broot, then we need
-		 * to realloc() it and shift the pointers to their new
-		 * location.  The records don't change location because
-		 * they are kept butted up against the btree block header.
-		 */
-		old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false);
-		xfs_broot_realloc(ifp, new_size);
-		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     old_size);
-		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     (int)new_size);
-		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-			xfs_inode_fork_size(ip, whichfork));
-		memmove(np, op, old_numrecs * (uint)sizeof(xfs_fsblock_t));
-		return ifp->if_broot;
-	}
-
-	/*
-	 * We're reducing, but not totally eliminating, numrecs.  In this case,
-	 * we are shrinking the if_broot buffer, so it must already exist.
-	 */
-	ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0);
-
-	/*
-	 * Shrink the btree root by moving the bmbt pointers, since they are
-	 * not butted up against the btree block header, then reallocating
-	 * broot.
-	 */
-	op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, old_size);
-	np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-					     (int)new_size);
-	memmove(np, op, new_numrecs * (uint)sizeof(xfs_fsblock_t));
-
-	xfs_broot_realloc(ifp, new_size);
-	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-	       xfs_inode_fork_size(ip, whichfork));
-	return ifp->if_broot;
-}
-
-
 /*
  * This is called when the amount of space needed for if_data
  * is increased or decreased.  The change in size is indicated by
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index d05eb0bad864..69ed0919d60b 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -175,8 +175,6 @@ struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp,
 struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp,
 				size_t new_size);
 
-struct xfs_btree_block *xfs_iroot_realloc(struct xfs_inode *ip, int whichfork,
-				unsigned int new_numrecs);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
 				  int);
-- 
cgit v1.2.3


From c914081775e2e39e4afa9b4bb9e5c98202110f51 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:15 -0800
Subject: xfs: tidy up xfs_bmap_broot_realloc a bit

Hoist out the code that migrates broot pointers during a resize
operation to avoid code duplication and streamline the caller.  Also
use the correct bmbt pointer type for the sizeof operation.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap_btree.c | 43 ++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 22cf2059d54d..908d7b050e9c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -516,6 +516,22 @@ xfs_bmbt_keys_contiguous(
 				 be64_to_cpu(key2->bmbt.br_startoff));
 }
 
+static inline void
+xfs_bmbt_move_ptrs(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*broot,
+	short			old_size,
+	size_t			new_size,
+	unsigned int		numrecs)
+{
+	void			*dptr;
+	void			*sptr;
+
+	sptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, old_size);
+	dptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, new_size);
+	memmove(dptr, sptr, numrecs * sizeof(xfs_bmbt_ptr_t));
+}
+
 /*
  * Reallocate the space for if_broot based on the number of records.  Move the
  * records and pointers in if_broot to fit the new size.  When shrinking this
@@ -541,8 +557,7 @@ xfs_bmap_broot_realloc(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
-	char			*np;
-	char			*op;
+	struct xfs_btree_block	*broot;
 	unsigned int		new_size;
 	unsigned int		old_size = ifp->if_broot_bytes;
 
@@ -577,15 +592,11 @@ xfs_bmap_broot_realloc(
 		 * they are kept butted up against the btree block header.
 		 */
 		old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false);
-		xfs_broot_realloc(ifp, new_size);
-		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     old_size);
-		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     (int)new_size);
-		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
+		broot = xfs_broot_realloc(ifp, new_size);
+		ASSERT(xfs_bmap_bmdr_space(broot) <=
 			xfs_inode_fork_size(ip, whichfork));
-		memmove(np, op, old_numrecs * (uint)sizeof(xfs_fsblock_t));
-		return ifp->if_broot;
+		xfs_bmbt_move_ptrs(mp, broot, old_size, new_size, old_numrecs);
+		return broot;
 	}
 
 	/*
@@ -599,15 +610,11 @@ xfs_bmap_broot_realloc(
 	 * not butted up against the btree block header, then reallocating
 	 * broot.
 	 */
-	op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, old_size);
-	np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-					     (int)new_size);
-	memmove(np, op, new_numrecs * (uint)sizeof(xfs_fsblock_t));
-
-	xfs_broot_realloc(ifp, new_size);
-	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
+	xfs_bmbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, new_numrecs);
+	broot = xfs_broot_realloc(ifp, new_size);
+	ASSERT(xfs_bmap_bmdr_space(broot) <=
 	       xfs_inode_fork_size(ip, whichfork));
-	return ifp->if_broot;
+	return broot;
 }
 
 static struct xfs_btree_block *
-- 
cgit v1.2.3


From 7708951ae52132d3c4e05aee2e57d35f0d89bd49 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:16 -0800
Subject: xfs: hoist the node iroot update code out of xfs_btree_new_iroot

In preparation for allowing records in an inode btree root, hoist the
code that copies keyptrs from an existing node root into a child block
to a separate function.  Note that the new function explicitly computes
the keys of the new child block and stores that in the root block; while
the bmap btree could rely on leaving the key alone, realtime rmap needs
to set the new high key.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.c | 117 ++++++++++++++++++++++++++++++----------------
 1 file changed, 76 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 672746f7217c..ed09eeee9161 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3078,6 +3078,78 @@ xfs_btree_split(
 #define xfs_btree_split	__xfs_btree_split
 #endif /* __KERNEL__ */
 
+/*
+ * Move the keys and pointers from a root block to a separate block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_promote_node_iroot(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level,
+	struct xfs_buf		*cbp,
+	union xfs_btree_ptr	*cptr,
+	struct xfs_btree_block	*cblock)
+{
+	union xfs_btree_key	*ckp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*cpp;
+	union xfs_btree_ptr	*pp;
+	int			i;
+	int			error;
+	int			numrecs = xfs_btree_get_numrecs(block);
+
+	/*
+	 * Increase tree height, adjusting the root block level to match.
+	 * We cannot change the root btree node size until we've copied the
+	 * block contents to the new child block.
+	 */
+	be16_add_cpu(&block->bb_level, 1);
+	cur->bc_nlevels++;
+	cur->bc_levels[level + 1].ptr = 1;
+
+	/*
+	 * Adjust the root btree record count, then copy the keys from the old
+	 * root to the new child block.
+	 */
+	xfs_btree_set_numrecs(block, 1);
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, ckp, kp, numrecs);
+
+	/* Check the pointers and copy them to the new child block. */
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+	for (i = 0; i < numrecs; i++) {
+		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+		if (error)
+			return error;
+	}
+	xfs_btree_copy_ptrs(cur, cpp, pp, numrecs);
+
+	/*
+	 * Set the first keyptr to point to the new child block, then shrink
+	 * the memory buffer for the root block.
+	 */
+	error = xfs_btree_debug_check_ptr(cur, cptr, 0, level);
+	if (error)
+		return error;
+	xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+	xfs_btree_get_keys(cur, cblock, kp);
+
+	cur->bc_ops->broot_realloc(cur, 1);
+
+	/* Attach the new block to the cursor and log it. */
+	xfs_btree_setbuf(cur, level, cbp);
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_keys(cur, cbp, 1, numrecs);
+	xfs_btree_log_ptrs(cur, cbp, 1, numrecs);
+	return 0;
+}
+
 /*
  * Copy the old inode root contents into a real block and make the
  * broot point to it.
@@ -3091,14 +3163,10 @@ xfs_btree_new_iroot(
 	struct xfs_buf		*cbp;		/* buffer for cblock */
 	struct xfs_btree_block	*block;		/* btree block */
 	struct xfs_btree_block	*cblock;	/* child btree block */
-	union xfs_btree_key	*ckp;		/* child key pointer */
-	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
-	union xfs_btree_key	*kp;		/* pointer to btree key */
-	union xfs_btree_ptr	*pp;		/* pointer to block addr */
+	union xfs_btree_ptr	*pp;
 	union xfs_btree_ptr	nptr;		/* new block addr */
 	int			level;		/* btree level */
 	int			error;		/* error return code */
-	int			i;		/* loop counter */
 
 	XFS_BTREE_STATS_INC(cur, newroot);
 
@@ -3136,45 +3204,12 @@ xfs_btree_new_iroot(
 			cblock->bb_u.s.bb_blkno = bno;
 	}
 
-	be16_add_cpu(&block->bb_level, 1);
-	xfs_btree_set_numrecs(block, 1);
-	cur->bc_nlevels++;
-	ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
-	cur->bc_levels[level + 1].ptr = 1;
-
-	kp = xfs_btree_key_addr(cur, 1, block);
-	ckp = xfs_btree_key_addr(cur, 1, cblock);
-	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
-
-	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
-		if (error)
-			goto error0;
-	}
-
-	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
-
-	error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
+	error = xfs_btree_promote_node_iroot(cur, block, level, cbp, &nptr,
+			cblock);
 	if (error)
 		goto error0;
 
-	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
-
-	cur->bc_ops->broot_realloc(cur, 1);
-
-	xfs_btree_setbuf(cur, level, cbp);
-
-	/*
-	 * Do all this logging at the end so that
-	 * the root is at the right level.
-	 */
-	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
-	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-
-	*logflags |=
-		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
+	*logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
 	*stat = 1;
 	return 0;
 error0:
-- 
cgit v1.2.3


From 505248719fcbf2c76594fe2ef293680d97fe426c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:17 -0800
Subject: xfs: hoist the node iroot update code out of xfs_btree_kill_iroot

In preparation for allowing records in an inode btree root, hoist the
code that copies keyptrs from an existing node child into the root block
to a separate function.  Remove some unnecessary conditionals and clean
up a few function calls in the new function.  Note that this change
reorders the ->free_block call with respect to the change in bc_nlevels
to make it easier to support inode root leaf blocks in the next patch.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.c | 84 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index ed09eeee9161..e83a8de5fb87 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3726,6 +3726,60 @@ error0:
 	return error;
 }
 
+/*
+ * Move the keyptrs from a child node block to the root block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_demote_node_child(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*cblock,
+	int			level,
+	int			numrecs)
+{
+	struct xfs_btree_block	*block;
+	union xfs_btree_key	*ckp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*cpp;
+	union xfs_btree_ptr	*pp;
+	int			i;
+	int			error;
+
+	/*
+	 * Adjust the root btree node size and the record count to match the
+	 * doomed child so that we can copy the keyptrs ahead of changing the
+	 * tree shape.
+	 */
+	block = cur->bc_ops->broot_realloc(cur, numrecs);
+
+	xfs_btree_set_numrecs(block, numrecs);
+	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+	/* Copy keys from the doomed block. */
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+	/* Copy pointers from the doomed block. */
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+	for (i = 0; i < numrecs; i++) {
+		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+		if (error)
+			return error;
+	}
+	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+	/* Decrease tree height, adjusting the root block level to match. */
+	cur->bc_levels[level - 1].bp = NULL;
+	be16_add_cpu(&block->bb_level, -1);
+	cur->bc_nlevels--;
+	return 0;
+}
+
 /*
  * Try to merge a non-leaf block back into the inode root.
  *
@@ -3741,10 +3795,6 @@ xfs_btree_kill_iroot(
 	struct xfs_inode	*ip = cur->bc_ino.ip;
 	struct xfs_btree_block	*block;
 	struct xfs_btree_block	*cblock;
-	union xfs_btree_key	*kp;
-	union xfs_btree_key	*ckp;
-	union xfs_btree_ptr	*pp;
-	union xfs_btree_ptr	*cpp;
 	struct xfs_buf		*cbp;
 	int			level;
 	int			numrecs;
@@ -3752,7 +3802,6 @@ xfs_btree_kill_iroot(
 #ifdef DEBUG
 	union xfs_btree_ptr	ptr;
 #endif
-	int			i;
 
 	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
 	ASSERT(cur->bc_nlevels > 1);
@@ -3792,35 +3841,16 @@ xfs_btree_kill_iroot(
 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 #endif
 
-	block = cur->bc_ops->broot_realloc(cur, numrecs);
-
-	block->bb_numrecs = be16_to_cpu(numrecs);
-	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-
-	kp = xfs_btree_key_addr(cur, 1, block);
-	ckp = xfs_btree_key_addr(cur, 1, cblock);
-	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
-
-	pp = xfs_btree_ptr_addr(cur, 1, block);
-	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-
-	for (i = 0; i < numrecs; i++) {
-		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
-		if (error)
-			return error;
-	}
-
-	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+	error = xfs_btree_demote_node_child(cur, cblock, level, numrecs);
+	if (error)
+		return error;
 
 	error = xfs_btree_free_block(cur, cbp);
 	if (error)
 		return error;
 
-	cur->bc_levels[level - 1].bp = NULL;
-	be16_add_cpu(&block->bb_level, -1);
 	xfs_trans_log_inode(cur->bc_tp, ip,
 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
-	cur->bc_nlevels--;
 out0:
 	return 0;
 }
-- 
cgit v1.2.3


From af32541081ed6b6ad49b1ea38b5128cb319841b0 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Fri, 22 Nov 2024 12:33:17 -0800
Subject: xfs: add some rtgroup inode helpers

Create some simple helpers to reduce the amount of typing whenever we
access rtgroup inodes.  Conversion was done with this spatch and some
minor reformatting:

@@
expression rtg;
@@

- rtg->rtg_inodes[XFS_RTGI_BITMAP]
+ rtg_bitmap(rtg)

@@
expression rtg;
@@

- rtg->rtg_inodes[XFS_RTGI_SUMMARY]
+ rtg_summary(rtg)

and the CLI command:

$ spatch --sp-file /tmp/moo.cocci --dir fs/xfs/ --use-gitgrep --in-place

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtbitmap.c |  2 +-
 fs/xfs/libxfs/xfs_rtgroup.c  | 18 ++++++++----------
 fs/xfs/libxfs/xfs_rtgroup.h  | 10 ++++++++++
 fs/xfs/scrub/rtbitmap.c      |  7 +++----
 fs/xfs/scrub/rtsummary.c     | 12 +++++-------
 fs/xfs/xfs_qm.c              |  8 ++++----
 fs/xfs/xfs_rtalloc.c         | 10 +++++-----
 7 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 4ddfb7e395b3..770adf60dd73 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1055,7 +1055,7 @@ xfs_rtfree_extent(
 	xfs_rtxlen_t		len)	/* length of extent freed */
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_inode	*rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+	struct xfs_inode	*rbmip = rtg_bitmap(rtg);
 	struct xfs_rtalloc_args	args = {
 		.mp		= mp,
 		.tp		= tp,
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 4f3bfc884aff..a79b734e7044 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -197,10 +197,10 @@ xfs_rtgroup_lock(
 		 * Lock both realtime free space metadata inodes for a freespace
 		 * update.
 		 */
-		xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL);
-		xfs_ilock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL);
+		xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+		xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
 	} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
-		xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED);
+		xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
 	}
 }
 
@@ -215,10 +215,10 @@ xfs_rtgroup_unlock(
 	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
 	if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
-		xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL);
-		xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL);
+		xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+		xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
 	} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
-		xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED);
+		xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
 	}
 }
 
@@ -236,10 +236,8 @@ xfs_rtgroup_trans_join(
 	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
 
 	if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
-		xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_BITMAP],
-				XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_SUMMARY],
-				XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
 	}
 }
 
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 7e7e491ff06f..19f8d302b9aa 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -64,6 +64,16 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
 	return rtg->rtg_group.xg_gno;
 }
 
+static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
+{
+	return rtg->rtg_inodes[XFS_RTGI_BITMAP];
+}
+
+static inline struct xfs_inode *rtg_summary(const struct xfs_rtgroup *rtg)
+{
+	return rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+}
+
 /* Passive rtgroup references */
 static inline struct xfs_rtgroup *
 xfs_rtgroup_get(
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 376a36fd9a9c..fb4970c877ab 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -49,8 +49,7 @@ xchk_setup_rtbitmap(
 	if (error)
 		return error;
 
-	error = xchk_install_live_inode(sc,
-			sc->sr.rtg->rtg_inodes[XFS_RTGI_BITMAP]);
+	error = xchk_install_live_inode(sc, rtg_bitmap(sc->sr.rtg));
 	if (error)
 		return error;
 
@@ -146,7 +145,7 @@ xchk_rtbitmap(
 {
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_rtgroup	*rtg = sc->sr.rtg;
-	struct xfs_inode	*rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+	struct xfs_inode	*rbmip = rtg_bitmap(rtg);
 	struct xchk_rtbitmap	*rtb = sc->buf;
 	int			error;
 
@@ -215,7 +214,7 @@ xchk_xref_is_used_rt_space(
 	xfs_extlen_t		len)
 {
 	struct xfs_rtgroup	*rtg = sc->sr.rtg;
-	struct xfs_inode	*rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+	struct xfs_inode	*rbmip = rtg_bitmap(rtg);
 	xfs_rtxnum_t		startext;
 	xfs_rtxnum_t		endext;
 	bool			is_free;
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 49fc6250bafc..f1af5431b388 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -81,8 +81,7 @@ xchk_setup_rtsummary(
 	if (error)
 		return error;
 
-	error = xchk_install_live_inode(sc,
-			sc->sr.rtg->rtg_inodes[XFS_RTGI_SUMMARY]);
+	error = xchk_install_live_inode(sc, rtg_summary(sc->sr.rtg));
 	if (error)
 		return error;
 
@@ -191,8 +190,7 @@ xchk_rtsum_record_free(
 	rtlen = xfs_rtxlen_to_extlen(mp, rec->ar_extcount);
 
 	if (!xfs_verify_rtbext(mp, rtbno, rtlen)) {
-		xchk_ino_xref_set_corrupt(sc,
-				rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_ino);
+		xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
 		return -EFSCORRUPTED;
 	}
 
@@ -218,7 +216,7 @@ xchk_rtsum_compute(
 
 	/* If the bitmap size doesn't match the computed size, bail. */
 	if (XFS_FSB_TO_B(mp, xfs_rtbitmap_blockcount(mp)) !=
-	    rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_disk_size)
+	    rtg_bitmap(rtg)->i_disk_size)
 		return -EFSCORRUPTED;
 
 	return xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtsum_record_free, sc);
@@ -310,8 +308,8 @@ xchk_rtsummary(
 {
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_rtgroup	*rtg = sc->sr.rtg;
-	struct xfs_inode	*rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
-	struct xfs_inode	*rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+	struct xfs_inode	*rbmip = rtg_bitmap(rtg);
+	struct xfs_inode	*rsumip = rtg_summary(rtg);
 	struct xchk_rtsummary	*rts = sc->buf;
 	int			error;
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3abab5fb593e..e1ba5af6250f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -230,10 +230,10 @@ xfs_qm_unmount_rt(
 
 	if (!rtg)
 		return;
-	if (rtg->rtg_inodes[XFS_RTGI_BITMAP])
-		xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_BITMAP]);
-	if (rtg->rtg_inodes[XFS_RTGI_SUMMARY])
-		xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_SUMMARY]);
+	if (rtg_bitmap(rtg))
+		xfs_qm_dqdetach(rtg_bitmap(rtg));
+	if (rtg_summary(rtg))
+		xfs_qm_dqdetach(rtg_summary(rtg));
 	xfs_rtgroup_rele(rtg);
 }
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5128c5ad72f5..4cd2f32aa70a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -856,8 +856,8 @@ xfs_growfs_rt_bmblock(
 	xfs_fileoff_t		bmbno)
 {
 	struct xfs_mount	*mp = rtg_mount(rtg);
-	struct xfs_inode	*rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
-	struct xfs_inode	*rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+	struct xfs_inode	*rbmip = rtg_bitmap(rtg);
+	struct xfs_inode	*rsumip = rtg_summary(rtg);
 	struct xfs_rtalloc_args	args = {
 		.mp		= mp,
 		.rtg		= rtg,
@@ -1041,8 +1041,8 @@ xfs_growfs_rt_alloc_blocks(
 	xfs_extlen_t		*nrbmblocks)
 {
 	struct xfs_mount	*mp = rtg_mount(rtg);
-	struct xfs_inode	*rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
-	struct xfs_inode	*rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+	struct xfs_inode	*rbmip = rtg_bitmap(rtg);
+	struct xfs_inode	*rsumip = rtg_summary(rtg);
 	xfs_extlen_t		orbmblocks = 0;
 	xfs_extlen_t		orsumblocks = 0;
 	struct xfs_mount	*nmp;
@@ -1622,7 +1622,7 @@ xfs_rtpick_extent(
 	xfs_rtxlen_t		len)		/* allocation length (rtextents) */
 {
 	struct xfs_mount	*mp = rtg_mount(rtg);
-	struct xfs_inode	*rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+	struct xfs_inode	*rbmip = rtg_bitmap(rtg);
 	xfs_rtxnum_t		b = 0;		/* result rtext */
 	int			log2;		/* log of sequence number */
 	uint64_t		resid;		/* residual after log removed */
-- 
cgit v1.2.3


From d415fb34b4c2d1c11b0536eebc61962d3f9bf286 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:20 -0800
Subject: xfs: prepare rmap btree cursor tracepoints for realtime

Rework the rmap btree cursor tracepoints in preparation to handle the
realtime rmap btree cursor.  Mostly this involves renaming the field to
"gbno" and extracting the group number from the cursor.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_rmap_item.c |  4 +--
 fs/xfs/xfs_trace.h     | 82 ++++++++++++++++++++++++++------------------------
 2 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 76b3c0ed3b4f..ac2913a73358 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -351,10 +351,10 @@ xfs_rmap_defer_add(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 
-	trace_xfs_rmap_defer(mp, ri);
-
 	ri->ri_group = xfs_group_intent_get(mp, ri->ri_bmap.br_startblock,
 			XG_TYPE_AG);
+
+	trace_xfs_rmap_defer(mp, ri);
 	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
 }
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index cbda663fe6e8..8b7bb1f5ae3c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -14,11 +14,15 @@
  * ino: filesystem inode number
  *
  * agbno: per-AG block number in fs blocks
+ * rgbno: per-rtgroup block number in fs blocks
  * startblock: physical block number for file mappings.  This is either a
  *             segmented fsblock for data device mappings, or a rfsblock
  *             for realtime device mappings
  * fsbcount: number of blocks in an extent, in fs blocks
  *
+ * gbno: generic allocation group block number.  This is an agbno for
+ *       space in a per-AG or a rgbno for space in a realtime group.
+ *
  * daddr: physical block number in 512b blocks
  * bbcount: number of blocks in a physical extent, in 512b blocks
  *
@@ -2918,13 +2922,14 @@ DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
 /* rmap tracepoints */
 DECLARE_EVENT_CLASS(xfs_rmap_class,
 	TP_PROTO(struct xfs_btree_cur *cur,
-		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
+		 xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten,
 		 const struct xfs_owner_info *oinfo),
-	TP_ARGS(cur, agbno, len, unwritten, oinfo),
+	TP_ARGS(cur, gbno, len, unwritten, oinfo),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, gbno)
 		__field(xfs_extlen_t, len)
 		__field(uint64_t, owner)
 		__field(uint64_t, offset)
@@ -2932,8 +2937,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
-		__entry->agbno = agbno;
+		__entry->gbno = gbno;
 		__entry->len = len;
 		__entry->owner = oinfo->oi_owner;
 		__entry->offset = oinfo->oi_offset;
@@ -2941,10 +2947,11 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
 		if (unwritten)
 			__entry->flags |= XFS_RMAP_UNWRITTEN;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
+	TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->gbno,
 		  __entry->len,
 		  __entry->owner,
 		  __entry->offset,
@@ -2953,9 +2960,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
 #define DEFINE_RMAP_EVENT(name) \
 DEFINE_EVENT(xfs_rmap_class, name, \
 	TP_PROTO(struct xfs_btree_cur *cur, \
-		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
+		 xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten, \
 		 const struct xfs_owner_info *oinfo), \
-	TP_ARGS(cur, agbno, len, unwritten, oinfo))
+	TP_ARGS(cur, gbno, len, unwritten, oinfo))
 
 /* btree cursor error/%ip tracepoint class */
 DECLARE_EVENT_CLASS(xfs_btree_error_class,
@@ -3018,47 +3025,36 @@ TRACE_EVENT(xfs_rmap_convert_state,
 	TP_ARGS(cur, state, caller_ip),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_ino_t, ino)
 		__field(int, state)
 		__field(unsigned long, caller_ip)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		switch (cur->bc_ops->type) {
-		case XFS_BTREE_TYPE_INODE:
-			__entry->agno = 0;
-			__entry->ino = cur->bc_ino.ip->i_ino;
-			break;
-		case XFS_BTREE_TYPE_AG:
-			__entry->agno = cur->bc_group->xg_gno;
-			__entry->ino = 0;
-			break;
-		case XFS_BTREE_TYPE_MEM:
-			__entry->agno = 0;
-			__entry->ino = 0;
-			break;
-		}
+		__entry->type = cur->bc_group->xg_type;
+		__entry->agno = cur->bc_group->xg_gno;
 		__entry->state = state;
 		__entry->caller_ip = caller_ip;
 	),
-	TP_printk("dev %d:%d agno 0x%x ino 0x%llx state %d caller %pS",
+	TP_printk("dev %d:%d %sno 0x%x state %d caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
-		  __entry->ino,
 		  __entry->state,
 		  (char *)__entry->caller_ip)
 );
 
 DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 	TP_PROTO(struct xfs_btree_cur *cur,
-		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t gbno, xfs_extlen_t len,
 		 uint64_t owner, uint64_t offset, unsigned int flags),
-	TP_ARGS(cur, agbno, len, owner, offset, flags),
+	TP_ARGS(cur, gbno, len, owner, offset, flags),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, gbno)
 		__field(xfs_extlen_t, len)
 		__field(uint64_t, owner)
 		__field(uint64_t, offset)
@@ -3066,17 +3062,19 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
-		__entry->agbno = agbno;
+		__entry->gbno = gbno;
 		__entry->len = len;
 		__entry->owner = owner;
 		__entry->offset = offset;
 		__entry->flags = flags;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+	TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->gbno,
 		  __entry->len,
 		  __entry->owner,
 		  __entry->offset,
@@ -3085,9 +3083,9 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 #define DEFINE_RMAPBT_EVENT(name) \
 DEFINE_EVENT(xfs_rmapbt_class, name, \
 	TP_PROTO(struct xfs_btree_cur *cur, \
-		 xfs_agblock_t agbno, xfs_extlen_t len, \
+		 xfs_agblock_t gbno, xfs_extlen_t len, \
 		 uint64_t owner, uint64_t offset, unsigned int flags), \
-	TP_ARGS(cur, agbno, len, owner, offset, flags))
+	TP_ARGS(cur, gbno, len, owner, offset, flags))
 
 TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
 TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
@@ -3104,8 +3102,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(unsigned long long, owner)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, gbno)
 		__field(int, whichfork)
 		__field(xfs_fileoff_t, l_loff)
 		__field(xfs_filblks_t, l_len)
@@ -3114,9 +3113,11 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
-		__entry->agbno = XFS_FSB_TO_AGBNO(mp,
-					ri->ri_bmap.br_startblock);
+		__entry->type = ri->ri_group->xg_type;
+		__entry->agno = ri->ri_group->xg_gno;
+		__entry->gbno = xfs_fsb_to_gbno(mp,
+						ri->ri_bmap.br_startblock,
+						ri->ri_group->xg_type);
 		__entry->owner = ri->ri_owner;
 		__entry->whichfork = ri->ri_whichfork;
 		__entry->l_loff = ri->ri_bmap.br_startoff;
@@ -3124,11 +3125,12 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 		__entry->l_state = ri->ri_bmap.br_state;
 		__entry->op = ri->ri_type;
 	),
-	TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+	TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->gbno,
 		  __entry->owner,
 		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
 		  __entry->l_loff,
@@ -3993,7 +3995,7 @@ TRACE_EVENT(xfs_fsmap_mapping,
 		__entry->offset = frec->offset;
 		__entry->flags = frec->rm_flags;
 	),
-	TP_printk("dev %d:%d keydev %d:%d agno 0x%x rmapbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
+	TP_printk("dev %d:%d keydev %d:%d agno 0x%x gbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
 		  __entry->agno,
-- 
cgit v1.2.3


From 84140a96cf7a5b5b48b862a79c8322aa220ce591 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:18 -0800
Subject: xfs: prepare to reuse the dquot pointer space in struct xfs_inode

Files participating in the metadata directory tree are not accounted to
the quota subsystem.  Therefore, the i_[ugp]dquot pointers in struct
xfs_inode are never used and should always be NULL.

In the next patch we want to add a u64 count of fs blocks reserved for
metadata btree expansion, but we don't want every inode in the fs to pay
the memory price for this feature.  The intent is to union those three
pointers with the u64 counter, but for that to work we must guard
against all access to the dquot pointers for metadata files.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_attr.c |  4 +---
 fs/xfs/libxfs/xfs_bmap.c |  4 +---
 fs/xfs/scrub/tempfile.c  |  1 +
 fs/xfs/xfs_dquot.h       |  3 +++
 fs/xfs/xfs_exchrange.c   |  3 +++
 fs/xfs/xfs_inode.h       | 10 +++++++---
 fs/xfs/xfs_qm.c          |  2 ++
 fs/xfs/xfs_quota.h       |  5 -----
 fs/xfs/xfs_trans.c       |  4 ++++
 fs/xfs/xfs_trans_dquot.c |  8 ++++----
 10 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 17875ad865f5..8c04acd30d48 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1004,9 +1004,7 @@ xfs_attr_add_fork(
 	unsigned int		blks;		/* space reservation */
 	int			error;		/* error return value */
 
-	if (xfs_is_metadir_inode(ip))
-		ASSERT(XFS_IS_DQDETACHED(ip));
-	else
+	if (!xfs_is_metadir_inode(ip))
 		ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 
 	blks = XFS_ADDAFORK_SPACE_RES(mp);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 0842577755f7..02323936cc9b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1042,9 +1042,7 @@ xfs_bmap_add_attrfork(
 	int			error;		/* error return value */
 
 	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
-	if (xfs_is_metadir_inode(ip))
-		ASSERT(XFS_IS_DQDETACHED(ip));
-	else
+	if (!xfs_is_metadir_inode(ip))
 		ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 	ASSERT(!xfs_inode_has_attr_fork(ip));
 
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 2d7ca7e1bbca..4ebb5f8459e8 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -749,6 +749,7 @@ xrep_tempexch_reserve_quota(
 	 * or the two inodes have the same dquots.
 	 */
 	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+	    xfs_is_metadir_inode(req->ip1) ||
 	    (req->ip1->i_udquot == req->ip2->i_udquot &&
 	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
 	     req->ip1->i_pdquot == req->ip2->i_pdquot))
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c617bac75361..61217adf5ba5 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -160,6 +160,9 @@ static inline struct xfs_dquot *xfs_inode_dquot(
 	struct xfs_inode	*ip,
 	xfs_dqtype_t		type)
 {
+	if (xfs_is_metadir_inode(ip))
+		return NULL;
+
 	switch (type) {
 	case XFS_DQTYPE_USER:
 		return ip->i_udquot;
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index 265c42449893..f340a2015c4c 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -119,6 +119,9 @@ xfs_exchrange_reserve_quota(
 	int				ip1_error = 0;
 	int				error;
 
+	ASSERT(!xfs_is_metadir_inode(req->ip1));
+	ASSERT(!xfs_is_metadir_inode(req->ip2));
+
 	/*
 	 * Don't bother with a quota reservation if we're not enforcing them
 	 * or the two inodes have the same dquots.
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1648dc5a8068..1141c2e8e123 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -25,9 +25,13 @@ struct xfs_dquot;
 typedef struct xfs_inode {
 	/* Inode linking and identification information. */
 	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
-	struct xfs_dquot	*i_udquot;	/* user dquot */
-	struct xfs_dquot	*i_gdquot;	/* group dquot */
-	struct xfs_dquot	*i_pdquot;	/* project dquot */
+	union {
+		struct {
+			struct xfs_dquot *i_udquot;	/* user dquot */
+			struct xfs_dquot *i_gdquot;	/* group dquot */
+			struct xfs_dquot *i_pdquot;	/* project dquot */
+		};
+	};
 
 	/* Inode location stuff */
 	xfs_ino_t		i_ino;		/* inode number (agno/agino)*/
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index dc8b1010d4d3..3abab5fb593e 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -428,6 +428,8 @@ void
 xfs_qm_dqdetach(
 	xfs_inode_t	*ip)
 {
+	if (xfs_is_metadir_inode(ip))
+		return;
 	if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
 		return;
 
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index d7565462af3d..105e6eb57620 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -29,11 +29,6 @@ struct xfs_buf;
 	 (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
 	 (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
 
-#define XFS_IS_DQDETACHED(ip) \
-	((ip)->i_udquot == NULL && \
-	 (ip)->i_gdquot == NULL && \
-	 (ip)->i_pdquot == NULL)
-
 #define XFS_QM_NEED_QUOTACHECK(mp) \
 	((XFS_IS_UQUOTA_ON(mp) && \
 		(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4cd25717c9d1..f53f82456288 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1266,6 +1266,9 @@ retry:
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
+	if (xfs_is_metadir_inode(ip))
+		goto out;
+
 	error = xfs_qm_dqattach_locked(ip, false);
 	if (error) {
 		/* Caller should have allocated the dquots! */
@@ -1334,6 +1337,7 @@ retry:
 			goto out_cancel;
 	}
 
+out:
 	*tpp = tp;
 	return 0;
 
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 713b6d243e56..765456bf3428 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -156,7 +156,8 @@ xfs_trans_mod_ino_dquot(
 	unsigned int			field,
 	int64_t				delta)
 {
-	ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip));
+	if (xfs_is_metadir_inode(ip))
+		return;
 
 	xfs_trans_mod_dquot(tp, dqp, field, delta);
 
@@ -246,11 +247,10 @@ xfs_trans_mod_dquot_byino(
 	xfs_mount_t	*mp = tp->t_mountp;
 
 	if (!XFS_IS_QUOTA_ON(mp) ||
-	    xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+	    xfs_is_quota_inode(&mp->m_sb, ip->i_ino) ||
+	    xfs_is_metadir_inode(ip))
 		return;
 
-	ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip));
-
 	if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
 		xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);
 	if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
-- 
cgit v1.2.3


From 953f76bf7a3622351f335c77c56ed7efb793e3e7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:21 -0800
Subject: xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions

Simplify the calling conventions by allowing callers to pass a fsbno
(xfs_fsblock_t) directly into these functions, since we're just going to
set it in a struct anyway.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_refcount.c |  6 ++----
 fs/xfs/libxfs/xfs_rmap.c     | 12 +++++-------
 fs/xfs/libxfs/xfs_rmap.h     |  8 ++++----
 fs/xfs/scrub/alloc_repair.c  |  5 +++--
 4 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 2dbab68b4fe6..26d3d7956e06 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1831,8 +1831,7 @@ xfs_refcount_alloc_cow_extent(
 	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
-			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
@@ -1848,8 +1847,7 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
-			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	xfs_rmap_free_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
 	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index d0df68dc3131..57dbf99ce004 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -525,7 +525,7 @@ xfs_rmap_free_check_owner(
 	struct xfs_btree_cur	*cur,
 	uint64_t		ltoff,
 	struct xfs_rmap_irec	*rec,
-	xfs_filblks_t		len,
+	xfs_extlen_t		len,
 	uint64_t		owner,
 	uint64_t		offset,
 	unsigned int		flags)
@@ -2729,8 +2729,7 @@ xfs_rmap_convert_extent(
 void
 xfs_rmap_alloc_extent(
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
+	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
 {
@@ -2739,7 +2738,7 @@ xfs_rmap_alloc_extent(
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
 		return;
 
-	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+	bmap.br_startblock = fsbno;
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
@@ -2751,8 +2750,7 @@ xfs_rmap_alloc_extent(
 void
 xfs_rmap_free_extent(
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
+	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
 {
@@ -2761,7 +2759,7 @@ xfs_rmap_free_extent(
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
 		return;
 
-	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+	bmap.br_startblock = fsbno;
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 96b4321d8310..8e2657af038e 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -184,10 +184,10 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
 		struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *imap);
-void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
-		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
-		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+		xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+		xfs_extlen_t len, uint64_t owner);
 
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
 		struct xfs_btree_cur **pcur);
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 0433363a90b6..11e1e5404fc6 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -542,8 +542,9 @@ xrep_abt_dispose_one(
 
 	/* Add a deferred rmap for each extent we used. */
 	if (resv->used > 0)
-		xfs_rmap_alloc_extent(sc->tp, pag_agno(pag), resv->agbno,
-				resv->used, XFS_RMAP_OWN_AG);
+		xfs_rmap_alloc_extent(sc->tp,
+				xfs_agbno_to_fsb(pag, resv->agbno), resv->used,
+				XFS_RMAP_OWN_AG);
 
 	/*
 	 * For each reserved btree block we didn't use, add it to the free
-- 
cgit v1.2.3


From 2f63b20b7a26c9a7c76ea5a6565ca38cd9e31282 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:18 -0800
Subject: xfs: support storing records in the inode core root

Add the necessary flags and code so that we can support storing leaf
records in the inode root block of a btree.  This hasn't been necessary
before, but the realtime rmapbt will need to be able to do this.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.c         | 138 ++++++++++++++++++++++++++++++++++----
 fs/xfs/libxfs/xfs_btree.h         |   2 +-
 fs/xfs/libxfs/xfs_btree_staging.c |   9 ++-
 3 files changed, 132 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index e83a8de5fb87..5ab201ef041e 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1537,12 +1537,16 @@ xfs_btree_log_recs(
 	int			first,
 	int			last)
 {
+	if (!bp) {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+				xfs_ilog_fbroot(cur->bc_ino.whichfork));
+		return;
+	}
 
 	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
 	xfs_trans_log_buf(cur->bc_tp, bp,
 			  xfs_btree_rec_offset(cur, first),
 			  xfs_btree_rec_offset(cur, last + 1) - 1);
-
 }
 
 /*
@@ -3078,6 +3082,59 @@ xfs_btree_split(
 #define xfs_btree_split	__xfs_btree_split
 #endif /* __KERNEL__ */
 
+/* Move the records from a root leaf block to a separate block. */
+STATIC void
+xfs_btree_promote_leaf_iroot(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	struct xfs_buf		*cbp,
+	union xfs_btree_ptr	*cptr,
+	struct xfs_btree_block	*cblock)
+{
+	union xfs_btree_rec	*rp;
+	union xfs_btree_rec	*crp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*pp;
+	struct xfs_btree_block	*broot;
+	int			numrecs = xfs_btree_get_numrecs(block);
+
+	/* Copy the records from the leaf broot into the new child block. */
+	rp = xfs_btree_rec_addr(cur, 1, block);
+	crp = xfs_btree_rec_addr(cur, 1, cblock);
+	xfs_btree_copy_recs(cur, crp, rp, numrecs);
+
+	/*
+	 * Increment the tree height.
+	 *
+	 * Trickery here: The amount of memory that we need per record for the
+	 * ifork's btree root block may change when we convert the broot from a
+	 * leaf to a node block.  Free the existing leaf broot so that nobody
+	 * thinks we need to migrate node pointers when we realloc the broot
+	 * buffer after bumping nlevels.
+	 */
+	cur->bc_ops->broot_realloc(cur, 0);
+	cur->bc_nlevels++;
+	cur->bc_levels[1].ptr = 1;
+
+	/*
+	 * Allocate a new node broot and initialize it to point to the new
+	 * child block.
+	 */
+	broot = cur->bc_ops->broot_realloc(cur, 1);
+	xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops,
+			cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino);
+
+	pp = xfs_btree_ptr_addr(cur, 1, broot);
+	kp = xfs_btree_key_addr(cur, 1, broot);
+	xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+	xfs_btree_get_keys(cur, cblock, kp);
+
+	/* Attach the new block to the cursor and log it. */
+	xfs_btree_setbuf(cur, 0, cbp);
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_recs(cur, cbp, 1, numrecs);
+}
+
 /*
  * Move the keys and pointers from a root block to a separate block.
  *
@@ -3163,7 +3220,7 @@ xfs_btree_new_iroot(
 	struct xfs_buf		*cbp;		/* buffer for cblock */
 	struct xfs_btree_block	*block;		/* btree block */
 	struct xfs_btree_block	*cblock;	/* child btree block */
-	union xfs_btree_ptr	*pp;
+	union xfs_btree_ptr	aptr;
 	union xfs_btree_ptr	nptr;		/* new block addr */
 	int			level;		/* btree level */
 	int			error;		/* error return code */
@@ -3175,10 +3232,15 @@ xfs_btree_new_iroot(
 	level = cur->bc_nlevels - 1;
 
 	block = xfs_btree_get_iroot(cur);
-	pp = xfs_btree_ptr_addr(cur, 1, block);
+	ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS));
+	if (level > 0)
+		aptr = *xfs_btree_ptr_addr(cur, 1, block);
+	else
+		aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp,
+				cur->bc_ino.ip->i_ino));
 
 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
+	error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat);
 	if (error)
 		goto error0;
 	if (*stat == 0)
@@ -3204,10 +3266,14 @@ xfs_btree_new_iroot(
 			cblock->bb_u.s.bb_blkno = bno;
 	}
 
-	error = xfs_btree_promote_node_iroot(cur, block, level, cbp, &nptr,
-			cblock);
-	if (error)
-		goto error0;
+	if (level > 0) {
+		error = xfs_btree_promote_node_iroot(cur, block, level, cbp,
+				&nptr, cblock);
+		if (error)
+			goto error0;
+	} else {
+		xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock);
+	}
 
 	*logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
 	*stat = 1;
@@ -3726,6 +3792,43 @@ error0:
 	return error;
 }
 
+/* Move the records from a child leaf block to the root block. */
+STATIC void
+xfs_btree_demote_leaf_child(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*cblock,
+	int			numrecs)
+{
+	union xfs_btree_rec	*rp;
+	union xfs_btree_rec	*crp;
+	struct xfs_btree_block	*broot;
+
+	/*
+	 * Decrease the tree height.
+	 *
+	 * Trickery here: The amount of memory that we need per record for the
+	 * ifork's btree root block may change when we convert the broot from a
+	 * node to a leaf.  Free the old node broot so that we can get a fresh
+	 * leaf broot.
+	 */
+	cur->bc_ops->broot_realloc(cur, 0);
+	cur->bc_nlevels--;
+
+	/*
+	 * Allocate a new leaf broot and copy the records from the old child.
+	 * Detach the old child from the cursor.
+	 */
+	broot = cur->bc_ops->broot_realloc(cur, numrecs);
+	xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs,
+			cur->bc_ino.ip->i_ino);
+
+	rp = xfs_btree_rec_addr(cur, 1, broot);
+	crp = xfs_btree_rec_addr(cur, 1, cblock);
+	xfs_btree_copy_recs(cur, rp, crp, numrecs);
+
+	cur->bc_levels[0].bp = NULL;
+}
+
 /*
  * Move the keyptrs from a child node block to the root block.
  *
@@ -3804,14 +3907,19 @@ xfs_btree_kill_iroot(
 #endif
 
 	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
-	ASSERT(cur->bc_nlevels > 1);
+	ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) ||
+	       cur->bc_nlevels > 1);
 
 	/*
 	 * Don't deal with the root block needs to be a leaf case.
 	 * We're just going to turn the thing back into extents anyway.
 	 */
 	level = cur->bc_nlevels - 1;
-	if (level == 1)
+	if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS))
+		goto out0;
+
+	/* If we're already a leaf, jump out. */
+	if (level == 0)
 		goto out0;
 
 	/*
@@ -3841,9 +3949,13 @@ xfs_btree_kill_iroot(
 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 #endif
 
-	error = xfs_btree_demote_node_child(cur, cblock, level, numrecs);
-	if (error)
-		return error;
+	if (level > 1) {
+		error = xfs_btree_demote_node_child(cur, cblock, level,
+				numrecs);
+		if (error)
+			return error;
+	} else
+		xfs_btree_demote_leaf_child(cur, cblock, numrecs);
 
 	error = xfs_btree_free_block(cur, cbp);
 	if (error)
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 8380ae0a64dd..3b8c2ccad908 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -233,7 +233,7 @@ struct xfs_btree_ops {
 
 /* btree geometry flags */
 #define XFS_BTGEO_OVERLAPPING		(1U << 0) /* overlapping intervals */
-
+#define XFS_BTGEO_IROOT_RECORDS		(1U << 1) /* iroot can store records */
 
 union xfs_btree_irec {
 	struct xfs_alloc_rec_incore	a;
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index 694929703152..58c146b5c9d4 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -573,6 +573,7 @@ xfs_btree_bload_compute_geometry(
 	struct xfs_btree_bload	*bbl,
 	uint64_t		nr_records)
 {
+	const struct xfs_btree_ops *ops = cur->bc_ops;
 	uint64_t		nr_blocks = 0;
 	uint64_t		nr_this_level;
 
@@ -599,7 +600,7 @@ xfs_btree_bload_compute_geometry(
 		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 				&avg_per_block, &level_blocks, &dontcare64);
 
-		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+		if (ops->type == XFS_BTREE_TYPE_INODE) {
 			/*
 			 * If all the items we want to store at this level
 			 * would fit in the inode root block, then we have our
@@ -607,7 +608,9 @@ xfs_btree_bload_compute_geometry(
 			 *
 			 * Note that bmap btrees forbid records in the root.
 			 */
-			if (level != 0 && nr_this_level <= avg_per_block) {
+			if ((level != 0 ||
+			     (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) &&
+			    nr_this_level <= avg_per_block) {
 				nr_blocks++;
 				break;
 			}
@@ -658,7 +661,7 @@ xfs_btree_bload_compute_geometry(
 		return -EOVERFLOW;
 
 	bbl->btree_height = cur->bc_nlevels;
-	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
+	if (ops->type == XFS_BTREE_TYPE_INODE)
 		bbl->nr_blocks = nr_blocks - 1;
 	else
 		bbl->nr_blocks = nr_blocks;
-- 
cgit v1.2.3


From 05290bd5c6236b8ad659157edb36bd2d38f46d3e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:19 -0800
Subject: xfs: allow inode-based btrees to reserve space in the data device

Create a new space reservation scheme so that btree metadata for the
realtime volume can reserve space in the data device to avoid space
underruns.

Back when we were testing the rmap and refcount btrees for the data
device, people observed occasional shutdowns when xfs_btree_split was
called for either of those two btrees.  This happened when certain
operations (mostly writeback ioends) created new rmap or refcount
records, which would expand the size of the btree.  If there were no
free blocks available the allocation would fail and the split would shut
down the filesystem.

I considered pre-reserving blocks for btree expansion at the time of a
write() call, but there wasn't any good way to attach the reservations
to an inode and keep them there all the way to ioend processing.  Unlike
delalloc reservations which have that indlen mechanism, there's no way
to do that for mapped extents; and indlen blocks are given back during
the delalloc -> unwritten transition.

The solution was to reserve sufficient blocks for rmap/refcount btree
expansion at mount time.  This is what the XFS_AG_RESV_* flags provide;
any expansion of those two btrees can come from the pre-reserved space.

This patch brings that pre-reservation ability to inode-rooted btrees so
that the rt rmap and refcount btrees can also save room for future
expansion.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_ag_resv.c  |   3 +
 fs/xfs/libxfs/xfs_errortag.h |   4 +-
 fs/xfs/libxfs/xfs_metadir.c  |   4 +
 fs/xfs/libxfs/xfs_metafile.c | 205 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_metafile.h |  11 +++
 fs/xfs/libxfs/xfs_types.h    |   7 ++
 fs/xfs/xfs_error.c           |   3 +
 fs/xfs/xfs_fsops.c           |  17 ++++
 fs/xfs/xfs_inode.h           |   6 ++
 fs/xfs/xfs_mount.c           |  10 +++
 fs/xfs/xfs_mount.h           |   1 +
 fs/xfs/xfs_rtalloc.c         |  21 +++++
 fs/xfs/xfs_rtalloc.h         |   5 ++
 fs/xfs/xfs_trace.h           |  45 ++++++++++
 14 files changed, 341 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index f5d853089019..fb79215a509d 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -114,6 +114,7 @@ xfs_ag_resv_needed(
 	case XFS_AG_RESV_RMAPBT:
 		len -= xfs_perag_resv(pag, type)->ar_reserved;
 		break;
+	case XFS_AG_RESV_METAFILE:
 	case XFS_AG_RESV_NONE:
 		/* empty */
 		break;
@@ -347,6 +348,7 @@ xfs_ag_resv_alloc_extent(
 
 	switch (type) {
 	case XFS_AG_RESV_AGFL:
+	case XFS_AG_RESV_METAFILE:
 		return;
 	case XFS_AG_RESV_METADATA:
 	case XFS_AG_RESV_RMAPBT:
@@ -389,6 +391,7 @@ xfs_ag_resv_free_extent(
 
 	switch (type) {
 	case XFS_AG_RESV_AGFL:
+	case XFS_AG_RESV_METAFILE:
 		return;
 	case XFS_AG_RESV_METADATA:
 	case XFS_AG_RESV_RMAPBT:
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 7002d7676a78..a53c5d40e084 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -64,7 +64,8 @@
 #define XFS_ERRTAG_WB_DELAY_MS				42
 #define XFS_ERRTAG_WRITE_DELAY_MS			43
 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE			44
-#define XFS_ERRTAG_MAX					45
+#define XFS_ERRTAG_METAFILE_RESV_CRITICAL		45
+#define XFS_ERRTAG_MAX					46
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -113,5 +114,6 @@
 #define XFS_RANDOM_WB_DELAY_MS				3000
 #define XFS_RANDOM_WRITE_DELAY_MS			3000
 #define XFS_RANDOM_EXCHMAPS_FINISH_ONE			1
+#define XFS_RANDOM_METAFILE_RESV_CRITICAL		4
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_metadir.c b/fs/xfs/libxfs/xfs_metadir.c
index bae7377c0f22..178e89711cb7 100644
--- a/fs/xfs/libxfs/xfs_metadir.c
+++ b/fs/xfs/libxfs/xfs_metadir.c
@@ -29,6 +29,10 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_parent.h"
 #include "xfs_health.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
 
 /*
  * Metadata Directory Tree
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index adeb25d1a444..e151663cc9ef 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -17,6 +17,10 @@
 #include "xfs_metafile.h"
 #include "xfs_trace.h"
 #include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
 
 /* Set up an inode to be recognized as a metadata directory inode. */
 void
@@ -50,3 +54,204 @@ xfs_metafile_clear_iflag(
 	ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
+
+/*
+ * Is the amount of space that could be allocated towards a given metadata
+ * file at or beneath a certain threshold?
+ */
+static inline bool
+xfs_metafile_resv_can_cover(
+	struct xfs_inode	*ip,
+	int64_t			rhs)
+{
+	/*
+	 * The amount of space that can be allocated to this metadata file is
+	 * the remaining reservation for the particular metadata file + the
+	 * global free block count.  Take care of the first case to avoid
+	 * touching the per-cpu counter.
+	 */
+	if (ip->i_delayed_blks >= rhs)
+		return true;
+
+	/*
+	 * There aren't enough blocks left in the inode's reservation, but it
+	 * isn't critical unless there also isn't enough free space.
+	 */
+	return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+			rhs - ip->i_delayed_blks, 2048) >= 0;
+}
+
+/*
+ * Is this metadata file critically low on blocks?  For now we'll define that
+ * as the number of blocks we can get our hands on being less than 10% of what
+ * we reserved or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_metafile_resv_critical(
+	struct xfs_inode	*ip)
+{
+	uint64_t		asked_low_water;
+
+	if (!ip)
+		return false;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	trace_xfs_metafile_resv_critical(ip, 0);
+
+	if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
+		return true;
+
+	asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
+	if (!xfs_metafile_resv_can_cover(ip, asked_low_water))
+		return true;
+
+	return XFS_TEST_ERROR(false, ip->i_mount,
+			XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+}
+
+/* Allocate a block from the metadata file's reservation. */
+void
+xfs_metafile_resv_alloc_space(
+	struct xfs_inode	*ip,
+	struct xfs_alloc_arg	*args)
+{
+	int64_t			len = args->len;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(args->resv == XFS_AG_RESV_METAFILE);
+
+	trace_xfs_metafile_resv_alloc_space(ip, args->len);
+
+	/*
+	 * Allocate the blocks from the metadata inode's block reservation
+	 * and update the ondisk sb counter.
+	 */
+	if (ip->i_delayed_blks > 0) {
+		int64_t		from_resv;
+
+		from_resv = min_t(int64_t, len, ip->i_delayed_blks);
+		ip->i_delayed_blks -= from_resv;
+		xfs_mod_delalloc(ip, 0, -from_resv);
+		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
+				-from_resv);
+		len -= from_resv;
+	}
+
+	/*
+	 * Any allocation in excess of the reservation requires in-core and
+	 * on-disk fdblocks updates.  If we can grab @len blocks from the
+	 * in-core fdblocks then all we need to do is update the on-disk
+	 * superblock; if not, then try to steal some from the transaction's
+	 * block reservation.  Overruns are only expected for rmap btrees.
+	 */
+	if (len) {
+		unsigned int	field;
+		int		error;
+
+		error = xfs_dec_fdblocks(ip->i_mount, len, true);
+		if (error)
+			field = XFS_TRANS_SB_FDBLOCKS;
+		else
+			field = XFS_TRANS_SB_RES_FDBLOCKS;
+
+		xfs_trans_mod_sb(args->tp, field, -len);
+	}
+
+	ip->i_nblocks += args->len;
+	xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
+}
+
+/* Free a block to the metadata file's reservation. */
+void
+xfs_metafile_resv_free_space(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp,
+	xfs_filblks_t		len)
+{
+	int64_t			to_resv;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	trace_xfs_metafile_resv_free_space(ip, len);
+
+	ip->i_nblocks -= len;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	/*
+	 * Add the freed blocks back into the inode's delalloc reservation
+	 * until it reaches the maximum size.  Update the ondisk fdblocks only.
+	 */
+	to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+	if (to_resv > 0) {
+		to_resv = min_t(int64_t, to_resv, len);
+		ip->i_delayed_blks += to_resv;
+		xfs_mod_delalloc(ip, 0, to_resv);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
+		len -= to_resv;
+	}
+
+	/*
+	 * Everything else goes back to the filesystem, so update the in-core
+	 * and on-disk counters.
+	 */
+	if (len)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
+}
+
+/* Release a metadata file's space reservation. */
+void
+xfs_metafile_resv_free(
+	struct xfs_inode	*ip)
+{
+	/* Non-btree metadata inodes don't need space reservations. */
+	if (!ip || !ip->i_meta_resv_asked)
+		return;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	trace_xfs_metafile_resv_free(ip, 0);
+
+	if (ip->i_delayed_blks) {
+		xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks);
+		xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks);
+		ip->i_delayed_blks = 0;
+	}
+	ip->i_meta_resv_asked = 0;
+}
+
+/* Set up a metadata file's space reservation. */
+int
+xfs_metafile_resv_init(
+	struct xfs_inode	*ip,
+	xfs_filblks_t		ask)
+{
+	xfs_filblks_t		hidden_space;
+	xfs_filblks_t		used;
+	int			error;
+
+	if (!ip || ip->i_meta_resv_asked > 0)
+		return 0;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+
+	/*
+	 * Space taken by all other metadata btrees are accounted on-disk as
+	 * used space.  We therefore only hide the space that is reserved but
+	 * not used by the trees.
+	 */
+	used = ip->i_nblocks;
+	if (used > ask)
+		ask = used;
+	hidden_space = ask - used;
+
+	error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true);
+	if (error) {
+		trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_);
+		return error;
+	}
+
+	xfs_mod_delalloc(ip, 0, hidden_space);
+	ip->i_delayed_blks = hidden_space;
+	ip->i_meta_resv_asked = ask;
+
+	trace_xfs_metafile_resv_init(ip, ask);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h
index acec400123db..8d8f08a6071c 100644
--- a/fs/xfs/libxfs/xfs_metafile.h
+++ b/fs/xfs/libxfs/xfs_metafile.h
@@ -21,6 +21,17 @@ void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip,
 		enum xfs_metafile_type metafile_type);
 void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
 
+/* Space reservations for metadata inodes. */
+struct xfs_alloc_arg;
+
+bool xfs_metafile_resv_critical(struct xfs_inode *ip);
+void xfs_metafile_resv_alloc_space(struct xfs_inode *ip,
+		struct xfs_alloc_arg *args);
+void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp,
+		xfs_filblks_t len);
+void xfs_metafile_resv_free(struct xfs_inode *ip);
+int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask);
+
 /* Code specific to kernel/userspace; must be provided externally. */
 
 int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino,
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index bf33c2b1e43e..ca2401c1facd 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -202,6 +202,13 @@ enum xfs_ag_resv_type {
 	 * altering fdblocks.  If you think you need this you're wrong.
 	 */
 	XFS_AG_RESV_IGNORE,
+
+	/*
+	 * This allocation activity is being done on behalf of a metadata file.
+	 * These files maintain their own permanent space reservations and are
+	 * required to adjust fdblocks using the xfs_metafile_resv_* helpers.
+	 */
+	XFS_AG_RESV_METAFILE,
 };
 
 /* Results of scanning a btree keyspace to check occupancy. */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 78cdc5064a8c..dbd87e137694 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -63,6 +63,7 @@ static unsigned int xfs_errortag_random_default[] = {
 	XFS_RANDOM_WB_DELAY_MS,
 	XFS_RANDOM_WRITE_DELAY_MS,
 	XFS_RANDOM_EXCHMAPS_FINISH_ONE,
+	XFS_RANDOM_METAFILE_RESV_CRITICAL,
 };
 
 struct xfs_errortag_attr {
@@ -181,6 +182,7 @@ XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node,	XFS_ERRTAG_ATTR_LEAF_TO_NODE);
 XFS_ERRORTAG_ATTR_RW(wb_delay_ms,	XFS_ERRTAG_WB_DELAY_MS);
 XFS_ERRORTAG_ATTR_RW(write_delay_ms,	XFS_ERRTAG_WRITE_DELAY_MS);
 XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
 
 static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -227,6 +229,7 @@ static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
 	XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
 	XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one),
+	XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit),
 	NULL,
 };
 ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 28dde215c899..e1145107d8cb 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -21,6 +21,7 @@
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 #include "xfs_trace.h"
+#include "xfs_rtalloc.h"
 
 /*
  * Write new AG headers to disk. Non-transactional, but need to be
@@ -541,6 +542,19 @@ xfs_fs_reserve_ag_blocks(
 		xfs_warn(mp,
 	"Error %d reserving per-AG metadata reserve pool.", error);
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		return error;
+	}
+
+	if (xfs_has_realtime(mp)) {
+		err2 = xfs_rt_resv_init(mp);
+		if (err2 && err2 != -ENOSPC) {
+			xfs_warn(mp,
+		"Error %d reserving realtime metadata reserve pool.", err2);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		}
+
+		if (err2 && !error)
+			error = err2;
 	}
 
 	return error;
@@ -555,6 +569,9 @@ xfs_fs_unreserve_ag_blocks(
 {
 	struct xfs_perag	*pag = NULL;
 
+	if (xfs_has_realtime(mp))
+		xfs_rt_resv_free(mp);
+
 	while ((pag = xfs_perag_next(mp, pag)))
 		xfs_ag_resv_free(pag);
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1141c2e8e123..c08093a65352 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -31,6 +31,12 @@ typedef struct xfs_inode {
 			struct xfs_dquot *i_gdquot;	/* group dquot */
 			struct xfs_dquot *i_pdquot;	/* project dquot */
 		};
+
+		/*
+		 * Space that has been set aside to accomodate expansions of a
+		 * metadata btree rooted in this file.
+		 */
+		uint64_t	i_meta_resv_asked;
 	};
 
 	/* Inode location stuff */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5918f433dba7..97137126b16f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -650,6 +650,15 @@ xfs_agbtree_compute_maxlevels(
 	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
 }
 
+/* Compute maximum possible height for realtime btree types for this fs. */
+static inline void
+xfs_rtbtree_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	/* This will be filled in later. */
+	mp->m_rtbtree_maxlevels = 0;
+}
+
 /*
  * This function does the following on an initial mount of a file system:
  *	- reads the superblock from disk and init the mount struct
@@ -721,6 +730,7 @@ xfs_mountfs(
 	xfs_refcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
+	xfs_rtbtree_compute_maxlevels(mp);
 
 	/*
 	 * Check if sb_agblocks is aligned at stripe boundary.  If sb_agblocks
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index db9dade7d22a..ddb9d19a3a3d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -165,6 +165,7 @@ typedef struct xfs_mount {
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
 	uint			m_refc_maxlevels; /* max refcount btree level */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
+	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
 	uint			m_alloc_set_aside; /* space we can't use */
 	uint			m_ag_max_usable; /* max space per AG */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index fcfa6e0eb3ad..5128c5ad72f5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1344,6 +1344,12 @@ xfs_growfs_rt(
 
 		if (!error)
 			error = error2;
+
+		/* Reset the rt metadata btree space reservations. */
+		xfs_rt_resv_free(mp);
+		error2 = xfs_rt_resv_init(mp);
+		if (error2 && error2 != -ENOSPC)
+			error = error2;
 	}
 
 out_unlock:
@@ -1487,6 +1493,21 @@ xfs_rtalloc_reinit_frextents(
 	return 0;
 }
 
+/* Free space reservations for rt metadata inodes. */
+void
+xfs_rt_resv_free(
+	struct xfs_mount	*mp)
+{
+}
+
+/* Reserve space for rt metadata inodes' space expansion. */
+int
+xfs_rt_resv_init(
+	struct xfs_mount	*mp)
+{
+	return 0;
+}
+
 /*
  * Read in the bmbt of an rt metadata inode so that we never have to load them
  * at runtime.  This enables the use of shared ILOCKs for rtbitmap scans.  Use
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8e2a07b8174b..d87523e6a550 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -34,6 +34,9 @@ int					/* error */
 xfs_rtmount_inodes(
 	struct xfs_mount	*mp);	/* file system mount structure */
 
+void xfs_rt_resv_free(struct xfs_mount *mp);
+int xfs_rt_resv_init(struct xfs_mount *mp);
+
 /*
  * Grow the realtime area of the filesystem.
  */
@@ -60,6 +63,8 @@ xfs_rtmount_init(
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
 # define xfs_rtunmount_inodes(m)
+# define xfs_rt_resv_free(mp)				((void)0)
+# define xfs_rt_resv_init(mp)				(0)
 #endif	/* CONFIG_XFS_RT */
 
 #endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7b16cdd72e9d..cbda663fe6e8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5574,6 +5574,51 @@ DEFINE_EVENT(xfs_metadir_class, name, \
 	TP_ARGS(dp, name, ino))
 DEFINE_METADIR_EVENT(xfs_metadir_lookup);
 
+/* metadata inode space reservations */
+
+DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
+	TP_ARGS(ip, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned long long, freeblks)
+		__field(unsigned long long, reserved)
+		__field(unsigned long long, asked)
+		__field(unsigned long long, used)
+		__field(unsigned long long, len)
+	),
+	TP_fast_assign(
+		struct xfs_mount *mp = ip->i_mount;
+
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
+		__entry->reserved = ip->i_delayed_blks;
+		__entry->asked = ip->i_meta_resv_asked;
+		__entry->used = ip->i_nblocks;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->freeblks,
+		  __entry->reserved,
+		  __entry->asked,
+		  __entry->used,
+		  __entry->len)
+)
+#define DEFINE_METAFILE_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_metafile_resv_class, name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
+	TP_ARGS(ip, len))
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
+DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From fc6856c6ff08642e3e8437f0416d70a5e1807010 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:21 -0800
Subject: xfs: introduce realtime rmap btree ondisk definitions

Add the ondisk structure definitions for realtime rmap btrees. The
realtime rmap btree will be rooted from a hidden inode so it needs to
have a separate btree block magic and pointer format.

Next, add everything needed to read, write and manipulate rmap btree
blocks. This prepares the way for connecting the btree operations
implementation, though embedding the rtrmap btree root in the inode
comes later in the series.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile                  |   1 +
 fs/xfs/libxfs/xfs_btree.c        |   5 +
 fs/xfs/libxfs/xfs_format.h       |  10 ++
 fs/xfs/libxfs/xfs_ondisk.h       |   1 +
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 271 +++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |  82 ++++++++++++
 fs/xfs/libxfs/xfs_sb.c           |   6 +
 fs/xfs/libxfs/xfs_shared.h       |   7 +
 fs/xfs/xfs_mount.c               |   5 +-
 fs/xfs/xfs_mount.h               |   9 ++
 fs/xfs/xfs_stats.c               |   3 +-
 fs/xfs/xfs_stats.h               |   1 +
 12 files changed, 398 insertions(+), 3 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtrmap_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_rtrmap_btree.h

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index ed9b0dabc1f1..ff45efb2463f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -51,6 +51,7 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_rmap_btree.o \
 				   xfs_refcount.o \
 				   xfs_refcount_btree.o \
+				   xfs_rtrmap_btree.o \
 				   xfs_sb.o \
 				   xfs_symlink_remote.o \
 				   xfs_trans_inode.o \
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 5ab201ef041e..0e2719193747 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -30,6 +30,7 @@
 #include "xfs_health.h"
 #include "xfs_buf_mem.h"
 #include "xfs_btree_mem.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Btree magic numbers.
@@ -5525,6 +5526,9 @@ xfs_btree_init_cur_caches(void)
 	if (error)
 		goto err;
 	error = xfs_refcountbt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_rtrmapbt_init_cur_cache();
 	if (error)
 		goto err;
 
@@ -5543,6 +5547,7 @@ xfs_btree_destroy_cur_caches(void)
 	xfs_bmbt_destroy_cur_cache();
 	xfs_rmapbt_destroy_cur_cache();
 	xfs_refcountbt_destroy_cur_cache();
+	xfs_rtrmapbt_destroy_cur_cache();
 }
 
 /* Move the btree cursor before the first record. */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 4d47a3e723aa..469fc7afa591 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1725,6 +1725,16 @@ typedef __be32 xfs_rmap_ptr_t;
 	 XFS_FIBT_BLOCK(mp) + 1 : \
 	 XFS_IBT_BLOCK(mp) + 1)
 
+/*
+ * Realtime Reverse mapping btree format definitions
+ *
+ * This is a btree for reverse mapping records for realtime volumes
+ */
+#define	XFS_RTRMAP_CRC_MAGIC	0x4d415052	/* 'MAPR' */
+
+/* inode-based btree pointer type */
+typedef __be64 xfs_rtrmap_ptr_t;
+
 /*
  * Reference Count Btree format definitions
  *
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index ad0dedf00f18..2c50877a1a2f 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -83,6 +83,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
+	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 
 	/*
 	 * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
new file mode 100644
index 000000000000..d3e4c52dcaa9
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+
+static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
+
+/*
+ * Realtime Reverse Map btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device.  See the comments in xfs_rmap_btree.c for more information.
+ *
+ * This tree is basically the same as the regular rmap btree except that it
+ * is rooted in an inode and does not live in free space.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrmapbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
+}
+
+static xfs_failaddr_t
+xfs_rtrmapbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	int			level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (!xfs_has_rmapbt(mp))
+		return __this_address;
+	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+	level = be16_to_cpu(block->bb_level);
+	if (level > mp->m_rtrmap_maxlevels)
+		return __this_address;
+
+	return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
+}
+
+static void
+xfs_rtrmapbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_fsblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rtrmapbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrmapbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_rtrmapbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_fsblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
+	.name			= "xfs_rtrmapbt",
+	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+	.verify_read		= xfs_rtrmapbt_read_verify,
+	.verify_write		= xfs_rtrmapbt_write_verify,
+	.verify_struct		= xfs_rtrmapbt_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrmapbt_ops = {
+	.name			= "rtrmap",
+	.type			= XFS_BTREE_TYPE_INODE,
+	.geom_flags		= XFS_BTGEO_OVERLAPPING |
+				  XFS_BTGEO_IROOT_RECORDS,
+
+	.rec_len		= sizeof(struct xfs_rmap_rec),
+	/* Overlapping btree; 2 keys per pointer. */
+	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
+
+	.lru_refs		= XFS_RMAP_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_2),
+
+	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
+	.buf_ops		= &xfs_rtrmapbt_buf_ops,
+};
+
+/* Allocate a new rt rmap btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_init_cursor(
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_inode	*ip = NULL;
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_btree_cur	*cur;
+
+	return NULL; /* XXX */
+
+	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops,
+			mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
+
+	cur->bc_ino.ip = ip;
+	cur->bc_group = xfs_group_hold(rtg_group(rtg));
+	cur->bc_ino.whichfork = XFS_DATA_FORK;
+	cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+
+	return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root.  Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrmapbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp)
+{
+	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
+	struct xfs_ifork	*ifp;
+	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	/*
+	 * Free any resources hanging off the real fork, then shallow-copy the
+	 * staging fork's contents into the real fork to transfer everything
+	 * we just built.
+	 */
+	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+	xfs_idestroy_fork(ifp);
+	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+	cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
+	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
+}
+
+/* Calculate number of records in a rt reverse mapping btree block. */
+static inline unsigned int
+xfs_rtrmapbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(struct xfs_rmap_rec);
+	return blocklen /
+		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Calculate number of records in an rt reverse mapping btree block.
+ */
+unsigned int
+xfs_rtrmapbt_maxrecs(
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_RTRMAP_BLOCK_LEN;
+	return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime reverse mapping btrees. */
+unsigned int
+xfs_rtrmapbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
+
+	/* We need at most one record for every block in an rt group. */
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrmapbt_init_cur_cache(void)
+{
+	xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
+			xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_rtrmapbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_rtrmapbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
+	xfs_rtrmapbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of an rt reverse mapping btree. */
+void
+xfs_rtrmapbt_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	unsigned int		d_maxlevels, r_maxlevels;
+
+	if (!xfs_has_rtrmapbt(mp)) {
+		mp->m_rtrmap_maxlevels = 0;
+		return;
+	}
+
+	/*
+	 * The realtime rmapbt lives on the data device, which means that its
+	 * maximum height is constrained by the size of the data device and
+	 * the height required to store one rmap record for each block in an
+	 * rt group.
+	 */
+	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
+				mp->m_sb.sb_dblocks);
+	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
+				mp->m_groups[XG_TYPE_RTG].blocks);
+
+	/* Add one level to handle the inode root level. */
+	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
new file mode 100644
index 000000000000..7d1a3a49a2d6
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTRMAP_BTREE_H__
+#define __XFS_RTRMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RTRMAP_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_trans *tp,
+		struct xfs_rtgroup *rtg);
+struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		struct xbtree_ifakeroot *ifake);
+void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp);
+unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
+void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrmapbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_rec_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_rec *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_high_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 sizeof(struct xfs_rmap_key) +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_ptr_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrmap_ptr_t *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+		 (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+unsigned int xfs_rtrmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rtrmapbt_init_cur_cache(void);
+void xfs_rtrmapbt_destroy_cur_cache(void);
+
+#endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 3b5623611eba..83fb14b4074c 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -28,6 +28,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_exchrange.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1215,6 +1216,11 @@ xfs_sb_mount_common(
 	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
 	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
 
+	mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false);
+	mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2;
+	mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2;
+
 	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
 	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index e7efdb9ceaf3..da23dac22c3f 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -42,6 +42,7 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
 extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
@@ -55,6 +56,7 @@ extern const struct xfs_btree_ops xfs_bmbt_ops;
 extern const struct xfs_btree_ops xfs_refcountbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
+extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
 
 static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
 {
@@ -100,6 +102,11 @@ static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops)
 # define xfs_btree_is_mem_rmap(...)	(false)
 #endif
 
+static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_rtrmapbt_ops;
+}
+
 /* log size calculation functions */
 int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
 int	xfs_log_calc_minimum_size(struct xfs_mount *);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 97137126b16f..7b7d21b50d54 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -37,6 +37,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_metafile.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/stats.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -655,8 +656,7 @@ static inline void
 xfs_rtbtree_compute_maxlevels(
 	struct xfs_mount	*mp)
 {
-	/* This will be filled in later. */
-	mp->m_rtbtree_maxlevels = 0;
+	mp->m_rtbtree_maxlevels = mp->m_rtrmap_maxlevels;
 }
 
 /*
@@ -727,6 +727,7 @@ xfs_mountfs(
 	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
 	xfs_mount_setup_inode_geom(mp);
 	xfs_rmapbt_compute_maxlevels(mp);
+	xfs_rtrmapbt_compute_maxlevels(mp);
 	xfs_refcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ddb9d19a3a3d..1bc95fb170db 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -158,11 +158,14 @@ typedef struct xfs_mount {
 	uint			m_bmap_dmnr[2];	/* min bmap btree records */
 	uint			m_rmap_mxr[2];	/* max rmap btree records */
 	uint			m_rmap_mnr[2];	/* min rmap btree records */
+	uint			m_rtrmap_mxr[2]; /* max rtrmap btree records */
+	uint			m_rtrmap_mnr[2]; /* min rtrmap btree records */
 	uint			m_refc_mxr[2];	/* max refc btree records */
 	uint			m_refc_mnr[2];	/* min refc btree records */
 	uint			m_alloc_maxlevels; /* max alloc btree levels */
 	uint			m_bm_maxlevels[2]; /* max bmap btree levels */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
+	uint			m_rtrmap_maxlevels; /* max rtrmap btree level */
 	uint			m_refc_maxlevels; /* max refcount btree level */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
 	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
@@ -399,6 +402,12 @@ static inline bool xfs_has_rtsb(struct xfs_mount *mp)
 	return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
 }
 
+static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
+{
+	return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) &&
+	       xfs_has_rmapbt(mp);
+}
+
 /*
  * Some features are always on for v5 file systems, allow the compiler to
  * eliminiate dead code when building without v4 support.
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index ffb52725c2a8..f94fb70b524f 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -52,7 +52,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
 		{ "rmapbt",		xfsstats_offset(xs_refcbt_2)	},
 		{ "refcntbt",		xfsstats_offset(xs_rmap_mem_2)	},
 		{ "rmapbt_mem",		xfsstats_offset(xs_rcbag_2)	},
-		{ "rcbagbt",		xfsstats_offset(xs_qm_dqreclaims)},
+		{ "rcbagbt",		xfsstats_offset(xs_rtrmap_2)	},
+		{ "rtrmapbt",		xfsstats_offset(xs_qm_dqreclaims)},
 		/* we print both series of quota information together */
 		{ "qm",			xfsstats_offset(xs_xstrat_bytes)},
 	};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index a61fb56ed2e6..05dc69c6d949 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -127,6 +127,7 @@ struct __xfsstats {
 	uint32_t		xs_refcbt_2[__XBTS_MAX];
 	uint32_t		xs_rmap_mem_2[__XBTS_MAX];
 	uint32_t		xs_rcbag_2[__XBTS_MAX];
+	uint32_t		xs_rtrmap_2[__XBTS_MAX];
 	uint32_t		xs_qm_dqreclaims;
 	uint32_t		xs_qm_dqreclaim_misses;
 	uint32_t		xs_qm_dquot_dups;
-- 
cgit v1.2.3


From e1c76fce50bb750dff236aa51a3b698de4f7132c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:22 -0800
Subject: xfs: realtime rmap btree transaction reservations

Make sure that there's enough log reservation to handle mapping
and unmapping realtime extents.  We have to reserve enough space
to handle a split in the rtrmapbt to add the record and a second
split in the regular rmapbt to record the rtrmapbt split.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_exchmaps.c    |  4 +++-
 fs/xfs/libxfs/xfs_trans_resv.c  | 12 ++++++++++--
 fs/xfs/libxfs/xfs_trans_space.h | 13 +++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c
index 2021396651de..3f1d6a98c118 100644
--- a/fs/xfs/libxfs/xfs_exchmaps.c
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -662,7 +662,9 @@ xfs_exchmaps_rmapbt_blocks(
 	if (!xfs_has_rmapbt(mp))
 		return 0;
 	if (XFS_IS_REALTIME_INODE(req->ip1))
-		return 0;
+		return howmany_64(req->nr_exchanges,
+					XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) *
+			XFS_RTRMAPADD_SPACE_RES(mp);
 
 	return howmany_64(req->nr_exchanges,
 					XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index bab402340b5d..f3392eb2d7f4 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -213,7 +213,9 @@ xfs_calc_inode_chunk_res(
  * Per-extent log reservation for the btree changes involved in freeing or
  * allocating a realtime extent.  We have to be able to log as many rtbitmap
  * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
- * extents, as well as the realtime summary block.
+ * extents, as well as the realtime summary block (t1).  Realtime rmap btree
+ * operations happen in a second transaction, so factor in a couple of rtrmapbt
+ * splits (t2).
  */
 static unsigned int
 xfs_rtalloc_block_count(
@@ -222,10 +224,16 @@ xfs_rtalloc_block_count(
 {
 	unsigned int		rtbmp_blocks;
 	xfs_rtxlen_t		rtxlen;
+	unsigned int		t1, t2 = 0;
 
 	rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
 	rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen);
-	return (rtbmp_blocks + 1) * num_ops;
+	t1 = (rtbmp_blocks + 1) * num_ops;
+
+	if (xfs_has_rmapbt(mp))
+		t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1);
+
+	return max(t1, t2);
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 1155ff2d37e2..d89b570aafcc 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -14,6 +14,19 @@
 #define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)    \
 		(((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0]))
 
+/* Worst case number of realtime rmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)    \
+		(((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0]))
+
+/* Adding one realtime rmap could split every level to the top of the tree. */
+#define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels)
+
+/* Blocks we might need to add "b" realtime rmaps to a tree. */
+#define XFS_NRTRMAPADD_SPACE_RES(mp, b) \
+	((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \
+	  XFS_RTRMAPADD_SPACE_RES(mp))
+
 /* Worst case number of rmaps that can be held in a block. */
 #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
 		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
-- 
cgit v1.2.3


From d386b4024372ea2f06aaa0f2c6c380b45ba0536e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:23 -0800
Subject: xfs: add realtime rmap btree operations

Implement the generic btree operations needed to manipulate rtrmap
btree blocks. This is different from the regular rmapbt in that we
allocate space from the filesystem at large, and are neither
constrained to the free space nor any particular AG.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.c        |  68 ++++++++++
 fs/xfs/libxfs/xfs_btree.h        |   6 +
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 271 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 345 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 0e2719193747..36ab06f8a3bc 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -31,6 +31,10 @@
 #include "xfs_buf_mem.h"
 #include "xfs_btree_mem.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_quota.h"
+#include "xfs_metafile.h"
 
 /*
  * Btree magic numbers.
@@ -5576,3 +5580,67 @@ xfs_btree_goto_left_edge(
 
 	return 0;
 }
+
+/* Allocate a block for an inode-rooted metadata btree. */
+int
+xfs_btree_alloc_metafile_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	struct xfs_alloc_arg		args = {
+		.mp			= cur->bc_mp,
+		.tp			= cur->bc_tp,
+		.resv			= XFS_AG_RESV_METAFILE,
+		.minlen			= 1,
+		.maxlen			= 1,
+		.prod			= 1,
+	};
+	struct xfs_inode		*ip = cur->bc_ino.ip;
+	int				error;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+
+	xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork);
+	error = xfs_alloc_vextent_start_ag(&args,
+			XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino));
+	if (error)
+		return error;
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+
+	xfs_metafile_resv_alloc_space(ip, &args);
+
+	new->l = cpu_to_be64(args.fsbno);
+	*stat = 1;
+	return 0;
+}
+
+/* Free a block from an inode-rooted metadata btree. */
+int
+xfs_btree_free_metafile_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_inode	*ip = cur->bc_ino.ip;
+	struct xfs_trans	*tp = cur->bc_tp;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+	int			error;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
+	error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE,
+			0);
+	if (error)
+		return error;
+
+	xfs_metafile_resv_free_space(ip, tp, 1);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3b8c2ccad908..ee82dc777d6d 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -703,4 +703,10 @@ xfs_btree_at_iroot(
 	       level == cur->bc_nlevels - 1;
 }
 
+int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *start, union xfs_btree_ptr *newp,
+		int *stat);
+int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur,
+		struct xfs_buf *bp);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index d3e4c52dcaa9..99d828bb5fe7 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -18,12 +18,14 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
+#include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
 #include "xfs_rtgroup.h"
+#include "xfs_bmap.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -44,6 +46,182 @@ xfs_rtrmapbt_dup_cursor(
 	return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
 }
 
+STATIC int
+xfs_rtrmapbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_rtrmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrmapbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0);
+	}
+
+	return cur->bc_mp->m_rtrmap_mxr[level != 0];
+}
+
+/*
+ * Convert the ondisk record's offset field into the ondisk key's offset field.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
+{
+	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+}
+
+STATIC void
+xfs_rtrmapbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->rmap.rm_startblock = rec->rmap.rm_startblock;
+	key->rmap.rm_owner = rec->rmap.rm_owner;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+}
+
+STATIC void
+xfs_rtrmapbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	uint64_t			off;
+	int				adj;
+
+	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+	key->rmap.rm_startblock = rec->rmap.rm_startblock;
+	be32_add_cpu(&key->rmap.rm_startblock, adj);
+	key->rmap.rm_owner = rec->rmap.rm_owner;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+		return;
+	off = be64_to_cpu(key->rmap.rm_offset);
+	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+	key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rtrmapbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+	rec->rmap.rm_offset = cpu_to_be64(
+			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rtrmapbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+/*
+ * Mask the appropriate parts of the ondisk key field for a key comparison.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline uint64_t offset_keymask(uint64_t offset)
+{
+	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
+	const struct xfs_rmap_key	*kp = &key->rmap;
+	__u64				x, y;
+	int64_t				d;
+
+	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+	if (d)
+		return d;
+
+	x = be64_to_cpu(kp->rm_owner);
+	y = rec->rm_owner;
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+
+	x = offset_keymask(be64_to_cpu(kp->rm_offset));
+	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+	return 0;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
+{
+	const struct xfs_rmap_key	*kp1 = &k1->rmap;
+	const struct xfs_rmap_key	*kp2 = &k2->rmap;
+	int64_t				d;
+	__u64				x, y;
+
+	/* Doesn't make sense to mask off the physical space part */
+	ASSERT(!mask || mask->rmap.rm_startblock);
+
+	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
+		     be32_to_cpu(kp2->rm_startblock);
+	if (d)
+		return d;
+
+	if (!mask || mask->rmap.rm_owner) {
+		x = be64_to_cpu(kp1->rm_owner);
+		y = be64_to_cpu(kp2->rm_owner);
+		if (x > y)
+			return 1;
+		else if (y > x)
+			return -1;
+	}
+
+	if (!mask || mask->rmap.rm_offset) {
+		/* Doesn't make sense to allow offset but not owner */
+		ASSERT(!mask || mask->rmap.rm_owner);
+
+		x = offset_keymask(be64_to_cpu(kp1->rm_offset));
+		y = offset_keymask(be64_to_cpu(kp2->rm_offset));
+		if (x > y)
+			return 1;
+		else if (y > x)
+			return -1;
+	}
+
+	return 0;
+}
+
 static xfs_failaddr_t
 xfs_rtrmapbt_verify(
 	struct xfs_buf		*bp)
@@ -110,6 +288,86 @@ const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
 	.verify_struct		= xfs_rtrmapbt_verify,
 };
 
+STATIC int
+xfs_rtrmapbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	uint32_t			x;
+	uint32_t			y;
+	uint64_t			a;
+	uint64_t			b;
+
+	x = be32_to_cpu(k1->rmap.rm_startblock);
+	y = be32_to_cpu(k2->rmap.rm_startblock);
+	if (x < y)
+		return 1;
+	else if (x > y)
+		return 0;
+	a = be64_to_cpu(k1->rmap.rm_owner);
+	b = be64_to_cpu(k2->rmap.rm_owner);
+	if (a < b)
+		return 1;
+	else if (a > b)
+		return 0;
+	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
+	if (a <= b)
+		return 1;
+	return 0;
+}
+
+STATIC int
+xfs_rtrmapbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	uint32_t			x;
+	uint32_t			y;
+	uint64_t			a;
+	uint64_t			b;
+
+	x = be32_to_cpu(r1->rmap.rm_startblock);
+	y = be32_to_cpu(r2->rmap.rm_startblock);
+	if (x < y)
+		return 1;
+	else if (x > y)
+		return 0;
+	a = be64_to_cpu(r1->rmap.rm_owner);
+	b = be64_to_cpu(r2->rmap.rm_owner);
+	if (a < b)
+		return 1;
+	else if (a > b)
+		return 0;
+	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
+	if (a <= b)
+		return 1;
+	return 0;
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrmapbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->rmap.rm_startblock);
+
+	/*
+	 * We only support checking contiguity of the physical space component.
+	 * If any callers ever need more specificity than that, they'll have to
+	 * implement it here.
+	 */
+	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
+
+	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
+				 be32_to_cpu(key2->rmap.rm_startblock));
+}
+
 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.name			= "rtrmap",
 	.type			= XFS_BTREE_TYPE_INODE,
@@ -125,7 +383,20 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_2),
 
 	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
+	.alloc_block		= xfs_btree_alloc_metafile_block,
+	.free_block		= xfs_btree_free_metafile_block,
+	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
+	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_rtrmapbt_init_ptr_from_cur,
+	.key_diff		= xfs_rtrmapbt_key_diff,
 	.buf_ops		= &xfs_rtrmapbt_buf_ops,
+	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
+	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
 };
 
 /* Allocate a new rt rmap btree cursor. */
-- 
cgit v1.2.3


From adafb31c80e608e63adcf8cae5675db00c734149 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:23 -0800
Subject: xfs: prepare rmap functions to deal with rtrmapbt

Prepare the high-level rmap functions to deal with the new realtime
rmapbt and its slightly different conventions.  Provide the ability
to talk to either rmapbt or rtrmapbt formats from the same high
level code.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rmap.c    | 63 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rmap.h    |  3 +++
 fs/xfs/libxfs/xfs_rtgroup.h | 26 +++++++++++++++++++
 3 files changed, 92 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 57dbf99ce004..da1b004837d3 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -25,6 +25,7 @@
 #include "xfs_ag.h"
 #include "xfs_health.h"
 #include "xfs_rmap_item.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -264,11 +265,69 @@ xfs_rmap_check_irec(
 	return NULL;
 }
 
+static xfs_failaddr_t
+xfs_rtrmap_check_meta_irec(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_rmap_irec	*irec)
+{
+	struct xfs_mount		*mp = rtg_mount(rtg);
+
+	if (irec->rm_offset != 0)
+		return __this_address;
+	if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+		return __this_address;
+
+	switch (irec->rm_owner) {
+	case XFS_RMAP_OWN_FS:
+		if (irec->rm_startblock != 0)
+			return __this_address;
+		if (irec->rm_blockcount != mp->m_sb.sb_rextsize)
+			return __this_address;
+		return NULL;
+	default:
+		return __this_address;
+	}
+
+	return NULL;
+}
+
+static xfs_failaddr_t
+xfs_rtrmap_check_inode_irec(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_rmap_irec	*irec)
+{
+	struct xfs_mount		*mp = rtg_mount(rtg);
+
+	if (!xfs_verify_ino(mp, irec->rm_owner))
+		return __this_address;
+	if (!xfs_verify_rgbext(rtg, irec->rm_startblock, irec->rm_blockcount))
+		return __this_address;
+	if (!xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))
+		return __this_address;
+	return NULL;
+}
+
+xfs_failaddr_t
+xfs_rtrmap_check_irec(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_rmap_irec	*irec)
+{
+	if (irec->rm_blockcount == 0)
+		return __this_address;
+	if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK))
+		return __this_address;
+	if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner))
+		return xfs_rtrmap_check_meta_irec(rtg, irec);
+	return xfs_rtrmap_check_inode_irec(rtg, irec);
+}
+
 static inline xfs_failaddr_t
 xfs_rmap_check_btrec(
 	struct xfs_btree_cur		*cur,
 	const struct xfs_rmap_irec	*irec)
 {
+	if (xfs_btree_is_rtrmap(cur->bc_ops))
+		return xfs_rtrmap_check_irec(to_rtg(cur->bc_group), irec);
 	return xfs_rmap_check_irec(to_perag(cur->bc_group), irec);
 }
 
@@ -283,6 +342,10 @@ xfs_rmap_complain_bad_rec(
 	if (xfs_btree_is_mem_rmap(cur->bc_ops))
 		xfs_warn(mp,
  "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+	else if (xfs_btree_is_rtrmap(cur->bc_ops))
+		xfs_warn(mp,
+ "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!",
+				cur->bc_group->xg_gno, fa);
 	else
 		xfs_warn(mp,
  "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 8e2657af038e..1b19f54b6504 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -7,6 +7,7 @@
 #define __XFS_RMAP_H__
 
 struct xfs_perag;
+struct xfs_rtgroup;
 
 static inline void
 xfs_rmap_ino_bmbt_owner(
@@ -206,6 +207,8 @@ xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_rmap_irec *irec);
 xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
 		const struct xfs_rmap_irec *irec);
+xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg,
+		const struct xfs_rmap_irec *irec);
 
 int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, enum xbtree_recpacking *outcome);
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 19f8d302b9aa..dc3ce660a013 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -132,6 +132,32 @@ xfs_rtgroup_next(
 	return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1);
 }
 
+static inline bool
+xfs_verify_rgbno(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		rgbno)
+{
+	ASSERT(xfs_has_rtgroups(rtg_mount(rtg)));
+
+	return xfs_verify_gbno(rtg_group(rtg), rgbno);
+}
+
+/*
+ * Check that [@rgbno,@len] is a valid extent range in @rtg.
+ *
+ * Must only be used for RTG-enabled file systems.
+ */
+static inline bool
+xfs_verify_rgbext(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		rgbno,
+	xfs_extlen_t		len)
+{
+	ASSERT(xfs_has_rtgroups(rtg_mount(rtg)));
+
+	return xfs_verify_gbext(rtg_group(rtg), rgbno, len);
+}
+
 static inline xfs_rtblock_t
 xfs_rgbno_to_rtb(
 	struct xfs_rtgroup	*rtg,
-- 
cgit v1.2.3


From 9e823fc27419b09718fff74ae2297b25ae6fb317 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:24 -0800
Subject: xfs: add a realtime flag to the rmap update log redo items

Extend the rmap update (RUI) log items to handle realtime volumes by
adding a new log intent item type.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_defer.h       |   1 +
 fs/xfs/libxfs/xfs_log_format.h  |   6 +-
 fs/xfs/libxfs/xfs_log_recover.h |   2 +
 fs/xfs/libxfs/xfs_refcount.c    |   4 +-
 fs/xfs/libxfs/xfs_rmap.c        |  17 +++-
 fs/xfs/libxfs/xfs_rmap.h        |   5 +-
 fs/xfs/scrub/alloc_repair.c     |   2 +-
 fs/xfs/xfs_log_recover.c        |   2 +
 fs/xfs/xfs_rmap_item.c          | 197 +++++++++++++++++++++++++++++++++++++---
 9 files changed, 213 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index ec51b8465e61..1e2477eaa5a8 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -69,6 +69,7 @@ struct xfs_defer_op_type {
 extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
 extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
 extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 15dec19b6c32..a7e0e479454d 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -250,6 +250,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_XMD		0x1249  /* mapping exchange done */
 #define	XFS_LI_EFI_RT		0x124a	/* realtime extent free intent */
 #define	XFS_LI_EFD_RT		0x124b	/* realtime extent free done */
+#define	XFS_LI_RUI_RT		0x124c	/* realtime rmap update intent */
+#define	XFS_LI_RUD_RT		0x124d	/* realtime rmap update done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -271,7 +273,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_XMI,		"XFS_LI_XMI" }, \
 	{ XFS_LI_XMD,		"XFS_LI_XMD" }, \
 	{ XFS_LI_EFI_RT,	"XFS_LI_EFI_RT" }, \
-	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }
+	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }, \
+	{ XFS_LI_RUI_RT,	"XFS_LI_RUI_RT" }, \
+	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }
 
 /*
  * Inode Log Item Format definitions.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 5397a8ff004d..abc705aff26d 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -79,6 +79,8 @@ extern const struct xlog_recover_item_ops xlog_xmi_item_ops;
 extern const struct xlog_recover_item_ops xlog_xmd_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtrui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtrud_item_ops;
 
 /*
  * Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 26d3d7956e06..bbb86dc9a25c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1831,7 +1831,7 @@ xfs_refcount_alloc_cow_extent(
 	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
@@ -1847,7 +1847,7 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_free_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
 	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index da1b004837d3..8d3cea90c7cd 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2710,6 +2710,7 @@ __xfs_rmap_add(
 	struct xfs_trans		*tp,
 	enum xfs_rmap_intent_type	type,
 	uint64_t			owner,
+	bool				isrt,
 	int				whichfork,
 	struct xfs_bmbt_irec		*bmap)
 {
@@ -2721,6 +2722,7 @@ __xfs_rmap_add(
 	ri->ri_owner = owner;
 	ri->ri_whichfork = whichfork;
 	ri->ri_bmap = *bmap;
+	ri->ri_realtime = isrt;
 
 	xfs_rmap_defer_add(tp, ri);
 }
@@ -2734,6 +2736,7 @@ xfs_rmap_map_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
 		return;
@@ -2741,7 +2744,7 @@ xfs_rmap_map_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_MAP_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /* Unmap an extent out of a file. */
@@ -2753,6 +2756,7 @@ xfs_rmap_unmap_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
 		return;
@@ -2760,7 +2764,7 @@ xfs_rmap_unmap_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_UNMAP_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /*
@@ -2778,6 +2782,7 @@ xfs_rmap_convert_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return;
@@ -2785,13 +2790,14 @@ xfs_rmap_convert_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_CONVERT_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /* Schedule the creation of an rmap for non-file data. */
 void
 xfs_rmap_alloc_extent(
 	struct xfs_trans	*tp,
+	bool			isrt,
 	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
@@ -2806,13 +2812,14 @@ xfs_rmap_alloc_extent(
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap);
 }
 
 /* Schedule the deletion of an rmap for non-file data. */
 void
 xfs_rmap_free_extent(
 	struct xfs_trans	*tp,
+	bool			isrt,
 	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
@@ -2827,7 +2834,7 @@ xfs_rmap_free_extent(
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap);
 }
 
 /* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 1b19f54b6504..5f39f6e53cd1 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -175,6 +175,7 @@ struct xfs_rmap_intent {
 	uint64_t				ri_owner;
 	struct xfs_bmbt_irec			ri_bmap;
 	struct xfs_group			*ri_group;
+	bool					ri_realtime;
 };
 
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
@@ -185,9 +186,9 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
 		struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *imap);
-void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
 		xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
 		xfs_extlen_t len, uint64_t owner);
 
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 11e1e5404fc6..bed6a09aa791 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -542,7 +542,7 @@ xrep_abt_dispose_one(
 
 	/* Add a deferred rmap for each extent we used. */
 	if (resv->used > 0)
-		xfs_rmap_alloc_extent(sc->tp,
+		xfs_rmap_alloc_extent(sc->tp, false,
 				xfs_agbno_to_fsb(pag, resv->agbno), resv->used,
 				XFS_RMAP_OWN_AG);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0af3d477197b..5c95c97519c7 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1820,6 +1820,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
 	&xlog_xmd_item_ops,
 	&xlog_rtefi_item_ops,
 	&xlog_rtefd_item_ops,
+	&xlog_rtrui_item_ops,
+	&xlog_rtrud_item_ops,
 };
 
 static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index ac2913a73358..e8caa600a95c 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -23,6 +23,7 @@
 #include "xfs_ag.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_rui_cache;
 struct kmem_cache	*xfs_rud_cache;
@@ -94,7 +95,9 @@ xfs_rui_item_format(
 	ASSERT(atomic_read(&ruip->rui_next_extent) ==
 			ruip->rui_format.rui_nextents);
 
-	ruip->rui_format.rui_type = XFS_LI_RUI;
+	ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT);
+
+	ruip->rui_format.rui_type = lip->li_type;
 	ruip->rui_format.rui_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
@@ -137,12 +140,15 @@ xfs_rui_item_release(
 STATIC struct xfs_rui_log_item *
 xfs_rui_init(
 	struct xfs_mount		*mp,
+	unsigned short			item_type,
 	uint				nextents)
 
 {
 	struct xfs_rui_log_item		*ruip;
 
 	ASSERT(nextents > 0);
+	ASSERT(item_type == XFS_LI_RUI || item_type == XFS_LI_RUI_RT);
+
 	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
 		ruip = kzalloc(xfs_rui_log_item_sizeof(nextents),
 				GFP_KERNEL | __GFP_NOFAIL);
@@ -150,7 +156,7 @@ xfs_rui_init(
 		ruip = kmem_cache_zalloc(xfs_rui_cache,
 					 GFP_KERNEL | __GFP_NOFAIL);
 
-	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+	xfs_log_item_init(mp, &ruip->rui_item, item_type, &xfs_rui_item_ops);
 	ruip->rui_format.rui_nextents = nextents;
 	ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
 	atomic_set(&ruip->rui_next_extent, 0);
@@ -189,7 +195,9 @@ xfs_rud_item_format(
 	struct xfs_rud_log_item	*rudp = RUD_ITEM(lip);
 	struct xfs_log_iovec	*vecp = NULL;
 
-	rudp->rud_format.rud_type = XFS_LI_RUD;
+	ASSERT(lip->li_type == XFS_LI_RUD || lip->li_type == XFS_LI_RUD_RT);
+
+	rudp->rud_format.rud_type = lip->li_type;
 	rudp->rud_format.rud_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format,
@@ -233,6 +241,14 @@ static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e)
 	return list_entry(e, struct xfs_rmap_intent, ri_list);
 }
 
+static inline bool
+xfs_rui_item_isrt(const struct xfs_log_item *lip)
+{
+	ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT);
+
+	return lip->li_type == XFS_LI_RUI_RT;
+}
+
 /* Sort rmap intents by AG. */
 static int
 xfs_rmap_update_diff_items(
@@ -305,18 +321,20 @@ xfs_rmap_update_log_item(
 }
 
 static struct xfs_log_item *
-xfs_rmap_update_create_intent(
+__xfs_rmap_update_create_intent(
 	struct xfs_trans		*tp,
 	struct list_head		*items,
 	unsigned int			count,
-	bool				sort)
+	bool				sort,
+	unsigned short			item_type)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_rui_log_item		*ruip = xfs_rui_init(mp, count);
+	struct xfs_rui_log_item		*ruip;
 	struct xfs_rmap_intent		*ri;
 
 	ASSERT(count > 0);
 
+	ruip = xfs_rui_init(mp, item_type, count);
 	if (sort)
 		list_sort(mp, items, xfs_rmap_update_diff_items);
 	list_for_each_entry(ri, items, ri_list)
@@ -324,6 +342,23 @@ xfs_rmap_update_create_intent(
 	return &ruip->rui_item;
 }
 
+static struct xfs_log_item *
+xfs_rmap_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	return __xfs_rmap_update_create_intent(tp, items, count, sort,
+			XFS_LI_RUI);
+}
+
+static inline unsigned short
+xfs_rud_type_from_rui(const struct xfs_rui_log_item *ruip)
+{
+	return xfs_rui_item_isrt(&ruip->rui_item) ? XFS_LI_RUD_RT : XFS_LI_RUD;
+}
+
 /* Get an RUD so we can process all the deferred rmap updates. */
 static struct xfs_log_item *
 xfs_rmap_update_create_done(
@@ -335,8 +370,8 @@ xfs_rmap_update_create_done(
 	struct xfs_rud_log_item		*rudp;
 
 	rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
-	xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
-			  &xfs_rud_item_ops);
+	xfs_log_item_init(tp->t_mountp, &rudp->rud_item,
+			xfs_rud_type_from_rui(ruip), &xfs_rud_item_ops);
 	rudp->rud_ruip = ruip;
 	rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
 
@@ -351,11 +386,20 @@ xfs_rmap_defer_add(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 
+	/*
+	 * Deferred rmap updates for the realtime and data sections must use
+	 * separate transactions to finish deferred work because updates to
+	 * realtime metadata files can lock AGFs to allocate btree blocks and
+	 * we don't want that mixing with the AGF locks taken to finish data
+	 * section updates.
+	 */
 	ri->ri_group = xfs_group_intent_get(mp, ri->ri_bmap.br_startblock,
-			XG_TYPE_AG);
+			ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG);
 
 	trace_xfs_rmap_defer(mp, ri);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+	xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ?
+			&xfs_rtrmap_update_defer_type :
+			&xfs_rmap_update_defer_type);
 }
 
 /* Cancel a deferred rmap update. */
@@ -566,10 +610,13 @@ xfs_rmap_relog_intent(
 	struct xfs_map_extent		*map;
 	unsigned int			count;
 
+	ASSERT(intent->li_type == XFS_LI_RUI ||
+	       intent->li_type == XFS_LI_RUI_RT);
+
 	count = RUI_ITEM(intent)->rui_format.rui_nextents;
 	map = RUI_ITEM(intent)->rui_format.rui_extents;
 
-	ruip = xfs_rui_init(tp->t_mountp, count);
+	ruip = xfs_rui_init(tp->t_mountp, intent->li_type, count);
 	memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
 	atomic_set(&ruip->rui_next_extent, count);
 
@@ -589,6 +636,47 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.relog_intent	= xfs_rmap_relog_intent,
 };
 
+#ifdef CONFIG_XFS_RT
+static struct xfs_log_item *
+xfs_rtrmap_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	return __xfs_rmap_update_create_intent(tp, items, count, sort,
+			XFS_LI_RUI_RT);
+}
+
+/* Clean up after calling xfs_rmap_finish_one. */
+STATIC void
+xfs_rtrmap_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	if (rcur)
+		xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = {
+	.name		= "rtrmap",
+	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_rtrmap_update_create_intent,
+	.abort_intent	= xfs_rmap_update_abort_intent,
+	.create_done	= xfs_rmap_update_create_done,
+	.finish_item	= xfs_rmap_update_finish_item,
+	.finish_cleanup = xfs_rtrmap_finish_one_cleanup,
+	.cancel_item	= xfs_rmap_update_cancel_item,
+	.recover_work	= xfs_rmap_recover_work,
+	.relog_intent	= xfs_rmap_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = {
+	.name		= "rtrmap",
+};
+#endif
+
 STATIC bool
 xfs_rui_item_match(
 	struct xfs_log_item	*lip,
@@ -654,7 +742,7 @@ xlog_recover_rui_commit_pass2(
 		return -EFSCORRUPTED;
 	}
 
-	ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+	ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents);
 	xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
 	atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
 
@@ -668,6 +756,61 @@ const struct xlog_recover_item_ops xlog_rui_item_ops = {
 	.commit_pass2		= xlog_recover_rui_commit_pass2,
 };
 
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtrui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_rui_log_item		*ruip;
+	struct xfs_rui_log_format	*rui_formatp;
+	size_t				len;
+
+	rui_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents);
+	xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
+	atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
+
+	xlog_recover_intent_item(log, &ruip->rui_item, lsn,
+			&xfs_rtrmap_update_defer_type);
+	return 0;
+}
+#else
+STATIC int
+xlog_recover_rtrui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+			item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+	return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtrui_item_ops = {
+	.item_type		= XFS_LI_RUI_RT,
+	.commit_pass2		= xlog_recover_rtrui_commit_pass2,
+};
+
 /*
  * This routine is called when an RUD format structure is found in a committed
  * transaction in the log. Its purpose is to cancel the corresponding RUI if it
@@ -699,3 +842,33 @@ const struct xlog_recover_item_ops xlog_rud_item_ops = {
 	.item_type		= XFS_LI_RUD,
 	.commit_pass2		= xlog_recover_rud_commit_pass2,
 };
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtrud_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_rud_log_format	*rud_formatp;
+
+	rud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				rud_formatp, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_RUI_RT,
+			rud_formatp->rud_rui_id);
+	return 0;
+}
+#else
+# define xlog_recover_rtrud_commit_pass2	xlog_recover_rtrui_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtrud_item_ops = {
+	.item_type		= XFS_LI_RUD_RT,
+	.commit_pass2		= xlog_recover_rtrud_commit_pass2,
+};
-- 
cgit v1.2.3


From 5e0679d1c62f25f97b4f93121efe236fc4e768d2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:25 -0800
Subject: xfs: support recovering rmap intent items targetting realtime extents

Now that we have rmap on the realtime device and rmap intent items that
target the realtime device, log recovery has to support remapping
extents on the realtime volume.  Make this work.  Identify rtrmapbt
blocks in the log correctly so that we can validate them during log
recovery.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf_item_recover.c |  4 ++++
 fs/xfs/xfs_rmap_item.c        | 15 ++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 3d0c6402cb36..4f2e4ea29e1f 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -262,6 +262,9 @@ xlog_recover_validate_buf_type(
 		case XFS_BMAP_MAGIC:
 			bp->b_ops = &xfs_bmbt_buf_ops;
 			break;
+		case XFS_RTRMAP_CRC_MAGIC:
+			bp->b_ops = &xfs_rtrmapbt_buf_ops;
+			break;
 		case XFS_RMAP_CRC_MAGIC:
 			bp->b_ops = &xfs_rmapbt_buf_ops;
 			break;
@@ -855,6 +858,7 @@ xlog_recover_get_buf_lsn(
 		uuid = &btb->bb_u.s.bb_uuid;
 		break;
 	}
+	case XFS_RTRMAP_CRC_MAGIC:
 	case XFS_BMAP_CRC_MAGIC:
 	case XFS_BMAP_MAGIC: {
 		struct xfs_btree_block *btb = blk;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index e8caa600a95c..89decffe76c8 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -459,6 +459,7 @@ xfs_rmap_update_abort_intent(
 static inline bool
 xfs_rui_validate_map(
 	struct xfs_mount		*mp,
+	bool				isrt,
 	struct xfs_map_extent		*map)
 {
 	if (!xfs_has_rmapbt(mp))
@@ -488,6 +489,9 @@ xfs_rui_validate_map(
 	if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
 		return false;
 
+	if (isrt)
+		return xfs_verify_rtbext(mp, map->me_startblock, map->me_len);
+
 	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
 }
 
@@ -495,6 +499,7 @@ static inline void
 xfs_rui_recover_work(
 	struct xfs_mount		*mp,
 	struct xfs_defer_pending	*dfp,
+	bool				isrt,
 	const struct xfs_map_extent	*map)
 {
 	struct xfs_rmap_intent		*ri;
@@ -539,7 +544,9 @@ xfs_rui_recover_work(
 	ri->ri_bmap.br_blockcount = map->me_len;
 	ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
 			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-	ri->ri_group = xfs_group_intent_get(mp, map->me_startblock, XG_TYPE_AG);
+	ri->ri_group = xfs_group_intent_get(mp, map->me_startblock,
+			isrt ? XG_TYPE_RTG : XG_TYPE_AG);
+	ri->ri_realtime = isrt;
 
 	xfs_defer_add_item(dfp, &ri->ri_list);
 }
@@ -558,6 +565,7 @@ xfs_rmap_recover_work(
 	struct xfs_rui_log_item		*ruip = RUI_ITEM(lip);
 	struct xfs_trans		*tp;
 	struct xfs_mount		*mp = lip->li_log->l_mp;
+	bool				isrt = xfs_rui_item_isrt(lip);
 	int				i;
 	int				error = 0;
 
@@ -567,7 +575,7 @@ xfs_rmap_recover_work(
 	 * just toss the RUI.
 	 */
 	for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
-		if (!xfs_rui_validate_map(mp,
+		if (!xfs_rui_validate_map(mp, isrt,
 					&ruip->rui_format.rui_extents[i])) {
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					&ruip->rui_format,
@@ -575,7 +583,8 @@ xfs_rmap_recover_work(
 			return -EFSCORRUPTED;
 		}
 
-		xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]);
+		xfs_rui_recover_work(mp, dfp, isrt,
+				&ruip->rui_format.rui_extents[i]);
 	}
 
 	resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
-- 
cgit v1.2.3


From 219ee99d3673ded7abbc13ddd4d7847e92661e2c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:26 -0800
Subject: xfs: pretty print metadata file types in error messages

Create a helper function to turn a metadata file type code into a
printable string, and use this to complain about lockdep problems with
rtgroup inodes.  We'll use this more in the next patch.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_metafile.c | 18 ++++++++++++++++++
 fs/xfs/libxfs/xfs_metafile.h |  2 ++
 fs/xfs/libxfs/xfs_rtgroup.c  |  3 ++-
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index e151663cc9ef..2f5f554a36d4 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -22,6 +22,24 @@
 #include "xfs_error.h"
 #include "xfs_alloc.h"
 
+static const struct {
+	enum xfs_metafile_type	mtype;
+	const char		*name;
+} xfs_metafile_type_strs[] = { XFS_METAFILE_TYPE_STR };
+
+const char *
+xfs_metafile_type_str(enum xfs_metafile_type metatype)
+{
+	unsigned int	i;
+
+	for (i = 0; i < ARRAY_SIZE(xfs_metafile_type_strs); i++) {
+		if (xfs_metafile_type_strs[i].mtype == metatype)
+			return xfs_metafile_type_strs[i].name;
+	}
+
+	return NULL;
+}
+
 /* Set up an inode to be recognized as a metadata directory inode. */
 void
 xfs_metafile_set_iflag(
diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h
index 8d8f08a6071c..95af4b52e5a7 100644
--- a/fs/xfs/libxfs/xfs_metafile.h
+++ b/fs/xfs/libxfs/xfs_metafile.h
@@ -6,6 +6,8 @@
 #ifndef __XFS_METAFILE_H__
 #define __XFS_METAFILE_H__
 
+const char *xfs_metafile_type_str(enum xfs_metafile_type metatype);
+
 /* All metadata files must have these flags set. */
 #define XFS_METAFILE_DIFLAGS	(XFS_DIFLAG_IMMUTABLE | \
 				 XFS_DIFLAG_SYNC | \
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index a79b734e7044..9e5fdc0dc55c 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -282,7 +282,8 @@ xfs_rtginode_ilock_print_fn(
 	const struct xfs_inode *ip =
 		container_of(m, struct xfs_inode, i_lock.dep_map);
 
-	printk(KERN_CONT " rgno=%u", ip->i_projid);
+	printk(KERN_CONT " rgno=%u metatype=%s", ip->i_projid,
+			xfs_metafile_type_str(ip->i_metatype));
 }
 
 /*
-- 
cgit v1.2.3


From 702c90f451622384d6c65897b619f647704b06a9 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:27 -0800
Subject: xfs: support file data forks containing metadata btrees

Create a new fork format type for metadata btrees.  This fork type
requires that the inode is in the metadata directory tree, and only
applies to the data fork.  The actual type of the metadata btree itself
is determined by the di_metatype field.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_format.h      |  6 ++++--
 fs/xfs/libxfs/xfs_inode_buf.c   | 23 ++++++++++++++++++++---
 fs/xfs/libxfs/xfs_inode_fork.c  | 19 +++++++++++++++++++
 fs/xfs/scrub/bmap.c             |  1 +
 fs/xfs/scrub/bmap_repair.c      |  1 +
 fs/xfs/scrub/inode.c            |  4 ++++
 fs/xfs/scrub/inode_repair.c     | 36 ++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/rmap_repair.c      | 31 +++++++++++++++++++++----------
 fs/xfs/xfs_inode.c              | 19 ++++++++++++++++++-
 fs/xfs/xfs_inode_item.c         |  2 ++
 fs/xfs/xfs_inode_item_recover.c | 40 +++++++++++++++++++++++++++++++++++-----
 fs/xfs/xfs_trace.h              |  1 +
 12 files changed, 162 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 469fc7afa591..41ea4283c43c 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -997,7 +997,8 @@ enum xfs_dinode_fmt {
 	XFS_DINODE_FMT_LOCAL,		/* bulk data */
 	XFS_DINODE_FMT_EXTENTS,		/* struct xfs_bmbt_rec */
 	XFS_DINODE_FMT_BTREE,		/* struct xfs_bmdr_block */
-	XFS_DINODE_FMT_UUID		/* added long ago, but never used */
+	XFS_DINODE_FMT_UUID,		/* added long ago, but never used */
+	XFS_DINODE_FMT_META_BTREE,	/* metadata btree */
 };
 
 #define XFS_INODE_FORMAT_STR \
@@ -1005,7 +1006,8 @@ enum xfs_dinode_fmt {
 	{ XFS_DINODE_FMT_LOCAL,		"local" }, \
 	{ XFS_DINODE_FMT_EXTENTS,	"extent" }, \
 	{ XFS_DINODE_FMT_BTREE,		"btree" }, \
-	{ XFS_DINODE_FMT_UUID,		"uuid" }
+	{ XFS_DINODE_FMT_UUID,		"uuid" }, \
+	{ XFS_DINODE_FMT_META_BTREE,	"meta_btree" }
 
 /*
  * Max values for extnum and aextnum.
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 424861fbf1bd..1648d72d6ed9 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -441,6 +441,16 @@ xfs_dinode_verify_fork(
 		if (di_nextents > max_extents)
 			return __this_address;
 		break;
+	case XFS_DINODE_FMT_META_BTREE:
+		if (!xfs_has_metadir(mp))
+			return __this_address;
+		if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA)))
+			return __this_address;
+		switch (be16_to_cpu(dip->di_metatype)) {
+		default:
+			return __this_address;
+		}
+		break;
 	default:
 		return __this_address;
 	}
@@ -460,6 +470,10 @@ xfs_dinode_verify_forkoff(
 		if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
 			return __this_address;
 		break;
+	case XFS_DINODE_FMT_META_BTREE:
+		if (!xfs_has_metadir(mp) || !xfs_has_parent(mp))
+			return __this_address;
+		fallthrough;
 	case XFS_DINODE_FMT_LOCAL:	/* fall through ... */
 	case XFS_DINODE_FMT_EXTENTS:    /* fall through ... */
 	case XFS_DINODE_FMT_BTREE:
@@ -637,9 +651,6 @@ xfs_dinode_verify(
 	if (mode && nextents + naextents > nblocks)
 		return __this_address;
 
-	if (nextents + naextents == 0 && nblocks != 0)
-		return __this_address;
-
 	if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
 		return __this_address;
 
@@ -743,6 +754,12 @@ xfs_dinode_verify(
 			return fa;
 	}
 
+	/* metadata inodes containing btrees always have zero extent count */
+	if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) {
+		if (nextents + naextents == 0 && nblocks != 0)
+			return __this_address;
+	}
+
 	return NULL;
 }
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 60853bac289a..1a782339396d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -267,6 +267,12 @@ xfs_iformat_data_fork(
 			return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 		case XFS_DINODE_FMT_BTREE:
 			return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+		case XFS_DINODE_FMT_META_BTREE:
+			switch (ip->i_metatype) {
+			default:
+				break;
+			}
+			fallthrough;
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -601,6 +607,19 @@ xfs_iflush_fork(
 		}
 		break;
 
+	case XFS_DINODE_FMT_META_BTREE:
+		ASSERT(whichfork == XFS_DATA_FORK);
+
+		if (!(iip->ili_fields & brootflag[whichfork]))
+			break;
+
+		switch (ip->i_metatype) {
+		default:
+			ASSERT(0);
+			break;
+		}
+		break;
+
 	default:
 		ASSERT(0);
 		break;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 7e00312225ed..0d7ad692822d 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -983,6 +983,7 @@ xchk_bmap(
 	case XFS_DINODE_FMT_UUID:
 	case XFS_DINODE_FMT_DEV:
 	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_META_BTREE:
 		/* No mappings to check. */
 		if (whichfork == XFS_COW_FORK)
 			xchk_fblock_set_corrupt(sc, whichfork, 0);
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 7c4955482641..141d36f1da9a 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -731,6 +731,7 @@ xrep_bmap_check_inputs(
 	case XFS_DINODE_FMT_DEV:
 	case XFS_DINODE_FMT_LOCAL:
 	case XFS_DINODE_FMT_UUID:
+	case XFS_DINODE_FMT_META_BTREE:
 		return -ECANCELED;
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_BTREE:
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 25ee66e7649d..2e911f38deae 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -502,6 +502,10 @@ xchk_dinode(
 		if (!S_ISREG(mode) && !S_ISDIR(mode))
 			xchk_ino_set_corrupt(sc, ino);
 		break;
+	case XFS_DINODE_FMT_META_BTREE:
+		if (!S_ISREG(mode))
+			xchk_ino_set_corrupt(sc, ino);
+		break;
 	case XFS_DINODE_FMT_UUID:
 	default:
 		xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 5a58ddd27bd2..7faa27472b91 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -888,6 +888,25 @@ xrep_dinode_bad_bmbt_fork(
 	return false;
 }
 
+/* Check a metadata-btree fork. */
+STATIC bool
+xrep_dinode_bad_metabt_fork(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	unsigned int		dfork_size,
+	int			whichfork)
+{
+	if (whichfork != XFS_DATA_FORK)
+		return true;
+
+	switch (be16_to_cpu(dip->di_metatype)) {
+	default:
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Check the data fork for things that will fail the ifork verifiers or the
  * ifork formatters.
@@ -968,6 +987,11 @@ xrep_dinode_check_dfork(
 				XFS_DATA_FORK))
 			return true;
 		break;
+	case XFS_DINODE_FMT_META_BTREE:
+		if (xrep_dinode_bad_metabt_fork(sc, dip, dfork_size,
+				XFS_DATA_FORK))
+			return true;
+		break;
 	default:
 		return true;
 	}
@@ -1088,6 +1112,11 @@ xrep_dinode_check_afork(
 					XFS_ATTR_FORK))
 			return true;
 		break;
+	case XFS_DINODE_FMT_META_BTREE:
+		if (xrep_dinode_bad_metabt_fork(sc, dip, afork_size,
+					XFS_ATTR_FORK))
+			return true;
+		break;
 	default:
 		return true;
 	}
@@ -1241,6 +1270,13 @@ xrep_dinode_ensure_forkoff(
 		bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
 		dfork_min = xfs_bmap_broot_space(sc->mp, bmdr);
 		break;
+	case XFS_DINODE_FMT_META_BTREE:
+		switch (be16_to_cpu(dip->di_metatype)) {
+		default:
+			dfork_min = 0;
+			break;
+		}
+		break;
 	default:
 		dfork_min = 0;
 		break;
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index a0a227d183d2..2a0b9e3d0fba 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -499,6 +499,14 @@ xrep_rmap_scan_iext(
 	return xrep_rmap_stash_accumulated(rf);
 }
 
+static int
+xrep_rmap_scan_meta_btree(
+	struct xrep_rmap_ifork	*rf,
+	struct xfs_inode	*ip)
+{
+	return -EFSCORRUPTED; /* XXX placeholder */
+}
+
 /* Find all the extents from a given AG in an inode fork. */
 STATIC int
 xrep_rmap_scan_ifork(
@@ -512,14 +520,14 @@ xrep_rmap_scan_ifork(
 		.whichfork	= whichfork,
 	};
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	bool			mappings_done;
 	int			error = 0;
 
 	if (!ifp)
 		return 0;
 
-	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
-		bool		mappings_done;
-
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_BTREE:
 		/*
 		 * Scan the bmap btree for data device mappings.  This includes
 		 * the btree blocks themselves, even if this is a realtime
@@ -528,15 +536,18 @@ xrep_rmap_scan_ifork(
 		error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
 		if (error || mappings_done)
 			return error;
-	} else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
-		return 0;
+		fallthrough;
+	case XFS_DINODE_FMT_EXTENTS:
+		/* Scan incore extent cache if this isn't a realtime file. */
+		if (xfs_ifork_is_realtime(ip, whichfork))
+			return 0;
+
+		return xrep_rmap_scan_iext(&rf, ifp);
+	case XFS_DINODE_FMT_META_BTREE:
+		return xrep_rmap_scan_meta_btree(&rf, ip);
 	}
 
-	/* Scan incore extent cache if this isn't a realtime file. */
-	if (xfs_ifork_is_realtime(ip, whichfork))
-		return 0;
-
-	return xrep_rmap_scan_iext(&rf, ifp);
+	return 0;
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8ad2606f928..c95fe1b1de4e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2382,7 +2382,16 @@ xfs_iflush(
 			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
 		goto flush_out;
 	}
-	if (S_ISREG(VFS_I(ip)->i_mode)) {
+	if (ip->i_df.if_format == XFS_DINODE_FMT_META_BTREE) {
+		if (!S_ISREG(VFS_I(ip)->i_mode) ||
+		    !(ip->i_diflags2 & XFS_DIFLAG2_METADATA)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad %s meta btree inode %Lu, ptr "PTR_FMT,
+				__func__, xfs_metafile_type_str(ip->i_metatype),
+				ip->i_ino, ip);
+			goto flush_out;
+		}
+	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
 		if (XFS_TEST_ERROR(
 		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
@@ -2422,6 +2431,14 @@ xfs_iflush(
 		goto flush_out;
 	}
 
+	if (xfs_inode_has_attr_fork(ip) &&
+	    ip->i_af.if_format == XFS_DINODE_FMT_META_BTREE) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: meta btree in inode %Lu attr fork, ptr "PTR_FMT,
+			__func__, ip->i_ino, ip);
+		goto flush_out;
+	}
+
 	/*
 	 * Inode item log recovery for v2 inodes are dependent on the flushiter
 	 * count for correct sequencing.  We bump the flush iteration count so
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 912f0b1bc3cb..a174f64b8bb2 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -242,6 +242,7 @@ xfs_inode_item_data_fork_size(
 		}
 		break;
 	case XFS_DINODE_FMT_BTREE:
+	case XFS_DINODE_FMT_META_BTREE:
 		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
 		    ip->i_df.if_broot_bytes > 0) {
 			*nbytes += ip->i_df.if_broot_bytes;
@@ -362,6 +363,7 @@ xfs_inode_item_format_data_fork(
 		}
 		break;
 	case XFS_DINODE_FMT_BTREE:
+	case XFS_DINODE_FMT_META_BTREE:
 		iip->ili_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
 
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index e70d2611456b..6e9b3bfc718c 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -266,6 +266,35 @@ xlog_dinode_verify_extent_counts(
 	return 0;
 }
 
+static inline int
+xlog_recover_inode_dbroot(
+	struct xfs_mount	*mp,
+	void			*src,
+	unsigned int		len,
+	struct xfs_dinode	*dip)
+{
+	void			*dfork = XFS_DFORK_DPTR(dip);
+	unsigned int		dsize = XFS_DFORK_DSIZE(dip, mp);
+
+	switch (dip->di_format) {
+	case XFS_DINODE_FMT_BTREE:
+		xfs_bmbt_to_bmdr(mp, src, len, dfork, dsize);
+		break;
+	case XFS_DINODE_FMT_META_BTREE:
+		switch (be16_to_cpu(dip->di_metatype)) {
+		default:
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
 STATIC int
 xlog_recover_inode_commit_pass2(
 	struct xlog			*log,
@@ -393,8 +422,9 @@ xlog_recover_inode_commit_pass2(
 
 
 	if (unlikely(S_ISREG(ldip->di_mode))) {
-		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
-		    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
+		if (ldip->di_format != XFS_DINODE_FMT_EXTENTS &&
+		    ldip->di_format != XFS_DINODE_FMT_BTREE &&
+		    ldip->di_format != XFS_DINODE_FMT_META_BTREE) {
 			XFS_CORRUPTION_ERROR(
 				"Bad log dinode data fork format for regular file",
 				XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
@@ -475,9 +505,9 @@ xlog_recover_inode_commit_pass2(
 		break;
 
 	case XFS_ILOG_DBROOT:
-		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
-				 (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip),
-				 XFS_DFORK_DSIZE(dip, mp));
+		error = xlog_recover_inode_dbroot(mp, src, len, dip);
+		if (error)
+			goto out_release;
 		break;
 
 	default:
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8b7bb1f5ae3c..a098935163b7 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2299,6 +2299,7 @@ TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_META_BTREE);
 
 DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 	TP_PROTO(struct xfs_inode *ip, int which),
-- 
cgit v1.2.3


From 6b08901a6e8fcda555f3ad39abd73bb0dd37f231 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:27 -0800
Subject: xfs: add realtime reverse map inode to metadata directory

Add a metadir path to select the realtime rmap btree inode and load
it at mount time.  The rtrmapbt inode will have a unique extent format
code, which means that we also have to update the inode validation and
flush routines to look for it.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_format.h       |  4 +++-
 fs/xfs/libxfs/xfs_inode_buf.c    |  9 +++++++++
 fs/xfs/libxfs/xfs_inode_fork.c   |  6 ++++++
 fs/xfs/libxfs/xfs_rtgroup.c      | 20 ++++++++++++++++++--
 fs/xfs/libxfs/xfs_rtgroup.h      |  8 ++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.c |  6 +++---
 6 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 41ea4283c43c..f32c9fda5a19 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -857,6 +857,7 @@ enum xfs_metafile_type {
 	XFS_METAFILE_PRJQUOTA,		/* project quota */
 	XFS_METAFILE_RTBITMAP,		/* rt bitmap */
 	XFS_METAFILE_RTSUMMARY,		/* rt summary */
+	XFS_METAFILE_RTRMAP,		/* rt rmap */
 
 	XFS_METAFILE_MAX
 } __packed;
@@ -868,7 +869,8 @@ enum xfs_metafile_type {
 	{ XFS_METAFILE_GRPQUOTA,	"grpquota" }, \
 	{ XFS_METAFILE_PRJQUOTA,	"prjquota" }, \
 	{ XFS_METAFILE_RTBITMAP,	"rtbitmap" }, \
-	{ XFS_METAFILE_RTSUMMARY,	"rtsummary" }
+	{ XFS_METAFILE_RTSUMMARY,	"rtsummary" }, \
+	{ XFS_METAFILE_RTRMAP,		"rtrmap" }
 
 /*
  * On-disk inode structure.
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 1648d72d6ed9..17cb91b89fca 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -447,6 +447,15 @@ xfs_dinode_verify_fork(
 		if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA)))
 			return __this_address;
 		switch (be16_to_cpu(dip->di_metatype)) {
+		case XFS_METAFILE_RTRMAP:
+			/*
+			 * growfs must create the rtrmap inodes before adding a
+			 * realtime volume to the filesystem, so we cannot use
+			 * the rtrmapbt predicate here.
+			 */
+			if (!xfs_has_rmapbt(mp))
+				return __this_address;
+			break;
 		default:
 			return __this_address;
 		}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1a782339396d..7c2b071a5838 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -269,6 +269,9 @@ xfs_iformat_data_fork(
 			return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 		case XFS_DINODE_FMT_META_BTREE:
 			switch (ip->i_metatype) {
+			case XFS_METAFILE_RTRMAP:
+				ASSERT(0); /* to be implemented later */
+				return -EFSCORRUPTED;
 			default:
 				break;
 			}
@@ -614,6 +617,9 @@ xfs_iflush_fork(
 			break;
 
 		switch (ip->i_metatype) {
+		case XFS_METAFILE_RTRMAP:
+			ASSERT(0); /* to be implemented later */
+			break;
 		default:
 			ASSERT(0);
 			break;
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 9e5fdc0dc55c..1b56c13b2827 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -315,6 +315,8 @@ struct xfs_rtginode_ops {
 
 	unsigned int		sick;	/* rtgroup sickness flag */
 
+	unsigned int		fmt_mask; /* all valid data fork formats */
+
 	/* Does the fs have this feature? */
 	bool			(*enabled)(struct xfs_mount *mp);
 
@@ -330,14 +332,29 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
 		.name		= "bitmap",
 		.metafile_type	= XFS_METAFILE_RTBITMAP,
 		.sick		= XFS_SICK_RG_BITMAP,
+		.fmt_mask	= (1U << XFS_DINODE_FMT_EXTENTS) |
+				  (1U << XFS_DINODE_FMT_BTREE),
 		.create		= xfs_rtbitmap_create,
 	},
 	[XFS_RTGI_SUMMARY] = {
 		.name		= "summary",
 		.metafile_type	= XFS_METAFILE_RTSUMMARY,
 		.sick		= XFS_SICK_RG_SUMMARY,
+		.fmt_mask	= (1U << XFS_DINODE_FMT_EXTENTS) |
+				  (1U << XFS_DINODE_FMT_BTREE),
 		.create		= xfs_rtsummary_create,
 	},
+	[XFS_RTGI_RMAP] = {
+		.name		= "rmap",
+		.metafile_type	= XFS_METAFILE_RTRMAP,
+		.fmt_mask	= 1U << XFS_DINODE_FMT_META_BTREE,
+		/*
+		 * growfs must create the rtrmap inodes before adding a
+		 * realtime volume to the filesystem, so we cannot use the
+		 * rtrmapbt predicate here.
+		 */
+		.enabled	= xfs_has_rmapbt,
+	},
 };
 
 /* Return the shortname of this rtgroup inode. */
@@ -434,8 +451,7 @@ xfs_rtginode_load(
 		return error;
 	}
 
-	if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
-			       ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) {
+	if (XFS_IS_CORRUPT(mp, !((1U << ip->i_df.if_format) & ops->fmt_mask))) {
 		xfs_irele(ip);
 		xfs_rtginode_mark_sick(rtg, type);
 		return -EFSCORRUPTED;
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index dc3ce660a013..5b61291d2669 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -14,6 +14,7 @@ struct xfs_trans;
 enum xfs_rtg_inodes {
 	XFS_RTGI_BITMAP,	/* allocation bitmap */
 	XFS_RTGI_SUMMARY,	/* allocation summary */
+	XFS_RTGI_RMAP,		/* rmap btree inode */
 
 	XFS_RTGI_MAX,
 };
@@ -74,6 +75,11 @@ static inline struct xfs_inode *rtg_summary(const struct xfs_rtgroup *rtg)
 	return rtg->rtg_inodes[XFS_RTGI_SUMMARY];
 }
 
+static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg)
+{
+	return rtg->rtg_inodes[XFS_RTGI_RMAP];
+}
+
 /* Passive rtgroup references */
 static inline struct xfs_rtgroup *
 xfs_rtgroup_get(
@@ -284,6 +290,8 @@ int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type,
 		bool init);
 void xfs_rtginode_irele(struct xfs_inode **ipp);
 
+void xfs_rtginode_irele(struct xfs_inode **ipp);
+
 static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno,
 		enum xfs_rtg_inodes type)
 {
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 99d828bb5fe7..22aabf326b2c 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -18,6 +18,7 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
+#include "xfs_metafile.h"
 #include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_trace.h"
@@ -405,12 +406,10 @@ xfs_rtrmapbt_init_cursor(
 	struct xfs_trans	*tp,
 	struct xfs_rtgroup	*rtg)
 {
-	struct xfs_inode	*ip = NULL;
+	struct xfs_inode	*ip = rtg_rmap(rtg);
 	struct xfs_mount	*mp = rtg_mount(rtg);
 	struct xfs_btree_cur	*cur;
 
-	return NULL; /* XXX */
-
 	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
 
 	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops,
@@ -439,6 +438,7 @@ xfs_rtrmapbt_commit_staged_btree(
 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
 
 	/*
 	 * Free any resources hanging off the real fork, then shallow-copy the
-- 
cgit v1.2.3


From 8491a55cfc73ff5c2c637a70ade51d4d08abb90a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:28 -0800
Subject: xfs: add metadata reservations for realtime rmap btrees

Reserve some free blocks so that we will always have enough free blocks
in the data volume to handle expansion of the realtime rmap btree.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 41 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |  2 ++
 fs/xfs/xfs_rtalloc.c             | 23 +++++++++++++++++++++-
 3 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 22aabf326b2c..066deadcaac9 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -540,3 +540,44 @@ xfs_rtrmapbt_compute_maxlevels(
 	/* Add one level to handle the inode root level. */
 	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
 }
+
+/* Calculate the rtrmap btree size for some records. */
+static unsigned long long
+xfs_rtrmapbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_rtrmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum rmap btree size.
+ */
+static unsigned long long
+xfs_rtrmapbt_max_size(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtblocks)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rtrmap_mxr[0] == 0)
+		return 0;
+
+	return xfs_rtrmapbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+xfs_filblks_t
+xfs_rtrmapbt_calc_reserves(
+	struct xfs_mount	*mp)
+{
+	uint32_t		blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+
+	if (!xfs_has_rtrmapbt(mp))
+		return 0;
+
+	/* Reserve 1% of the rtgroup or enough for 1 block per record. */
+	return max_t(xfs_filblks_t, blocks / 100,
+			xfs_rtrmapbt_max_size(mp, blocks));
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 7d1a3a49a2d6..eaa2942297e2 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -79,4 +79,6 @@ unsigned int xfs_rtrmapbt_maxlevels_ondisk(void);
 int __init xfs_rtrmapbt_init_cur_cache(void);
 void xfs_rtrmapbt_destroy_cur_cache(void);
 
+xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp);
+
 #endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 4cd2f32aa70a..2245f9ecaa33 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,6 +22,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_sb.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
 #include "xfs_quota.h"
 #include "xfs_log_priv.h"
 #include "xfs_health.h"
@@ -1498,6 +1499,13 @@ void
 xfs_rt_resv_free(
 	struct xfs_mount	*mp)
 {
+	struct xfs_rtgroup	*rtg = NULL;
+	unsigned int		i;
+
+	while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+		for (i = 0; i < XFS_RTGI_MAX; i++)
+			xfs_metafile_resv_free(rtg->rtg_inodes[i]);
+	}
 }
 
 /* Reserve space for rt metadata inodes' space expansion. */
@@ -1505,7 +1513,20 @@ int
 xfs_rt_resv_init(
 	struct xfs_mount	*mp)
 {
-	return 0;
+	struct xfs_rtgroup	*rtg = NULL;
+	xfs_filblks_t		ask;
+	int			error = 0;
+
+	while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+		int		err2;
+
+		ask = xfs_rtrmapbt_calc_reserves(mp);
+		err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	return error;
 }
 
 /*
-- 
cgit v1.2.3


From f33659e8a114e2c17108227d30a2bdf398e39bdb Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:29 -0800
Subject: xfs: wire up a new metafile type for the realtime rmap

Plumb in the pieces we need to embed the root of the realtime rmap btree
in an inode's data fork, complete with new metafile type and on-disk
interpretation functions.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_format.h       |   8 ++
 fs/xfs/libxfs/xfs_inode_fork.c   |   6 +-
 fs/xfs/libxfs/xfs_ondisk.h       |   1 +
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 251 +++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h | 112 +++++++++++++++++
 fs/xfs/xfs_inode_item_recover.c  |   4 +
 6 files changed, 379 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index f32c9fda5a19..fba4e59aded4 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1736,6 +1736,14 @@ typedef __be32 xfs_rmap_ptr_t;
  */
 #define	XFS_RTRMAP_CRC_MAGIC	0x4d415052	/* 'MAPR' */
 
+/*
+ * rtrmap root header, on-disk form only.
+ */
+struct xfs_rtrmap_root {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+};
+
 /* inode-based btree pointer type */
 typedef __be64 xfs_rtrmap_ptr_t;
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 7c2b071a5838..d9b3c182cb40 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -27,6 +27,7 @@
 #include "xfs_errortag.h"
 #include "xfs_health.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_rtrmap_btree.h"
 
 struct kmem_cache *xfs_ifork_cache;
 
@@ -270,8 +271,7 @@ xfs_iformat_data_fork(
 		case XFS_DINODE_FMT_META_BTREE:
 			switch (ip->i_metatype) {
 			case XFS_METAFILE_RTRMAP:
-				ASSERT(0); /* to be implemented later */
-				return -EFSCORRUPTED;
+				return xfs_iformat_rtrmap(ip, dip);
 			default:
 				break;
 			}
@@ -618,7 +618,7 @@ xfs_iflush_fork(
 
 		switch (ip->i_metatype) {
 		case XFS_METAFILE_RTRMAP:
-			ASSERT(0); /* to be implemented later */
+			xfs_iflush_rtrmap(ip, dip);
 			break;
 		default:
 			ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 2c50877a1a2f..07e2f5fb3a94 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -84,6 +84,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
 
 	/*
 	 * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 066deadcaac9..af0df0c7e61e 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -77,6 +77,39 @@ xfs_rtrmapbt_get_maxrecs(
 	return cur->bc_mp->m_rtrmap_mxr[level != 0];
 }
 
+/* Calculate number of records in the ondisk realtime rmap btree inode root. */
+unsigned int
+xfs_rtrmapbt_droot_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= sizeof(struct xfs_rtrmap_root);
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_rmap_rec);
+	return blocklen / (2 * sizeof(struct xfs_rmap_key) +
+			sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_rtrmapbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrmapbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_rtrmap_mxr[level != 0];
+	return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
 /*
  * Convert the ondisk record's offset field into the ondisk key's offset field.
  * Fork and bmbt are significant parts of the rmap record key, but written
@@ -369,6 +402,87 @@ xfs_rtrmapbt_keys_contiguous(
 				 be32_to_cpu(key2->rmap.rm_startblock));
 }
 
+static inline void
+xfs_rtrmapbt_move_ptrs(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*broot,
+	short			old_size,
+	size_t			new_size,
+	unsigned int		numrecs)
+{
+	void			*dptr;
+	void			*sptr;
+
+	sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size);
+	dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size);
+	memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t));
+}
+
+static struct xfs_btree_block *
+xfs_rtrmapbt_broot_realloc(
+	struct xfs_btree_cur	*cur,
+	unsigned int		new_numrecs)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+	struct xfs_btree_block	*broot;
+	unsigned int		new_size;
+	unsigned int		old_size = ifp->if_broot_bytes;
+	const unsigned int	level = cur->bc_nlevels - 1;
+
+	new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs);
+
+	/* Handle the nop case quietly. */
+	if (new_size == old_size)
+		return ifp->if_broot;
+
+	if (new_size > old_size) {
+		unsigned int	old_numrecs;
+
+		/*
+		 * If there wasn't any memory allocated before, just allocate
+		 * it now and get out.
+		 */
+		if (old_size == 0)
+			return xfs_broot_realloc(ifp, new_size);
+
+		/*
+		 * If there is already an existing if_broot, then we need to
+		 * realloc it and possibly move the node block pointers because
+		 * those are not butted up against the btree block header.
+		 */
+		old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0);
+		broot = xfs_broot_realloc(ifp, new_size);
+		if (level > 0)
+			xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size,
+					old_numrecs);
+		goto out_broot;
+	}
+
+	/*
+	 * We're reducing numrecs.  If we're going all the way to zero, just
+	 * free the block.
+	 */
+	ASSERT(ifp->if_broot != NULL && old_size > 0);
+	if (new_size == 0)
+		return xfs_broot_realloc(ifp, 0);
+
+	/*
+	 * Shrink the btree root by possibly moving the rtrmapbt pointers,
+	 * since they are not butted up against the btree block header.  Then
+	 * reallocate broot.
+	 */
+	if (level > 0)
+		xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size,
+				new_numrecs);
+	broot = xfs_broot_realloc(ifp, new_size);
+
+out_broot:
+	ASSERT(xfs_rtrmap_droot_space(broot) <=
+	       xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
+	return broot;
+}
+
 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.name			= "rtrmap",
 	.type			= XFS_BTREE_TYPE_INODE,
@@ -388,6 +502,7 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.free_block		= xfs_btree_free_metafile_block,
 	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
 	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
+	.get_dmaxrecs		= xfs_rtrmapbt_get_dmaxrecs,
 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
@@ -398,6 +513,7 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
+	.broot_realloc		= xfs_rtrmapbt_broot_realloc,
 };
 
 /* Allocate a new rt rmap btree cursor. */
@@ -581,3 +697,138 @@ xfs_rtrmapbt_calc_reserves(
 	return max_t(xfs_filblks_t, blocks / 100,
 			xfs_rtrmapbt_max_size(mp, blocks));
 }
+
+/* Convert on-disk form of btree root to in-memory form. */
+STATIC void
+xfs_rtrmapbt_from_disk(
+	struct xfs_inode	*ip,
+	struct xfs_rtrmap_root	*dblock,
+	unsigned int		dblocklen,
+	struct xfs_btree_block	*rblock)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_rmap_key	*fkp;
+	__be64			*fpp;
+	struct xfs_rmap_key	*tkp;
+	__be64			*tpp;
+	struct xfs_rmap_rec	*frp;
+	struct xfs_rmap_rec	*trp;
+	unsigned int		rblocklen = xfs_rtrmap_broot_space(mp, dblock);
+	unsigned int		numrecs;
+	unsigned int		maxrecs;
+
+	xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino);
+
+	rblock->bb_level = dblock->bb_level;
+	rblock->bb_numrecs = dblock->bb_numrecs;
+	numrecs = be16_to_cpu(dblock->bb_numrecs);
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+		tkp = xfs_rtrmap_key_addr(rblock, 1);
+		fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+		tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+		trp = xfs_rtrmap_rec_addr(rblock, 1);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Load a realtime reverse mapping btree root in from disk. */
+int
+xfs_iformat_rtrmap(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	struct xfs_btree_block	*broot;
+	unsigned int		numrecs;
+	unsigned int		level;
+	int			dsize;
+
+	/*
+	 * growfs must create the rtrmap inodes before adding a realtime volume
+	 * to the filesystem, so we cannot use the rtrmapbt predicate here.
+	 */
+	if (!xfs_has_rmapbt(ip->i_mount))
+		return -EFSCORRUPTED;
+
+	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+	numrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > mp->m_rtrmap_maxlevels ||
+	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize)
+		return -EFSCORRUPTED;
+
+	broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
+			xfs_rtrmap_broot_space_calc(mp, level, numrecs));
+	if (broot)
+		xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot);
+	return 0;
+}
+
+/* Convert in-memory form of btree root to on-disk form. */
+void
+xfs_rtrmapbt_to_disk(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*rblock,
+	unsigned int		rblocklen,
+	struct xfs_rtrmap_root	*dblock,
+	unsigned int		dblocklen)
+{
+	struct xfs_rmap_key	*fkp;
+	__be64			*fpp;
+	struct xfs_rmap_key	*tkp;
+	__be64			*tpp;
+	struct xfs_rmap_rec	*frp;
+	struct xfs_rmap_rec	*trp;
+	unsigned int		numrecs;
+	unsigned int		maxrecs;
+
+	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC));
+	ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+	ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+	numrecs = be16_to_cpu(rblock->bb_numrecs);
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrmap_key_addr(rblock, 1);
+		tkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+		fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrmap_rec_addr(rblock, 1);
+		trp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Flush a realtime reverse mapping btree root out to disk. */
+void
+xfs_iflush_rtrmap(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+	ASSERT(ifp->if_broot != NULL);
+	ASSERT(ifp->if_broot_bytes > 0);
+	ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, XFS_DATA_FORK));
+	xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
+			dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index eaa2942297e2..e97695066920 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -25,6 +25,7 @@ void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
 unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
 		bool leaf);
 void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf);
 
 /*
  * Addresses of records, keys, and pointers within an incore rtrmapbt block.
@@ -81,4 +82,115 @@ void xfs_rtrmapbt_destroy_cur_cache(void);
 
 xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp);
 
+/* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */
+
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_droot_rec_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_rec *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_droot_key_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)(block + 1) +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_droot_ptr_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrmap_ptr_t *)
+		((char *)(block + 1) +
+		 maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+		 (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_broot_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*bb,
+	unsigned int		index,
+	unsigned int		block_size)
+{
+	return xfs_rtrmap_ptr_addr(bb, index,
+			xfs_rtrmapbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrmap_broot_space_calc(
+	struct xfs_mount	*mp,
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = XFS_RTRMAP_BLOCK_LEN;
+
+	if (level > 0)
+		return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+					 sizeof(xfs_rtrmap_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb)
+{
+	return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrmap_droot_space_calc(
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = sizeof(struct xfs_rtrmap_root);
+
+	if (level > 0)
+		return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+					 sizeof(xfs_rtrmap_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrmap_droot_space(struct xfs_btree_block *bb)
+{
+	return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock,
+		unsigned int rblocklen, struct xfs_rtrmap_root *dblock,
+		unsigned int dblocklen);
+void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+
 #endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 6e9b3bfc718c..5de1d3563b76 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -22,6 +22,7 @@
 #include "xfs_log_recover.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_rtrmap_btree.h"
 
 STATIC void
 xlog_recover_inode_ra_pass2(
@@ -282,6 +283,9 @@ xlog_recover_inode_dbroot(
 		break;
 	case XFS_DINODE_FMT_META_BTREE:
 		switch (be16_to_cpu(dip->di_metatype)) {
+		case XFS_METAFILE_RTRMAP:
+			xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize);
+			return 0;
 		default:
 			ASSERT(0);
 			return -EFSCORRUPTED;
-- 
cgit v1.2.3


From 609a592865c9e66a1c00eb7b8ee7436eea3c39a3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:30 -0800
Subject: xfs: wire up rmap map and unmap to the realtime rmapbt

Connect the map and unmap reverse-mapping operations to the realtime
rmapbt via the deferred operation callbacks.  This enables us to
perform rmap operations against the correct btree.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rmap.c    | 78 ++++++++++++++++++++++++++++++---------------
 fs/xfs/libxfs/xfs_rtgroup.c |  9 ++++++
 fs/xfs/libxfs/xfs_rtgroup.h |  5 ++-
 3 files changed, 66 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 8d3cea90c7cd..2f0688a57991 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -26,6 +26,7 @@
 #include "xfs_health.h"
 #include "xfs_rmap_item.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -2619,6 +2620,47 @@ __xfs_rmap_finish_intent(
 	}
 }
 
+static int
+xfs_rmap_finish_init_cursor(
+	struct xfs_trans		*tp,
+	struct xfs_rmap_intent		*ri,
+	struct xfs_btree_cur		**pcur)
+{
+	struct xfs_perag		*pag = to_perag(ri->ri_group);
+	struct xfs_buf			*agbp = NULL;
+	int				error;
+
+	/*
+	 * Refresh the freelist before we start changing the rmapbt, because a
+	 * shape change could cause us to allocate blocks.
+	 */
+	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+	if (error) {
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
+		return error;
+	}
+	if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
+		return -EFSCORRUPTED;
+	}
+	*pcur = xfs_rmapbt_init_cursor(tp->t_mountp, tp, agbp, pag);
+	return 0;
+}
+
+static int
+xfs_rtrmap_finish_init_cursor(
+	struct xfs_trans		*tp,
+	struct xfs_rmap_intent		*ri,
+	struct xfs_btree_cur		**pcur)
+{
+	struct xfs_rtgroup		*rtg = to_rtg(ri->ri_group);
+
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+	xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+	*pcur = xfs_rtrmapbt_init_cursor(tp, rtg);
+	return 0;
+}
+
 /*
  * Process one of the deferred rmap operations.  We pass back the
  * btree cursor to maintain our lock on the rmapbt between calls.
@@ -2634,8 +2676,6 @@ xfs_rmap_finish_one(
 {
 	struct xfs_owner_info		oinfo;
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_btree_cur		*rcur = *pcur;
-	struct xfs_buf			*agbp = NULL;
 	xfs_agblock_t			bno;
 	bool				unwritten;
 	int				error = 0;
@@ -2649,38 +2689,26 @@ xfs_rmap_finish_one(
 	 * If we haven't gotten a cursor or the cursor AG doesn't match
 	 * the startblock, get one now.
 	 */
-	if (rcur != NULL && rcur->bc_group != ri->ri_group) {
-		xfs_btree_del_cursor(rcur, 0);
-		rcur = NULL;
+	if (*pcur != NULL && (*pcur)->bc_group != ri->ri_group) {
+		xfs_btree_del_cursor(*pcur, 0);
 		*pcur = NULL;
 	}
-	if (rcur == NULL) {
-		struct xfs_perag	*pag = to_perag(ri->ri_group);
-
-		/*
-		 * Refresh the freelist before we start changing the
-		 * rmapbt, because a shape change could cause us to
-		 * allocate blocks.
-		 */
-		error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
-		if (error) {
-			xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
+	if (*pcur == NULL) {
+		if (ri->ri_group->xg_type == XG_TYPE_RTG)
+			error = xfs_rtrmap_finish_init_cursor(tp, ri, pcur);
+		else
+			error = xfs_rmap_finish_init_cursor(tp, ri, pcur);
+		if (error)
 			return error;
-		}
-		if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
-			xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
-			return -EFSCORRUPTED;
-		}
-
-		*pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
 	}
 
 	xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
 			ri->ri_bmap.br_startoff);
 	unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
-	bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
 
-	error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+	bno = xfs_fsb_to_gbno(mp, ri->ri_bmap.br_startblock,
+			ri->ri_group->xg_type);
+	error = __xfs_rmap_finish_intent(*pcur, ri->ri_type, bno,
 			ri->ri_bmap.br_blockcount, &oinfo, unwritten);
 	if (error)
 		return error;
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 1b56c13b2827..af1716ec0691 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -202,6 +202,9 @@ xfs_rtgroup_lock(
 	} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
 		xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
 	}
+
+	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+		xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
 }
 
 /* Unlock metadata inodes associated with this rt group. */
@@ -214,6 +217,9 @@ xfs_rtgroup_unlock(
 	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
 	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
+	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+		xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
 	if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
 		xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
 		xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
@@ -239,6 +245,9 @@ xfs_rtgroup_trans_join(
 		xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
 	}
+
+	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+		xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL);
 }
 
 /* Retrieve rt group geometry. */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 5b61291d2669..733da7417c9c 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -265,9 +265,12 @@ int xfs_update_last_rtgroup_size(struct xfs_mount *mp,
 #define XFS_RTGLOCK_BITMAP		(1U << 0)
 /* Lock the rt bitmap inode in shared mode */
 #define XFS_RTGLOCK_BITMAP_SHARED	(1U << 1)
+/* Lock the rt rmap inode in exclusive mode */
+#define XFS_RTGLOCK_RMAP		(1U << 2)
 
 #define XFS_RTGLOCK_ALL_FLAGS	(XFS_RTGLOCK_BITMAP | \
-				 XFS_RTGLOCK_BITMAP_SHARED)
+				 XFS_RTGLOCK_BITMAP_SHARED | \
+				 XFS_RTGLOCK_RMAP)
 
 void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
 void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
-- 
cgit v1.2.3


From 71b8acb42be60e11810eb43a6f470589fcf7b7dd Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:30 -0800
Subject: xfs: create routine to allocate and initialize a realtime rmap btree
 inode

Create a library routine to allocate and initialize an empty realtime
rmapbt inode.  We'll use this for mkfs and repair.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtgroup.c      |  2 ++
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 54 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |  5 ++++
 fs/xfs/xfs_rtalloc.c             | 12 +++++++--
 4 files changed, 71 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index af1716ec0691..5f31b6e65d5d 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -33,6 +33,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_metafile.h"
 #include "xfs_metadir.h"
+#include "xfs_rtrmap_btree.h"
 
 /* Find the first usable fsblock in this rtgroup. */
 static inline uint32_t
@@ -363,6 +364,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
 		 * rtrmapbt predicate here.
 		 */
 		.enabled	= xfs_has_rmapbt,
+		.create		= xfs_rtrmapbt_create,
 	},
 };
 
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index af0df0c7e61e..4de7720d2f4e 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -832,3 +832,57 @@ xfs_iflush_rtrmap(
 	xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
 			dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
 }
+
+/*
+ * Create a realtime rmap btree inode.
+ */
+int
+xfs_rtrmapbt_create(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp,
+	bool			init)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_btree_block	*broot;
+
+	ifp->if_format = XFS_DINODE_FMT_META_BTREE;
+	ASSERT(ifp->if_broot_bytes == 0);
+	ASSERT(ifp->if_bytes == 0);
+
+	/* Initialize the empty incore btree root. */
+	broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0));
+	if (broot)
+		xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0,
+				ip->i_ino);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+
+	return 0;
+}
+
+/*
+ * Initialize an rmap for a realtime superblock using the potentially updated
+ * rt geometry in the provided @mp.
+ */
+int
+xfs_rtrmapbt_init_rtsb(
+	struct xfs_mount	*mp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_trans	*tp)
+{
+	struct xfs_rmap_irec	rmap = {
+		.rm_blockcount	= mp->m_sb.sb_rextsize,
+		.rm_owner	= XFS_RMAP_OWN_FS,
+	};
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ASSERT(xfs_has_rtsb(mp));
+	ASSERT(rtg_rgno(rtg) == 0);
+
+	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+	error = xfs_rmap_map_raw(cur, &rmap);
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index e97695066920..bf73460be274 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -193,4 +193,9 @@ void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock,
 		unsigned int dblocklen);
 void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
 
+int xfs_rtrmapbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		struct xfs_trans *tp, bool init);
+int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg,
+		struct xfs_trans *tp);
+
 #endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 2245f9ecaa33..c7efd9264139 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -846,6 +846,13 @@ xfs_growfs_rt_init_rtsb(
 	mp->m_rtsb_bp = rtsb_bp;
 	error = xfs_bwrite(rtsb_bp);
 	xfs_buf_unlock(rtsb_bp);
+	if (error)
+		return error;
+
+	/* Initialize the rtrmap to reflect the rtsb. */
+	if (rtg_rmap(args->rtg) != NULL)
+		error = xfs_rtrmapbt_init_rtsb(nargs->mp, args->rtg, args->tp);
+
 	return error;
 }
 
@@ -894,8 +901,9 @@ xfs_growfs_rt_bmblock(
 		goto out_free;
 	nargs.tp = args.tp;
 
-	xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP);
-	xfs_rtgroup_trans_join(args.tp, args.rtg, XFS_RTGLOCK_BITMAP);
+	xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_RMAP);
+	xfs_rtgroup_trans_join(args.tp, args.rtg,
+			XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_RMAP);
 
 	/*
 	 * Update the bitmap inode's size ondisk and incore.  We need to update
-- 
cgit v1.2.3


From b3683c74bf17e9f4e33cc8e13c73d29972b19765 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:31 -0800
Subject: xfs: wire up getfsmap to the realtime reverse mapping btree

Connect the getfsmap ioctl to the realtime rmapbt.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_fsmap.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 173 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3290dd8524a6..3e3ef16f65a3 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -26,6 +26,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_ag.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 /* Convert an xfs_fsmap to an fsmap. */
 static void
@@ -832,6 +833,174 @@ xfs_getfsmap_rtdev_rtbitmap(
 
 	return error;
 }
+
+/* Transform a realtime rmapbt record into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt_helper(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xfs_fsmap_irec		frec = {
+		.owner			= rec->rm_owner,
+		.offset			= rec->rm_offset,
+		.rm_flags		= rec->rm_flags,
+		.rec_key		= rec->rm_startblock,
+	};
+	struct xfs_getfsmap_info	*info = priv;
+
+	return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group,
+			rec->rm_startblock, rec->rm_blockcount, &frec);
+}
+
+/* Actually query the rtrmap btree. */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt_query(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	struct xfs_btree_cur		**curpp)
+{
+	struct xfs_rtgroup		*rtg = to_rtg(info->group);
+
+	/* Query the rtrmapbt */
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+	*curpp = xfs_rtrmapbt_init_cursor(tp, rtg);
+	return xfs_rmap_query_range(*curpp, &info->low, &info->high,
+			xfs_getfsmap_rtdev_rmapbt_helper, info);
+}
+
+/* Execute a getfsmap query against the realtime device rmapbt. */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_rtgroup		*rtg = NULL;
+	struct xfs_btree_cur		*bt_cur = NULL;
+	xfs_rtblock_t			start_rtb;
+	xfs_rtblock_t			end_rtb;
+	xfs_rgnumber_t			start_rg, end_rg;
+	uint64_t			eofs;
+	int				error = 0;
+
+	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
+	end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
+
+	info->missing_owner = XFS_FMR_OWN_FREE;
+
+	/*
+	 * Convert the fsmap low/high keys to rtgroup based keys.  Initialize
+	 * low to the fsmap low key and max out the high key to the end
+	 * of the rtgroup.
+	 */
+	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+	error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+	if (error)
+		return error;
+	info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
+	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+	/* Adjust the low key if we are continuing from where we left off. */
+	if (info->low.rm_blockcount == 0) {
+		/* No previous record from which to continue */
+	} else if (rmap_not_shareable(mp, &info->low)) {
+		/* Last record seen was an unshareable extent */
+		info->low.rm_owner = 0;
+		info->low.rm_offset = 0;
+
+		start_rtb += info->low.rm_blockcount;
+		if (xfs_rtb_to_daddr(mp, start_rtb) >= eofs)
+			return 0;
+	} else {
+		/* Last record seen was a shareable file data extent */
+		info->low.rm_offset += info->low.rm_blockcount;
+	}
+	info->low.rm_startblock = xfs_rtb_to_rgbno(mp, start_rtb);
+
+	info->high.rm_startblock = -1U;
+	info->high.rm_owner = ULLONG_MAX;
+	info->high.rm_offset = ULLONG_MAX;
+	info->high.rm_blockcount = 0;
+	info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+
+	start_rg = xfs_rtb_to_rgno(mp, start_rtb);
+	end_rg = xfs_rtb_to_rgno(mp, end_rtb);
+
+	while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rg, end_rg))) {
+		/*
+		 * Set the rtgroup high key from the fsmap high key if this
+		 * is the last rtgroup that we're querying.
+		 */
+		info->group = rtg_group(rtg);
+		if (rtg_rgno(rtg) == end_rg) {
+			info->high.rm_startblock =
+				xfs_rtb_to_rgbno(mp, end_rtb);
+			info->high.rm_offset =
+				XFS_BB_TO_FSBT(mp, keys[1].fmr_offset);
+			error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+			if (error)
+				break;
+			xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+		}
+
+		if (bt_cur) {
+			xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group),
+					XFS_RTGLOCK_RMAP);
+			xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+			bt_cur = NULL;
+		}
+
+		trace_xfs_fsmap_low_group_key(mp, info->dev, rtg_rgno(rtg),
+				&info->low);
+		trace_xfs_fsmap_high_group_key(mp, info->dev, rtg_rgno(rtg),
+				&info->high);
+
+		error = xfs_getfsmap_rtdev_rmapbt_query(tp, info, &bt_cur);
+		if (error)
+			break;
+
+		/*
+		 * Set the rtgroup low key to the start of the rtgroup prior to
+		 * moving on to the next rtgroup.
+		 */
+		if (rtg_rgno(rtg) == start_rg)
+			memset(&info->low, 0, sizeof(info->low));
+
+		/*
+		 * If this is the last rtgroup, report any gap at the end of it
+		 * before we drop the reference to the perag when the loop
+		 * terminates.
+		 */
+		if (rtg_rgno(rtg) == end_rg) {
+			info->last = true;
+			error = xfs_getfsmap_rtdev_rmapbt_helper(bt_cur,
+					&info->high, info);
+			if (error)
+				break;
+		}
+		info->group = NULL;
+	}
+
+	if (bt_cur) {
+		xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group),
+				XFS_RTGLOCK_RMAP);
+		xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
+							 XFS_BTREE_NOERROR);
+	}
+
+	/* loop termination case */
+	if (rtg) {
+		info->group = NULL;
+		xfs_rtgroup_rele(rtg);
+	}
+
+	return error;
+}
 #endif /* CONFIG_XFS_RT */
 
 /* Do we recognize the device? */
@@ -971,7 +1140,10 @@ xfs_getfsmap(
 	if (mp->m_rtdev_targp) {
 		handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
 		handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
-		handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
+		if (use_rmap)
+			handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
+		else
+			handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
 	}
 #endif /* CONFIG_XFS_RT */
 
-- 
cgit v1.2.3


From 59a57acbce282d5ff0ddcc308b934549b340c713 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:32 -0800
Subject: xfs: check that the rtrmapbt maxlevels doesn't increase when growing
 fs

The size of filesystem transaction reservations depends on the maximum
height (maxlevels) of the realtime btrees.  Since we don't want a grow
operation to increase the reservation size enough that we'll fail the
minimum log size checks on the next mount, constrain growfs operations
if they would cause an increase in those maxlevels.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_fsops.c   | 11 +++++++++++
 fs/xfs/xfs_rtalloc.c | 25 ++++++++++++++++++-------
 fs/xfs/xfs_rtalloc.h | 10 ++++++++++
 fs/xfs/xfs_trace.h   | 21 +++++++++++++++++++++
 4 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index e1145107d8cb..9df5a09c0acd 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -22,6 +22,7 @@
 #include "xfs_ag_resv.h"
 #include "xfs_trace.h"
 #include "xfs_rtalloc.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Write new AG headers to disk. Non-transactional, but need to be
@@ -114,6 +115,12 @@ xfs_growfs_data_private(
 		xfs_buf_relse(bp);
 	}
 
+	/* Make sure the new fs size won't cause problems with the log. */
+	error = xfs_growfs_check_rtgeom(mp, nb, mp->m_sb.sb_rblocks,
+			mp->m_sb.sb_rextsize);
+	if (error)
+		return error;
+
 	nb_div = nb;
 	nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
 	if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS)
@@ -221,7 +228,11 @@ xfs_growfs_data_private(
 		error = xfs_fs_reserve_ag_blocks(mp);
 		if (error == -ENOSPC)
 			error = 0;
+
+		/* Compute new maxlevels for rt btrees. */
+		xfs_rtrmapbt_compute_maxlevels(mp);
 	}
+
 	return error;
 
 out_trans_cancel:
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c7efd9264139..3c1bce5a4855 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -989,9 +989,11 @@ xfs_growfs_rt_bmblock(
 		goto out_free;
 
 	/*
-	 * Ensure the mount RT feature flag is now set.
+	 * Ensure the mount RT feature flag is now set, and compute new
+	 * maxlevels for rt btrees.
 	 */
 	mp->m_features |= XFS_FEAT_REALTIME;
+	xfs_rtrmapbt_compute_maxlevels(mp);
 
 	kfree(nmp);
 	return 0;
@@ -1159,29 +1161,37 @@ out_rele:
 	return error;
 }
 
-static int
+int
 xfs_growfs_check_rtgeom(
 	const struct xfs_mount	*mp,
+	xfs_rfsblock_t		dblocks,
 	xfs_rfsblock_t		rblocks,
 	xfs_extlen_t		rextsize)
 {
+	xfs_extlen_t		min_logfsbs;
 	struct xfs_mount	*nmp;
-	int			error = 0;
 
 	nmp = xfs_growfs_rt_alloc_fake_mount(mp, rblocks, rextsize);
 	if (!nmp)
 		return -ENOMEM;
+	nmp->m_sb.sb_dblocks = dblocks;
+
+	xfs_rtrmapbt_compute_maxlevels(nmp);
+	xfs_trans_resv_calc(nmp, M_RES(nmp));
 
 	/*
 	 * New summary size can't be more than half the size of the log.  This
 	 * prevents us from getting a log overflow, since we'll log basically
 	 * the whole summary file at once.
 	 */
-	if (nmp->m_rsumblocks > (mp->m_sb.sb_logblocks >> 1))
-		error = -EINVAL;
+	min_logfsbs = min_t(xfs_extlen_t, xfs_log_calc_minimum_size(nmp),
+			nmp->m_rsumblocks * 2);
 
 	kfree(nmp);
-	return error;
+
+	if (min_logfsbs > mp->m_sb.sb_logblocks)
+		return -EINVAL;
+	return 0;
 }
 
 /*
@@ -1300,7 +1310,8 @@ xfs_growfs_rt(
 		goto out_unlock;
 
 	/* Make sure the new fs size won't cause problems with the log. */
-	error = xfs_growfs_check_rtgeom(mp, in->newblocks, in->extsize);
+	error = xfs_growfs_check_rtgeom(mp, mp->m_sb.sb_dblocks, in->newblocks,
+			in->extsize);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index d87523e6a550..9044f7226ab6 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -46,6 +46,8 @@ xfs_growfs_rt(
 	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
 
 int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
+int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, xfs_rfsblock_t dblocks,
+		xfs_rfsblock_t rblocks, xfs_agblock_t rextsize);
 #else
 # define xfs_growfs_rt(mp,in)				(-ENOSYS)
 # define xfs_rtalloc_reinit_frextents(m)		(0)
@@ -65,6 +67,14 @@ xfs_rtmount_init(
 # define xfs_rtunmount_inodes(m)
 # define xfs_rt_resv_free(mp)				((void)0)
 # define xfs_rt_resv_init(mp)				(0)
+
+static inline int
+xfs_growfs_check_rtgeom(const struct xfs_mount *mp,
+		xfs_rfsblock_t dblocks, xfs_rfsblock_t rblocks,
+		xfs_extlen_t rextsize)
+{
+	return 0;
+}
 #endif	/* CONFIG_XFS_RT */
 
 #endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a098935163b7..84cdc145e2d9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5622,6 +5622,27 @@ DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
 DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
 DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error);
 
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xfs_growfs_check_rtgeom,
+	TP_PROTO(const struct xfs_mount *mp, unsigned int min_logfsbs),
+	TP_ARGS(mp, min_logfsbs),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, logblocks)
+		__field(unsigned int, min_logfsbs)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->logblocks = mp->m_sb.sb_logblocks;
+		__entry->min_logfsbs = min_logfsbs;
+	),
+	TP_printk("dev %d:%d logblocks %u min_logfsbs %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->logblocks,
+		  __entry->min_logfsbs)
+);
+#endif /* CONFIG_XFS_RT */
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From 6d4933c221958d1e1848d5092a3e3d1c6e4a6f92 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:33 -0800
Subject: xfs: report realtime rmap btree corruption errors to the health
 system

Whenever we encounter corrupt realtime rmap btree blocks, we should
report that to the health monitoring system for later reporting.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.h        |  2 +-
 fs/xfs/libxfs/xfs_fs.h           |  1 +
 fs/xfs/libxfs/xfs_health.h       |  4 +++-
 fs/xfs/libxfs/xfs_rtgroup.c      |  1 +
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 10 ++++++++--
 fs/xfs/xfs_health.c              |  1 +
 6 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index ee82dc777d6d..dbc047b2fb2c 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -135,7 +135,7 @@ struct xfs_btree_ops {
 	/* offset of btree stats array */
 	unsigned int		statoff;
 
-	/* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */
+	/* sick mask for health reporting (not for bmap btrees) */
 	unsigned int		sick_mask;
 
 	/* cursor operations */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 41ce4d3d650e..7cca458ff812 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -993,6 +993,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_SUPER	(1U << 0)  /* superblock */
 #define XFS_RTGROUP_GEOM_SICK_BITMAP	(1U << 1)  /* rtbitmap */
 #define XFS_RTGROUP_GEOM_SICK_SUMMARY	(1U << 2)  /* rtsummary */
+#define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1U << 3)  /* reverse mappings */
 
 /*
  * ioctl commands that are used by Linux filesystems
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index d34986ac18c3..5c8a0aff6ba6 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -70,6 +70,7 @@ struct xfs_rtgroup;
 #define XFS_SICK_RG_SUPER	(1 << 0)  /* rt group superblock */
 #define XFS_SICK_RG_BITMAP	(1 << 1)  /* rt group bitmap */
 #define XFS_SICK_RG_SUMMARY	(1 << 2)  /* rt groups summary */
+#define XFS_SICK_RG_RMAPBT	(1 << 3)  /* reverse mappings */
 
 /* Observable health issues for AG metadata. */
 #define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
@@ -115,7 +116,8 @@ struct xfs_rtgroup;
 
 #define XFS_SICK_RG_PRIMARY	(XFS_SICK_RG_SUPER | \
 				 XFS_SICK_RG_BITMAP | \
-				 XFS_SICK_RG_SUMMARY)
+				 XFS_SICK_RG_SUMMARY | \
+				 XFS_SICK_RG_RMAPBT)
 
 #define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
 				 XFS_SICK_AG_AGF | \
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 5f31b6e65d5d..b7ed2d27d545 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -357,6 +357,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
 	[XFS_RTGI_RMAP] = {
 		.name		= "rmap",
 		.metafile_type	= XFS_METAFILE_RTRMAP,
+		.sick		= XFS_SICK_RG_RMAPBT,
 		.fmt_mask	= 1U << XFS_DINODE_FMT_META_BTREE,
 		/*
 		 * growfs must create the rtrmap inodes before adding a
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 4de7720d2f4e..19e5109c3683 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -27,6 +27,7 @@
 #include "xfs_extent_busy.h"
 #include "xfs_rtgroup.h"
 #include "xfs_bmap.h"
+#include "xfs_health.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -496,6 +497,7 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 
 	.lru_refs		= XFS_RMAP_BTREE_REF,
 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_2),
+	.sick_mask		= XFS_SICK_RG_RMAPBT,
 
 	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
 	.alloc_block		= xfs_btree_alloc_metafile_block,
@@ -755,16 +757,20 @@ xfs_iformat_rtrmap(
 	 * growfs must create the rtrmap inodes before adding a realtime volume
 	 * to the filesystem, so we cannot use the rtrmapbt predicate here.
 	 */
-	if (!xfs_has_rmapbt(ip->i_mount))
+	if (!xfs_has_rmapbt(ip->i_mount)) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
+	}
 
 	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
 	numrecs = be16_to_cpu(dfp->bb_numrecs);
 	level = be16_to_cpu(dfp->bb_level);
 
 	if (level > mp->m_rtrmap_maxlevels ||
-	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize)
+	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
+	}
 
 	broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
 			xfs_rtrmap_broot_space_calc(mp, level, numrecs));
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index c7c2e6561998..d438c3c001c8 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -447,6 +447,7 @@ static const struct ioctl_sick_map rtgroup_map[] = {
 	{ XFS_SICK_RG_SUPER,	XFS_RTGROUP_GEOM_SICK_SUPER },
 	{ XFS_SICK_RG_BITMAP,	XFS_RTGROUP_GEOM_SICK_BITMAP },
 	{ XFS_SICK_RG_SUMMARY,	XFS_RTGROUP_GEOM_SICK_SUMMARY },
+	{ XFS_SICK_RG_RMAPBT,	XFS_RTGROUP_GEOM_SICK_RMAPBT },
 };
 
 /* Fill out rtgroup geometry health info. */
-- 
cgit v1.2.3


From 428e4884656db965cb0dedc78dcf9a45013fe454 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:33 -0800
Subject: xfs: allow queued realtime intents to drain before scrubbing

When a writer thread executes a chain of log intent items for the
realtime volume, the ILOCKs taken during each step are for each rt
metadata file, not the entire rt volume itself.  Although scrub takes
all rt metadata ILOCKs, this isn't sufficient to guard against scrub
checking the rt volume while that writer thread is in the middle of
finishing a chain because there's no higher level locking primitive
guarding the realtime volume.

When there's a collision, cross-referencing between data structures
(e.g. rtrmapbt and rtrefcountbt) yields false corruption events; if
repair is running, this results in incorrect repairs, which is
catastrophic.

Fix this by adding to the mount structure the same drain that we use to
protect scrub against concurrent AG updates, but this time for the
realtime volume.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/bmap.c      |  7 ++++-
 fs/xfs/scrub/common.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/common.h    |  5 ++--
 fs/xfs/scrub/rgsuper.c   |  4 ++-
 fs/xfs/scrub/rtbitmap.c  |  8 +++++-
 fs/xfs/scrub/rtsummary.c |  5 +++-
 fs/xfs/scrub/scrub.c     |  2 +-
 fs/xfs/xfs_drain.c       | 20 +++++++-------
 fs/xfs/xfs_drain.h       |  7 +++--
 9 files changed, 108 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 0d7ad692822d..dd99366643f8 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -324,10 +324,15 @@ xchk_bmap_rt_iextent_xref(
 			irec->br_startoff, &error))
 		return;
 
-	xchk_rtgroup_lock(&info->sc->sr, XCHK_RTGLOCK_ALL);
+	error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL);
+	if (!xchk_fblock_process_error(info->sc, info->whichfork,
+			irec->br_startoff, &error))
+		goto out_free;
+
 	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
 			irec->br_blockcount);
 
+out_free:
 	xchk_rtgroup_free(info->sc, &info->sc->sr);
 }
 
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 5cbd94b56582..613fb54e723e 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -719,13 +719,79 @@ xchk_rtgroup_init(
 	return 0;
 }
 
-void
+/* Lock all the rt group metadata inode ILOCKs and wait for intents. */
+int
 xchk_rtgroup_lock(
+	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr,
 	unsigned int		rtglock_flags)
 {
-	xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+	int			error = 0;
+
+	ASSERT(sr->rtg != NULL);
+
+	/*
+	 * If we're /only/ locking the rtbitmap in shared mode, then we're
+	 * obviously not trying to compare records in two metadata inodes.
+	 * There's no need to drain intents here because the caller (most
+	 * likely the rgsuper scanner) doesn't need that level of consistency.
+	 */
+	if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) {
+		xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+		sr->rtlock_flags = rtglock_flags;
+		return 0;
+	}
+
+	do {
+		if (xchk_should_terminate(sc, &error))
+			return error;
+
+		xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+
+		/*
+		 * If we've grabbed a non-metadata file for scrubbing, we
+		 * assume that holding its ILOCK will suffice to coordinate
+		 * with any rt intent chains involving this inode.
+		 */
+		if (sc->ip && !xfs_is_internal_inode(sc->ip))
+			break;
+
+		/*
+		 * Decide if the rt group is quiet enough for all metadata to
+		 * be consistent with each other.  Regular file IO doesn't get
+		 * to lock all the rt inodes at the same time, which means that
+		 * there could be other threads in the middle of processing a
+		 * chain of deferred ops.
+		 *
+		 * We just locked all the metadata inodes for this rt group;
+		 * now take a look to see if there are any intents in progress.
+		 * If there are, drop the rt group inode locks and wait for the
+		 * intents to drain.  Since we hold the rt group inode locks
+		 * for the duration of the scrub, this is the only time we have
+		 * to sample the intents counter; any threads increasing it
+		 * after this point can't possibly be in the middle of a chain
+		 * of rt metadata updates.
+		 *
+		 * Obviously, this should be slanted against scrub and in favor
+		 * of runtime threads.
+		 */
+		if (!xfs_group_intent_busy(rtg_group(sr->rtg)))
+			break;
+
+		xfs_rtgroup_unlock(sr->rtg, rtglock_flags);
+
+		if (!(sc->flags & XCHK_FSGATES_DRAIN))
+			return -ECHRNG;
+		error = xfs_group_intent_drain(rtg_group(sr->rtg));
+		if (error) {
+			if (error == -ERESTARTSYS)
+				error = -EINTR;
+			return error;
+		}
+	} while (1);
+
 	sr->rtlock_flags = rtglock_flags;
+	return 0;
 }
 
 /*
@@ -1379,7 +1445,7 @@ xchk_fsgates_enable(
 	trace_xchk_fsgates_enable(sc, scrub_fsgates);
 
 	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
-		xfs_drain_wait_enable();
+		xfs_defer_drain_wait_enable();
 
 	if (scrub_fsgates & XCHK_FSGATES_QUOTA)
 		xfs_dqtrx_hook_enable();
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 9ff3cafd8679..e734572a8dd6 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -141,12 +141,13 @@ xchk_rtgroup_init_existing(
 	return error == -ENOENT ? -EFSCORRUPTED : error;
 }
 
-void xchk_rtgroup_lock(struct xchk_rt *sr, unsigned int rtglock_flags);
+int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
+		unsigned int rtglock_flags);
 void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
 #else
 # define xchk_rtgroup_init(sc, rgno, sr)		(-EFSCORRUPTED)
 # define xchk_rtgroup_init_existing(sc, rgno, sr)	(-EFSCORRUPTED)
-# define xchk_rtgroup_lock(sc, lockflags)		do { } while (0)
+# define xchk_rtgroup_lock(sc, sr, lockflags)		(-EFSCORRUPTED)
 # define xchk_rtgroup_free(sc, sr)			do { } while (0)
 #endif /* CONFIG_XFS_RT */
 
diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c
index 463b3573bb76..e062c7d12565 100644
--- a/fs/xfs/scrub/rgsuper.c
+++ b/fs/xfs/scrub/rgsuper.c
@@ -61,7 +61,9 @@ xchk_rgsuperblock(
 	if (!xchk_xref_process_error(sc, 0, 0, &error))
 		return error;
 
-	xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+	error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+	if (error)
+		return error;
 
 	/*
 	 * Since we already validated the rt superblock at mount time, we don't
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index fb4970c877ab..819026ea2d74 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -30,6 +30,9 @@ xchk_setup_rtbitmap(
 	struct xchk_rtbitmap	*rtb;
 	int			error;
 
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
 	rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
 	if (!rtb)
 		return -ENOMEM;
@@ -57,12 +60,15 @@ xchk_setup_rtbitmap(
 	if (error)
 		return error;
 
+	error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+	if (error)
+		return error;
+
 	/*
 	 * Now that we've locked the rtbitmap, we can't race with growfsrt
 	 * trying to expand the bitmap or change the size of the rt volume.
 	 * Hence it is safe to compute and check the geometry values.
 	 */
-	xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
 	if (mp->m_sb.sb_rblocks) {
 		rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
 		rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index f1af5431b388..4ac679c1bd29 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -89,6 +89,10 @@ xchk_setup_rtsummary(
 	if (error)
 		return error;
 
+	error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP);
+	if (error)
+		return error;
+
 	/*
 	 * Now that we've locked the rtbitmap and rtsummary, we can't race with
 	 * growfsrt trying to expand the summary or change the size of the rt
@@ -99,7 +103,6 @@ xchk_setup_rtsummary(
 	 * exclusively here.  If we ever start caring about running concurrent
 	 * fsmap with scrub this could be changed.
 	 */
-	xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
 	if (mp->m_sb.sb_rblocks) {
 		rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
 		rts->rbmblocks = xfs_rtbitmap_blockcount(mp);
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 950f5a58dcd9..652d347cee99 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -164,7 +164,7 @@ xchk_fsgates_disable(
 	trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
 
 	if (sc->flags & XCHK_FSGATES_DRAIN)
-		xfs_drain_wait_disable();
+		xfs_defer_drain_wait_disable();
 
 	if (sc->flags & XCHK_FSGATES_QUOTA)
 		xfs_dqtrx_hook_disable();
diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c
index 5ede81fadbd8..fa5f31931efd 100644
--- a/fs/xfs/xfs_drain.c
+++ b/fs/xfs/xfs_drain.c
@@ -13,28 +13,28 @@
 #include "xfs_trace.h"
 
 /*
- * Use a static key here to reduce the overhead of xfs_drain_rele.  If the
- * compiler supports jump labels, the static branch will be replaced by a nop
- * sled when there are no xfs_drain_wait callers.  Online fsck is currently
- * the only caller, so this is a reasonable tradeoff.
+ * Use a static key here to reduce the overhead of xfs_defer_drain_rele.  If
+ * the compiler supports jump labels, the static branch will be replaced by a
+ * nop sled when there are no xfs_defer_drain_wait callers.  Online fsck is
+ * currently the only caller, so this is a reasonable tradeoff.
  *
  * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
  * parts of the kernel allocate memory with that lock held, which means that
  * XFS callers cannot hold any locks that might be used by memory reclaim or
  * writeback when calling the static_branch_{inc,dec} functions.
  */
-static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate);
+static DEFINE_STATIC_KEY_FALSE(xfs_defer_drain_waiter_gate);
 
 void
-xfs_drain_wait_disable(void)
+xfs_defer_drain_wait_disable(void)
 {
-	static_branch_dec(&xfs_drain_waiter_gate);
+	static_branch_dec(&xfs_defer_drain_waiter_gate);
 }
 
 void
-xfs_drain_wait_enable(void)
+xfs_defer_drain_wait_enable(void)
 {
-	static_branch_inc(&xfs_drain_waiter_gate);
+	static_branch_inc(&xfs_defer_drain_waiter_gate);
 }
 
 void
@@ -71,7 +71,7 @@ static inline bool has_waiters(struct wait_queue_head *wq_head)
 static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr)
 {
 	if (atomic_dec_and_test(&dr->dr_count) &&
-	    static_branch_unlikely(&xfs_drain_waiter_gate) &&
+	    static_branch_unlikely(&xfs_defer_drain_waiter_gate) &&
 	    has_waiters(&dr->dr_waiters))
 		wake_up(&dr->dr_waiters);
 }
diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h
index efcf88df9a5e..4d446dbf65e5 100644
--- a/fs/xfs/xfs_drain.h
+++ b/fs/xfs/xfs_drain.h
@@ -26,8 +26,8 @@ struct xfs_defer_drain {
 void xfs_defer_drain_init(struct xfs_defer_drain *dr);
 void xfs_defer_drain_free(struct xfs_defer_drain *dr);
 
-void xfs_drain_wait_disable(void);
-void xfs_drain_wait_enable(void);
+void xfs_defer_drain_wait_disable(void);
+void xfs_defer_drain_wait_enable(void);
 
 /*
  * Deferred Work Intent Drains
@@ -61,6 +61,9 @@ void xfs_drain_wait_enable(void);
  * All functions that create work items must increment the intent counter as
  * soon as the item is added to the transaction and cannot drop the counter
  * until the item is finished or cancelled.
+ *
+ * The same principles apply to realtime groups because the rt metadata inode
+ * ILOCKs are not held across transaction rolls.
  */
 struct xfs_group *xfs_group_intent_get(struct xfs_mount *mp,
 		xfs_fsblock_t fsbno, enum xfs_group_type type);
-- 
cgit v1.2.3


From 9a6cc4f6d081fddc0d5ff96744a2507d3559f949 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:34 -0800
Subject: xfs: scrub the realtime rmapbt

Check the realtime reverse mapping btree against the rtbitmap, and
modify the rtbitmap scrub to check against the rtrmapbt.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile             |   1 +
 fs/xfs/libxfs/xfs_fs.h      |   3 +-
 fs/xfs/scrub/common.c       |  86 +++++++++++++++++++++
 fs/xfs/scrub/common.h       |  10 ++-
 fs/xfs/scrub/health.c       |   1 +
 fs/xfs/scrub/inode.c        |   6 +-
 fs/xfs/scrub/inode_repair.c |   7 +-
 fs/xfs/scrub/repair.c       |   1 +
 fs/xfs/scrub/rtrmap.c       | 184 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c        |   9 +++
 fs/xfs/scrub/scrub.h        |   5 ++
 fs/xfs/scrub/stats.c        |   1 +
 fs/xfs/scrub/trace.h        |   4 +-
 13 files changed, 307 insertions(+), 11 deletions(-)
 create mode 100644 fs/xfs/scrub/rtrmap.c

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index ff45efb2463f..136a465e00d2 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -194,6 +194,7 @@ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rgsuper.o \
 				   rtbitmap.o \
+				   rtrmap.o \
 				   rtsummary.o \
 				   )
 
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 7cca458ff812..34fcbcd0bcd5 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -737,9 +737,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_DIRTREE	28	/* directory tree structure */
 #define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
 #define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
+#define XFS_SCRUB_TYPE_RTRMAPBT	31	/* rtgroup reverse mapping btree */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	31
+#define XFS_SCRUB_TYPE_NR	32
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 613fb54e723e..ca43dd4f52b2 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -35,6 +35,8 @@
 #include "xfs_exchmaps.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_bmap_util.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -791,9 +793,28 @@ xchk_rtgroup_lock(
 	} while (1);
 
 	sr->rtlock_flags = rtglock_flags;
+
+	if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP))
+		sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+
 	return 0;
 }
 
+/*
+ * Free all the btree cursors and other incore data relating to the realtime
+ * group.  This has to be done /before/ committing (or cancelling) the scrub
+ * transaction.
+ */
+void
+xchk_rtgroup_btcur_free(
+	struct xchk_rt		*sr)
+{
+	if (sr->rmap_cur)
+		xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR);
+
+	sr->rmap_cur = NULL;
+}
+
 /*
  * Unlock the realtime group.  This must be done /after/ committing (or
  * cancelling) the scrub transaction.
@@ -878,6 +899,14 @@ xchk_setup_fs(
 	return xchk_trans_alloc(sc, resblks);
 }
 
+/* Set us up with a transaction and an empty context to repair rt metadata. */
+int
+xchk_setup_rt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_trans_alloc(sc, 0);
+}
+
 /* Set us up with AG headers and btree cursors. */
 int
 xchk_setup_ag_btree(
@@ -1639,3 +1668,60 @@ xchk_inode_rootdir_inum(const struct xfs_inode *ip)
 		return mp->m_metadirip->i_ino;
 	return mp->m_rootip->i_ino;
 }
+
+static int
+xchk_meta_btree_count_blocks(
+	struct xfs_scrub	*sc,
+	xfs_extnum_t		*nextents,
+	xfs_filblks_t		*count)
+{
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	if (!sc->sr.rtg) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	switch (sc->ip->i_metatype) {
+	case XFS_METAFILE_RTRMAP:
+		cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	error = xfs_btree_count_blocks(cur, count);
+	xfs_btree_del_cursor(cur, error);
+	if (!error) {
+		*nextents = 0;
+		(*count)--;	/* don't count the btree iroot */
+	}
+	return error;
+}
+
+/* Count the blocks used by a file, even if it's a metadata inode. */
+int
+xchk_inode_count_blocks(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_extnum_t		*nextents,
+	xfs_filblks_t		*count)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+	if (!ifp) {
+		*nextents = 0;
+		*count = 0;
+		return 0;
+	}
+
+	if (ifp->if_format == XFS_DINODE_FMT_META_BTREE) {
+		ASSERT(whichfork == XFS_DATA_FORK);
+		return xchk_meta_btree_count_blocks(sc, nextents, count);
+	}
+
+	return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork, nextents,
+			count);
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index e734572a8dd6..1576467f7244 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -63,6 +63,7 @@ static inline int xchk_setup_nothing(struct xfs_scrub *sc)
 /* Setup functions */
 int xchk_setup_agheader(struct xfs_scrub *sc);
 int xchk_setup_fs(struct xfs_scrub *sc);
+int xchk_setup_rt(struct xfs_scrub *sc);
 int xchk_setup_ag_allocbt(struct xfs_scrub *sc);
 int xchk_setup_ag_iallocbt(struct xfs_scrub *sc);
 int xchk_setup_ag_rmapbt(struct xfs_scrub *sc);
@@ -80,10 +81,12 @@ int xchk_setup_metapath(struct xfs_scrub *sc);
 int xchk_setup_rtbitmap(struct xfs_scrub *sc);
 int xchk_setup_rtsummary(struct xfs_scrub *sc);
 int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
+int xchk_setup_rtrmapbt(struct xfs_scrub *sc);
 #else
 # define xchk_setup_rtbitmap		xchk_setup_nothing
 # define xchk_setup_rtsummary		xchk_setup_nothing
 # define xchk_setup_rgsuperblock	xchk_setup_nothing
+# define xchk_setup_rtrmapbt		xchk_setup_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -125,7 +128,8 @@ xchk_ag_init_existing(
 #ifdef CONFIG_XFS_RT
 
 /* All the locks we need to check an rtgroup. */
-#define XCHK_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP)
+#define XCHK_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP | \
+				 XFS_RTGLOCK_RMAP)
 
 int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
 		struct xchk_rt *sr);
@@ -143,11 +147,13 @@ xchk_rtgroup_init_existing(
 
 int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
 		unsigned int rtglock_flags);
+void xchk_rtgroup_btcur_free(struct xchk_rt *sr);
 void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
 #else
 # define xchk_rtgroup_init(sc, rgno, sr)		(-EFSCORRUPTED)
 # define xchk_rtgroup_init_existing(sc, rgno, sr)	(-EFSCORRUPTED)
 # define xchk_rtgroup_lock(sc, sr, lockflags)		(-EFSCORRUPTED)
+# define xchk_rtgroup_btcur_free(sr)			do { } while (0)
 # define xchk_rtgroup_free(sc, sr)			do { } while (0)
 #endif /* CONFIG_XFS_RT */
 
@@ -275,6 +281,8 @@ void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
 
 int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino,
 		bool *inuse);
+int xchk_inode_count_blocks(struct xfs_scrub *sc, int whichfork,
+		xfs_extnum_t *nextents, xfs_filblks_t *count);
 
 bool xchk_inode_is_dirtree_root(const struct xfs_inode *ip);
 bool xchk_inode_is_sb_rooted(const struct xfs_inode *ip);
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index ccc6ca5934ca..bcc4244e3b55 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -114,6 +114,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_DIRTREE]	= { XHG_INO, XFS_SICK_INO_DIRTREE },
 	[XFS_SCRUB_TYPE_METAPATH]	= { XHG_FS,  XFS_SICK_FS_METAPATH },
 	[XFS_SCRUB_TYPE_RGSUPER]	= { XHG_RTGROUP, XFS_SICK_RG_SUPER },
+	[XFS_SCRUB_TYPE_RTRMAPBT]	= { XHG_RTGROUP, XFS_SICK_RG_RMAPBT },
 };
 
 /* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 2e911f38deae..8e702121dc86 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -690,15 +690,13 @@ xchk_inode_xref_bmap(
 		return;
 
 	/* Walk all the extents to check nextents/naextents/nblocks. */
-	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
-			&nextents, &count);
+	error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count);
 	if (!xchk_should_check_xref(sc, &error, NULL))
 		return;
 	if (nextents < xfs_dfork_data_extents(dip))
 		xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
 
-	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
-			&nextents, &acount);
+	error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, &acount);
 	if (!xchk_should_check_xref(sc, &error, NULL))
 		return;
 	if (nextents != xfs_dfork_attr_extents(dip))
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 7faa27472b91..a94f9df0ca78 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1536,8 +1536,7 @@ xrep_inode_blockcounts(
 	trace_xrep_inode_blockcounts(sc);
 
 	/* Set data fork counters from the data fork mappings. */
-	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
-			&nextents, &count);
+	error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count);
 	if (error)
 		return error;
 	if (xfs_is_reflink_inode(sc->ip)) {
@@ -1561,8 +1560,8 @@ xrep_inode_blockcounts(
 	/* Set attr fork counters from the attr fork mappings. */
 	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
 	if (ifp) {
-		error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
-				&nextents, &acount);
+		error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents,
+				&acount);
 		if (error)
 			return error;
 		if (count >= sc->mp->m_sb.sb_dblocks)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 91c8bc055a4f..e788e3032f8e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -62,6 +62,7 @@ xrep_attempt(
 	trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
 
 	xchk_ag_btcur_free(&sc->sa);
+	xchk_rtgroup_btcur_free(&sc->sr);
 
 	/* Repair whatever's broken. */
 	ASSERT(sc->ops->repair);
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
new file mode 100644
index 000000000000..7b5f932bcd94
--- /dev/null
+++ b/fs/xfs/scrub/rtrmap.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* Set us up with the realtime metadata locked. */
+int
+xchk_setup_rtrmapbt(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+	error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+	if (error)
+		return error;
+
+	error = xchk_setup_rt(sc);
+	if (error)
+		return error;
+
+	error = xchk_install_live_inode(sc, rtg_rmap(sc->sr.rtg));
+	if (error)
+		return error;
+
+	return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+}
+
+/* Realtime reverse mapping. */
+
+struct xchk_rtrmap {
+	/*
+	 * The furthest-reaching of the rmapbt records that we've already
+	 * processed.  This enables us to detect overlapping records for space
+	 * allocations that cannot be shared.
+	 */
+	struct xfs_rmap_irec	overlap_rec;
+
+	/*
+	 * The previous rmapbt record, so that we can check for two records
+	 * that could be one.
+	 */
+	struct xfs_rmap_irec	prev_rec;
+};
+
+/* Flag failures for records that overlap but cannot. */
+STATIC void
+xchk_rtrmapbt_check_overlapping(
+	struct xchk_btree		*bs,
+	struct xchk_rtrmap		*cr,
+	const struct xfs_rmap_irec	*irec)
+{
+	xfs_rtblock_t			pnext, inext;
+
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	/* No previous record? */
+	if (cr->overlap_rec.rm_blockcount == 0)
+		goto set_prev;
+
+	/* Do overlap_rec and irec overlap? */
+	pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount;
+	if (pnext <= irec->rm_startblock)
+		goto set_prev;
+
+	xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	/* Save whichever rmap record extends furthest. */
+	inext = irec->rm_startblock + irec->rm_blockcount;
+	if (pnext > inext)
+		return;
+
+set_prev:
+	memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Decide if two reverse-mapping records can be merged. */
+static inline bool
+xchk_rtrmap_mergeable(
+	struct xchk_rtrmap		*cr,
+	const struct xfs_rmap_irec	*r2)
+{
+	const struct xfs_rmap_irec	*r1 = &cr->prev_rec;
+
+	/* Ignore if prev_rec is not yet initialized. */
+	if (cr->prev_rec.rm_blockcount == 0)
+		return false;
+
+	if (r1->rm_owner != r2->rm_owner)
+		return false;
+	if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock)
+		return false;
+	if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount >
+	    XFS_RMAP_LEN_MAX)
+		return false;
+	if (r1->rm_flags != r2->rm_flags)
+		return false;
+	return r1->rm_offset + r1->rm_blockcount == r2->rm_offset;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrmapbt_check_mergeable(
+	struct xchk_btree		*bs,
+	struct xchk_rtrmap		*cr,
+	const struct xfs_rmap_irec	*irec)
+{
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	if (xchk_rtrmap_mergeable(cr, irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Scrub a realtime rmapbt record. */
+STATIC int
+xchk_rtrmapbt_rec(
+	struct xchk_btree		*bs,
+	const union xfs_btree_rec	*rec)
+{
+	struct xchk_rtrmap		*cr = bs->private;
+	struct xfs_rmap_irec		irec;
+
+	if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
+	    xfs_rtrmap_check_irec(to_rtg(bs->cur->bc_group), &irec) != NULL) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
+
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	xchk_rtrmapbt_check_mergeable(bs, cr, &irec);
+	xchk_rtrmapbt_check_overlapping(bs, cr, &irec);
+	return 0;
+}
+
+/* Scrub the realtime rmap btree. */
+int
+xchk_rtrmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_inode	*ip = rtg_rmap(sc->sr.rtg);
+	struct xfs_owner_info	oinfo;
+	struct xchk_rtrmap	cr = { };
+	int			error;
+
+	error = xchk_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, XFS_DATA_FORK);
+	return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr);
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 652d347cee99..09983899c341 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -218,6 +218,8 @@ xchk_teardown(
 	int			error)
 {
 	xchk_ag_free(sc, &sc->sa);
+	xchk_rtgroup_btcur_free(&sc->sr);
+
 	if (sc->tp) {
 		if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 			error = xfs_trans_commit(sc->tp);
@@ -458,6 +460,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.has	= xfs_has_rtsb,
 		.repair = xrep_rgsuperblock,
 	},
+	[XFS_SCRUB_TYPE_RTRMAPBT] = {	/* realtime group rmapbt */
+		.type	= ST_RTGROUP,
+		.setup	= xchk_setup_rtrmapbt,
+		.scrub	= xchk_rtrmapbt,
+		.has	= xfs_has_rtrmapbt,
+		.repair	= xrep_notsupported,
+	},
 };
 
 static int
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 5dbbe93cb49b..0ad5122af486 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -126,6 +126,9 @@ struct xchk_rt {
 
 	/* XFS_RTGLOCK_* lock state if locked */
 	unsigned int		rtlock_flags;
+
+	/* rtgroup btrees */
+	struct xfs_btree_cur	*rmap_cur;
 };
 
 struct xfs_scrub {
@@ -280,10 +283,12 @@ int xchk_metapath(struct xfs_scrub *sc);
 int xchk_rtbitmap(struct xfs_scrub *sc);
 int xchk_rtsummary(struct xfs_scrub *sc);
 int xchk_rgsuperblock(struct xfs_scrub *sc);
+int xchk_rtrmapbt(struct xfs_scrub *sc);
 #else
 # define xchk_rtbitmap		xchk_nothing
 # define xchk_rtsummary		xchk_nothing
 # define xchk_rgsuperblock	xchk_nothing
+# define xchk_rtrmapbt		xchk_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_quota(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index a476c7b2ab75..eb6bb170c902 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -82,6 +82,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_DIRTREE]	= "dirtree",
 	[XFS_SCRUB_TYPE_METAPATH]	= "metapath",
 	[XFS_SCRUB_TYPE_RGSUPER]	= "rgsuper",
+	[XFS_SCRUB_TYPE_RTRMAPBT]	= "rtrmapbt",
 };
 
 /* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index d2ae7e93acb0..5afc440f22f5 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -72,6 +72,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
 
 #define XFS_SCRUB_TYPE_STRINGS \
 	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
@@ -105,7 +106,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
 	{ XFS_SCRUB_TYPE_DIRTREE,	"dirtree" }, \
 	{ XFS_SCRUB_TYPE_BARRIER,	"barrier" }, \
 	{ XFS_SCRUB_TYPE_METAPATH,	"metapath" }, \
-	{ XFS_SCRUB_TYPE_RGSUPER,	"rgsuper" }
+	{ XFS_SCRUB_TYPE_RGSUPER,	"rgsuper" }, \
+	{ XFS_SCRUB_TYPE_RTRMAPBT,	"rtrmapbt" }
 
 #define XFS_SCRUB_FLAG_STRINGS \
 	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \
-- 
cgit v1.2.3


From 1ebecab5adba16a0e49ca30f945ab4937d8e498c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:35 -0800
Subject: xfs: cross-reference realtime bitmap to realtime rmapbt scrubber

When we're checking the realtime rmap btree entries, cross-reference
those entries with the realtime bitmap too.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/rtrmap.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 7b5f932bcd94..515c2a9b02cd 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -142,6 +142,20 @@ xchk_rtrmapbt_check_mergeable(
 	memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
 }
 
+/* Cross-reference with other metadata. */
+STATIC void
+xchk_rtrmapbt_xref(
+	struct xfs_scrub	*sc,
+	struct xfs_rmap_irec	*irec)
+{
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	xchk_xref_is_used_rt_space(sc,
+			xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock),
+			irec->rm_blockcount);
+}
+
 /* Scrub a realtime rmapbt record. */
 STATIC int
 xchk_rtrmapbt_rec(
@@ -162,6 +176,7 @@ xchk_rtrmapbt_rec(
 
 	xchk_rtrmapbt_check_mergeable(bs, cr, &irec);
 	xchk_rtrmapbt_check_overlapping(bs, cr, &irec);
+	xchk_rtrmapbt_xref(bs->sc, &irec);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 037a44d8277adfd9e9097ea9d1bf99089aa14b39 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:36 -0800
Subject: xfs: cross-reference the realtime rmapbt

Teach the data fork and realtime bitmap scrubbers to cross-reference
information with the realtime rmap btree.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/bmap.c     | 52 +++++++++++++++++++++++++++------------
 fs/xfs/scrub/rgsuper.c  |  2 ++
 fs/xfs/scrub/rtbitmap.c | 55 ++++++++++++++++++++++++++++++++++++++---
 fs/xfs/scrub/rtbitmap.h |  5 ++++
 fs/xfs/scrub/rtrmap.c   | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.h    |  9 +++++++
 6 files changed, 169 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index dd99366643f8..b7f9f3b3d81a 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -143,15 +143,22 @@ static inline bool
 xchk_bmap_get_rmap(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec,
-	xfs_agblock_t		agbno,
+	xfs_agblock_t		bno,
 	uint64_t		owner,
 	struct xfs_rmap_irec	*rmap)
 {
+	struct xfs_btree_cur	**curp = &info->sc->sa.rmap_cur;
 	xfs_fileoff_t		offset;
 	unsigned int		rflags = 0;
 	int			has_rmap;
 	int			error;
 
+	if (xfs_ifork_is_realtime(info->sc->ip, info->whichfork))
+		curp = &info->sc->sr.rmap_cur;
+
+	if (*curp == NULL)
+		return false;
+
 	if (info->whichfork == XFS_ATTR_FORK)
 		rflags |= XFS_RMAP_ATTR_FORK;
 	if (irec->br_state == XFS_EXT_UNWRITTEN)
@@ -172,13 +179,13 @@ xchk_bmap_get_rmap(
 	 * range rmap lookup to make sure we get the correct owner/offset.
 	 */
 	if (info->is_shared) {
-		error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
-				owner, offset, rflags, rmap, &has_rmap);
+		error = xfs_rmap_lookup_le_range(*curp, bno, owner, offset,
+				rflags, rmap, &has_rmap);
 	} else {
-		error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
-				owner, offset, rflags, rmap, &has_rmap);
+		error = xfs_rmap_lookup_le(*curp, bno, owner, offset,
+				rflags, rmap, &has_rmap);
 	}
-	if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
+	if (!xchk_should_check_xref(info->sc, &error, curp))
 		return false;
 
 	if (!has_rmap)
@@ -192,29 +199,29 @@ STATIC void
 xchk_bmap_xref_rmap(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec,
-	xfs_agblock_t		agbno)
+	xfs_agblock_t		bno)
 {
 	struct xfs_rmap_irec	rmap;
 	unsigned long long	rmap_end;
 	uint64_t		owner = info->sc->ip->i_ino;
 
-	if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
+	if (xchk_skip_xref(info->sc->sm))
 		return;
 
 	/* Find the rmap record for this irec. */
-	if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+	if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap))
 		return;
 
 	/*
 	 * The rmap must be an exact match for this incore file mapping record,
 	 * which may have arisen from multiple ondisk records.
 	 */
-	if (rmap.rm_startblock != agbno)
+	if (rmap.rm_startblock != bno)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
 	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
-	if (rmap_end != agbno + irec->br_blockcount)
+	if (rmap_end != bno + irec->br_blockcount)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
@@ -259,7 +266,7 @@ STATIC void
 xchk_bmap_xref_rmap_cow(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec,
-	xfs_agblock_t		agbno)
+	xfs_agblock_t		bno)
 {
 	struct xfs_rmap_irec	rmap;
 	unsigned long long	rmap_end;
@@ -269,7 +276,7 @@ xchk_bmap_xref_rmap_cow(
 		return;
 
 	/* Find the rmap record for this irec. */
-	if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+	if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap))
 		return;
 
 	/*
@@ -277,12 +284,12 @@ xchk_bmap_xref_rmap_cow(
 	 * can start before and end after the physical space allocated to this
 	 * mapping.  There are no offsets to check.
 	 */
-	if (rmap.rm_startblock > agbno)
+	if (rmap.rm_startblock > bno)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
 	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
-	if (rmap_end < agbno + irec->br_blockcount)
+	if (rmap_end < bno + irec->br_blockcount)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
@@ -315,6 +322,8 @@ xchk_bmap_rt_iextent_xref(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
+	struct xfs_owner_info	oinfo;
+	xfs_rgblock_t		rgbno;
 	int			error;
 
 	error = xchk_rtgroup_init_existing(info->sc,
@@ -332,6 +341,19 @@ xchk_bmap_rt_iextent_xref(
 	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
 			irec->br_blockcount);
 
+	if (!xfs_has_rtrmapbt(info->sc->mp))
+		goto out_cur;
+
+	rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock);
+	xchk_bmap_xref_rmap(info, irec, rgbno);
+
+	xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, info->whichfork,
+			irec->br_startoff);
+	xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+			irec->br_blockcount, &oinfo);
+
+out_cur:
+	xchk_rtgroup_btcur_free(&info->sc->sr);
 out_free:
 	xchk_rtgroup_free(info->sc, &info->sc->sr);
 }
diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c
index e062c7d12565..d189732d0e24 100644
--- a/fs/xfs/scrub/rgsuper.c
+++ b/fs/xfs/scrub/rgsuper.c
@@ -13,6 +13,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
+#include "xfs_rmap.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/repair.h"
@@ -34,6 +35,7 @@ xchk_rgsuperblock_xref(
 		return;
 
 	xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, 0), 1);
+	xchk_xref_is_only_rt_owned_by(sc, 0, 1, &XFS_RMAP_OINFO_FS);
 }
 
 int
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 819026ea2d74..675f4fdd1e67 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -9,17 +9,22 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_btree.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bit.h"
+#include "xfs_rtgroup.h"
 #include "xfs_sb.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/repair.h"
 #include "scrub/rtbitmap.h"
+#include "scrub/btree.h"
 
 /* Set us up with the realtime metadata locked. */
 int
@@ -37,6 +42,7 @@ xchk_setup_rtbitmap(
 	if (!rtb)
 		return -ENOMEM;
 	sc->buf = rtb;
+	rtb->sc = sc;
 
 	error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
 	if (error)
@@ -78,7 +84,30 @@ xchk_setup_rtbitmap(
 	return 0;
 }
 
-/* Realtime bitmap. */
+/* Per-rtgroup bitmap contents. */
+
+/* Cross-reference rtbitmap entries with other metadata. */
+STATIC void
+xchk_rtbitmap_xref(
+	struct xchk_rtbitmap	*rtb,
+	xfs_rtblock_t		startblock,
+	xfs_rtblock_t		blockcount)
+{
+	struct xfs_scrub	*sc = rtb->sc;
+	xfs_rgblock_t		rgbno = xfs_rtb_to_rgbno(sc->mp, startblock);
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+	if (!sc->sr.rmap_cur)
+		return;
+
+	xchk_xref_has_no_rt_owner(sc, rgbno, blockcount);
+
+	if (rtb->next_free_rgbno < rgbno)
+		xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno,
+				rgbno - rtb->next_free_rgbno);
+	rtb->next_free_rgbno = rgbno + blockcount;
+}
 
 /* Scrub a free extent record from the realtime bitmap. */
 STATIC int
@@ -88,7 +117,8 @@ xchk_rtbitmap_rec(
 	const struct xfs_rtalloc_rec *rec,
 	void			*priv)
 {
-	struct xfs_scrub	*sc = priv;
+	struct xchk_rtbitmap	*rtb = priv;
+	struct xfs_scrub	*sc = rtb->sc;
 	xfs_rtblock_t		startblock;
 	xfs_filblks_t		blockcount;
 
@@ -97,6 +127,12 @@ xchk_rtbitmap_rec(
 
 	if (!xfs_verify_rtbext(rtg_mount(rtg), startblock, blockcount))
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+
+	xchk_rtbitmap_xref(rtb, startblock, blockcount);
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return -ECANCELED;
+
 	return 0;
 }
 
@@ -144,7 +180,7 @@ xchk_rtbitmap_check_extents(
 	return error;
 }
 
-/* Scrub the realtime bitmap. */
+/* Scrub this group's realtime bitmap. */
 int
 xchk_rtbitmap(
 	struct xfs_scrub	*sc)
@@ -153,6 +189,7 @@ xchk_rtbitmap(
 	struct xfs_rtgroup	*rtg = sc->sr.rtg;
 	struct xfs_inode	*rbmip = rtg_bitmap(rtg);
 	struct xchk_rtbitmap	*rtb = sc->buf;
+	xfs_rgblock_t		last_rgbno;
 	int			error;
 
 	/* Is sb_rextents correct? */
@@ -205,10 +242,20 @@ xchk_rtbitmap(
 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 		return error;
 
-	error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, sc);
+	rtb->next_free_rgbno = 0;
+	error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, rtb);
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
 		return error;
 
+	/*
+	 * Check that the are rmappings for all rt extents between the end of
+	 * the last free extent we saw and the last possible extent in the rt
+	 * group.
+	 */
+	last_rgbno = rtg->rtg_extents * mp->m_sb.sb_rextsize - 1;
+	if (rtb->next_free_rgbno < last_rgbno)
+		xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno,
+				last_rgbno - rtb->next_free_rgbno);
 	return 0;
 }
 
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
index 85304ff019e1..dd5b394d9697 100644
--- a/fs/xfs/scrub/rtbitmap.h
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -7,10 +7,15 @@
 #define __XFS_SCRUB_RTBITMAP_H__
 
 struct xchk_rtbitmap {
+	struct xfs_scrub	*sc;
+
 	uint64_t		rextents;
 	uint64_t		rbmblocks;
 	unsigned int		rextslog;
 	unsigned int		resblks;
+
+	/* The next free rt group block number that we expect to see. */
+	xfs_rgblock_t		next_free_rgbno;
 };
 
 #ifdef CONFIG_XFS_ONLINE_REPAIR
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 515c2a9b02cd..764fa2967922 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -197,3 +197,68 @@ xchk_rtrmapbt(
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, XFS_DATA_FORK);
 	return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr);
 }
+
+/* xref check that the extent has no realtime reverse mapping at all */
+void
+xchk_xref_has_no_rt_owner(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		bno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* xref check that the extent is completely mapped */
+void
+xchk_xref_has_rt_owner(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		bno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_FULL)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* xref check that the extent is only owned by a given owner */
+void
+xchk_xref_is_only_rt_owned_by(
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_rmap_matches		res;
+	int				error;
+
+	if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_count_owners(sc->sr.rmap_cur, bno, len, oinfo, &res);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		return;
+	if (res.matches != 1)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+	if (res.bad_non_owner_matches)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+	if (res.non_owner_matches)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 0ad5122af486..cba4e89a3a62 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -322,8 +322,17 @@ void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
 #ifdef CONFIG_XFS_RT
 void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
 		xfs_extlen_t len);
+void xchk_xref_has_no_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
+void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
+void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
 #else
 # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
+# define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0)
+# define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0)
+# define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0)
 #endif
 
 #endif	/* __XFS_SCRUB_SCRUB_H__ */
-- 
cgit v1.2.3


From a5542712f98341927f646b790b367b72c7d614c5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:36 -0800
Subject: xfs: scan rt rmap when we're doing an intense rmap check of bmbt
 mappings

Teach the bmbt scrubber how to perform a comprehensive check that the
rmapbt does not contain /any/ mappings that are not described by bmbt
records when it's dealing with a realtime file.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/bmap.c | 48 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index b7f9f3b3d81a..f6077b0cba8a 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -21,6 +21,8 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_rtgroup.h"
 #include "xfs_health.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -641,8 +643,7 @@ xchk_bmap_check_rmap(
 			xchk_fblock_set_corrupt(sc, sbcri->whichfork,
 					check_rec.rm_offset);
 		if (irec.br_startblock !=
-		    xfs_agbno_to_fsb(to_perag(cur->bc_group),
-				check_rec.rm_startblock))
+		    xfs_gbno_to_fsb(cur->bc_group, check_rec.rm_startblock))
 			xchk_fblock_set_corrupt(sc, sbcri->whichfork,
 					check_rec.rm_offset);
 		if (irec.br_blockcount > check_rec.rm_blockcount)
@@ -696,6 +697,30 @@ xchk_bmap_check_ag_rmaps(
 	return error;
 }
 
+/* Make sure each rt rmap has a corresponding bmbt entry. */
+STATIC int
+xchk_bmap_check_rt_rmaps(
+	struct xfs_scrub		*sc,
+	struct xfs_rtgroup		*rtg)
+{
+	struct xchk_bmap_check_rmap_info sbcri;
+	struct xfs_btree_cur		*cur;
+	int				error;
+
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+	cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
+
+	sbcri.sc = sc;
+	sbcri.whichfork = XFS_DATA_FORK;
+	error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
+	if (error == -ECANCELED)
+		error = 0;
+
+	xfs_btree_del_cursor(cur, error);
+	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+	return error;
+}
+
 /*
  * Decide if we want to scan the reverse mappings to determine if the attr
  * fork /really/ has zero space mappings.
@@ -750,10 +775,6 @@ xchk_bmap_check_empty_datafork(
 {
 	struct xfs_ifork	*ifp = &ip->i_df;
 
-	/* Don't support realtime rmap checks yet. */
-	if (XFS_IS_REALTIME_INODE(ip))
-		return false;
-
 	/*
 	 * If the dinode repair found a bad data fork, it will reset the fork
 	 * to extents format with zero records and wait for the this scrubber
@@ -804,6 +825,21 @@ xchk_bmap_check_rmaps(
 	struct xfs_perag	*pag = NULL;
 	int			error;
 
+	if (xfs_ifork_is_realtime(sc->ip, whichfork)) {
+		struct xfs_rtgroup	*rtg = NULL;
+
+		while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+			error = xchk_bmap_check_rt_rmaps(sc, rtg);
+			if (error ||
+			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+				xfs_rtgroup_rele(rtg);
+				return error;
+			}
+		}
+
+		return 0;
+	}
+
 	while ((pag = xfs_perag_next(sc->mp, pag))) {
 		error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
 		if (error ||
-- 
cgit v1.2.3


From 366243cc99b7e80236a19d7391b68d0f47677f4f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:37 -0800
Subject: xfs: scrub the metadir path of rt rmap btree files

Add a new XFS_SCRUB_METAPATH subtype so that we can scrub the metadata
directory tree path to the rmap btree file for each rt group.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h  | 3 ++-
 fs/xfs/scrub/metapath.c | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 34fcbcd0bcd5..d42d3a5617e3 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -830,9 +830,10 @@ struct xfs_scrub_vec_head {
 #define XFS_SCRUB_METAPATH_USRQUOTA	(5)  /* user quota */
 #define XFS_SCRUB_METAPATH_GRPQUOTA	(6)  /* group quota */
 #define XFS_SCRUB_METAPATH_PRJQUOTA	(7)  /* project quota */
+#define XFS_SCRUB_METAPATH_RTRMAPBT	(8)  /* realtime reverse mapping */
 
 /* Number of metapath sm_ino values */
-#define XFS_SCRUB_METAPATH_NR		(8)
+#define XFS_SCRUB_METAPATH_NR		(9)
 
 /*
  * ioctl limits
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index c678cba1ffc3..74d71373e7ed 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -21,6 +21,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_attr.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -246,6 +247,8 @@ xchk_setup_metapath(
 		return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_GROUP);
 	case XFS_SCRUB_METAPATH_PRJQUOTA:
 		return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ);
+	case XFS_SCRUB_METAPATH_RTRMAPBT:
+		return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP);
 	default:
 		return -ENOENT;
 	}
-- 
cgit v1.2.3


From 2e0629e17c311ad50863dac8fb18c62c94965244 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:38 -0800
Subject: xfs: walk the rt reverse mapping tree when rebuilding rmap

When we're rebuilding the data device rmap, if we encounter an "rmap"
format fork, we have to walk the (realtime) rmap btree inode to build
the appropriate mappings.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/rmap_repair.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index 2a0b9e3d0fba..91c17feb4976 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -31,6 +31,8 @@
 #include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_ag.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtgroup.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -504,7 +506,56 @@ xrep_rmap_scan_meta_btree(
 	struct xrep_rmap_ifork	*rf,
 	struct xfs_inode	*ip)
 {
-	return -EFSCORRUPTED; /* XXX placeholder */
+	struct xfs_scrub	*sc = rf->rr->sc;
+	struct xfs_rtgroup	*rtg = NULL;
+	struct xfs_btree_cur	*cur = NULL;
+	enum xfs_rtg_inodes	type;
+	int			error;
+
+	if (rf->whichfork != XFS_DATA_FORK)
+		return -EFSCORRUPTED;
+
+	switch (ip->i_metatype) {
+	case XFS_METAFILE_RTRMAP:
+		type = XFS_RTGI_RMAP;
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+		if (ip == rtg->rtg_inodes[type])
+			goto found;
+	}
+
+	/*
+	 * We should never find an rt metadata btree inode that isn't
+	 * associated with an rtgroup yet has ondisk blocks allocated to it.
+	 */
+	if (ip->i_nblocks) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+
+found:
+	switch (ip->i_metatype) {
+	case XFS_METAFILE_RTRMAP:
+		cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		goto out_rtg;
+	}
+
+	error = xrep_rmap_scan_iroot_btree(rf, cur);
+	xfs_btree_del_cursor(cur, error);
+out_rtg:
+	xfs_rtgroup_rele(rtg);
+	return error;
 }
 
 /* Find all the extents from a given AG in an inode fork. */
-- 
cgit v1.2.3


From f1a6d9b4c3177bc344528a43143e39a598752a1a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:39 -0800
Subject: xfs: online repair of realtime file bmaps

Now that we have a reverse-mapping index of the realtime device, we can
rebuild the data fork forward-mappings of any realtime file.  Enhance
the existing bmbt repair code to walk the rtrmap btrees to gather this
information.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/bmap_repair.c | 128 +++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/repair.c      |  46 ++++++++++++++++
 fs/xfs/scrub/repair.h      |   2 +
 3 files changed, 172 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 141d36f1da9a..fd64bdf4e138 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -25,11 +25,13 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
 #include "xfs_refcount.h"
 #include "xfs_quota.h"
 #include "xfs_ialloc.h"
 #include "xfs_ag.h"
 #include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -359,6 +361,112 @@ xrep_bmap_scan_ag(
 	return error;
 }
 
+#ifdef CONFIG_XFS_RT
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_rtfork_rmap(
+	struct xfs_scrub		*sc,
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec)
+{
+	/* xattr extents are never stored on realtime devices */
+	if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+		return -EFSCORRUPTED;
+
+	/* bmbt blocks are never stored on realtime devices */
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		return -EFSCORRUPTED;
+
+	/* Data extents for non-rt files are never stored on the rt device. */
+	if (!XFS_IS_REALTIME_INODE(sc->ip))
+		return -EFSCORRUPTED;
+
+	/* Check the file offsets and physical extents. */
+	if (!xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	/* Check that this is within the rtgroup. */
+	if (!xfs_verify_rgbext(to_rtg(cur->bc_group), rec->rm_startblock,
+				rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	return xrep_require_rtext_inuse(sc, rec->rm_startblock,
+			rec->rm_blockcount);
+}
+
+/* Record realtime extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rtrmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_bmap		*rb = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(rb->sc, &error))
+		return error;
+
+	/* Skip extents which are not owned by this inode and fork. */
+	if (rec->rm_owner != rb->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_bmap_check_rtfork_rmap(rb->sc, cur, rec);
+	if (error)
+		return error;
+
+	/*
+	 * Record all blocks allocated to this file even if the extent isn't
+	 * for the fork we're rebuilding so that we can reset di_nblocks later.
+	 */
+	rb->nblocks += rec->rm_blockcount;
+
+	/* If this rmap isn't for the fork we want, we're done. */
+	if (rb->whichfork == XFS_DATA_FORK &&
+	    (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+	if (rb->whichfork == XFS_ATTR_FORK &&
+	    !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+
+	return xrep_bmap_from_rmap(rb, rec->rm_offset,
+			xfs_rgbno_to_rtb(to_rtg(cur->bc_group),
+				rec->rm_startblock),
+			rec->rm_blockcount,
+			rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/* Scan the realtime reverse mappings to build the new extent map. */
+STATIC int
+xrep_bmap_scan_rtgroup(
+	struct xrep_bmap	*rb,
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_scrub	*sc = rb->sc;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(sc->mp))
+		return 0;
+
+	error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+			XFS_RTGLOCK_RMAP | XFS_RTGLOCK_BITMAP_SHARED);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb);
+	xchk_rtgroup_btcur_free(&sc->sr);
+	xchk_rtgroup_free(sc, &sc->sr);
+	return error;
+}
+#else
+static inline int
+xrep_bmap_scan_rtgroup(struct xrep_bmap *rb, struct xfs_rtgroup *rtg)
+{
+	return -EFSCORRUPTED;
+}
+#endif
+
 /* Find the delalloc extents from the old incore extent tree. */
 STATIC int
 xrep_bmap_find_delalloc(
@@ -410,6 +518,22 @@ xrep_bmap_find_mappings(
 	struct xfs_perag	*pag = NULL;
 	int			error = 0;
 
+	/*
+	 * Iterate the rtrmaps for extents.  Metadata files never have content
+	 * on the realtime device, so there's no need to scan them.
+	 */
+	if (!xfs_is_metadir_inode(sc->ip)) {
+		struct xfs_rtgroup	*rtg = NULL;
+
+		while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+			error = xrep_bmap_scan_rtgroup(rb, rtg);
+			if (error) {
+				xfs_rtgroup_rele(rtg);
+				return error;
+			}
+		}
+	}
+
 	/* Iterate the rmaps for extents. */
 	while ((pag = xfs_perag_next(sc->mp, pag))) {
 		error = xrep_bmap_scan_ag(rb, pag);
@@ -754,10 +878,6 @@ xrep_bmap_check_inputs(
 		return -EINVAL;
 	}
 
-	/* Don't know how to rebuild realtime data forks. */
-	if (XFS_IS_REALTIME_INODE(sc->ip))
-		return -EOPNOTSUPP;
-
 	return 0;
 }
 
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index e788e3032f8e..18946dd46fa7 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -37,6 +37,9 @@
 #include "xfs_da_btree.h"
 #include "xfs_attr.h"
 #include "xfs_dir2.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -955,6 +958,22 @@ xrep_ag_init(
 }
 
 #ifdef CONFIG_XFS_RT
+/* Initialize all the btree cursors for a RT repair. */
+static void
+xrep_rtgroup_btcur_init(
+	struct xfs_scrub	*sc,
+	struct xchk_rt		*sr)
+{
+	struct xfs_mount	*mp = sc->mp;
+
+	ASSERT(sr->rtg != NULL);
+
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT &&
+	    (sr->rtlock_flags & XFS_RTGLOCK_RMAP) &&
+	    xfs_has_rtrmapbt(mp))
+		sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+}
+
 /*
  * Given a reference to a rtgroup structure, lock rtgroup btree inodes and
  * create btree cursors.  Must only be called to repair a regular rt file.
@@ -973,6 +992,33 @@ xrep_rtgroup_init(
 
 	/* Grab our own passive reference from the caller's ref. */
 	sr->rtg = xfs_rtgroup_hold(rtg);
+	xrep_rtgroup_btcur_init(sc, sr);
+	return 0;
+}
+
+/* Ensure that all rt blocks in the given range are not marked free. */
+int
+xrep_require_rtext_inuse(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		rgbno,
+	xfs_filblks_t		len)
+{
+	struct xfs_mount	*mp = sc->mp;
+	xfs_rtxnum_t		startrtx;
+	xfs_rtxnum_t		endrtx;
+	bool			is_free = false;
+	int			error;
+
+	startrtx = xfs_rgbno_to_rtx(mp, rgbno);
+	endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
+
+	error = xfs_rtalloc_extent_is_free(sc->sr.rtg, sc->tp, startrtx,
+			endrtx - startrtx + 1, &is_free);
+	if (error)
+		return error;
+	if (is_free)
+		return -EFSCORRUPTED;
+
 	return 0;
 }
 #endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index b649da1a93eb..584135042d9a 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -110,6 +110,8 @@ int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
 #ifdef CONFIG_XFS_RT
 int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
 		struct xchk_rt *sr, unsigned int rtglock_flags);
+int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_filblks_t len);
 #else
 # define xrep_rtgroup_init(sc, rtg, sr, lockflags)	(-ENOSYS)
 #endif /* CONFIG_XFS_RT */
-- 
cgit v1.2.3


From 1bd0843027167221421fe482d18ce03f7ff1f9f1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:39 -0800
Subject: xfs: repair inodes that have realtime extents

Plumb into the inode core repair code the ability to search for extents
on realtime devices.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/inode_repair.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index a94f9df0ca78..816e81330ffc 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -38,6 +38,8 @@
 #include "xfs_log_priv.h"
 #include "xfs_health.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -773,17 +775,71 @@ xrep_dinode_count_ag_rmaps(
 	return error;
 }
 
+/* Count extents and blocks for an inode given an rt rmap. */
+STATIC int
+xrep_dinode_walk_rtrmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_inode		*ri = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(ri->sc, &error))
+		return error;
+
+	/* We only care about this inode. */
+	if (rec->rm_owner != ri->sc->sm->sm_ino)
+		return 0;
+
+	if (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))
+		return -EFSCORRUPTED;
+
+	ri->rt_blocks += rec->rm_blockcount;
+	ri->rt_extents++;
+	return 0;
+}
+
+/* Count extents and blocks for an inode from all realtime rmap data. */
+STATIC int
+xrep_dinode_count_rtgroup_rmaps(
+	struct xrep_inode	*ri,
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	int			error;
+
+	error = xrep_rtgroup_init(sc, rtg, &sc->sr, XFS_RTGLOCK_RMAP);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap,
+			ri);
+	xchk_rtgroup_btcur_free(&sc->sr);
+	xchk_rtgroup_free(sc, &sc->sr);
+	return error;
+}
+
 /* Count extents and blocks for a given inode from all rmap data. */
 STATIC int
 xrep_dinode_count_rmaps(
 	struct xrep_inode	*ri)
 {
 	struct xfs_perag	*pag = NULL;
+	struct xfs_rtgroup	*rtg = NULL;
 	int			error;
 
-	if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
+	if (!xfs_has_rmapbt(ri->sc->mp))
 		return -EOPNOTSUPP;
 
+	while ((rtg = xfs_rtgroup_next(ri->sc->mp, rtg))) {
+		error = xrep_dinode_count_rtgroup_rmaps(ri, rtg);
+		if (error) {
+			xfs_rtgroup_rele(rtg);
+			return error;
+		}
+	}
+
 	while ((pag = xfs_perag_next(ri->sc->mp, pag))) {
 		error = xrep_dinode_count_ag_rmaps(ri, pag);
 		if (error) {
-- 
cgit v1.2.3


From 3dd3aba6b92bf8298ff02d839ed3008df7478a8e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:40 -0800
Subject: xfs: repair rmap btree inodes

Teach the inode repair code how to deal with realtime rmap btree inodes
that won't load properly.  This is most likely moot since the filesystem
generally won't mount without the rtrmapbt inodes being usable, but
we'll add this for completeness.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/inode_repair.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 816e81330ffc..d7e3f033b160 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -944,6 +944,34 @@ xrep_dinode_bad_bmbt_fork(
 	return false;
 }
 
+/* Return true if this rmap-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_rtrmapbt_fork(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	unsigned int		dfork_size)
+{
+	struct xfs_rtrmap_root	*dfp;
+	unsigned int		nrecs;
+	unsigned int		level;
+
+	if (dfork_size < sizeof(struct xfs_rtrmap_root))
+		return true;
+
+	dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > sc->mp->m_rtrmap_maxlevels)
+		return true;
+	if (xfs_rtrmap_droot_space_calc(level, nrecs) > dfork_size)
+		return true;
+	if (level > 0 && nrecs == 0)
+		return true;
+
+	return false;
+}
+
 /* Check a metadata-btree fork. */
 STATIC bool
 xrep_dinode_bad_metabt_fork(
@@ -956,6 +984,8 @@ xrep_dinode_bad_metabt_fork(
 		return true;
 
 	switch (be16_to_cpu(dip->di_metatype)) {
+	case XFS_METAFILE_RTRMAP:
+		return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size);
 	default:
 		return true;
 	}
@@ -1220,6 +1250,7 @@ xrep_dinode_ensure_forkoff(
 	uint16_t		mode)
 {
 	struct xfs_bmdr_block	*bmdr;
+	struct xfs_rtrmap_root	*rmdr;
 	struct xfs_scrub	*sc = ri->sc;
 	xfs_extnum_t		attr_extents, data_extents;
 	size_t			bmdr_minsz = xfs_bmdr_space_calc(1);
@@ -1328,6 +1359,10 @@ xrep_dinode_ensure_forkoff(
 		break;
 	case XFS_DINODE_FMT_META_BTREE:
 		switch (be16_to_cpu(dip->di_metatype)) {
+		case XFS_METAFILE_RTRMAP:
+			rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+			dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr);
+			break;
 		default:
 			dfork_min = 0;
 			break;
-- 
cgit v1.2.3


From 8defee8dff2b202702cdf33f6d8577adf9ad3e82 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:41 -0800
Subject: xfs: online repair of realtime bitmaps for a realtime group

For a given rt group, regenerate the bitmap contents from the group's
realtime rmap btree.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtbitmap.h    |   9 +
 fs/xfs/scrub/common.h           |   6 +
 fs/xfs/scrub/repair.c           |   2 +-
 fs/xfs/scrub/repair.h           |   1 +
 fs/xfs/scrub/rtbitmap.c         |   5 +-
 fs/xfs/scrub/rtbitmap.h         |  50 +++++
 fs/xfs/scrub/rtbitmap_repair.c  | 429 +++++++++++++++++++++++++++++++++++++++-
 fs/xfs/scrub/rtsummary_repair.c |   3 +-
 fs/xfs/scrub/tempexch.h         |   2 +-
 fs/xfs/scrub/tempfile.c         |  20 +-
 fs/xfs/scrub/trace.c            |   1 +
 fs/xfs/scrub/trace.h            | 150 ++++++++++++++
 12 files changed, 659 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 16563a44bd13..22e5d9cd95f4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -135,6 +135,15 @@ xfs_rtb_to_rtx(
 	return div_u64(rtbno, mp->m_sb.sb_rextsize);
 }
 
+/* Return the offset of a rtgroup block number within an rt extent. */
+static inline xfs_extlen_t
+xfs_rgbno_to_rtxoff(
+	struct xfs_mount	*mp,
+	xfs_rgblock_t		rgbno)
+{
+	return rgbno % mp->m_sb.sb_rextsize;
+}
+
 /* Return the offset of an rt block number within an rt extent. */
 static inline xfs_extlen_t
 xfs_rtb_to_rtxoff(
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 1576467f7244..e5891609af27 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -264,6 +264,12 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
 			(sc)->mp->m_super->s_id, \
 			(sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \
 			##__VA_ARGS__)
+#define xchk_xfile_rtgroup_descr(sc, fmt, ...) \
+	kasprintf(XCHK_GFP_FLAGS, "XFS (%s): rtgroup 0x%x " fmt, \
+			(sc)->mp->m_super->s_id, \
+			(sc)->sa.pag ? \
+				rtg_rgno((sc)->sr.rtg) : (sc)->sm->sm_agno, \
+			##__VA_ARGS__)
 
 /*
  * Setting up a hook to wait for intents to drain is costly -- we have to take
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 18946dd46fa7..82fe01d78cb0 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -959,7 +959,7 @@ xrep_ag_init(
 
 #ifdef CONFIG_XFS_RT
 /* Initialize all the btree cursors for a RT repair. */
-static void
+void
 xrep_rtgroup_btcur_init(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr)
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 584135042d9a..7f493752ea78 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -110,6 +110,7 @@ int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
 #ifdef CONFIG_XFS_RT
 int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
 		struct xchk_rt *sr, unsigned int rtglock_flags);
+void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr);
 int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
 		xfs_filblks_t len);
 #else
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 675f4fdd1e67..28c90a31f4c3 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -20,9 +20,11 @@
 #include "xfs_sb.h"
 #include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_exchmaps.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/repair.h"
+#include "scrub/tempexch.h"
 #include "scrub/rtbitmap.h"
 #include "scrub/btree.h"
 
@@ -38,7 +40,8 @@ xchk_setup_rtbitmap(
 	if (xchk_need_intent_drain(sc))
 		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
 
-	rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
+	rtb = kzalloc(struct_size(rtb, words, xchk_rtbitmap_wordcnt(sc)),
+			XCHK_GFP_FLAGS);
 	if (!rtb)
 		return -ENOMEM;
 	sc->buf = rtb;
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
index dd5b394d9697..fe52b877253d 100644
--- a/fs/xfs/scrub/rtbitmap.h
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -6,6 +6,20 @@
 #ifndef __XFS_SCRUB_RTBITMAP_H__
 #define __XFS_SCRUB_RTBITMAP_H__
 
+/*
+ * We use an xfile to construct new bitmap blocks for the portion of the
+ * rtbitmap file that we're replacing.  Whereas the ondisk bitmap must be
+ * accessed through the buffer cache, the xfile bitmap supports direct
+ * word-level accesses.  Therefore, we create a small abstraction for linear
+ * access.
+ */
+typedef unsigned long long xrep_wordoff_t;
+typedef unsigned int xrep_wordcnt_t;
+
+/* Mask to round an rtx down to the nearest bitmap word. */
+#define XREP_RTBMP_WORDMASK	((1ULL << XFS_NBWORDLOG) - 1)
+
+
 struct xchk_rtbitmap {
 	struct xfs_scrub	*sc;
 
@@ -16,12 +30,48 @@ struct xchk_rtbitmap {
 
 	/* The next free rt group block number that we expect to see. */
 	xfs_rgblock_t		next_free_rgbno;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+	/* stuff for staging a new bitmap */
+	struct xfs_rtalloc_args	args;
+	struct xrep_tempexch	tempexch;
+#endif
+
+	/* The next rtgroup block we expect to see during our rtrmapbt walk. */
+	xfs_rgblock_t		next_rgbno;
+
+	/* rtgroup lock flags */
+	unsigned int		rtglock_flags;
+
+	/* rtword position of xfile as we write buffers to disk. */
+	xrep_wordoff_t		prep_wordoff;
+
+	/* In-Memory rtbitmap for repair. */
+	union xfs_rtword_raw	words[];
 };
 
 #ifdef CONFIG_XFS_ONLINE_REPAIR
 int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
+
+/*
+ * How big should the words[] buffer be?
+ *
+ * For repairs, we want a full fsblock worth of space so that we can memcpy a
+ * buffer full of 1s into the xfile bitmap.  The xfile bitmap doesn't have
+ * rtbitmap block headers, so we don't use blockwsize.  Scrub doesn't use the
+ * words buffer at all.
+ */
+static inline unsigned int
+xchk_rtbitmap_wordcnt(
+	struct xfs_scrub	*sc)
+{
+	if (xchk_could_repair(sc))
+		return sc->mp->m_sb.sb_blocksize >> XFS_WORDLOG;
+	return 0;
+}
 #else
 # define xrep_setup_rtbitmap(sc, rtb)	(0)
+# define xchk_rtbitmap_wordcnt(sc)	(0)
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
 #endif /* __XFS_SCRUB_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index 0fef98e9f834..c6e33834c5ae 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -12,32 +12,65 @@
 #include "xfs_btree.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
+#include "xfs_rtalloc.h"
 #include "xfs_inode.h"
 #include "xfs_bit.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_extent_busy.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/xfile.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
 #include "scrub/rtbitmap.h"
 
-/* Set up to repair the realtime bitmap file metadata. */
+/* rt bitmap content repairs */
+
+/* Set up to repair the realtime bitmap for this group. */
 int
 xrep_setup_rtbitmap(
 	struct xfs_scrub	*sc,
 	struct xchk_rtbitmap	*rtb)
 {
 	struct xfs_mount	*mp = sc->mp;
-	unsigned long long	blocks = 0;
+	char			*descr;
+	unsigned long long	blocks = mp->m_sb.sb_rbmblocks;
+	int			error;
+
+	error = xrep_tempfile_create(sc, S_IFREG);
+	if (error)
+		return error;
+
+	/* Create an xfile to hold our reconstructed bitmap. */
+	descr = xchk_xfile_rtgroup_descr(sc, "bitmap file");
+	error = xfile_create(descr, blocks * mp->m_sb.sb_blocksize, &sc->xfile);
+	kfree(descr);
+	if (error)
+		return error;
 
 	/*
-	 * Reserve enough blocks to write out a completely new bmbt for a
-	 * maximally fragmented bitmap file.  We do not hold the rtbitmap
-	 * ILOCK yet, so this is entirely speculative.
+	 * Reserve enough blocks to write out a completely new bitmap file,
+	 * plus twice as many blocks as we would need if we can only allocate
+	 * one block per data fork mapping.  This should cover the
+	 * preallocation of the temporary file and exchanging the extent
+	 * mappings.
+	 *
+	 * We cannot use xfs_exchmaps_estimate because we have not yet
+	 * constructed the replacement bitmap and therefore do not know how
+	 * many extents it will use.  By the time we do, we will have a dirty
+	 * transaction (which we cannot drop because we cannot drop the
+	 * rtbitmap ILOCK) and cannot ask for more reservation.
 	 */
-	blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks);
+	blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
 	if (blocks > UINT_MAX)
 		return -EOPNOTSUPP;
 
@@ -45,6 +78,304 @@ xrep_setup_rtbitmap(
 	return 0;
 }
 
+static inline xrep_wordoff_t
+rtx_to_wordoff(
+	struct xfs_mount	*mp,
+	xfs_rtxnum_t		rtx)
+{
+	return rtx >> XFS_NBWORDLOG;
+}
+
+static inline xrep_wordcnt_t
+rtxlen_to_wordcnt(
+	xfs_rtxlen_t	rtxlen)
+{
+	return rtxlen >> XFS_NBWORDLOG;
+}
+
+/* Helper functions to record rtwords in an xfile. */
+
+static inline int
+xfbmp_load(
+	struct xchk_rtbitmap	*rtb,
+	xrep_wordoff_t		wordoff,
+	xfs_rtword_t		*word)
+{
+	union xfs_rtword_raw	urk;
+	int			error;
+
+	ASSERT(xfs_has_rtgroups(rtb->sc->mp));
+
+	error = xfile_load(rtb->sc->xfile, &urk,
+			sizeof(union xfs_rtword_raw),
+			wordoff << XFS_WORDLOG);
+	if (error)
+		return error;
+
+	*word = be32_to_cpu(urk.rtg);
+	return 0;
+}
+
+static inline int
+xfbmp_store(
+	struct xchk_rtbitmap	*rtb,
+	xrep_wordoff_t		wordoff,
+	const xfs_rtword_t	word)
+{
+	union xfs_rtword_raw	urk;
+
+	ASSERT(xfs_has_rtgroups(rtb->sc->mp));
+
+	urk.rtg = cpu_to_be32(word);
+	return xfile_store(rtb->sc->xfile, &urk,
+			sizeof(union xfs_rtword_raw),
+			wordoff << XFS_WORDLOG);
+}
+
+static inline int
+xfbmp_copyin(
+	struct xchk_rtbitmap	*rtb,
+	xrep_wordoff_t		wordoff,
+	const union xfs_rtword_raw	*word,
+	xrep_wordcnt_t		nr_words)
+{
+	return xfile_store(rtb->sc->xfile, word, nr_words << XFS_WORDLOG,
+			wordoff << XFS_WORDLOG);
+}
+
+static inline int
+xfbmp_copyout(
+	struct xchk_rtbitmap	*rtb,
+	xrep_wordoff_t		wordoff,
+	union xfs_rtword_raw	*word,
+	xrep_wordcnt_t		nr_words)
+{
+	return xfile_load(rtb->sc->xfile, word, nr_words << XFS_WORDLOG,
+			wordoff << XFS_WORDLOG);
+}
+
+/* Perform a logical OR operation on an rtword in the incore bitmap. */
+static int
+xrep_rtbitmap_or(
+	struct xchk_rtbitmap	*rtb,
+	xrep_wordoff_t		wordoff,
+	xfs_rtword_t		mask)
+{
+	xfs_rtword_t		word;
+	int			error;
+
+	error = xfbmp_load(rtb, wordoff, &word);
+	if (error)
+		return error;
+
+	trace_xrep_rtbitmap_or(rtb->sc->mp, wordoff, mask, word);
+
+	return xfbmp_store(rtb, wordoff, word | mask);
+}
+
+/*
+ * Mark as free every rt extent between the next rt block we expected to see
+ * in the rtrmap records and the given rt block.
+ */
+STATIC int
+xrep_rtbitmap_mark_free(
+	struct xchk_rtbitmap	*rtb,
+	xfs_rgblock_t		rgbno)
+{
+	struct xfs_mount	*mp = rtb->sc->mp;
+	struct xfs_rtgroup	*rtg = rtb->sc->sr.rtg;
+	xfs_rtxnum_t		startrtx;
+	xfs_rtxnum_t		nextrtx;
+	xrep_wordoff_t		wordoff, nextwordoff;
+	unsigned int		bit;
+	unsigned int		bufwsize;
+	xfs_extlen_t		mod;
+	xfs_rtword_t		mask;
+	int			error;
+
+	if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno))
+		return -EFSCORRUPTED;
+
+	/*
+	 * Convert rt blocks to rt extents  The block range we find must be
+	 * aligned to an rtextent boundary on both ends.
+	 */
+	startrtx = xfs_rgbno_to_rtx(mp, rtb->next_rgbno);
+	mod = xfs_rgbno_to_rtxoff(mp, rtb->next_rgbno);
+	if (mod)
+		return -EFSCORRUPTED;
+
+	nextrtx = xfs_rgbno_to_rtx(mp, rgbno - 1) + 1;
+	mod = xfs_rgbno_to_rtxoff(mp, rgbno - 1);
+	if (mod != mp->m_sb.sb_rextsize - 1)
+		return -EFSCORRUPTED;
+
+	trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1);
+
+	/* Set bits as needed to round startrtx up to the nearest word. */
+	bit = startrtx & XREP_RTBMP_WORDMASK;
+	if (bit) {
+		xfs_rtblock_t	len = nextrtx - startrtx;
+		unsigned int	lastbit;
+
+		lastbit = min(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+
+		error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, startrtx),
+				mask);
+		if (error || lastbit - bit == len)
+			return error;
+		startrtx += XFS_NBWORD - bit;
+	}
+
+	/* Set bits as needed to round nextrtx down to the nearest word. */
+	bit = nextrtx & XREP_RTBMP_WORDMASK;
+	if (bit) {
+		mask = ((xfs_rtword_t)1 << bit) - 1;
+
+		error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, nextrtx),
+				mask);
+		if (error || startrtx + bit == nextrtx)
+			return error;
+		nextrtx -= bit;
+	}
+
+	trace_xrep_rtbitmap_record_free_bulk(mp, startrtx, nextrtx - 1);
+
+	/* Set all the words in between, up to a whole fs block at once. */
+	wordoff = rtx_to_wordoff(mp, startrtx);
+	nextwordoff = rtx_to_wordoff(mp, nextrtx);
+	bufwsize = mp->m_sb.sb_blocksize >> XFS_WORDLOG;
+
+	while (wordoff < nextwordoff) {
+		xrep_wordoff_t	rem;
+		xrep_wordcnt_t	wordcnt;
+
+		wordcnt = min_t(xrep_wordcnt_t, nextwordoff - wordoff,
+				bufwsize);
+
+		/*
+		 * Try to keep us aligned to the rtwords buffer to reduce the
+		 * number of xfile writes.
+		 */
+		rem = wordoff & (bufwsize - 1);
+		if (rem)
+			wordcnt = min_t(xrep_wordcnt_t, wordcnt,
+					bufwsize - rem);
+
+		error = xfbmp_copyin(rtb, wordoff, rtb->words, wordcnt);
+		if (error)
+			return error;
+
+		wordoff += wordcnt;
+	}
+
+	return 0;
+}
+
+/* Set free space in the rtbitmap based on rtrmapbt records. */
+STATIC int
+xrep_rtbitmap_walk_rtrmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xchk_rtbitmap		*rtb = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(rtb->sc, &error))
+		return error;
+
+	if (rtb->next_rgbno < rec->rm_startblock) {
+		error = xrep_rtbitmap_mark_free(rtb, rec->rm_startblock);
+		if (error)
+			return error;
+	}
+
+	rtb->next_rgbno = max(rtb->next_rgbno,
+			      rec->rm_startblock + rec->rm_blockcount);
+	return 0;
+}
+
+/*
+ * Walk the rtrmapbt to find all the gaps between records, and mark the gaps
+ * in the realtime bitmap that we're computing.
+ */
+STATIC int
+xrep_rtbitmap_find_freespace(
+	struct xchk_rtbitmap	*rtb)
+{
+	struct xfs_scrub	*sc = rtb->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_rtgroup	*rtg = sc->sr.rtg;
+	uint64_t		blockcount;
+	int			error;
+
+	/* Prepare a buffer of ones so that we can accelerate bulk setting. */
+	memset(rtb->words, 0xFF, mp->m_sb.sb_blocksize);
+
+	xrep_rtgroup_btcur_init(sc, &sc->sr);
+	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_rtbitmap_walk_rtrmap,
+			rtb);
+	if (error)
+		goto out;
+
+	/*
+	 * Mark as free every possible rt extent from the last one we saw to
+	 * the end of the rt group.
+	 */
+	blockcount = rtg->rtg_extents * mp->m_sb.sb_rextsize;
+	if (rtb->next_rgbno < blockcount) {
+		error = xrep_rtbitmap_mark_free(rtb, blockcount);
+		if (error)
+			goto out;
+	}
+
+out:
+	xchk_rtgroup_btcur_free(&sc->sr);
+	return error;
+}
+
+static int
+xrep_rtbitmap_prep_buf(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp,
+	void			*data)
+{
+	struct xchk_rtbitmap	*rtb = data;
+	struct xfs_mount	*mp = sc->mp;
+	union xfs_rtword_raw	*ondisk;
+	int			error;
+
+	rtb->args.mp = sc->mp;
+	rtb->args.tp = sc->tp;
+	rtb->args.rbmbp = bp;
+	ondisk = xfs_rbmblock_wordptr(&rtb->args, 0);
+	rtb->args.rbmbp = NULL;
+
+	error = xfbmp_copyout(rtb, rtb->prep_wordoff, ondisk,
+			mp->m_blockwsize);
+	if (error)
+		return error;
+
+	if (xfs_has_rtgroups(sc->mp)) {
+		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+
+		hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+		hdr->rt_owner = cpu_to_be64(sc->ip->i_ino);
+		hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		hdr->rt_lsn = 0;
+		uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid);
+		bp->b_ops = &xfs_rtbitmap_buf_ops;
+	} else {
+		bp->b_ops = &xfs_rtbuf_ops;
+	}
+
+	rtb->prep_wordoff += mp->m_blockwsize;
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTBITMAP_BUF);
+	return 0;
+}
+
 /*
  * Make sure that the given range of the data fork of the realtime file is
  * mapped to written blocks.  The caller must ensure that the inode is joined
@@ -160,9 +491,18 @@ xrep_rtbitmap(
 {
 	struct xchk_rtbitmap	*rtb = sc->buf;
 	struct xfs_mount	*mp = sc->mp;
+	struct xfs_group	*xg = rtg_group(sc->sr.rtg);
 	unsigned long long	blocks = 0;
+	unsigned int		busy_gen;
 	int			error;
 
+	/* We require the realtime rmapbt to rebuild anything. */
+	if (!xfs_has_rtrmapbt(sc->mp))
+		return -EOPNOTSUPP;
+	/* We require atomic file exchange range to rebuild anything. */
+	if (!xfs_has_exchange_range(sc->mp))
+		return -EOPNOTSUPP;
+
 	/* Impossibly large rtbitmap means we can't touch the filesystem. */
 	if (rtb->rbmblocks > U32_MAX)
 		return 0;
@@ -195,6 +535,79 @@ xrep_rtbitmap(
 	if (error)
 		return error;
 
-	/* Fix inconsistent bitmap geometry */
-	return xrep_rtbitmap_geometry(sc, rtb);
+	/*
+	 * Fix inconsistent bitmap geometry.  This function returns with a
+	 * clean scrub transaction.
+	 */
+	error = xrep_rtbitmap_geometry(sc, rtb);
+	if (error)
+		return error;
+
+	/*
+	 * Make sure the busy extent list is clear because we can't put extents
+	 * on there twice.
+	 */
+	if (!xfs_extent_busy_list_empty(xg, &busy_gen)) {
+		error = xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Generate the new rtbitmap data.  We don't need the rtbmp information
+	 * once this call is finished.
+	 */
+	error = xrep_rtbitmap_find_freespace(rtb);
+	if (error)
+		return error;
+
+	/*
+	 * Try to take ILOCK_EXCL of the temporary file.  We had better be the
+	 * only ones holding onto this inode, but we can't block while holding
+	 * the rtbitmap file's ILOCK_EXCL.
+	 */
+	while (!xrep_tempfile_ilock_nowait(sc)) {
+		if (xchk_should_terminate(sc, &error))
+			return error;
+		delay(1);
+	}
+
+	/*
+	 * Make sure we have space allocated for the part of the bitmap
+	 * file that corresponds to this group.  We already joined sc->ip.
+	 */
+	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+	error = xrep_tempfile_prealloc(sc, 0, rtb->rbmblocks);
+	if (error)
+		return error;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	/* Copy the bitmap file that we generated. */
+	error = xrep_tempfile_copyin(sc, 0, rtb->rbmblocks,
+			xrep_rtbitmap_prep_buf, rtb);
+	if (error)
+		return error;
+	error = xrep_tempfile_set_isize(sc,
+			XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks));
+	if (error)
+		return error;
+
+	/*
+	 * Now exchange the data fork contents.  We're done with the temporary
+	 * buffer, so we can reuse it for the tempfile exchmaps information.
+	 */
+	error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0,
+			rtb->rbmblocks, &rtb->tempexch);
+	if (error)
+		return error;
+
+	error = xrep_tempexch_contents(sc, &rtb->tempexch);
+	if (error)
+		return error;
+
+	/* Free the old rtbitmap blocks if they're not in use. */
+	return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
 }
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
index 8198ea84ad70..d593977d70df 100644
--- a/fs/xfs/scrub/rtsummary_repair.c
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -165,7 +165,8 @@ xrep_rtsummary(
 	 * Now exchange the contents.  Nothing in repair uses the temporary
 	 * buffer, so we can reuse it for the tempfile exchrange information.
 	 */
-	error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch);
+	error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0,
+			rts->rsumblocks, &rts->tempexch);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h
index 995ba187c5aa..eccda720c2ca 100644
--- a/fs/xfs/scrub/tempexch.h
+++ b/fs/xfs/scrub/tempexch.h
@@ -12,7 +12,7 @@ struct xrep_tempexch {
 };
 
 int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork,
-		struct xrep_tempexch *ti);
+		xfs_fileoff_t off, xfs_filblks_t len, struct xrep_tempexch *ti);
 int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork,
 		struct xrep_tempexch *ti);
 
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 4ebb5f8459e8..cf99e0ca51b0 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -606,6 +606,8 @@ STATIC int
 xrep_tempexch_prep_request(
 	struct xfs_scrub	*sc,
 	int			whichfork,
+	xfs_fileoff_t		off,
+	xfs_filblks_t		len,
 	struct xrep_tempexch	*tx)
 {
 	struct xfs_exchmaps_req	*req = &tx->req;
@@ -629,18 +631,19 @@ xrep_tempexch_prep_request(
 	/* Exchange all mappings in both forks. */
 	req->ip1 = sc->tempip;
 	req->ip2 = sc->ip;
-	req->startoff1 = 0;
-	req->startoff2 = 0;
+	req->startoff1 = off;
+	req->startoff2 = off;
 	switch (whichfork) {
 	case XFS_ATTR_FORK:
 		req->flags |= XFS_EXCHMAPS_ATTR_FORK;
 		break;
 	case XFS_DATA_FORK:
-		/* Always exchange sizes when exchanging data fork mappings. */
-		req->flags |= XFS_EXCHMAPS_SET_SIZES;
+		/* Exchange sizes when exchanging all data fork mappings. */
+		if (off == 0 && len == XFS_MAX_FILEOFF)
+			req->flags |= XFS_EXCHMAPS_SET_SIZES;
 		break;
 	}
-	req->blockcount = XFS_MAX_FILEOFF;
+	req->blockcount = len;
 
 	return 0;
 }
@@ -796,6 +799,8 @@ int
 xrep_tempexch_trans_reserve(
 	struct xfs_scrub	*sc,
 	int			whichfork,
+	xfs_fileoff_t		off,
+	xfs_filblks_t		len,
 	struct xrep_tempexch	*tx)
 {
 	int			error;
@@ -804,7 +809,7 @@ xrep_tempexch_trans_reserve(
 	xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
 	xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
 
-	error = xrep_tempexch_prep_request(sc, whichfork, tx);
+	error = xrep_tempexch_prep_request(sc, whichfork, off, len, tx);
 	if (error)
 		return error;
 
@@ -842,7 +847,8 @@ xrep_tempexch_trans_alloc(
 	ASSERT(sc->tp == NULL);
 	ASSERT(xfs_has_exchange_range(sc->mp));
 
-	error = xrep_tempexch_prep_request(sc, whichfork, tx);
+	error = xrep_tempexch_prep_request(sc, whichfork, 0, XFS_MAX_FILEOFF,
+			tx);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 98f923ae664d..2450e214103f 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -21,6 +21,7 @@
 #include "xfs_rmap.h"
 #include "xfs_parent.h"
 #include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 5afc440f22f5..3b661e444345 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -17,6 +17,7 @@
 #include "xfs_bit.h"
 #include "xfs_quota_defs.h"
 
+struct xfs_rtgroup;
 struct xfs_scrub;
 struct xfile;
 struct xfarray;
@@ -3607,6 +3608,155 @@ DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink);
 DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink);
 DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xrep_rtbitmap_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, xfs_rtxnum_t end),
+	TP_ARGS(mp, start, end),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rtxnum_t, start)
+		__field(xfs_rtxnum_t, end)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->start = start;
+		__entry->end = end;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d startrtx 0x%llx endrtx 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->start,
+		  __entry->end)
+);
+#define DEFINE_REPAIR_RGBITMAP_EVENT(name) \
+DEFINE_EVENT(xrep_rtbitmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, \
+		 xfs_rtxnum_t end), \
+	TP_ARGS(mp, start, end))
+DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free);
+DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free_bulk);
+
+TRACE_EVENT(xrep_rtbitmap_or,
+	TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff,
+		 xfs_rtword_t mask, xfs_rtword_t word),
+	TP_ARGS(mp, wordoff, mask, word),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(unsigned long long, wordoff)
+		__field(unsigned int, mask)
+		__field(unsigned int, word)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->wordoff = wordoff;
+		__entry->mask = mask;
+		__entry->word = word;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx mask 0x%x word 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->wordoff,
+		  __entry->mask,
+		  __entry->word)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load,
+	TP_PROTO(struct xfs_rtgroup *rtg, xfs_fileoff_t rbmoff,
+		 xfs_rtxnum_t rtx, xfs_rtxnum_t len),
+	TP_ARGS(rtg, rbmoff, rtx, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_fileoff_t, rbmoff)
+		__field(xfs_rtxnum_t, rtx)
+		__field(xfs_rtxnum_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg_mount(rtg)->m_super->s_dev;
+		__entry->rtdev = rtg_mount(rtg)->m_rtdev_targp->bt_dev;
+		__entry->rgno = rtg_rgno(rtg);
+		__entry->rbmoff = rbmoff;
+		__entry->rtx = rtx;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rbmoff 0x%llx rtx 0x%llx rtxcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rgno,
+		  __entry->rbmoff,
+		  __entry->rtx,
+		  __entry->len)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load_words,
+	TP_PROTO(struct xfs_mount *mp, xfs_fileoff_t rbmoff,
+		 unsigned long long wordoff, unsigned int wordcnt),
+	TP_ARGS(mp, rbmoff, wordoff, wordcnt),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_fileoff_t, rbmoff)
+		__field(unsigned long long, wordoff)
+		__field(unsigned int, wordcnt)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->rbmoff = rbmoff;
+		__entry->wordoff = wordoff;
+		__entry->wordcnt = wordcnt;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rbmoff 0x%llx wordoff 0x%llx wordcnt 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rbmoff,
+		  __entry->wordoff,
+		  __entry->wordcnt)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load_word,
+	TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff,
+		 unsigned int bit, xfs_rtword_t ondisk_word,
+		 xfs_rtword_t xfile_word, xfs_rtword_t word_mask),
+	TP_ARGS(mp, wordoff, bit, ondisk_word, xfile_word, word_mask),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(unsigned long long, wordoff)
+		__field(unsigned int, bit)
+		__field(xfs_rtword_t, ondisk_word)
+		__field(xfs_rtword_t, xfile_word)
+		__field(xfs_rtword_t, word_mask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->wordoff = wordoff;
+		__entry->bit = bit;
+		__entry->ondisk_word = ondisk_word;
+		__entry->xfile_word = xfile_word;
+		__entry->word_mask = word_mask;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx bit %u ondisk 0x%x(0x%x) inmem 0x%x(0x%x) result 0x%x mask 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->wordoff,
+		  __entry->bit,
+		  __entry->ondisk_word,
+		  __entry->ondisk_word & __entry->word_mask,
+		  __entry->xfile_word,
+		  __entry->xfile_word & ~__entry->word_mask,
+		  (__entry->xfile_word & ~__entry->word_mask) |
+		  (__entry->ondisk_word & __entry->word_mask),
+		  __entry->word_mask)
+);
+#endif /* CONFIG_XFS_RT */
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 #endif /* _TRACE_XFS_SCRUB_TRACE_H */
-- 
cgit v1.2.3


From c6904f6788b79e8d2275a55cbb71b4992be29d92 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:42 -0800
Subject: xfs: support repairing metadata btrees rooted in metadir inodes

Adapt the repair code so that we can stage a new btree in the data fork
area of a metadir inode and reap the old blocks.  We already have nearly
all of the infrastructure; the only parts that were missing were the
metadata inode reservation handling.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/newbt.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/newbt.h |  1 +
 fs/xfs/scrub/reap.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/reap.h  |  2 ++
 4 files changed, 86 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 70af27d98734..ac38f5843090 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -19,6 +19,8 @@
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
 #include "xfs_defer.h"
+#include "xfs_metafile.h"
+#include "xfs_quota.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -120,6 +122,43 @@ xrep_newbt_init_inode(
 	return 0;
 }
 
+/*
+ * Initialize accounting resources for staging a new metadata inode btree.
+ * If the metadata file has a space reservation, the caller must adjust that
+ * reservation when committing the new ondisk btree.
+ */
+int
+xrep_newbt_init_metadir_inode(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc)
+{
+	struct xfs_owner_info		oinfo;
+	struct xfs_ifork		*ifp;
+
+	ASSERT(xfs_is_metadir_inode(sc->ip));
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+
+	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+	if (!ifp)
+		return -ENOMEM;
+
+	/*
+	 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the
+	 * inode metadata space reservations can only account allocated space
+	 * to the i_nblocks.  We do not want to change the inode core fields
+	 * until we're ready to commit the new tree, so we allocate the blocks
+	 * as if they were regular file blocks.  This exposes us to a higher
+	 * risk of the repair being cancelled due to ENOSPC.
+	 */
+	xrep_newbt_init_ag(xnr, sc, &oinfo,
+			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+			XFS_AG_RESV_NONE);
+	xnr->ifake.if_fork = ifp;
+	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK);
+	return 0;
+}
+
 /*
  * Initialize accounting resources for staging a new btree.  Callers are
  * expected to add their own reservations (and clean them up) manually.
@@ -224,6 +263,7 @@ xrep_newbt_alloc_ag_blocks(
 	int			error = 0;
 
 	ASSERT(sc->sa.pag != NULL);
+	ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
 
 	while (nr_blocks > 0) {
 		struct xfs_alloc_arg	args = {
@@ -297,6 +337,8 @@ xrep_newbt_alloc_file_blocks(
 	struct xfs_mount	*mp = sc->mp;
 	int			error = 0;
 
+	ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
+
 	while (nr_blocks > 0) {
 		struct xfs_alloc_arg	args = {
 			.tp		= sc->tp,
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index 3d804d31af24..5ce785599287 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -63,6 +63,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
 		enum xfs_ag_resv_type resv);
 int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
 		int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_init_metadir_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc);
 int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
 int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
 		xfs_agblock_t agbno, xfs_extlen_t len);
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 08230952053b..4d7f1b82dc55 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -33,6 +33,7 @@
 #include "xfs_attr.h"
 #include "xfs_attr_remote.h"
 #include "xfs_defer.h"
+#include "xfs_metafile.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -390,6 +391,8 @@ xreap_agextent_iter(
 	xfs_fsblock_t		fsbno;
 	int			error = 0;
 
+	ASSERT(rs->resv != XFS_AG_RESV_METAFILE);
+
 	fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno);
 
 	/*
@@ -675,6 +678,44 @@ xrep_reap_fsblocks(
 	return 0;
 }
 
+/*
+ * Dispose of every block of an old metadata btree that used to be rooted in a
+ * metadata directory file.
+ */
+int
+xrep_reap_metadir_fsblocks(
+	struct xfs_scrub		*sc,
+	struct xfsb_bitmap		*bitmap)
+{
+	/*
+	 * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old
+	 * blocks are no longer mapped by the inode, and inode metadata space
+	 * reservations can only account freed space to the i_nblocks.
+	 */
+	struct xfs_owner_info		oinfo;
+	struct xreap_state		rs = {
+		.sc			= sc,
+		.oinfo			= &oinfo,
+		.resv			= XFS_AG_RESV_NONE,
+	};
+	int				error;
+
+	ASSERT(xfs_has_rmapbt(sc->mp));
+	ASSERT(sc->ip != NULL);
+	ASSERT(xfs_is_metadir_inode(sc->ip));
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+
+	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
+	if (error)
+		return error;
+
+	if (xreap_dirty(&rs))
+		return xrep_defer_finish(sc);
+
+	return 0;
+}
+
 /*
  * Metadata files are not supposed to share blocks with anything else.
  * If blocks are shared, we remove the reverse mapping (thus reducing the
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 3f2f1775e29d..70e5e6bbb8d3 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -14,6 +14,8 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
 int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
 		const struct xfs_owner_info *oinfo);
 int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
+int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc,
+		struct xfsb_bitmap *bitmap);
 
 /* Buffer cache scan context. */
 struct xrep_bufscan {
-- 
cgit v1.2.3


From 6a849bd81b69ccbda5b766cc700f0be86194e4d1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:42 -0800
Subject: xfs: online repair of the realtime rmap btree

Repair the realtime rmap btree while mounted.  Similar to the regular
rmap btree repair code, we walk the data fork mappings of every realtime
file in the filesystem to collect reverse-mapping records in an xfarray.
Then we sort the xfarray, and use the btree bulk loader to create a new
rtrmap btree ondisk.  Finally, we swap the btree roots, and reap the old
blocks in the usual way.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile                   |   1 +
 fs/xfs/libxfs/xfs_btree_staging.c |   1 +
 fs/xfs/libxfs/xfs_rtrmap_btree.c  |   2 +-
 fs/xfs/libxfs/xfs_rtrmap_btree.h  |   3 +
 fs/xfs/scrub/common.c             |   4 +-
 fs/xfs/scrub/common.h             |   2 +
 fs/xfs/scrub/repair.c             | 144 ++++++++
 fs/xfs/scrub/repair.h             |  14 +
 fs/xfs/scrub/rtrmap.c             |   7 +
 fs/xfs/scrub/rtrmap_repair.c      | 733 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c              |   2 +-
 fs/xfs/scrub/trace.h              |  57 +++
 12 files changed, 966 insertions(+), 4 deletions(-)
 create mode 100644 fs/xfs/scrub/rtrmap_repair.c

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 136a465e00d2..338e10f81b7b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -234,6 +234,7 @@ xfs-y				+= $(addprefix scrub/, \
 
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rtbitmap_repair.o \
+				   rtrmap_repair.o \
 				   rtsummary_repair.o \
 				   )
 
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index 58c146b5c9d4..5ed84f9cc877 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -134,6 +134,7 @@ xfs_btree_stage_ifakeroot(
 	cur->bc_ino.ifake = ifake;
 	cur->bc_nlevels = ifake->if_levels;
 	cur->bc_ino.forksize = ifake->if_fork_size;
+	cur->bc_ino.whichfork = XFS_STAGING_FORK;
 	cur->bc_flags |= XFS_BTREE_STAGING;
 }
 
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 19e5109c3683..6bf1b253c247 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -660,7 +660,7 @@ xfs_rtrmapbt_compute_maxlevels(
 }
 
 /* Calculate the rtrmap btree size for some records. */
-static unsigned long long
+unsigned long long
 xfs_rtrmapbt_calc_size(
 	struct xfs_mount	*mp,
 	unsigned long long	len)
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index bf73460be274..ad76ac7938b6 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -198,4 +198,7 @@ int xfs_rtrmapbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
 int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg,
 		struct xfs_trans *tp);
 
+unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
 #endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index ca43dd4f52b2..06cb61e63498 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -819,7 +819,7 @@ xchk_rtgroup_btcur_free(
  * Unlock the realtime group.  This must be done /after/ committing (or
  * cancelling) the scrub transaction.
  */
-static void
+void
 xchk_rtgroup_unlock(
 	struct xchk_rt		*sr)
 {
@@ -904,7 +904,7 @@ int
 xchk_setup_rt(
 	struct xfs_scrub	*sc)
 {
-	return xchk_trans_alloc(sc, 0);
+	return xchk_trans_alloc(sc, xrep_calc_rtgroup_resblks(sc));
 }
 
 /* Set us up with AG headers and btree cursors. */
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index e5891609af27..50ac6cca18fe 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -147,12 +147,14 @@ xchk_rtgroup_init_existing(
 
 int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
 		unsigned int rtglock_flags);
+void xchk_rtgroup_unlock(struct xchk_rt *sr);
 void xchk_rtgroup_btcur_free(struct xchk_rt *sr);
 void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
 #else
 # define xchk_rtgroup_init(sc, rgno, sr)		(-EFSCORRUPTED)
 # define xchk_rtgroup_init_existing(sc, rgno, sr)	(-EFSCORRUPTED)
 # define xchk_rtgroup_lock(sc, sr, lockflags)		(-EFSCORRUPTED)
+# define xchk_rtgroup_unlock(sr)			do { } while (0)
 # define xchk_rtgroup_btcur_free(sr)			do { } while (0)
 # define xchk_rtgroup_free(sc, sr)			do { } while (0)
 #endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 82fe01d78cb0..61e414c81253 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -40,6 +40,8 @@
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
+#include "xfs_metafile.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -382,6 +384,41 @@ xrep_calc_ag_resblks(
 	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
 }
 
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out how many blocks to reserve for a rtgroup repair.  We calculate
+ * the worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-rtgroup btree.
+ */
+xfs_extlen_t
+xrep_calc_rtgroup_resblks(
+	struct xfs_scrub		*sc)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_scrub_metadata	*sm = sc->sm;
+	uint64_t			usedlen;
+	xfs_extlen_t			rmapbt_sz = 0;
+
+	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+		return 0;
+	if (!xfs_has_rtgroups(mp)) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	usedlen = xfs_rtbxlen_to_blen(mp, xfs_rtgroup_extents(mp, sm->sm_agno));
+	ASSERT(usedlen <= XFS_MAX_RGBLOCKS);
+
+	if (xfs_has_rmapbt(mp))
+		rmapbt_sz = xfs_rtrmapbt_calc_size(mp, usedlen);
+
+	trace_xrep_calc_rtgroup_resblks_btsize(mp, sm->sm_agno, usedlen,
+			rmapbt_sz);
+
+	return rmapbt_sz;
+}
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Reconstructing per-AG Btrees
  *
@@ -1284,3 +1321,110 @@ xrep_buf_verify_struct(
 
 	return fa == NULL;
 }
+
+/* Check the sanity of a rmap record for a metadata btree inode. */
+int
+xrep_check_ino_btree_mapping(
+	struct xfs_scrub		*sc,
+	const struct xfs_rmap_irec	*rec)
+{
+	enum xbtree_recpacking		outcome;
+	int				error;
+
+	/*
+	 * Metadata btree inodes never have extended attributes, and all blocks
+	 * should have the bmbt block flag set.
+	 */
+	if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) ||
+	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+		return -EFSCORRUPTED;
+
+	/* Make sure the block is within the AG. */
+	if (!xfs_verify_agbext(sc->sa.pag, rec->rm_startblock,
+				rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+			rec->rm_blockcount, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/*
+ * Reset the block count of the inode being repaired, and adjust the dquot
+ * block usage to match.  The inode must not have an xattr fork.
+ */
+void
+xrep_inode_set_nblocks(
+	struct xfs_scrub	*sc,
+	int64_t			new_blocks)
+{
+	int64_t			delta =
+		new_blocks - sc->ip->i_nblocks;
+
+	sc->ip->i_nblocks = new_blocks;
+
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	if (delta != 0)
+		xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT,
+				delta);
+}
+
+/* Reset the block reservation for a metadata inode. */
+int
+xrep_reset_metafile_resv(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_inode	*ip = sc->ip;
+	int64_t			delta;
+	int			error;
+
+	delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
+	if (delta == 0)
+		return 0;
+
+	/*
+	 * Too many blocks have been reserved, transfer some from the incore
+	 * reservation back to the filesystem.
+	 */
+	if (delta > 0) {
+		int64_t		give_back;
+
+		give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
+		if (give_back > 0) {
+			xfs_mod_delalloc(ip, 0, -give_back);
+			xfs_add_fdblocks(ip->i_mount, give_back);
+			ip->i_delayed_blks -= give_back;
+		}
+
+		return 0;
+	}
+
+	/*
+	 * Not enough reservation; try to take some blocks from the filesystem
+	 * to the metadata inode.  @delta is negative here, so invert the sign.
+	 */
+	delta = -delta;
+	error = xfs_dec_fdblocks(sc->mp, delta, true);
+	while (error == -ENOSPC) {
+		delta--;
+		if (delta == 0) {
+			xfs_warn(sc->mp,
+"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
+					ip->i_ino);
+			return 0;
+		}
+		error = xfs_dec_fdblocks(sc->mp, delta, true);
+	}
+	if (error)
+		return error;
+
+	xfs_mod_delalloc(ip, 0, delta);
+	ip->i_delayed_blks += delta;
+	return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 7f493752ea78..ac5962732d26 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -97,6 +97,7 @@ int xrep_setup_parent(struct xfs_scrub *sc);
 int xrep_setup_nlinks(struct xfs_scrub *sc);
 int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
 int xrep_setup_dirtree(struct xfs_scrub *sc);
+int xrep_setup_rtrmapbt(struct xfs_scrub *sc);
 
 /* Repair setup functions */
 int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -113,10 +114,15 @@ int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
 void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr);
 int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
 		xfs_filblks_t len);
+xfs_extlen_t xrep_calc_rtgroup_resblks(struct xfs_scrub *sc);
 #else
 # define xrep_rtgroup_init(sc, rtg, sr, lockflags)	(-ENOSYS)
+# define xrep_calc_rtgroup_resblks(sc)			(0)
 #endif /* CONFIG_XFS_RT */
 
+int xrep_check_ino_btree_mapping(struct xfs_scrub *sc,
+		const struct xfs_rmap_irec *rec);
+
 /* Metadata revalidators */
 
 int xrep_revalidate_allocbt(struct xfs_scrub *sc);
@@ -150,10 +156,12 @@ int xrep_metapath(struct xfs_scrub *sc);
 int xrep_rtbitmap(struct xfs_scrub *sc);
 int xrep_rtsummary(struct xfs_scrub *sc);
 int xrep_rgsuperblock(struct xfs_scrub *sc);
+int xrep_rtrmapbt(struct xfs_scrub *sc);
 #else
 # define xrep_rtbitmap			xrep_notsupported
 # define xrep_rtsummary			xrep_notsupported
 # define xrep_rgsuperblock		xrep_notsupported
+# define xrep_rtrmapbt			xrep_notsupported
 #endif /* CONFIG_XFS_RT */
 
 #ifdef CONFIG_XFS_QUOTA
@@ -172,6 +180,8 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
 void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
 
 bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks);
+int xrep_reset_metafile_resv(struct xfs_scrub *sc);
 
 #else
 
@@ -195,6 +205,8 @@ xrep_calc_ag_resblks(
 	return 0;
 }
 
+#define xrep_calc_rtgroup_resblks	xrep_calc_ag_resblks
+
 static inline int
 xrep_reset_perag_resv(
 	struct xfs_scrub	*sc)
@@ -222,6 +234,7 @@ xrep_setup_nothing(
 #define xrep_setup_nlinks		xrep_setup_nothing
 #define xrep_setup_dirtree		xrep_setup_nothing
 #define xrep_setup_metapath		xrep_setup_nothing
+#define xrep_setup_rtrmapbt		xrep_setup_nothing
 
 #define xrep_setup_inode(sc, imap)	((void)0)
 
@@ -259,6 +272,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
 #define xrep_dirtree			xrep_notsupported
 #define xrep_metapath			xrep_notsupported
 #define xrep_rgsuperblock		xrep_notsupported
+#define xrep_rtrmapbt			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 764fa2967922..300a1e85b3d6 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -27,6 +27,7 @@
 #include "scrub/common.h"
 #include "scrub/btree.h"
 #include "scrub/trace.h"
+#include "scrub/repair.h"
 
 /* Set us up with the realtime metadata locked. */
 int
@@ -38,6 +39,12 @@ xchk_setup_rtrmapbt(
 	if (xchk_need_intent_drain(sc))
 		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
 
+	if (xchk_could_repair(sc)) {
+		error = xrep_setup_rtrmapbt(sc);
+		if (error)
+			return error;
+	}
+
 	error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
 	if (error)
 		return error;
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
new file mode 100644
index 000000000000..60e317725dea
--- /dev/null
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -0,0 +1,733 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Realtime Reverse Mapping Btree Repair
+ * =====================================
+ *
+ * This isn't quite as difficult as repairing the rmap btree on the data
+ * device, since we only store the data fork extents of realtime files on the
+ * realtime device.  We still have to freeze the filesystem and stop the
+ * background threads like we do for the rmap repair, but we only have to scan
+ * realtime inodes.
+ *
+ * Collecting entries for the new realtime rmap btree is easy -- all we have
+ * to do is generate rtrmap entries from the data fork mappings of all realtime
+ * files in the filesystem.  We then scan the rmap btrees of the data device
+ * looking for extents belonging to the old btree and note them in a bitmap.
+ *
+ * To rebuild the realtime rmap btree, we bulk-load the collected mappings into
+ * a new btree cursor and atomically swap that into the realtime inode.  Then
+ * we can free the blocks from the old btree.
+ *
+ * We use the 'xrep_rtrmap' prefix for all the rmap functions.
+ */
+
+/*
+ * Packed rmap record.  The UNWRITTEN flags are hidden in the upper bits of
+ * offset, just like the on-disk record.
+ */
+struct xrep_rtrmap_extent {
+	xfs_rgblock_t	startblock;
+	xfs_extlen_t	blockcount;
+	uint64_t	owner;
+	uint64_t	offset;
+} __packed;
+
+/* Context for collecting rmaps */
+struct xrep_rtrmap {
+	/* new rtrmapbt information */
+	struct xrep_newbt	new_btree;
+
+	/* rmap records generated from primary metadata */
+	struct xfarray		*rtrmap_records;
+
+	struct xfs_scrub	*sc;
+
+	/* bitmap of old rtrmapbt blocks */
+	struct xfsb_bitmap	old_rtrmapbt_blocks;
+
+	/* inode scan cursor */
+	struct xchk_iscan	iscan;
+
+	/* get_records()'s position in the free space record array. */
+	xfarray_idx_t		array_cur;
+};
+
+/* Set us up to repair rt reverse mapping btrees. */
+int
+xrep_setup_rtrmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_rtrmap	*rr;
+
+	rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS);
+	if (!rr)
+		return -ENOMEM;
+
+	rr->sc = sc;
+	sc->buf = rr;
+	return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rtrmap_check_mapping(
+	struct xfs_scrub	*sc,
+	const struct xfs_rmap_irec *rec)
+{
+	if (xfs_rtrmap_check_irec(sc->sr.rtg, rec) != NULL)
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	return xrep_require_rtext_inuse(sc, rec->rm_startblock,
+			rec->rm_blockcount);
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rtrmap_stash(
+	struct xrep_rtrmap	*rr,
+	xfs_rgblock_t		startblock,
+	xfs_extlen_t		blockcount,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	struct xrep_rtrmap_extent	rre = {
+		.startblock	= startblock,
+		.blockcount	= blockcount,
+		.owner		= owner,
+	};
+	struct xfs_rmap_irec	rmap = {
+		.rm_startblock	= startblock,
+		.rm_blockcount	= blockcount,
+		.rm_owner	= owner,
+		.rm_offset	= offset,
+		.rm_flags	= flags,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	int			error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	trace_xrep_rtrmap_found(sc->mp, &rmap);
+
+	rre.offset = xfs_rmap_irec_offset_pack(&rmap);
+	return xfarray_append(rr->rtrmap_records, &rre);
+}
+
+/* Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rtrmap_ifork {
+	/*
+	 * Accumulate rmap data here to turn multiple adjacent bmaps into a
+	 * single rmap.
+	 */
+	struct xfs_rmap_irec	accum;
+
+	struct xrep_rtrmap	*rr;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rtrmap_stash_accumulated(
+	struct xrep_rtrmap_ifork	*rf)
+{
+	if (rf->accum.rm_blockcount == 0)
+		return 0;
+
+	return xrep_rtrmap_stash(rf->rr, rf->accum.rm_startblock,
+			rf->accum.rm_blockcount, rf->accum.rm_owner,
+			rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rtrmap_visit_bmbt(
+	struct xfs_btree_cur	*cur,
+	struct xfs_bmbt_irec	*rec,
+	void			*priv)
+{
+	struct xrep_rtrmap_ifork *rf = priv;
+	struct xfs_rmap_irec	*accum = &rf->accum;
+	struct xfs_mount	*mp = rf->rr->sc->mp;
+	xfs_rgblock_t		rgbno;
+	unsigned int		rmap_flags = 0;
+	int			error;
+
+	if (xfs_rtb_to_rgno(mp, rec->br_startblock) !=
+	    rtg_rgno(rf->rr->sc->sr.rtg))
+		return 0;
+
+	if (rec->br_state == XFS_EXT_UNWRITTEN)
+		rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+	/* If this bmap is adjacent to the previous one, just add it. */
+	rgbno = xfs_rtb_to_rgbno(mp, rec->br_startblock);
+	if (accum->rm_blockcount > 0 &&
+	    rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+	    rgbno == accum->rm_startblock + accum->rm_blockcount &&
+	    rmap_flags == accum->rm_flags) {
+		accum->rm_blockcount += rec->br_blockcount;
+		return 0;
+	}
+
+	/* Otherwise stash the old rmap and start accumulating a new one. */
+	error = xrep_rtrmap_stash_accumulated(rf);
+	if (error)
+		return error;
+
+	accum->rm_startblock = rgbno;
+	accum->rm_blockcount = rec->br_blockcount;
+	accum->rm_offset = rec->br_startoff;
+	accum->rm_flags = rmap_flags;
+	return 0;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that maps to the rt volume.  Sets @mappings_done to true if we've
+ * scanned the block mappings in this fork.
+ */
+STATIC int
+xrep_rtrmap_scan_bmbt(
+	struct xrep_rtrmap_ifork *rf,
+	struct xfs_inode	*ip,
+	bool			*mappings_done)
+{
+	struct xrep_rtrmap	*rr = rf->rr;
+	struct xfs_btree_cur	*cur;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	int			error = 0;
+
+	*mappings_done = false;
+
+	/*
+	 * If the incore extent cache is already loaded, we'll just use the
+	 * incore extent scanner to record mappings.  Don't bother walking the
+	 * ondisk extent tree.
+	 */
+	if (!xfs_need_iread_extents(ifp))
+		return 0;
+
+	/* Accumulate all the mappings in the bmap btree. */
+	cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, XFS_DATA_FORK);
+	error = xfs_bmap_query_all(cur, xrep_rtrmap_visit_bmbt, rf);
+	xfs_btree_del_cursor(cur, error);
+	if (error)
+		return error;
+
+	/* Stash any remaining accumulated rmaps and exit. */
+	*mappings_done = true;
+	return xrep_rtrmap_stash_accumulated(rf);
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rtrmap_scan_iext(
+	struct xrep_rtrmap_ifork *rf,
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_bmbt_irec	rec;
+	struct xfs_iext_cursor	icur;
+	int			error;
+
+	for_each_xfs_iext(ifp, &icur, &rec) {
+		if (isnullstartblock(rec.br_startblock))
+			continue;
+		error = xrep_rtrmap_visit_bmbt(NULL, &rec, rf);
+		if (error)
+			return error;
+	}
+
+	return xrep_rtrmap_stash_accumulated(rf);
+}
+
+/* Find all the extents on the realtime device mapped by an inode fork. */
+STATIC int
+xrep_rtrmap_scan_dfork(
+	struct xrep_rtrmap	*rr,
+	struct xfs_inode	*ip)
+{
+	struct xrep_rtrmap_ifork rf = {
+		.accum		= { .rm_owner = ip->i_ino, },
+		.rr		= rr,
+	};
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	int			error = 0;
+
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		bool		mappings_done;
+
+		/*
+		 * Scan the bmbt for mappings.  If the incore extent tree is
+		 * loaded, we want to scan the cached mappings since that's
+		 * faster when the extent counts are very high.
+		 */
+		error = xrep_rtrmap_scan_bmbt(&rf, ip, &mappings_done);
+		if (error || mappings_done)
+			return error;
+	} else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+		/* realtime data forks should only be extents or btree */
+		return -EFSCORRUPTED;
+	}
+
+	/* Scan incore extent cache. */
+	return xrep_rtrmap_scan_iext(&rf, ifp);
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rtrmap_scan_inode(
+	struct xrep_rtrmap	*rr,
+	struct xfs_inode	*ip)
+{
+	unsigned int		lock_mode;
+	int			error = 0;
+
+	/* Skip the rt rmap btree inode. */
+	if (rr->sc->ip == ip)
+		return 0;
+
+	lock_mode = xfs_ilock_data_map_shared(ip);
+
+	/* Check the data fork if it's on the realtime device. */
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		error = xrep_rtrmap_scan_dfork(rr, ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+	xfs_iunlock(ip, lock_mode);
+	return error;
+}
+
+/* Record extents that belong to the realtime rmap inode. */
+STATIC int
+xrep_rtrmap_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_rtrmap		*rr = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(rr->sc, &error))
+		return error;
+
+	/* Skip extents which are not owned by this inode and fork. */
+	if (rec->rm_owner != rr->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_check_ino_btree_mapping(rr->sc, rec);
+	if (error)
+		return error;
+
+	return xfsb_bitmap_set(&rr->old_rtrmapbt_blocks,
+			xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock),
+			rec->rm_blockcount);
+}
+
+/* Scan one AG for reverse mappings for the realtime rmap btree. */
+STATIC int
+xrep_rtrmap_scan_ag(
+	struct xrep_rtrmap	*rr,
+	struct xfs_perag	*pag)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	error = xrep_ag_init(sc, pag, &sc->sa);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrmap_walk_rmap, rr);
+	xchk_ag_free(sc, &sc->sa);
+	return error;
+}
+
+/* Generate all the reverse-mappings for the realtime device. */
+STATIC int
+xrep_rtrmap_find_rmaps(
+	struct xrep_rtrmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_perag	*pag = NULL;
+	struct xfs_inode	*ip;
+	int			error;
+
+	/* Generate rmaps for the realtime superblock */
+	if (xfs_has_rtsb(sc->mp) && rtg_rgno(rr->sc->sr.rtg) == 0) {
+		error = xrep_rtrmap_stash(rr, 0, sc->mp->m_sb.sb_rextsize,
+				XFS_RMAP_OWN_FS, 0, 0);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Set up for a potentially lengthy filesystem scan by reducing our
+	 * transaction resource usage for the duration.  Specifically:
+	 *
+	 * Unlock the realtime metadata inodes and cancel the transaction to
+	 * release the log grant space while we scan the filesystem.
+	 *
+	 * Create a new empty transaction to eliminate the possibility of the
+	 * inode scan deadlocking on cyclical metadata.
+	 *
+	 * We pass the empty transaction to the file scanning function to avoid
+	 * repeatedly cycling empty transactions.  This can be done even though
+	 * we take the IOLOCK to quiesce the file because empty transactions
+	 * do not take sb_internal.
+	 */
+	xchk_trans_cancel(sc);
+	xchk_rtgroup_unlock(&sc->sr);
+	error = xchk_trans_alloc_empty(sc);
+	if (error)
+		return error;
+
+	while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+		error = xrep_rtrmap_scan_inode(rr, ip);
+		xchk_irele(sc, ip);
+		if (error)
+			break;
+
+		if (xchk_should_terminate(sc, &error))
+			break;
+	}
+	xchk_iscan_iter_finish(&rr->iscan);
+	if (error)
+		return error;
+
+	/*
+	 * Switch out for a real transaction and lock the RT metadata in
+	 * preparation for building a new tree.
+	 */
+	xchk_trans_cancel(sc);
+	error = xchk_setup_rt(sc);
+	if (error)
+		return error;
+	error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+	if (error)
+		return error;
+
+	/* Scan for old rtrmap blocks. */
+	while ((pag = xfs_perag_next(sc->mp, pag))) {
+		error = xrep_rtrmap_scan_ag(rr, pag);
+		if (error) {
+			xfs_perag_rele(pag);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+/* Building the new rtrmap btree. */
+
+/* Retrieve rtrmapbt data for bulk load. */
+STATIC int
+xrep_rtrmap_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xrep_rtrmap_extent	rec;
+	struct xfs_rmap_irec		*irec = &cur->bc_rec.r;
+	struct xrep_rtrmap		*rr = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+	int				error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		error = xfarray_load_next(rr->rtrmap_records, &rr->array_cur,
+				&rec);
+		if (error)
+			return error;
+
+		irec->rm_startblock = rec.startblock;
+		irec->rm_blockcount = rec.blockcount;
+		irec->rm_owner = rec.owner;
+
+		if (xfs_rmap_irec_offset_unpack(rec.offset, irec) != NULL)
+			return -EFSCORRUPTED;
+
+		error = xrep_rtrmap_check_mapping(rr->sc, irec);
+		if (error)
+			return error;
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrmap_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_rtrmap	*rr = priv;
+
+	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrmap_iroot_size(
+	struct xfs_btree_cur	*cur,
+	unsigned int		level,
+	unsigned int		nr_this_level,
+	void			*priv)
+{
+	return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
+}
+
+/*
+ * Use the collected rmap information to stage a new rmap btree.  If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed.  This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rtrmap_build_new_tree(
+	struct xrep_rtrmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_rtgroup	*rtg = sc->sr.rtg;
+	struct xfs_btree_cur	*rmap_cur;
+	uint64_t		nr_records;
+	int			error;
+
+	/*
+	 * Prepare to construct the new btree by reserving disk space for the
+	 * new btree and setting up all the accounting information we'll need
+	 * to root the new btree while it's under construction and before we
+	 * attach it to the realtime rmapbt inode.
+	 */
+	error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+	if (error)
+		return error;
+
+	rr->new_btree.bload.get_records = xrep_rtrmap_get_records;
+	rr->new_btree.bload.claim_block = xrep_rtrmap_claim_block;
+	rr->new_btree.bload.iroot_size = xrep_rtrmap_iroot_size;
+
+	rmap_cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
+	xfs_btree_stage_ifakeroot(rmap_cur, &rr->new_btree.ifake);
+
+	nr_records = xfarray_length(rr->rtrmap_records);
+
+	/* Compute how many blocks we'll need for the rmaps collected. */
+	error = xfs_btree_bload_compute_geometry(rmap_cur,
+			&rr->new_btree.bload, nr_records);
+	if (error)
+		goto err_cur;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		goto err_cur;
+
+	/*
+	 * Guess how many blocks we're going to need to rebuild an entire
+	 * rtrmapbt from the number of extents we found, and pump up our
+	 * transaction to have sufficient block reservation.  We're allowed
+	 * to exceed quota to repair inconsistent metadata, though this is
+	 * unlikely.
+	 */
+	error = xfs_trans_reserve_more_inode(sc->tp, rtg_rmap(rtg),
+			rr->new_btree.bload.nr_blocks, 0, true);
+	if (error)
+		goto err_cur;
+
+	/* Reserve the space we'll need for the new btree. */
+	error = xrep_newbt_alloc_blocks(&rr->new_btree,
+			rr->new_btree.bload.nr_blocks);
+	if (error)
+		goto err_cur;
+
+	/* Add all observed rmap records. */
+	rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
+	rr->array_cur = XFARRAY_CURSOR_INIT;
+	error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Install the new rtrmap btree in the inode.  After this point the old
+	 * btree is no longer accessible, the new tree is live, and we can
+	 * delete the cursor.
+	 */
+	xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp);
+	xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+	xfs_btree_del_cursor(rmap_cur, 0);
+
+	/* Dispose of any unused blocks and the accounting information. */
+	error = xrep_newbt_commit(&rr->new_btree);
+	if (error)
+		return error;
+
+	return xrep_roll_trans(sc);
+
+err_cur:
+	xfs_btree_del_cursor(rmap_cur, error);
+	xrep_newbt_cancel(&rr->new_btree);
+	return error;
+}
+
+/* Reaping the old btree. */
+
+/* Reap the old rtrmapbt blocks. */
+STATIC int
+xrep_rtrmap_remove_old_tree(
+	struct xrep_rtrmap	*rr)
+{
+	int			error;
+
+	/*
+	 * Free all the extents that were allocated to the former rtrmapbt and
+	 * aren't cross-linked with something else.
+	 */
+	error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
+	if (error)
+		return error;
+
+	/*
+	 * Ensure the proper reservation for the rtrmap inode so that we don't
+	 * fail to expand the new btree.
+	 */
+	return xrep_reset_metafile_resv(rr->sc);
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rtrmap_setup_scan(
+	struct xrep_rtrmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	char			*descr;
+	int			error;
+
+	xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);
+
+	/* Set up some storage */
+	descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
+	error = xfarray_create(descr, 0, sizeof(struct xrep_rtrmap_extent),
+			&rr->rtrmap_records);
+	kfree(descr);
+	if (error)
+		goto out_bitmap;
+
+	/* Retry iget every tenth of a second for up to 30 seconds. */
+	xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+	return 0;
+
+out_bitmap:
+	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+	return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rtrmap_teardown(
+	struct xrep_rtrmap	*rr)
+{
+	xchk_iscan_teardown(&rr->iscan);
+	xfarray_destroy(rr->rtrmap_records);
+	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+}
+
+/* Repair the realtime rmap btree. */
+int
+xrep_rtrmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_rtrmap	*rr = sc->buf;
+	int			error;
+
+	/* Functionality is not yet complete. */
+	return xrep_notsupported(sc);
+
+	/* Make sure any problems with the fork are fixed. */
+	error = xrep_metadata_inode_forks(sc);
+	if (error)
+		return error;
+
+	error = xrep_rtrmap_setup_scan(rr);
+	if (error)
+		return error;
+
+	/* Collect rmaps for realtime files. */
+	error = xrep_rtrmap_find_rmaps(rr);
+	if (error)
+		goto out_records;
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* Rebuild the rtrmap information. */
+	error = xrep_rtrmap_build_new_tree(rr);
+	if (error)
+		goto out_records;
+
+	/* Kill the old tree. */
+	error = xrep_rtrmap_remove_old_tree(rr);
+	if (error)
+		goto out_records;
+
+out_records:
+	xrep_rtrmap_teardown(rr);
+	return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 09983899c341..16da054b2eb0 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -465,7 +465,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_rtrmapbt,
 		.scrub	= xchk_rtrmapbt,
 		.has	= xfs_has_rtrmapbt,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_rtrmapbt,
 	},
 };
 
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 3b661e444345..3f2a8695ef5c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2285,6 +2285,32 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
 		  __entry->rmapbt_sz,
 		  __entry->refcbt_sz)
 )
+
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xrep_calc_rtgroup_resblks_btsize,
+	TP_PROTO(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		 xfs_rgblock_t usedlen, xfs_rgblock_t rmapbt_sz),
+	TP_ARGS(mp, rgno, usedlen, rmapbt_sz),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, usedlen)
+		__field(xfs_rgblock_t, rmapbt_sz)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rgno = rgno;
+		__entry->usedlen = usedlen;
+		__entry->rmapbt_sz = rmapbt_sz;
+	),
+	TP_printk("dev %d:%d rgno 0x%x usedlen %u rmapbt %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->usedlen,
+		  __entry->rmapbt_sz)
+);
+#endif /* CONFIG_XFS_RT */
+
 TRACE_EVENT(xrep_reset_counters,
 	TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc),
 	TP_ARGS(mp, fsc),
@@ -3755,6 +3781,37 @@ TRACE_EVENT(xrep_rtbitmap_load_word,
 		  (__entry->ondisk_word & __entry->word_mask),
 		  __entry->word_mask)
 );
+
+TRACE_EVENT(xrep_rtrmap_found,
+	TP_PROTO(struct xfs_mount *mp, const struct xfs_rmap_irec *rec),
+	TP_ARGS(mp, rec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->rgbno = rec->rm_startblock;
+		__entry->len = rec->rm_blockcount;
+		__entry->owner = rec->rm_owner;
+		__entry->offset = rec->rm_offset;
+		__entry->flags = rec->rm_flags;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rgbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rgbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
 #endif /* CONFIG_XFS_RT */
 
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
-- 
cgit v1.2.3


From 4a61f12eb11958f157e054d386466627445644cd Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:43 -0800
Subject: xfs: create a shadow rmap btree during realtime rmap repair

Create an in-memory btree of rmap records instead of an array.  This
enables us to do live record collection instead of freezing the fs.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree_mem.c    |   1 +
 fs/xfs/libxfs/xfs_rmap.c         |   3 +-
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 117 +++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |   6 ++
 fs/xfs/libxfs/xfs_shared.h       |   7 ++
 fs/xfs/scrub/rtrmap_repair.c     | 137 ++++++++++++++++++++++++++-------------
 fs/xfs/xfs_stats.c               |   3 +-
 fs/xfs/xfs_stats.h               |   1 +
 8 files changed, 228 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
index df3d613675a1..f2f7b4305413 100644
--- a/fs/xfs/libxfs/xfs_btree_mem.c
+++ b/fs/xfs/libxfs/xfs_btree_mem.c
@@ -18,6 +18,7 @@
 #include "xfs_ag.h"
 #include "xfs_buf_item.h"
 #include "xfs_trace.h"
+#include "xfs_rtgroup.h"
 
 /* Set the root of an in-memory btree. */
 void
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 2f0688a57991..f8415fd96cc2 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -327,7 +327,8 @@ xfs_rmap_check_btrec(
 	struct xfs_btree_cur		*cur,
 	const struct xfs_rmap_irec	*irec)
 {
-	if (xfs_btree_is_rtrmap(cur->bc_ops))
+	if (xfs_btree_is_rtrmap(cur->bc_ops) ||
+	    xfs_btree_is_mem_rtrmap(cur->bc_ops))
 		return xfs_rtrmap_check_irec(to_rtg(cur->bc_group), irec);
 	return xfs_rmap_check_irec(to_perag(cur->bc_group), irec);
 }
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 6bf1b253c247..b90901e39e92 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -28,6 +28,8 @@
 #include "xfs_rtgroup.h"
 #include "xfs_bmap.h"
 #include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -542,6 +544,121 @@ xfs_rtrmapbt_init_cursor(
 	return cur;
 }
 
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+/*
+ * Validate an in-memory realtime rmap btree block.  Callers are allowed to
+ * generate an in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rtrmapbt_mem_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+	unsigned int		maxrecs;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+
+	level = be16_to_cpu(block->bb_level);
+	if (xfs_has_rmapbt(mp)) {
+		if (level >= mp->m_rtrmap_maxlevels)
+			return __this_address;
+	} else {
+		if (level >= xfs_rtrmapbt_maxlevels_ondisk())
+			return __this_address;
+	}
+
+	maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+	return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+xfs_rtrmapbt_mem_rw_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa = xfs_rtrmapbt_mem_verify(bp);
+
+	if (fa)
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = {
+	.name			= "xfs_rtrmapbt_mem",
+	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+	.verify_read		= xfs_rtrmapbt_mem_rw_verify,
+	.verify_write		= xfs_rtrmapbt_mem_rw_verify,
+	.verify_struct		= xfs_rtrmapbt_mem_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
+	.type			= XFS_BTREE_TYPE_MEM,
+	.geom_flags		= XFS_BTGEO_OVERLAPPING,
+
+	.rec_len		= sizeof(struct xfs_rmap_rec),
+	/* Overlapping btree; 2 keys per pointer. */
+	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
+
+	.lru_refs		= XFS_RMAP_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2),
+
+	.dup_cursor		= xfbtree_dup_cursor,
+	.set_root		= xfbtree_set_root,
+	.alloc_block		= xfbtree_alloc_block,
+	.free_block		= xfbtree_free_block,
+	.get_minrecs		= xfbtree_get_minrecs,
+	.get_maxrecs		= xfbtree_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfbtree_init_ptr_from_cur,
+	.key_diff		= xfs_rtrmapbt_key_diff,
+	.buf_ops		= &xfs_rtrmapbt_mem_buf_ops,
+	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
+	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_mem_cursor(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_trans	*tp,
+	struct xfbtree		*xfbt)
+{
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops,
+			mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
+	cur->bc_mem.xfbtree = xfbt;
+	cur->bc_nlevels = xfbt->nlevels;
+	cur->bc_group = xfs_group_hold(rtg_group(rtg));
+	return cur;
+}
+
+/* Create an in-memory realtime rmap btree. */
+int
+xfs_rtrmapbt_mem_init(
+	struct xfs_mount	*mp,
+	struct xfbtree		*xfbt,
+	struct xfs_buftarg	*btp,
+	xfs_rgnumber_t		rgno)
+{
+	xfbt->owner = rgno;
+	return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops);
+}
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
 /*
  * Install a new rt reverse mapping btree root.  Caller is responsible for
  * invalidating and freeing the old btree blocks.
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index ad76ac7938b6..9d0915089891 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_cur;
 struct xfs_mount;
 struct xbtree_ifakeroot;
 struct xfs_rtgroup;
+struct xfbtree;
 
 /* rmaps only exist on crc enabled filesystems */
 #define XFS_RTRMAP_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
@@ -201,4 +202,9 @@ int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg,
 unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
+struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
+		struct xfs_trans *tp, struct xfbtree *xfbtree);
+int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+		struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
+
 #endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index da23dac22c3f..960716c387cc 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -57,6 +57,7 @@ extern const struct xfs_btree_ops xfs_refcountbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
 extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
+extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops;
 
 static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
 {
@@ -98,8 +99,14 @@ static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops)
 {
 	return ops == &xfs_rmapbt_mem_ops;
 }
+
+static inline bool xfs_btree_is_mem_rtrmap(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_rtrmapbt_mem_ops;
+}
 #else
 # define xfs_btree_is_mem_rmap(...)	(false)
+# define xfs_btree_is_mem_rtrmap(...)	(false)
 #endif
 
 static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops)
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index 60e317725dea..b376bcc8d1d2 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -12,6 +12,8 @@
 #include "xfs_defer.h"
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
 #include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
@@ -64,24 +66,13 @@
  * We use the 'xrep_rtrmap' prefix for all the rmap functions.
  */
 
-/*
- * Packed rmap record.  The UNWRITTEN flags are hidden in the upper bits of
- * offset, just like the on-disk record.
- */
-struct xrep_rtrmap_extent {
-	xfs_rgblock_t	startblock;
-	xfs_extlen_t	blockcount;
-	uint64_t	owner;
-	uint64_t	offset;
-} __packed;
-
 /* Context for collecting rmaps */
 struct xrep_rtrmap {
 	/* new rtrmapbt information */
 	struct xrep_newbt	new_btree;
 
 	/* rmap records generated from primary metadata */
-	struct xfarray		*rtrmap_records;
+	struct xfbtree		rtrmap_btree;
 
 	struct xfs_scrub	*sc;
 
@@ -91,8 +82,11 @@ struct xrep_rtrmap {
 	/* inode scan cursor */
 	struct xchk_iscan	iscan;
 
-	/* get_records()'s position in the free space record array. */
-	xfarray_idx_t		array_cur;
+	/* in-memory btree cursor for the ->get_blocks walk */
+	struct xfs_btree_cur	*mcur;
+
+	/* Number of records we're staging in the new btree. */
+	uint64_t		nr_records;
 };
 
 /* Set us up to repair rt reverse mapping btrees. */
@@ -101,6 +95,14 @@ xrep_setup_rtrmapbt(
 	struct xfs_scrub	*sc)
 {
 	struct xrep_rtrmap	*rr;
+	char			*descr;
+	int			error;
+
+	descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
+	error = xrep_setup_xfbtree(sc, descr);
+	kfree(descr);
+	if (error)
+		return error;
 
 	rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS);
 	if (!rr)
@@ -135,11 +137,6 @@ xrep_rtrmap_stash(
 	uint64_t		offset,
 	unsigned int		flags)
 {
-	struct xrep_rtrmap_extent	rre = {
-		.startblock	= startblock,
-		.blockcount	= blockcount,
-		.owner		= owner,
-	};
 	struct xfs_rmap_irec	rmap = {
 		.rm_startblock	= startblock,
 		.rm_blockcount	= blockcount,
@@ -148,6 +145,7 @@ xrep_rtrmap_stash(
 		.rm_flags	= flags,
 	};
 	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_btree_cur	*mcur;
 	int			error = 0;
 
 	if (xchk_should_terminate(sc, &error))
@@ -155,8 +153,18 @@ xrep_rtrmap_stash(
 
 	trace_xrep_rtrmap_found(sc->mp, &rmap);
 
-	rre.offset = xfs_rmap_irec_offset_pack(&rmap);
-	return xfarray_append(rr->rtrmap_records, &rre);
+	/* Add entry to in-memory btree. */
+	mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, &rr->rtrmap_btree);
+	error = xfs_rmap_map_raw(mcur, &rmap);
+	xfs_btree_del_cursor(mcur, error);
+	if (error)
+		goto out_cancel;
+
+	return xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp);
+
+out_cancel:
+	xfbtree_trans_cancel(&rr->rtrmap_btree, sc->tp);
+	return error;
 }
 
 /* Finding all file and bmbt extents. */
@@ -395,6 +403,24 @@ xrep_rtrmap_scan_ag(
 	return error;
 }
 
+/* Count and check all collected records. */
+STATIC int
+xrep_rtrmap_check_record(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_rtrmap		*rr = priv;
+	int				error;
+
+	error = xrep_rtrmap_check_mapping(rr->sc, rec);
+	if (error)
+		return error;
+
+	rr->nr_records++;
+	return 0;
+}
+
 /* Generate all the reverse-mappings for the realtime device. */
 STATIC int
 xrep_rtrmap_find_rmaps(
@@ -403,6 +429,7 @@ xrep_rtrmap_find_rmaps(
 	struct xfs_scrub	*sc = rr->sc;
 	struct xfs_perag	*pag = NULL;
 	struct xfs_inode	*ip;
+	struct xfs_btree_cur	*mcur;
 	int			error;
 
 	/* Generate rmaps for the realtime superblock */
@@ -468,7 +495,19 @@ xrep_rtrmap_find_rmaps(
 		}
 	}
 
-	return 0;
+	/*
+	 * Now that we have everything locked again, we need to count the
+	 * number of rmap records stashed in the btree.  This should reflect
+	 * all actively-owned rt files in the filesystem.  At the same time,
+	 * check all our records before we start building a new btree, which
+	 * requires the rtbitmap lock.
+	 */
+	mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, NULL, &rr->rtrmap_btree);
+	rr->nr_records = 0;
+	error = xfs_rmap_query_all(mcur, xrep_rtrmap_check_record, rr);
+	xfs_btree_del_cursor(mcur, error);
+
+	return error;
 }
 
 /* Building the new rtrmap btree. */
@@ -482,29 +521,25 @@ xrep_rtrmap_get_records(
 	unsigned int			nr_wanted,
 	void				*priv)
 {
-	struct xrep_rtrmap_extent	rec;
-	struct xfs_rmap_irec		*irec = &cur->bc_rec.r;
 	struct xrep_rtrmap		*rr = priv;
 	union xfs_btree_rec		*block_rec;
 	unsigned int			loaded;
 	int				error;
 
 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
-		error = xfarray_load_next(rr->rtrmap_records, &rr->array_cur,
-				&rec);
+		int			stat = 0;
+
+		error = xfs_btree_increment(rr->mcur, 0, &stat);
 		if (error)
 			return error;
-
-		irec->rm_startblock = rec.startblock;
-		irec->rm_blockcount = rec.blockcount;
-		irec->rm_owner = rec.owner;
-
-		if (xfs_rmap_irec_offset_unpack(rec.offset, irec) != NULL)
+		if (!stat)
 			return -EFSCORRUPTED;
 
-		error = xrep_rtrmap_check_mapping(rr->sc, irec);
+		error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
 		if (error)
 			return error;
+		if (!stat)
+			return -EFSCORRUPTED;
 
 		block_rec = xfs_btree_rec_addr(cur, idx, block);
 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
@@ -549,7 +584,6 @@ xrep_rtrmap_build_new_tree(
 	struct xfs_scrub	*sc = rr->sc;
 	struct xfs_rtgroup	*rtg = sc->sr.rtg;
 	struct xfs_btree_cur	*rmap_cur;
-	uint64_t		nr_records;
 	int			error;
 
 	/*
@@ -569,11 +603,9 @@ xrep_rtrmap_build_new_tree(
 	rmap_cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
 	xfs_btree_stage_ifakeroot(rmap_cur, &rr->new_btree.ifake);
 
-	nr_records = xfarray_length(rr->rtrmap_records);
-
 	/* Compute how many blocks we'll need for the rmaps collected. */
 	error = xfs_btree_bload_compute_geometry(rmap_cur,
-			&rr->new_btree.bload, nr_records);
+			&rr->new_btree.bload, rr->nr_records);
 	if (error)
 		goto err_cur;
 
@@ -599,12 +631,20 @@ xrep_rtrmap_build_new_tree(
 	if (error)
 		goto err_cur;
 
+	/*
+	 * Create a cursor to the in-memory btree so that we can bulk load the
+	 * new btree.
+	 */
+	rr->mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, NULL, &rr->rtrmap_btree);
+	error = xfs_btree_goto_left_edge(rr->mcur);
+	if (error)
+		goto err_mcur;
+
 	/* Add all observed rmap records. */
 	rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
-	rr->array_cur = XFARRAY_CURSOR_INIT;
 	error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
 	if (error)
-		goto err_cur;
+		goto err_mcur;
 
 	/*
 	 * Install the new rtrmap btree in the inode.  After this point the old
@@ -614,6 +654,14 @@ xrep_rtrmap_build_new_tree(
 	xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp);
 	xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
 	xfs_btree_del_cursor(rmap_cur, 0);
+	xfs_btree_del_cursor(rr->mcur, 0);
+	rr->mcur = NULL;
+
+	/*
+	 * Now that we've written the new btree to disk, we don't need to keep
+	 * updating the in-memory btree.  Abort the scan to stop live updates.
+	 */
+	xchk_iscan_abort(&rr->iscan);
 
 	/* Dispose of any unused blocks and the accounting information. */
 	error = xrep_newbt_commit(&rr->new_btree);
@@ -622,6 +670,8 @@ xrep_rtrmap_build_new_tree(
 
 	return xrep_roll_trans(sc);
 
+err_mcur:
+	xfs_btree_del_cursor(rr->mcur, error);
 err_cur:
 	xfs_btree_del_cursor(rmap_cur, error);
 	xrep_newbt_cancel(&rr->new_btree);
@@ -658,16 +708,13 @@ xrep_rtrmap_setup_scan(
 	struct xrep_rtrmap	*rr)
 {
 	struct xfs_scrub	*sc = rr->sc;
-	char			*descr;
 	int			error;
 
 	xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);
 
 	/* Set up some storage */
-	descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
-	error = xfarray_create(descr, 0, sizeof(struct xrep_rtrmap_extent),
-			&rr->rtrmap_records);
-	kfree(descr);
+	error = xfs_rtrmapbt_mem_init(sc->mp, &rr->rtrmap_btree, sc->xmbtp,
+			rtg_rgno(sc->sr.rtg));
 	if (error)
 		goto out_bitmap;
 
@@ -686,7 +733,7 @@ xrep_rtrmap_teardown(
 	struct xrep_rtrmap	*rr)
 {
 	xchk_iscan_teardown(&rr->iscan);
-	xfarray_destroy(rr->rtrmap_records);
+	xfbtree_destroy(&rr->rtrmap_btree);
 	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
 }
 
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f94fb70b524f..b7f2988bc03b 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -53,7 +53,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
 		{ "refcntbt",		xfsstats_offset(xs_rmap_mem_2)	},
 		{ "rmapbt_mem",		xfsstats_offset(xs_rcbag_2)	},
 		{ "rcbagbt",		xfsstats_offset(xs_rtrmap_2)	},
-		{ "rtrmapbt",		xfsstats_offset(xs_qm_dqreclaims)},
+		{ "rtrmapbt",		xfsstats_offset(xs_rtrmap_mem_2)},
+		{ "rtrmapbt_mem",	xfsstats_offset(xs_qm_dqreclaims)},
 		/* we print both series of quota information together */
 		{ "qm",			xfsstats_offset(xs_xstrat_bytes)},
 	};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 05dc69c6d949..9c47de5dff2d 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -128,6 +128,7 @@ struct __xfsstats {
 	uint32_t		xs_rmap_mem_2[__XBTS_MAX];
 	uint32_t		xs_rcbag_2[__XBTS_MAX];
 	uint32_t		xs_rtrmap_2[__XBTS_MAX];
+	uint32_t		xs_rtrmap_mem_2[__XBTS_MAX];
 	uint32_t		xs_qm_dqreclaims;
 	uint32_t		xs_qm_dqreclaim_misses;
 	uint32_t		xs_qm_dquot_dups;
-- 
cgit v1.2.3


From 9515572be65e9f8d7b852f067c7a163bfe9520d4 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:44 -0800
Subject: xfs: hook live realtime rmap operations during a repair operation

Hook the regular realtime rmap code when an rtrmapbt repair operation is
running so that we can unlock the AGF buffer to scan the filesystem and
keep the in-memory btree up to date during the scan.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/rmap_repair.c   |   2 +-
 fs/xfs/scrub/rtrmap_repair.c | 131 +++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/trace.h         |  17 ++++--
 3 files changed, 140 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index 91c17feb4976..c2c7b76cc25a 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -1614,7 +1614,7 @@ xrep_rmapbt_live_update(
 	if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
 		goto out_unlock;
 
-	trace_xrep_rmap_live_update(rr->sc->sa.pag, action, p);
+	trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
 
 	error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
 	if (error)
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index b376bcc8d1d2..49de8bc2dd17 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -71,6 +71,9 @@ struct xrep_rtrmap {
 	/* new rtrmapbt information */
 	struct xrep_newbt	new_btree;
 
+	/* lock for the xfbtree and xfile */
+	struct mutex		lock;
+
 	/* rmap records generated from primary metadata */
 	struct xfbtree		rtrmap_btree;
 
@@ -79,6 +82,9 @@ struct xrep_rtrmap {
 	/* bitmap of old rtrmapbt blocks */
 	struct xfsb_bitmap	old_rtrmapbt_blocks;
 
+	/* Hooks into rtrmap update code. */
+	struct xfs_rmap_hook	rhook;
+
 	/* inode scan cursor */
 	struct xchk_iscan	iscan;
 
@@ -98,6 +104,8 @@ xrep_setup_rtrmapbt(
 	char			*descr;
 	int			error;
 
+	xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
 	descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
 	error = xrep_setup_xfbtree(sc, descr);
 	kfree(descr);
@@ -151,19 +159,31 @@ xrep_rtrmap_stash(
 	if (xchk_should_terminate(sc, &error))
 		return error;
 
+	if (xchk_iscan_aborted(&rr->iscan))
+		return -EFSCORRUPTED;
+
 	trace_xrep_rtrmap_found(sc->mp, &rmap);
 
 	/* Add entry to in-memory btree. */
+	mutex_lock(&rr->lock);
 	mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, &rr->rtrmap_btree);
 	error = xfs_rmap_map_raw(mcur, &rmap);
 	xfs_btree_del_cursor(mcur, error);
 	if (error)
 		goto out_cancel;
 
-	return xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp);
+	error = xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp);
+	if (error)
+		goto out_abort;
+
+	mutex_unlock(&rr->lock);
+	return 0;
 
 out_cancel:
 	xfbtree_trans_cancel(&rr->rtrmap_btree, sc->tp);
+out_abort:
+	xchk_iscan_abort(&rr->iscan);
+	mutex_unlock(&rr->lock);
 	return error;
 }
 
@@ -486,6 +506,13 @@ xrep_rtrmap_find_rmaps(
 	if (error)
 		return error;
 
+	/*
+	 * If a hook failed to update the in-memory btree, we lack the data to
+	 * continue the repair.
+	 */
+	if (xchk_iscan_aborted(&rr->iscan))
+		return -EFSCORRUPTED;
+
 	/* Scan for old rtrmap blocks. */
 	while ((pag = xfs_perag_next(sc->mp, pag))) {
 		error = xrep_rtrmap_scan_ag(rr, pag);
@@ -702,6 +729,83 @@ xrep_rtrmap_remove_old_tree(
 	return xrep_reset_metafile_resv(rr->sc);
 }
 
+static inline bool
+xrep_rtrmapbt_want_live_update(
+	struct xchk_iscan		*iscan,
+	const struct xfs_owner_info	*oi)
+{
+	if (xchk_iscan_aborted(iscan))
+		return false;
+
+	/*
+	 * We scanned the CoW staging extents before we started the iscan, so
+	 * we need all the updates.
+	 */
+	if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+		return true;
+
+	/* Ignore updates to files that the scanner hasn't visited yet. */
+	return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rtrmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the rtrmap ILOCK and is generating
+ * the update, so we must be careful about which parts of the struct
+ * xrep_rtrmap that we change.
+ */
+static int
+xrep_rtrmapbt_live_update(
+	struct notifier_block		*nb,
+	unsigned long			action,
+	void				*data)
+{
+	struct xfs_rmap_update_params	*p = data;
+	struct xrep_rtrmap		*rr;
+	struct xfs_mount		*mp;
+	struct xfs_btree_cur		*mcur;
+	struct xfs_trans		*tp;
+	void				*txcookie;
+	int				error;
+
+	rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb);
+	mp = rr->sc->mp;
+
+	if (!xrep_rtrmapbt_want_live_update(&rr->iscan, &p->oinfo))
+		goto out_unlock;
+
+	trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p);
+
+	error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+	if (error)
+		goto out_abort;
+
+	mutex_lock(&rr->lock);
+	mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree);
+	error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+			p->blockcount, &p->oinfo, p->unwritten);
+	xfs_btree_del_cursor(mcur, error);
+	if (error)
+		goto out_cancel;
+
+	error = xfbtree_trans_commit(&rr->rtrmap_btree, tp);
+	if (error)
+		goto out_cancel;
+
+	xrep_trans_cancel_hook_dummy(&txcookie, tp);
+	mutex_unlock(&rr->lock);
+	return NOTIFY_DONE;
+
+out_cancel:
+	xfbtree_trans_cancel(&rr->rtrmap_btree, tp);
+	xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+	xchk_iscan_abort(&rr->iscan);
+	mutex_unlock(&rr->lock);
+out_unlock:
+	return NOTIFY_DONE;
+}
+
 /* Set up the filesystem scan components. */
 STATIC int
 xrep_rtrmap_setup_scan(
@@ -710,6 +814,7 @@ xrep_rtrmap_setup_scan(
 	struct xfs_scrub	*sc = rr->sc;
 	int			error;
 
+	mutex_init(&rr->lock);
 	xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);
 
 	/* Set up some storage */
@@ -720,10 +825,26 @@ xrep_rtrmap_setup_scan(
 
 	/* Retry iget every tenth of a second for up to 30 seconds. */
 	xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+	/*
+	 * Hook into live rtrmap operations so that we can update our in-memory
+	 * btree to reflect live changes on the filesystem.  Since we drop the
+	 * rtrmap ILOCK to scan all the inodes, we need this piece to avoid
+	 * installing a stale btree.
+	 */
+	ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+	xfs_rmap_hook_setup(&rr->rhook, xrep_rtrmapbt_live_update);
+	error = xfs_rmap_hook_add(rtg_group(sc->sr.rtg), &rr->rhook);
+	if (error)
+		goto out_iscan;
 	return 0;
 
+out_iscan:
+	xchk_iscan_teardown(&rr->iscan);
+	xfbtree_destroy(&rr->rtrmap_btree);
 out_bitmap:
 	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+	mutex_destroy(&rr->lock);
 	return error;
 }
 
@@ -732,9 +853,14 @@ STATIC void
 xrep_rtrmap_teardown(
 	struct xrep_rtrmap	*rr)
 {
+	struct xfs_scrub	*sc = rr->sc;
+
+	xchk_iscan_abort(&rr->iscan);
+	xfs_rmap_hook_del(rtg_group(sc->sr.rtg), &rr->rhook);
 	xchk_iscan_teardown(&rr->iscan);
 	xfbtree_destroy(&rr->rtrmap_btree);
 	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+	mutex_destroy(&rr->lock);
 }
 
 /* Repair the realtime rmap btree. */
@@ -745,9 +871,6 @@ xrep_rtrmapbt(
 	struct xrep_rtrmap	*rr = sc->buf;
 	int			error;
 
-	/* Functionality is not yet complete. */
-	return xrep_notsupported(sc);
-
 	/* Make sure any problems with the fork are fixed. */
 	error = xrep_metadata_inode_forks(sc);
 	if (error)
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 3f2a8695ef5c..fb86b746bc17 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -41,6 +41,9 @@ struct xchk_dirtree_outcomes;
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
 
+TRACE_DEFINE_ENUM(XG_TYPE_AG);
+TRACE_DEFINE_ENUM(XG_TYPE_RTG);
+
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
@@ -2709,11 +2712,12 @@ DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode);
 DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode);
 
 TRACE_EVENT(xrep_rmap_live_update,
-	TP_PROTO(const struct xfs_perag *pag, unsigned int op,
+	TP_PROTO(const struct xfs_group *xg, unsigned int op,
 		 const struct xfs_rmap_update_params *p),
-	TP_ARGS(pag, op, p),
+	TP_ARGS(xg, op, p),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(unsigned int, op)
 		__field(xfs_agblock_t, agbno)
@@ -2723,8 +2727,9 @@ TRACE_EVENT(xrep_rmap_live_update,
 		__field(unsigned int, flags)
 	),
 	TP_fast_assign(
-		__entry->dev = pag_mount(pag)->m_super->s_dev;
-		__entry->agno = pag_agno(pag);
+		__entry->dev = xg->xg_mount->m_super->s_dev;
+		__entry->type = xg->xg_type;
+		__entry->agno = xg->xg_gno;
 		__entry->op = op;
 		__entry->agbno = p->startblock;
 		__entry->len = p->blockcount;
@@ -2733,10 +2738,12 @@ TRACE_EVENT(xrep_rmap_live_update,
 		if (p->unwritten)
 			__entry->flags |= XFS_RMAP_UNWRITTEN;
 	),
-	TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+	TP_printk("dev %d:%d %sno 0x%x op %d %sbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
 		  __entry->op,
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agbno,
 		  __entry->len,
 		  __entry->owner,
-- 
cgit v1.2.3


From f4ed93037966aea07ae6b10ab208976783d24e2e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 17 Dec 2024 13:43:06 -0800
Subject: xfs: don't shut down the filesystem for media failures beyond end of
 log

If the filesystem has an external log device on pmem and the pmem
reports a media error beyond the end of the log area, don't shut down
the filesystem because we don't use that space.

Cc: <stable@vger.kernel.org> # v6.0
Fixes: 6f643c57d57c56 ("xfs: implement ->notify_failure() for XFS")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_notify_failure.c | 121 ++++++++++++++++++++++++++++++--------------
 1 file changed, 82 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index fa50e5308292..0b0b0f31aca2 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -153,6 +153,79 @@ xfs_dax_notify_failure_thaw(
 	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
 }
 
+static int
+xfs_dax_translate_range(
+	struct xfs_buftarg	*btp,
+	u64			offset,
+	u64			len,
+	xfs_daddr_t		*daddr,
+	uint64_t		*bblen)
+{
+	u64			dev_start = btp->bt_dax_part_off;
+	u64			dev_len = bdev_nr_bytes(btp->bt_bdev);
+	u64			dev_end = dev_start + dev_len - 1;
+
+	/* Notify failure on the whole device. */
+	if (offset == 0 && len == U64_MAX) {
+		offset = dev_start;
+		len = dev_len;
+	}
+
+	/* Ignore the range out of filesystem area */
+	if (offset + len - 1 < dev_start)
+		return -ENXIO;
+	if (offset > dev_end)
+		return -ENXIO;
+
+	/* Calculate the real range when it touches the boundary */
+	if (offset > dev_start)
+		offset -= dev_start;
+	else {
+		len -= dev_start - offset;
+		offset = 0;
+	}
+	if (offset + len - 1 > dev_end)
+		len = dev_end - offset + 1;
+
+	*daddr = BTOBB(offset);
+	*bblen = BTOBB(len);
+	return 0;
+}
+
+static int
+xfs_dax_notify_logdev_failure(
+	struct xfs_mount	*mp,
+	u64			offset,
+	u64			len,
+	int			mf_flags)
+{
+	xfs_daddr_t		daddr;
+	uint64_t		bblen;
+	int			error;
+
+	/*
+	 * Return ENXIO instead of shutting down the filesystem if the failed
+	 * region is beyond the end of the log.
+	 */
+	error = xfs_dax_translate_range(mp->m_logdev_targp,
+			offset, len, &daddr, &bblen);
+	if (error)
+		return error;
+
+	/*
+	 * In the pre-remove case the failure notification is attempting to
+	 * trigger a force unmount.  The expectation is that the device is
+	 * still present, but its removal is in progress and can not be
+	 * cancelled, proceed with accessing the log device.
+	 */
+	if (mf_flags & MF_MEM_PRE_REMOVE)
+		return 0;
+
+	xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+	return -EFSCORRUPTED;
+}
+
 static int
 xfs_dax_notify_ddev_failure(
 	struct xfs_mount	*mp,
@@ -263,8 +336,9 @@ xfs_dax_notify_failure(
 	int			mf_flags)
 {
 	struct xfs_mount	*mp = dax_holder(dax_dev);
-	u64			ddev_start;
-	u64			ddev_end;
+	xfs_daddr_t		daddr;
+	uint64_t		bblen;
+	int			error;
 
 	if (!(mp->m_super->s_flags & SB_BORN)) {
 		xfs_warn(mp, "filesystem is not ready for notify_failure()!");
@@ -279,17 +353,7 @@ xfs_dax_notify_failure(
 
 	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
 	    mp->m_logdev_targp != mp->m_ddev_targp) {
-		/*
-		 * In the pre-remove case the failure notification is attempting
-		 * to trigger a force unmount.  The expectation is that the
-		 * device is still present, but its removal is in progress and
-		 * can not be cancelled, proceed with accessing the log device.
-		 */
-		if (mf_flags & MF_MEM_PRE_REMOVE)
-			return 0;
-		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
-		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
-		return -EFSCORRUPTED;
+		return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
 	}
 
 	if (!xfs_has_rmapbt(mp)) {
@@ -297,33 +361,12 @@ xfs_dax_notify_failure(
 		return -EOPNOTSUPP;
 	}
 
-	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
-	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
-
-	/* Notify failure on the whole device. */
-	if (offset == 0 && len == U64_MAX) {
-		offset = ddev_start;
-		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
-	}
-
-	/* Ignore the range out of filesystem area */
-	if (offset + len - 1 < ddev_start)
-		return -ENXIO;
-	if (offset > ddev_end)
-		return -ENXIO;
-
-	/* Calculate the real range when it touches the boundary */
-	if (offset > ddev_start)
-		offset -= ddev_start;
-	else {
-		len -= ddev_start - offset;
-		offset = 0;
-	}
-	if (offset + len - 1 > ddev_end)
-		len = ddev_end - offset + 1;
+	error = xfs_dax_translate_range(mp->m_ddev_targp, offset, len, &daddr,
+			&bblen);
+	if (error)
+		return error;
 
-	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
-			mf_flags);
+	return xfs_dax_notify_ddev_failure(mp, daddr, bblen, mf_flags);
 }
 
 const struct dax_holder_operations xfs_dax_holder_operations = {
-- 
cgit v1.2.3


From 799e7e6566df79b3546a2dd7cf6bcf287f53d41c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 17 Dec 2024 13:22:44 -0800
Subject: xfs: react to fsdax failure notifications on the rt device

Now that we have reverse mapping for the realtime device, use the
information to kill processes that have mappings to bad pmem.  This
requires refactoring the existing routines to handle rtgroups or AGs;
and splitting out the translation function to improve cohesion.
Also make a proper header file for the dax holder ops.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c            |   1 +
 fs/xfs/xfs_notify_failure.c | 127 +++++++++++++++++++++++++-------------------
 fs/xfs/xfs_notify_failure.h |  11 ++++
 fs/xfs/xfs_super.h          |   1 -
 4 files changed, 84 insertions(+), 56 deletions(-)
 create mode 100644 fs/xfs/xfs_notify_failure.h

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aa63b8efd782..6f313fbf7669 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -22,6 +22,7 @@
 #include "xfs_error.h"
 #include "xfs_ag.h"
 #include "xfs_buf_mem.h"
+#include "xfs_notify_failure.h"
 
 struct kmem_cache *xfs_buf_cache;
 
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 0b0b0f31aca2..ed8d8ed42f0a 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -19,6 +19,9 @@
 #include "xfs_rtalloc.h"
 #include "xfs_trans.h"
 #include "xfs_ag.h"
+#include "xfs_notify_failure.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 #include <linux/mm.h>
 #include <linux/dax.h>
@@ -227,23 +230,42 @@ xfs_dax_notify_logdev_failure(
 }
 
 static int
-xfs_dax_notify_ddev_failure(
+xfs_dax_notify_dev_failure(
 	struct xfs_mount	*mp,
-	xfs_daddr_t		daddr,
-	xfs_daddr_t		bblen,
-	int			mf_flags)
+	u64			offset,
+	u64			len,
+	int			mf_flags,
+	enum xfs_group_type	type)
 {
 	struct xfs_failure_info	notify = { .mf_flags = mf_flags };
 	struct xfs_trans	*tp = NULL;
 	struct xfs_btree_cur	*cur = NULL;
-	struct xfs_buf		*agf_bp = NULL;
 	int			error = 0;
 	bool			kernel_frozen = false;
-	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
-	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
-	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp,
-							     daddr + bblen - 1);
-	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+	uint32_t		start_gno, end_gno;
+	xfs_fsblock_t		start_bno, end_bno;
+	xfs_daddr_t		daddr;
+	uint64_t		bblen;
+	struct xfs_group	*xg = NULL;
+
+	if (!xfs_has_rmapbt(mp)) {
+		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
+		return -EOPNOTSUPP;
+	}
+
+	error = xfs_dax_translate_range(type == XG_TYPE_RTG ?
+			mp->m_rtdev_targp : mp->m_ddev_targp,
+			offset, len, &daddr, &bblen);
+	if (error)
+		return error;
+
+	if (type == XG_TYPE_RTG) {
+		start_bno = xfs_daddr_to_rtb(mp, daddr);
+		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
+	} else {
+		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
+		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
+	}
 
 	if (mf_flags & MF_MEM_PRE_REMOVE) {
 		xfs_info(mp, "Device is about to be removed!");
@@ -262,46 +284,58 @@ xfs_dax_notify_ddev_failure(
 	if (error)
 		goto out;
 
-	for (; agno <= end_agno; agno++) {
+	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
+	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
+	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
+		struct xfs_buf		*agf_bp = NULL;
+		struct xfs_rtgroup	*rtg = NULL;
 		struct xfs_rmap_irec	ri_low = { };
 		struct xfs_rmap_irec	ri_high;
-		struct xfs_agf		*agf;
-		struct xfs_perag	*pag;
-		xfs_agblock_t		range_agend;
 
-		pag = xfs_perag_get(mp, agno);
-		error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
-		if (error) {
-			xfs_perag_put(pag);
-			break;
-		}
+		if (type == XG_TYPE_AG) {
+			struct xfs_perag	*pag = to_perag(xg);
 
-		cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+			if (error) {
+				xfs_perag_put(pag);
+				break;
+			}
+
+			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+		} else {
+			rtg = to_rtg(xg);
+			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+		}
 
 		/*
 		 * Set the rmap range from ri_low to ri_high, which represents
 		 * a [start, end] where we looking for the files or metadata.
 		 */
 		memset(&ri_high, 0xFF, sizeof(ri_high));
-		ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
-		if (agno == end_agno)
-			ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
+		if (xg->xg_gno == start_gno)
+			ri_low.rm_startblock =
+				xfs_fsb_to_gbno(mp, start_bno, type);
+		if (xg->xg_gno == end_gno)
+			ri_high.rm_startblock =
+				xfs_fsb_to_gbno(mp, end_bno, type);
 
-		agf = agf_bp->b_addr;
-		range_agend = min(be32_to_cpu(agf->agf_length) - 1,
-				ri_high.rm_startblock);
 		notify.startblock = ri_low.rm_startblock;
-		notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
+		notify.blockcount = min(xg->xg_block_count,
+					ri_high.rm_startblock + 1) -
+					ri_low.rm_startblock;
 
 		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
 				xfs_dax_failure_fn, &notify);
 		xfs_btree_del_cursor(cur, error);
-		xfs_trans_brelse(tp, agf_bp);
-		xfs_perag_put(pag);
-		if (error)
+		if (agf_bp)
+			xfs_trans_brelse(tp, agf_bp);
+		if (rtg)
+			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+		if (error) {
+			xfs_group_put(xg);
 			break;
-
-		fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
+		}
 	}
 
 	xfs_trans_cancel(tp);
@@ -336,37 +370,20 @@ xfs_dax_notify_failure(
 	int			mf_flags)
 {
 	struct xfs_mount	*mp = dax_holder(dax_dev);
-	xfs_daddr_t		daddr;
-	uint64_t		bblen;
-	int			error;
 
 	if (!(mp->m_super->s_flags & SB_BORN)) {
 		xfs_warn(mp, "filesystem is not ready for notify_failure()!");
 		return -EIO;
 	}
 
-	if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
-		xfs_debug(mp,
-			 "notify_failure() not supported on realtime device!");
-		return -EOPNOTSUPP;
-	}
-
-	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
-	    mp->m_logdev_targp != mp->m_ddev_targp) {
+	if (mp->m_logdev_targp != mp->m_ddev_targp &&
+	    mp->m_logdev_targp->bt_daxdev == dax_dev) {
 		return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
 	}
 
-	if (!xfs_has_rmapbt(mp)) {
-		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
-		return -EOPNOTSUPP;
-	}
-
-	error = xfs_dax_translate_range(mp->m_ddev_targp, offset, len, &daddr,
-			&bblen);
-	if (error)
-		return error;
-
-	return xfs_dax_notify_ddev_failure(mp, daddr, bblen, mf_flags);
+	return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags,
+		(mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ?
+				XG_TYPE_RTG : XG_TYPE_AG);
 }
 
 const struct dax_holder_operations xfs_dax_holder_operations = {
diff --git a/fs/xfs/xfs_notify_failure.h b/fs/xfs/xfs_notify_failure.h
new file mode 100644
index 000000000000..8d08ec29dd29
--- /dev/null
+++ b/fs/xfs/xfs_notify_failure.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_NOTIFY_FAILURE_H__
+#define __XFS_NOTIFY_FAILURE_H__
+
+extern const struct dax_holder_operations xfs_dax_holder_operations;
+
+#endif /* __XFS_NOTIFY_FAILURE_H__ */
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 302e6e5d6c7e..c0e85c1e42f2 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -92,7 +92,6 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
 
 extern const struct export_operations xfs_export_operations;
 extern const struct quotactl_ops xfs_quotactl_operations;
-extern const struct dax_holder_operations xfs_dax_holder_operations;
 
 extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
 
-- 
cgit v1.2.3


From c2358439af374cad47f771797875d0beb8256738 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:45 -0800
Subject: xfs: enable realtime rmap btree

Permit mounting filesystems with realtime rmap btrees.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_rtalloc.c | 12 ++++++++----
 fs/xfs/xfs_super.c   |  6 ------
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 3c1bce5a4855..a69967f9d88e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1282,11 +1282,15 @@ xfs_growfs_rt(
 	    XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE)
 		goto out_unlock;
 
-	/* Unsupported realtime features. */
+	/* Check for features supported only on rtgroups filesystems. */
 	error = -EOPNOTSUPP;
-	if (xfs_has_quota(mp) && !xfs_has_rtgroups(mp))
-		goto out_unlock;
-	if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
+	if (!xfs_has_rtgroups(mp)) {
+		if (xfs_has_rmapbt(mp))
+			goto out_unlock;
+		if (xfs_has_quota(mp))
+			goto out_unlock;
+	}
+	if (xfs_has_reflink(mp))
 		goto out_unlock;
 
 	error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 394fdf3bb535..ecd5a9f444d8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1767,12 +1767,6 @@ xfs_fs_fill_super(
 		}
 	}
 
-	if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) {
-		xfs_alert(mp,
-	"reverse mapping btree not compatible with realtime device!");
-		error = -EINVAL;
-		goto out_filestream_unmount;
-	}
 
 	if (xfs_has_exchange_range(mp))
 		xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE);
-- 
cgit v1.2.3


From 0d89af530c8c7591f75492e5814aa1ca4a0d26d7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:45 -0800
Subject: xfs: prepare refcount btree cursor tracepoints for realtime

Rework the refcount btree cursor tracepoints in preparation to handle the
realtime refcount btree cursor.  Mostly this involves renaming the field to
"refcbno" and extracting the group number from the cursor when possible.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_refcount_item.c |   4 +-
 fs/xfs/xfs_trace.h         | 111 +++++++++++++++++++++++++++------------------
 2 files changed, 70 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index bede1c96c330..c807c4b90c44 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -328,9 +328,9 @@ xfs_refcount_defer_add(
 {
 	struct xfs_mount		*mp = tp->t_mountp;
 
-	trace_xfs_refcount_defer(mp, ri);
-
 	ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, XG_TYPE_AG);
+
+	trace_xfs_refcount_defer(mp, ri);
 	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
 }
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 84cdc145e2d9..4fe689410eb6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3305,56 +3305,62 @@ TRACE_EVENT(xfs_ag_resv_init_error,
 /* refcount tracepoint classes */
 
 DECLARE_EVENT_CLASS(xfs_refcount_class,
-	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno,
 		xfs_extlen_t len),
-	TP_ARGS(cur, agbno, len),
+	TP_ARGS(cur, gbno, len),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, gbno)
 		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
-		__entry->agbno = agbno;
+		__entry->gbno = gbno;
 		__entry->len = len;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+	TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->gbno,
 		  __entry->len)
 );
 #define DEFINE_REFCOUNT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_class, name, \
-	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, \
 		xfs_extlen_t len), \
-	TP_ARGS(cur, agbno, len))
+	TP_ARGS(cur, gbno, len))
 
 TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
 TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
 TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
 TRACE_EVENT(xfs_refcount_lookup,
-	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno,
 		xfs_lookup_t dir),
-	TP_ARGS(cur, agbno, dir),
+	TP_ARGS(cur, gbno, dir),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, gbno)
 		__field(xfs_lookup_t, dir)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
-		__entry->agbno = agbno;
+		__entry->gbno = gbno;
 		__entry->dir = dir;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)",
+	TP_printk("dev %d:%d %sno 0x%x gbno 0x%x cmp %s(%d)",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->gbno,
 		  __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
 		  __entry->dir)
 )
@@ -3365,6 +3371,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
 	TP_ARGS(cur, irec),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, domain)
 		__field(xfs_agblock_t, startblock)
@@ -3373,14 +3380,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
 		__entry->domain = irec->rc_domain;
 		__entry->startblock = irec->rc_startblock;
 		__entry->blockcount = irec->rc_blockcount;
 		__entry->refcount = irec->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+	TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
 		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->startblock,
@@ -3396,49 +3405,53 @@ DEFINE_EVENT(xfs_refcount_extent_class, name, \
 /* single-rcext and an agbno tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec,
-		 xfs_agblock_t agbno),
-	TP_ARGS(cur, irec, agbno),
+		 xfs_agblock_t gbno),
+	TP_ARGS(cur, irec, gbno),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, domain)
 		__field(xfs_agblock_t, startblock)
 		__field(xfs_extlen_t, blockcount)
 		__field(xfs_nlink_t, refcount)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, gbno)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
 		__entry->domain = irec->rc_domain;
 		__entry->startblock = irec->rc_startblock;
 		__entry->blockcount = irec->rc_blockcount;
 		__entry->refcount = irec->rc_refcount;
-		__entry->agbno = agbno;
+		__entry->gbno = gbno;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+	TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
 		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->startblock,
 		  __entry->blockcount,
 		  __entry->refcount,
-		  __entry->agbno)
+		  __entry->gbno)
 )
 
 #define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \
-		 xfs_agblock_t agbno), \
-	TP_ARGS(cur, irec, agbno))
+		 xfs_agblock_t gbno), \
+	TP_ARGS(cur, irec, gbno))
 
 /* double-rcext tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
-		struct xfs_refcount_irec *i2),
+		 struct xfs_refcount_irec *i2),
 	TP_ARGS(cur, i1, i2),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, i1_domain)
 		__field(xfs_agblock_t, i1_startblock)
@@ -3451,6 +3464,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
@@ -3461,9 +3475,10 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 		__entry->i2_blockcount = i2->rc_blockcount;
 		__entry->i2_refcount = i2->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
-		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+	TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s gbno 0x%x fsbcount 0x%x refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
 		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->i1_startblock,
@@ -3484,10 +3499,11 @@ DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
 /* double-rcext and an agbno tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
-		 struct xfs_refcount_irec *i2, xfs_agblock_t agbno),
-	TP_ARGS(cur, i1, i2, agbno),
+		 struct xfs_refcount_irec *i2, xfs_agblock_t gbno),
+	TP_ARGS(cur, i1, i2, gbno),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, i1_domain)
 		__field(xfs_agblock_t, i1_startblock)
@@ -3497,10 +3513,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 		__field(xfs_agblock_t, i2_startblock)
 		__field(xfs_extlen_t, i2_blockcount)
 		__field(xfs_nlink_t, i2_refcount)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, gbno)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
@@ -3510,11 +3527,12 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 		__entry->i2_startblock = i2->rc_startblock;
 		__entry->i2_blockcount = i2->rc_blockcount;
 		__entry->i2_refcount = i2->rc_refcount;
-		__entry->agbno = agbno;
+		__entry->gbno = gbno;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
-		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+	TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
 		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->i1_startblock,
@@ -3524,14 +3542,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 		  __entry->i2_startblock,
 		  __entry->i2_blockcount,
 		  __entry->i2_refcount,
-		  __entry->agbno)
+		  __entry->gbno)
 )
 
 #define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
-		struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \
-	TP_ARGS(cur, i1, i2, agbno))
+		struct xfs_refcount_irec *i2, xfs_agblock_t gbno), \
+	TP_ARGS(cur, i1, i2, gbno))
 
 /* triple-rcext tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
@@ -3540,6 +3558,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 	TP_ARGS(cur, i1, i2, i3),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, i1_domain)
 		__field(xfs_agblock_t, i1_startblock)
@@ -3556,6 +3575,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->type = cur->bc_group->xg_type;
 		__entry->agno = cur->bc_group->xg_gno;
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
@@ -3570,10 +3590,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 		__entry->i3_blockcount = i3->rc_blockcount;
 		__entry->i3_refcount = i3->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
-		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
-		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+	TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s gbno 0x%x fsbcount 0x%x refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
 		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->i1_startblock,
@@ -3641,23 +3662,27 @@ DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
 	TP_ARGS(mp, refc),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(int, op)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, gbno)
 		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock);
+		__entry->type = refc->ri_group->xg_type;
+		__entry->agno = refc->ri_group->xg_gno;
 		__entry->op = refc->ri_type;
-		__entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock);
+		__entry->gbno = xfs_fsb_to_gbno(mp, refc->ri_startblock,
+						   refc->ri_group->xg_type);
 		__entry->len = refc->ri_blockcount;
 	),
-	TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x",
+	TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x fsbcount 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->gbno,
 		  __entry->len)
 );
 #define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
-- 
cgit v1.2.3


From 70fcf6866578e69635399e806273376f5e0b8e2b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:46 -0800
Subject: xfs: namespace the maximum length/refcount symbols

Actually namespace these variables properly, so that readers can tell
that this is an XFS symbol, and that it's for the refcount
functionality.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_format.h     |  4 ++--
 fs/xfs/libxfs/xfs_refcount.c   | 18 +++++++++---------
 fs/xfs/scrub/refcount.c        |  2 +-
 fs/xfs/scrub/refcount_repair.c |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index fba4e59aded4..16696bc3ff94 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1790,8 +1790,8 @@ struct xfs_refcount_key {
 	__be32		rc_startblock;	/* starting block number */
 };
 
-#define MAXREFCOUNT	((xfs_nlink_t)~0U)
-#define MAXREFCEXTLEN	((xfs_extlen_t)~0U)
+#define XFS_REFC_REFCOUNT_MAX	((xfs_nlink_t)~0U)
+#define XFS_REFC_LEN_MAX	((xfs_extlen_t)~0U)
 
 /* btree pointer type */
 typedef __be32 xfs_refcount_ptr_t;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index bbb86dc9a25c..faace12fe2e3 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -128,7 +128,7 @@ xfs_refcount_check_irec(
 	struct xfs_perag		*pag,
 	const struct xfs_refcount_irec	*irec)
 {
-	if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
+	if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
 		return __this_address;
 
 	if (!xfs_refcount_check_domain(irec))
@@ -138,7 +138,7 @@ xfs_refcount_check_irec(
 	if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
 		return __this_address;
 
-	if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
+	if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
 		return __this_address;
 
 	return NULL;
@@ -853,9 +853,9 @@ xfs_refc_merge_refcount(
 	const struct xfs_refcount_irec	*irec,
 	enum xfs_refc_adjust_op		adjust)
 {
-	/* Once a record hits MAXREFCOUNT, it is pinned there forever */
-	if (irec->rc_refcount == MAXREFCOUNT)
-		return MAXREFCOUNT;
+	/* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */
+	if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX)
+		return XFS_REFC_REFCOUNT_MAX;
 	return irec->rc_refcount + adjust;
 }
 
@@ -898,7 +898,7 @@ xfs_refc_want_merge_center(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cleft->rc_blockcount + right->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	*ulenp = ulen;
@@ -933,7 +933,7 @@ xfs_refc_want_merge_left(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cleft->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	return true;
@@ -967,7 +967,7 @@ xfs_refc_want_merge_right(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cright->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	return true;
@@ -1196,7 +1196,7 @@ xfs_refcount_adjust_extents(
 		 * Adjust the reference count and either update the tree
 		 * (incr) or free the blocks (decr).
 		 */
-		if (ext.rc_refcount == MAXREFCOUNT)
+		if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX)
 			goto skip;
 		ext.rc_refcount += adj;
 		trace_xfs_refcount_modify_extent(cur, &ext);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 1c5e45cc6419..d46528023015 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -421,7 +421,7 @@ xchk_refcount_mergeable(
 	if (r1->rc_refcount != r2->rc_refcount)
 		return false;
 	if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
-			MAXREFCEXTLEN)
+			XFS_REFC_LEN_MAX)
 		return false;
 
 	return true;
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index 4e572b81c986..1ee6d4aeb308 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -183,7 +183,7 @@ xrep_refc_stash(
 	if (xchk_should_terminate(sc, &error))
 		return error;
 
-	irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+	irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
 
 	error = xrep_refc_check_ext(rr->sc, &irec);
 	if (error)
@@ -422,7 +422,7 @@ xrep_refc_find_refcounts(
 	/*
 	 * Set up a bag to store all the rmap records that we're tracking to
 	 * generate a reference count record.  If the size of the bag exceeds
-	 * MAXREFCOUNT, we clamp rc_refcount.
+	 * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
 	 */
 	error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
 	if (error)
-- 
cgit v1.2.3


From 9abe03a0e4f978615a2b1b484b8d09ca84c16ea0 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:47 -0800
Subject: xfs: introduce realtime refcount btree ondisk definitions

Add the ondisk structure definitions for realtime refcount btrees. The
realtime refcount btree will be rooted from a hidden inode so it needs
to have a separate btree block magic and pointer format.

Next, add everything needed to read, write and manipulate refcount btree
blocks. This prepares the way for connecting the btree operations
implementation, though the changes to actually root the rtrefcount btree
in an inode come later.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile                      |   1 +
 fs/xfs/libxfs/xfs_btree.c            |   5 +
 fs/xfs/libxfs/xfs_format.h           |   9 ++
 fs/xfs/libxfs/xfs_ondisk.h           |   1 +
 fs/xfs/libxfs/xfs_rtrefcount_btree.c | 273 +++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |  70 +++++++++
 fs/xfs/libxfs/xfs_sb.c               |   8 +
 fs/xfs/libxfs/xfs_shared.h           |   7 +
 fs/xfs/xfs_mount.c                   |   5 +-
 fs/xfs/xfs_mount.h                   |   9 ++
 fs/xfs/xfs_stats.c                   |   3 +-
 fs/xfs/xfs_stats.h                   |   1 +
 12 files changed, 390 insertions(+), 2 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtrefcount_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_rtrefcount_btree.h

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 338e10f81b7b..b9356d01416e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -51,6 +51,7 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_rmap_btree.o \
 				   xfs_refcount.o \
 				   xfs_refcount_btree.o \
+				   xfs_rtrefcount_btree.o \
 				   xfs_rtrmap_btree.o \
 				   xfs_sb.o \
 				   xfs_symlink_remote.o \
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 36ab06f8a3bc..299ce7fd11b0 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -35,6 +35,7 @@
 #include "xfs_rmap.h"
 #include "xfs_quota.h"
 #include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Btree magic numbers.
@@ -5533,6 +5534,9 @@ xfs_btree_init_cur_caches(void)
 	if (error)
 		goto err;
 	error = xfs_rtrmapbt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_rtrefcountbt_init_cur_cache();
 	if (error)
 		goto err;
 
@@ -5552,6 +5556,7 @@ xfs_btree_destroy_cur_caches(void)
 	xfs_rmapbt_destroy_cur_cache();
 	xfs_refcountbt_destroy_cur_cache();
 	xfs_rtrmapbt_destroy_cur_cache();
+	xfs_rtrefcountbt_destroy_cur_cache();
 }
 
 /* Move the btree cursor before the first record. */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 16696bc3ff94..17f7c0d1aaa4 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1796,6 +1796,15 @@ struct xfs_refcount_key {
 /* btree pointer type */
 typedef __be32 xfs_refcount_ptr_t;
 
+/*
+ * Realtime Reference Count btree format definitions
+ *
+ * This is a btree for reference count records for realtime volumes
+ */
+#define	XFS_RTREFC_CRC_MAGIC	0x52434e54	/* 'RCNT' */
+
+/* inode-rooted btree pointer type */
+typedef __be64 xfs_rtrefcount_ptr_t;
 
 /*
  * BMAP Btree format definitions
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 07e2f5fb3a94..efb035050c00 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -85,6 +85,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t,		8);
 
 	/*
 	 * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
new file mode 100644
index 000000000000..d07d3b1e78f7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+
+static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
+
+/*
+ * Realtime Reference Count btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device.  See the comments in xfs_refcount_btree.c for more information.
+ *
+ * This tree is basically the same as the regular refcount btree except that
+ * it's rooted in an inode.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrefcountbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
+}
+
+static xfs_failaddr_t
+xfs_rtrefcountbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	int			level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (!xfs_has_reflink(mp))
+		return __this_address;
+	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+	level = be16_to_cpu(block->bb_level);
+	if (level > mp->m_rtrefc_maxlevels)
+		return __this_address;
+
+	return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]);
+}
+
+static void
+xfs_rtrefcountbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_fsblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rtrefcountbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrefcountbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_rtrefcountbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_fsblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
+	.name			= "xfs_rtrefcountbt",
+	.magic			= { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) },
+	.verify_read		= xfs_rtrefcountbt_read_verify,
+	.verify_write		= xfs_rtrefcountbt_write_verify,
+	.verify_struct		= xfs_rtrefcountbt_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
+	.name			= "rtrefcount",
+	.type			= XFS_BTREE_TYPE_INODE,
+	.geom_flags		= XFS_BTGEO_IROOT_RECORDS,
+
+	.rec_len		= sizeof(struct xfs_refcount_rec),
+	.key_len		= sizeof(struct xfs_refcount_key),
+	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
+
+	.lru_refs		= XFS_REFC_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrefcbt_2),
+
+	.dup_cursor		= xfs_rtrefcountbt_dup_cursor,
+	.buf_ops		= &xfs_rtrefcountbt_buf_ops,
+};
+
+/* Allocate a new rt refcount btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrefcountbt_init_cursor(
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_inode	*ip = NULL;
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_btree_cur	*cur;
+
+	return NULL; /* XXX */
+
+	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops,
+			mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache);
+
+	cur->bc_ino.ip = ip;
+	cur->bc_refc.nr_ops = 0;
+	cur->bc_refc.shape_changes = 0;
+	cur->bc_group = xfs_group_hold(rtg_group(rtg));
+	cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+	cur->bc_ino.whichfork = XFS_DATA_FORK;
+	return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root.  Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrefcountbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp)
+{
+	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
+	struct xfs_ifork	*ifp;
+	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	/*
+	 * Free any resources hanging off the real fork, then shallow-copy the
+	 * staging fork's contents into the real fork to transfer everything
+	 * we just built.
+	 */
+	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+	xfs_idestroy_fork(ifp);
+	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+	cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
+	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
+}
+
+/* Calculate number of records in a realtime refcount btree block. */
+static inline unsigned int
+xfs_rtrefcountbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (sizeof(struct xfs_refcount_key) +
+			   sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount btree block.
+ */
+unsigned int
+xfs_rtrefcountbt_maxrecs(
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_RTREFCOUNT_BLOCK_LEN;
+	return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime refcount btrees. */
+unsigned int
+xfs_rtrefcountbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2;
+
+	/* We need at most one record for every block in an rt group. */
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrefcountbt_init_cur_cache(void)
+{
+	xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur",
+			xfs_btree_cur_sizeof(
+					xfs_rtrefcountbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_rtrefcountbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_rtrefcountbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_rtrefcountbt_cur_cache);
+	xfs_rtrefcountbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of a realtime refcount btree. */
+void
+xfs_rtrefcountbt_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	unsigned int		d_maxlevels, r_maxlevels;
+
+	if (!xfs_has_rtreflink(mp)) {
+		mp->m_rtrefc_maxlevels = 0;
+		return;
+	}
+
+	/*
+	 * The realtime refcountbt lives on the data device, which means that
+	 * its maximum height is constrained by the size of the data device and
+	 * the height required to store one refcount record for each rtextent
+	 * in an rt group.
+	 */
+	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr,
+				mp->m_sb.sb_dblocks);
+	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr,
+				mp->m_sb.sb_rgextents);
+
+	/* Add one level to handle the inode root level. */
+	mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
new file mode 100644
index 000000000000..b713b3381880
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTREFCOUNT_BTREE_H__
+#define __XFS_RTREFCOUNT_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* refcounts only exist on crc enabled filesystems */
+#define XFS_RTREFCOUNT_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp,
+		struct xfs_rtgroup *rtg);
+struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		struct xbtree_ifakeroot *ifake);
+void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp);
+unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
+		unsigned int blocklen, bool leaf);
+void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_rec_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_refcount_rec *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_refcount_key *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_ptr_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrefcount_ptr_t *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 maxrecs * sizeof(struct xfs_refcount_key) +
+		 (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
+int __init xfs_rtrefcountbt_init_cur_cache(void);
+void xfs_rtrefcountbt_destroy_cur_cache(void);
+
+#endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 83fb14b4074c..3dc5f5dba162 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -29,6 +29,7 @@
 #include "xfs_exchrange.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1226,6 +1227,13 @@ xfs_sb_mount_common(
 	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
 	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
 
+	mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+			true);
+	mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+			false);
+	mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2;
+	mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2;
+
 	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 	mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 960716c387cc..b1e0d9bc1f7d 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -42,6 +42,7 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
 extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
@@ -58,6 +59,7 @@ extern const struct xfs_btree_ops xfs_rmapbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
 extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
 extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops;
+extern const struct xfs_btree_ops xfs_rtrefcountbt_ops;
 
 static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
 {
@@ -114,6 +116,11 @@ static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops)
 	return ops == &xfs_rtrmapbt_ops;
 }
 
+static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_rtrefcountbt_ops;
+}
+
 /* log size calculation functions */
 int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
 int	xfs_log_calc_minimum_size(struct xfs_mount *);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7b7d21b50d54..477c5262cf91 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -38,6 +38,7 @@
 #include "xfs_metafile.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -656,7 +657,8 @@ static inline void
 xfs_rtbtree_compute_maxlevels(
 	struct xfs_mount	*mp)
 {
-	mp->m_rtbtree_maxlevels = mp->m_rtrmap_maxlevels;
+	mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels,
+				      mp->m_rtrefc_maxlevels);
 }
 
 /*
@@ -729,6 +731,7 @@ xfs_mountfs(
 	xfs_rmapbt_compute_maxlevels(mp);
 	xfs_rtrmapbt_compute_maxlevels(mp);
 	xfs_refcountbt_compute_maxlevels(mp);
+	xfs_rtrefcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
 	xfs_rtbtree_compute_maxlevels(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1bc95fb170db..9a1516080e63 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -162,11 +162,14 @@ typedef struct xfs_mount {
 	uint			m_rtrmap_mnr[2]; /* min rtrmap btree records */
 	uint			m_refc_mxr[2];	/* max refc btree records */
 	uint			m_refc_mnr[2];	/* min refc btree records */
+	uint			m_rtrefc_mxr[2]; /* max rtrefc btree records */
+	uint			m_rtrefc_mnr[2]; /* min rtrefc btree records */
 	uint			m_alloc_maxlevels; /* max alloc btree levels */
 	uint			m_bm_maxlevels[2]; /* max bmap btree levels */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
 	uint			m_rtrmap_maxlevels; /* max rtrmap btree level */
 	uint			m_refc_maxlevels; /* max refcount btree level */
+	uint			m_rtrefc_maxlevels; /* max rtrefc btree level */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
 	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
@@ -408,6 +411,12 @@ static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
 	       xfs_has_rmapbt(mp);
 }
 
+static inline bool xfs_has_rtreflink(struct xfs_mount *mp)
+{
+	return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
+	       xfs_has_reflink(mp);
+}
+
 /*
  * Some features are always on for v5 file systems, allow the compiler to
  * eliminiate dead code when building without v4 support.
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index b7f2988bc03b..35c7fb3ba324 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -54,7 +54,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
 		{ "rmapbt_mem",		xfsstats_offset(xs_rcbag_2)	},
 		{ "rcbagbt",		xfsstats_offset(xs_rtrmap_2)	},
 		{ "rtrmapbt",		xfsstats_offset(xs_rtrmap_mem_2)},
-		{ "rtrmapbt_mem",	xfsstats_offset(xs_qm_dqreclaims)},
+		{ "rtrmapbt_mem",	xfsstats_offset(xs_rtrefcbt_2)	},
+		{ "rtrefcntbt",		xfsstats_offset(xs_qm_dqreclaims)},
 		/* we print both series of quota information together */
 		{ "qm",			xfsstats_offset(xs_xstrat_bytes)},
 	};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 9c47de5dff2d..15ba1abcf253 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -129,6 +129,7 @@ struct __xfsstats {
 	uint32_t		xs_rcbag_2[__XBTS_MAX];
 	uint32_t		xs_rtrmap_2[__XBTS_MAX];
 	uint32_t		xs_rtrmap_mem_2[__XBTS_MAX];
+	uint32_t		xs_rtrefcbt_2[__XBTS_MAX];
 	uint32_t		xs_qm_dqreclaims;
 	uint32_t		xs_qm_dqreclaim_misses;
 	uint32_t		xs_qm_dquot_dups;
-- 
cgit v1.2.3


From 2003c6a8754e307970c101a20baf8fb67d0588f2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:48 -0800
Subject: xfs: realtime refcount btree transaction reservations

Make sure that there's enough log reservation to handle mapping
and unmapping realtime extents.  We have to reserve enough space
to handle a split in the rtrefcountbt to add the record and a second
split in the regular refcountbt to record the rtrefcountbt split.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_trans_resv.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f3392eb2d7f4..13d00c7166e1 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -92,6 +92,14 @@ xfs_refcountbt_block_count(
 	return num_ops * (2 * mp->m_refc_maxlevels - 1);
 }
 
+static unsigned int
+xfs_rtrefcountbt_block_count(
+	struct xfs_mount	*mp,
+	unsigned int		num_ops)
+{
+	return num_ops * (2 * mp->m_rtrefc_maxlevels - 1);
+}
+
 /*
  * Logging inodes is really tricksy. They are logged in memory format,
  * which means that what we write into the log doesn't directly translate into
@@ -259,10 +267,13 @@ xfs_rtalloc_block_count(
  * Compute the log reservation required to handle the refcount update
  * transaction.  Refcount updates are always done via deferred log items.
  *
- * This is calculated as:
+ * This is calculated as the max of:
  * Data device refcount updates (t1):
  *    the agfs of the ags containing the blocks: nr_ops * sector size
  *    the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ * Realtime refcount updates (t2);
+ *    the rt refcount inode
+ *    the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
  */
 static unsigned int
 xfs_calc_refcountbt_reservation(
@@ -270,12 +281,20 @@ xfs_calc_refcountbt_reservation(
 	unsigned int		nr_ops)
 {
 	unsigned int		blksz = XFS_FSB_TO_B(mp, 1);
+	unsigned int		t1, t2 = 0;
 
 	if (!xfs_has_reflink(mp))
 		return 0;
 
-	return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
-	       xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+	t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+	     xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+
+	if (xfs_has_realtime(mp))
+		t2 = xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+				     blksz);
+
+	return max(t1, t2);
 }
 
 /*
-- 
cgit v1.2.3


From 1a6f88ea538db9b3d8aef86112894e7e6d098287 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:48 -0800
Subject: xfs: add realtime refcount btree operations

Implement the generic btree operations needed to manipulate rtrefcount
btree blocks. This is different from the regular refcountbt in that we
allocate space from the filesystem at large, and are neither constrained
to the free space nor any particular AG.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtrefcount_btree.c | 148 +++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index d07d3b1e78f7..e30af9415816 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -19,6 +19,7 @@
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
 #include "xfs_rtrefcount_btree.h"
+#include "xfs_refcount.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_error.h"
@@ -45,6 +46,106 @@ xfs_rtrefcountbt_dup_cursor(
 	return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
 }
 
+STATIC int
+xfs_rtrefcountbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_rtrefc_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrefcountbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0);
+	}
+
+	return cur->bc_mp->m_rtrefc_mxr[level != 0];
+}
+
+STATIC void
+xfs_rtrefcountbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_rtrefcountbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	__u32				x;
+
+	x = be32_to_cpu(rec->refc.rc_startblock);
+	x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+	key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+	uint32_t		start;
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	rec->refc.rc_startblock = cpu_to_be32(start);
+	rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+	rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	const struct xfs_refcount_key	*kp = &key->refc;
+	const struct xfs_refcount_irec	*irec = &cur->bc_rec.rc;
+	uint32_t			start;
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->refc.rc_startblock);
+
+	return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+			be32_to_cpu(k2->refc.rc_startblock);
+}
+
 static xfs_failaddr_t
 xfs_rtrefcountbt_verify(
 	struct xfs_buf		*bp)
@@ -111,6 +212,40 @@ const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
 	.verify_struct		= xfs_rtrefcountbt_verify,
 };
 
+STATIC int
+xfs_rtrefcountbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->refc.rc_startblock) <
+	       be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_rtrefcountbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	return  be32_to_cpu(r1->refc.rc_startblock) +
+		be32_to_cpu(r1->refc.rc_blockcount) <=
+		be32_to_cpu(r2->refc.rc_startblock);
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrefcountbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->refc.rc_startblock);
+
+	return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock),
+				 be32_to_cpu(key2->refc.rc_startblock));
+}
+
 const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.name			= "rtrefcount",
 	.type			= XFS_BTREE_TYPE_INODE,
@@ -124,7 +259,20 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrefcbt_2),
 
 	.dup_cursor		= xfs_rtrefcountbt_dup_cursor,
+	.alloc_block		= xfs_btree_alloc_metafile_block,
+	.free_block		= xfs_btree_free_metafile_block,
+	.get_minrecs		= xfs_rtrefcountbt_get_minrecs,
+	.get_maxrecs		= xfs_rtrefcountbt_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrefcountbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrefcountbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrefcountbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_rtrefcountbt_init_ptr_from_cur,
+	.key_diff		= xfs_rtrefcountbt_key_diff,
 	.buf_ops		= &xfs_rtrefcountbt_buf_ops,
+	.diff_two_keys		= xfs_rtrefcountbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrefcountbt_keys_inorder,
+	.recs_inorder		= xfs_rtrefcountbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrefcountbt_keys_contiguous,
 };
 
 /* Allocate a new rt refcount btree cursor. */
-- 
cgit v1.2.3


From 01cef1db246ee8b094fca6df23ea6d4335748181 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:49 -0800
Subject: xfs: prepare refcount functions to deal with rtrefcountbt

Prepare the high-level refcount functions to deal with the new realtime
refcountbt and its slightly different conventions.  Provide the ability
to talk to either refcountbt or rtrefcountbt formats from the same high
level code.

Note that we leave the _recover_cow_leftovers functions for a separate
patch so that we can convert it all at once.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_refcount.c | 53 +++++++++++++++++++++++++++++++++++++-------
 fs/xfs/libxfs/xfs_refcount.h |  3 +++
 2 files changed, 48 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index faace12fe2e3..9be40ac16c7d 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -25,6 +25,7 @@
 #include "xfs_ag.h"
 #include "xfs_health.h"
 #include "xfs_refcount_item.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -144,6 +145,37 @@ xfs_refcount_check_irec(
 	return NULL;
 }
 
+xfs_failaddr_t
+xfs_rtrefcount_check_irec(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
+		return __this_address;
+
+	if (!xfs_refcount_check_domain(irec))
+		return __this_address;
+
+	/* check for valid extent range, including overflow */
+	if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount))
+		return __this_address;
+
+	if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
+		return __this_address;
+
+	return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_refcount_check_btrec(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (xfs_btree_is_rtrefcount(cur->bc_ops))
+		return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec);
+	return xfs_refcount_check_irec(to_perag(cur->bc_group), irec);
+}
+
 static inline int
 xfs_refcount_complain_bad_rec(
 	struct xfs_btree_cur		*cur,
@@ -152,9 +184,15 @@ xfs_refcount_complain_bad_rec(
 {
 	struct xfs_mount		*mp = cur->bc_mp;
 
-	xfs_warn(mp,
+	if (xfs_btree_is_rtrefcount(cur->bc_ops)) {
+		xfs_warn(mp,
+ "RT Refcount BTree record corruption in rtgroup %u detected at %pS!",
+				cur->bc_group->xg_gno, fa);
+	} else {
+		xfs_warn(mp,
  "Refcount BTree record corruption in AG %d detected at %pS!",
 				cur->bc_group->xg_gno, fa);
+	}
 	xfs_warn(mp,
 		"Start block 0x%x, block count 0x%x, references 0x%x",
 		irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -180,7 +218,7 @@ xfs_refcount_get_rec(
 		return error;
 
 	xfs_refcount_btrec_to_irec(rec, irec);
-	fa = xfs_refcount_check_irec(to_perag(cur->bc_group), irec);
+	fa = xfs_refcount_check_btrec(cur, irec);
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, irec);
 
@@ -1065,7 +1103,7 @@ xfs_refcount_still_have_space(
 	 */
 	overhead = xfs_allocfree_block_count(cur->bc_mp,
 				cur->bc_refc.shape_changes);
-	overhead += cur->bc_mp->m_refc_maxlevels;
+	overhead += cur->bc_maxlevels;
 	overhead *= cur->bc_mp->m_sb.sb_blocksize;
 
 	/*
@@ -1117,7 +1155,7 @@ xfs_refcount_adjust_extents(
 		if (error)
 			goto out_error;
 		if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
-			ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+			ext.rc_startblock = xfs_group_max_blocks(cur->bc_group);
 			ext.rc_blockcount = 0;
 			ext.rc_refcount = 0;
 			ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
@@ -1666,7 +1704,7 @@ xfs_refcount_adjust_cow_extents(
 		goto out_error;
 	}
 	if (!found_rec) {
-		ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+		ext.rc_startblock = xfs_group_max_blocks(cur->bc_group);
 		ext.rc_blockcount = 0;
 		ext.rc_refcount = 0;
 		ext.rc_domain = XFS_REFC_DOMAIN_COW;
@@ -1877,8 +1915,7 @@ xfs_refcount_recover_extent(
 	INIT_LIST_HEAD(&rr->rr_list);
 	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
 
-	if (xfs_refcount_check_irec(to_perag(cur->bc_group), &rr->rr_rrec) !=
-			NULL ||
+	if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL ||
 	    XFS_IS_CORRUPT(cur->bc_mp,
 			   rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
 		xfs_btree_mark_sick(cur);
@@ -2026,7 +2063,7 @@ xfs_refcount_query_range_helper(
 	xfs_failaddr_t			fa;
 
 	xfs_refcount_btrec_to_irec(rec, &irec);
-	fa = xfs_refcount_check_irec(to_perag(cur->bc_group), &irec);
+	fa = xfs_refcount_check_btrec(cur, &irec);
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, &irec);
 
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 62d78afcf1f3..9cd58d48716f 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -12,6 +12,7 @@ struct xfs_perag;
 struct xfs_btree_cur;
 struct xfs_bmbt_irec;
 struct xfs_refcount_irec;
+struct xfs_rtgroup;
 
 extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
 		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
@@ -120,6 +121,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_refcount_irec *irec);
 xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
 		const struct xfs_refcount_irec *irec);
+xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg,
+		const struct xfs_refcount_irec *irec);
 extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
 		struct xfs_refcount_irec *irec, int *stat);
 
-- 
cgit v1.2.3


From fd9300679ccec20c6ee1b95458ab0bcf0db628d5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:50 -0800
Subject: xfs: add a realtime flag to the refcount update log redo items

Extend the refcount update (CUI) log items with a new realtime flag that
indicates that the updates apply against the realtime refcountbt.  We'll
wire up the actual refcount code later.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c        |  10 +-
 fs/xfs/libxfs/xfs_defer.h       |   1 +
 fs/xfs/libxfs/xfs_log_format.h  |   6 +-
 fs/xfs/libxfs/xfs_log_recover.h |   2 +
 fs/xfs/libxfs/xfs_refcount.c    |  64 ++++++++----
 fs/xfs/libxfs/xfs_refcount.h    |  17 +--
 fs/xfs/scrub/cow_repair.c       |   2 +-
 fs/xfs/scrub/reap.c             |   5 +-
 fs/xfs/xfs_log_recover.c        |   2 +
 fs/xfs/xfs_refcount_item.c      | 224 +++++++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_reflink.c            |  19 ++--
 11 files changed, 298 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 02323936cc9b..d63713630236 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4564,8 +4564,9 @@ xfs_bmapi_write(
 			 * the refcount btree for orphan recovery.
 			 */
 			if (whichfork == XFS_COW_FORK)
-				xfs_refcount_alloc_cow_extent(tp, bma.blkno,
-						bma.length);
+				xfs_refcount_alloc_cow_extent(tp,
+						XFS_IS_REALTIME_INODE(ip),
+						bma.blkno, bma.length);
 		}
 
 		/* Deal with the allocated space we found.  */
@@ -4740,7 +4741,8 @@ xfs_bmapi_convert_one_delalloc(
 		*seq = READ_ONCE(ifp->if_seq);
 
 	if (whichfork == XFS_COW_FORK)
-		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+		xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip),
+				bma.blkno, bma.length);
 
 	error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
 			whichfork);
@@ -5388,7 +5390,7 @@ xfs_bmap_del_extent_real(
 		bool	isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
-			xfs_refcount_decrease_extent(tp, del);
+			xfs_refcount_decrease_extent(tp, isrt, del);
 		} else if (isrt && !xfs_has_rtgroups(mp)) {
 			error = xfs_bmap_free_rtblocks(tp, del);
 		} else {
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 1e2477eaa5a8..9effd95ddcd4 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -68,6 +68,7 @@ struct xfs_defer_op_type {
 
 extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a7e0e479454d..ec7157eaba5f 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -252,6 +252,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_EFD_RT		0x124b	/* realtime extent free done */
 #define	XFS_LI_RUI_RT		0x124c	/* realtime rmap update intent */
 #define	XFS_LI_RUD_RT		0x124d	/* realtime rmap update done */
+#define	XFS_LI_CUI_RT		0x124e	/* realtime refcount update intent */
+#define	XFS_LI_CUD_RT		0x124f	/* realtime refcount update done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -275,7 +277,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_EFI_RT,	"XFS_LI_EFI_RT" }, \
 	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }, \
 	{ XFS_LI_RUI_RT,	"XFS_LI_RUI_RT" }, \
-	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }
+	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }, \
+	{ XFS_LI_CUI_RT,	"XFS_LI_CUI_RT" }, \
+	{ XFS_LI_CUD_RT,	"XFS_LI_CUD_RT" }
 
 /*
  * Inode Log Item Format definitions.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index abc705aff26d..66c7916fb5cd 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -81,6 +81,8 @@ extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtrui_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtrud_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcud_item_ops;
 
 /*
  * Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 9be40ac16c7d..8007d1585625 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -26,6 +26,7 @@
 #include "xfs_health.h"
 #include "xfs_refcount_item.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -1123,6 +1124,22 @@ xfs_refcount_still_have_space(
 		cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
 }
 
+/* Schedule an extent free. */
+static int
+xrefc_free_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*rec)
+{
+	unsigned int			flags = 0;
+
+	if (xfs_btree_is_rtrefcount(cur->bc_ops))
+		flags |= XFS_FREE_EXTENT_REALTIME;
+
+	return xfs_free_extent_later(cur->bc_tp,
+			xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock),
+			rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags);
+}
+
 /*
  * Adjust the refcounts of middle extents.  At this point we should have
  * split extents that crossed the adjustment range; merged with adjacent
@@ -1139,7 +1156,6 @@ xfs_refcount_adjust_extents(
 	struct xfs_refcount_irec	ext, tmp;
 	int				error;
 	int				found_rec, found_tmp;
-	xfs_fsblock_t			fsbno;
 
 	/* Merging did all the work already. */
 	if (*aglen == 0)
@@ -1192,11 +1208,7 @@ xfs_refcount_adjust_extents(
 					goto out_error;
 				}
 			} else {
-				fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group),
-						tmp.rc_startblock);
-				error = xfs_free_extent_later(cur->bc_tp, fsbno,
-						  tmp.rc_blockcount, NULL,
-						  XFS_AG_RESV_NONE, 0);
+				error = xrefc_free_extent(cur, &tmp);
 				if (error)
 					goto out_error;
 			}
@@ -1254,11 +1266,7 @@ xfs_refcount_adjust_extents(
 			}
 			goto advloop;
 		} else {
-			fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group),
-					ext.rc_startblock);
-			error = xfs_free_extent_later(cur->bc_tp, fsbno,
-					ext.rc_blockcount, NULL,
-					XFS_AG_RESV_NONE, 0);
+			error = xrefc_free_extent(cur, &ext);
 			if (error)
 				goto out_error;
 		}
@@ -1454,6 +1462,20 @@ xfs_refcount_finish_one(
 	return error;
 }
 
+/*
+ * Process one of the deferred realtime refcount operations.  We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ */
+int
+xfs_rtrefcount_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_refcount_intent	*ri,
+	struct xfs_btree_cur		**pcur)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
 /*
  * Record a refcount intent for later processing.
  */
@@ -1461,6 +1483,7 @@ static void
 __xfs_refcount_add(
 	struct xfs_trans		*tp,
 	enum xfs_refcount_intent_type	type,
+	bool				isrt,
 	xfs_fsblock_t			startblock,
 	xfs_extlen_t			blockcount)
 {
@@ -1472,6 +1495,7 @@ __xfs_refcount_add(
 	ri->ri_type = type;
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
+	ri->ri_realtime = isrt;
 
 	xfs_refcount_defer_add(tp, ri);
 }
@@ -1482,12 +1506,13 @@ __xfs_refcount_add(
 void
 xfs_refcount_increase_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_has_reflink(tp->t_mountp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock,
 			PREV->br_blockcount);
 }
 
@@ -1497,12 +1522,13 @@ xfs_refcount_increase_extent(
 void
 xfs_refcount_decrease_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_has_reflink(tp->t_mountp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock,
 			PREV->br_blockcount);
 }
 
@@ -1858,6 +1884,7 @@ __xfs_refcount_cow_free(
 void
 xfs_refcount_alloc_cow_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
@@ -1866,16 +1893,17 @@ xfs_refcount_alloc_cow_extent(
 	if (!xfs_has_reflink(mp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
 void
 xfs_refcount_free_cow_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
@@ -1885,8 +1913,8 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
-	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+	xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len);
 }
 
 struct xfs_refcount_recovery {
@@ -1992,7 +2020,7 @@ xfs_refcount_recover_cow_leftovers(
 
 		/* Free the orphan record */
 		fsb = xfs_agbno_to_fsb(pag, rr->rr_rrec.rc_startblock);
-		xfs_refcount_free_cow_extent(tp, fsb,
+		xfs_refcount_free_cow_extent(tp, false, fsb,
 				rr->rr_rrec.rc_blockcount);
 
 		/* Free the block. */
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 9cd58d48716f..be11df25abcc 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -61,6 +61,7 @@ struct xfs_refcount_intent {
 	enum xfs_refcount_intent_type		ri_type;
 	xfs_extlen_t				ri_blockcount;
 	xfs_fsblock_t				ri_startblock;
+	bool					ri_realtime;
 };
 
 /* Check that the refcount is appropriate for the record domain. */
@@ -75,22 +76,24 @@ xfs_refcount_check_domain(
 	return true;
 }
 
-void xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt,
 		struct xfs_bmbt_irec *irec);
-void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt,
 		struct xfs_bmbt_irec *irec);
 
-extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+int xfs_refcount_finish_one(struct xfs_trans *tp,
+		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
+int xfs_rtrefcount_finish_one(struct xfs_trans *tp,
 		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
 
 extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
 		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
 		xfs_extlen_t *flen, bool find_end_of_shared);
 
-void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-		xfs_extlen_t len);
-void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-		xfs_extlen_t len);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
+		xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
+		xfs_fsblock_t fsb, xfs_extlen_t len);
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 5b6194cef3e5..ba695dd21f8b 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -343,7 +343,7 @@ xrep_cow_alloc(
 	if (args.fsbno == NULLFSBLOCK)
 		return -ENOSPC;
 
-	xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+	xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len);
 
 	repl->fsbno = args.fsbno;
 	repl->len = args.len;
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 4d7f1b82dc55..534c7696e9a1 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -419,7 +419,8 @@ xreap_agextent_iter(
 			 * records from the refcountbt, which will remove the
 			 * rmap record as well.
 			 */
-			xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+			xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
+					*aglenp);
 			return 0;
 		}
 
@@ -451,7 +452,7 @@ xreap_agextent_iter(
 	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
 		ASSERT(rs->resv == XFS_AG_RESV_NONE);
 
-		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+		xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp);
 		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
 				rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
 		if (error)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5c95c97519c7..b3c27dbccce8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1822,6 +1822,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
 	&xlog_rtefd_item_ops,
 	&xlog_rtrui_item_ops,
 	&xlog_rtrud_item_ops,
+	&xlog_rtcui_item_ops,
+	&xlog_rtcud_item_ops,
 };
 
 static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index c807c4b90c44..2086d40514d0 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -23,6 +23,7 @@
 #include "xfs_ag.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_cui_cache;
 struct kmem_cache	*xfs_cud_cache;
@@ -94,8 +95,9 @@ xfs_cui_item_format(
 
 	ASSERT(atomic_read(&cuip->cui_next_extent) ==
 			cuip->cui_format.cui_nextents);
+	ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT);
 
-	cuip->cui_format.cui_type = XFS_LI_CUI;
+	cuip->cui_format.cui_type = lip->li_type;
 	cuip->cui_format.cui_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format,
@@ -138,12 +140,14 @@ xfs_cui_item_release(
 STATIC struct xfs_cui_log_item *
 xfs_cui_init(
 	struct xfs_mount		*mp,
+	unsigned short			item_type,
 	uint				nextents)
-
 {
 	struct xfs_cui_log_item		*cuip;
 
 	ASSERT(nextents > 0);
+	ASSERT(item_type == XFS_LI_CUI || item_type == XFS_LI_CUI_RT);
+
 	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
 		cuip = kzalloc(xfs_cui_log_item_sizeof(nextents),
 				GFP_KERNEL | __GFP_NOFAIL);
@@ -151,7 +155,7 @@ xfs_cui_init(
 		cuip = kmem_cache_zalloc(xfs_cui_cache,
 					 GFP_KERNEL | __GFP_NOFAIL);
 
-	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+	xfs_log_item_init(mp, &cuip->cui_item, item_type, &xfs_cui_item_ops);
 	cuip->cui_format.cui_nextents = nextents;
 	cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
 	atomic_set(&cuip->cui_next_extent, 0);
@@ -190,7 +194,9 @@ xfs_cud_item_format(
 	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
 	struct xfs_log_iovec	*vecp = NULL;
 
-	cudp->cud_format.cud_type = XFS_LI_CUD;
+	ASSERT(lip->li_type == XFS_LI_CUD || lip->li_type == XFS_LI_CUD_RT);
+
+	cudp->cud_format.cud_type = lip->li_type;
 	cudp->cud_format.cud_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format,
@@ -234,6 +240,14 @@ static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e)
 	return list_entry(e, struct xfs_refcount_intent, ri_list);
 }
 
+static inline bool
+xfs_cui_item_isrt(const struct xfs_log_item *lip)
+{
+	ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT);
+
+	return lip->li_type == XFS_LI_CUI_RT;
+}
+
 /* Sort refcount intents by AG. */
 static int
 xfs_refcount_update_diff_items(
@@ -282,18 +296,20 @@ xfs_refcount_update_log_item(
 }
 
 static struct xfs_log_item *
-xfs_refcount_update_create_intent(
+__xfs_refcount_update_create_intent(
 	struct xfs_trans		*tp,
 	struct list_head		*items,
 	unsigned int			count,
-	bool				sort)
+	bool				sort,
+	unsigned short			item_type)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_cui_log_item		*cuip = xfs_cui_init(mp, count);
+	struct xfs_cui_log_item		*cuip;
 	struct xfs_refcount_intent	*ri;
 
 	ASSERT(count > 0);
 
+	cuip = xfs_cui_init(mp, item_type, count);
 	if (sort)
 		list_sort(mp, items, xfs_refcount_update_diff_items);
 	list_for_each_entry(ri, items, ri_list)
@@ -301,6 +317,23 @@ xfs_refcount_update_create_intent(
 	return &cuip->cui_item;
 }
 
+static struct xfs_log_item *
+xfs_refcount_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	return __xfs_refcount_update_create_intent(tp, items, count, sort,
+			XFS_LI_CUI);
+}
+
+static inline unsigned short
+xfs_cud_type_from_cui(const struct xfs_cui_log_item *cuip)
+{
+	return xfs_cui_item_isrt(&cuip->cui_item) ? XFS_LI_CUD_RT : XFS_LI_CUD;
+}
+
 /* Get an CUD so we can process all the deferred refcount updates. */
 static struct xfs_log_item *
 xfs_refcount_update_create_done(
@@ -312,8 +345,8 @@ xfs_refcount_update_create_done(
 	struct xfs_cud_log_item		*cudp;
 
 	cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
-	xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
-			  &xfs_cud_item_ops);
+	xfs_log_item_init(tp->t_mountp, &cudp->cud_item,
+			xfs_cud_type_from_cui(cuip), &xfs_cud_item_ops);
 	cudp->cud_cuip = cuip;
 	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
 
@@ -328,10 +361,20 @@ xfs_refcount_defer_add(
 {
 	struct xfs_mount		*mp = tp->t_mountp;
 
-	ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, XG_TYPE_AG);
+	/*
+	 * Deferred refcount updates for the realtime and data sections must
+	 * use separate transactions to finish deferred work because updates to
+	 * realtime metadata files can lock AGFs to allocate btree blocks and
+	 * we don't want that mixing with the AGF locks taken to finish data
+	 * section updates.
+	 */
+	ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock,
+			ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG);
 
 	trace_xfs_refcount_defer(mp, ri);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
+	xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ?
+			&xfs_rtrefcount_update_defer_type :
+			&xfs_refcount_update_defer_type);
 }
 
 /* Cancel a deferred refcount update. */
@@ -381,7 +424,7 @@ xfs_refcount_finish_one_cleanup(
 		return;
 	agbp = rcur->bc_ag.agbp;
 	xfs_btree_del_cursor(rcur, error);
-	if (error)
+	if (error && agbp)
 		xfs_trans_brelse(tp, agbp);
 }
 
@@ -515,10 +558,13 @@ xfs_refcount_relog_intent(
 	struct xfs_phys_extent		*pmap;
 	unsigned int			count;
 
+	ASSERT(intent->li_type == XFS_LI_CUI ||
+	       intent->li_type == XFS_LI_CUI_RT);
+
 	count = CUI_ITEM(intent)->cui_format.cui_nextents;
 	pmap = CUI_ITEM(intent)->cui_format.cui_extents;
 
-	cuip = xfs_cui_init(tp->t_mountp, count);
+	cuip = xfs_cui_init(tp->t_mountp, intent->li_type, count);
 	memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
 	atomic_set(&cuip->cui_next_extent, count);
 
@@ -538,6 +584,71 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.relog_intent	= xfs_refcount_relog_intent,
 };
 
+#ifdef CONFIG_XFS_RT
+static struct xfs_log_item *
+xfs_rtrefcount_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	return __xfs_refcount_update_create_intent(tp, items, count, sort,
+			XFS_LI_CUI_RT);
+}
+
+/* Process a deferred realtime refcount update. */
+STATIC int
+xfs_rtrefcount_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_refcount_intent	*ri = ci_entry(item);
+	int				error;
+
+	error = xfs_rtrefcount_finish_one(tp, ri, state);
+
+	/* Did we run out of reservation?  Requeue what we didn't finish. */
+	if (!error && ri->ri_blockcount > 0) {
+		ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
+		       ri->ri_type == XFS_REFCOUNT_DECREASE);
+		return -EAGAIN;
+	}
+
+	xfs_refcount_update_cancel_item(item);
+	return error;
+}
+
+/* Clean up after calling xfs_rtrefcount_finish_one. */
+STATIC void
+xfs_rtrefcount_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	if (rcur)
+		xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+	.name		= "rtrefcount",
+	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_rtrefcount_update_create_intent,
+	.abort_intent	= xfs_refcount_update_abort_intent,
+	.create_done	= xfs_refcount_update_create_done,
+	.finish_item	= xfs_rtrefcount_update_finish_item,
+	.finish_cleanup = xfs_rtrefcount_finish_one_cleanup,
+	.cancel_item	= xfs_refcount_update_cancel_item,
+	.recover_work	= xfs_refcount_recover_work,
+	.relog_intent	= xfs_refcount_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+	.name		= "rtrefcount",
+};
+#endif /* CONFIG_XFS_RT */
+
 STATIC bool
 xfs_cui_item_match(
 	struct xfs_log_item	*lip,
@@ -603,7 +714,7 @@ xlog_recover_cui_commit_pass2(
 		return -EFSCORRUPTED;
 	}
 
-	cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+	cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents);
 	xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
 	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
 
@@ -617,6 +728,61 @@ const struct xlog_recover_item_ops xlog_cui_item_ops = {
 	.commit_pass2		= xlog_recover_cui_commit_pass2,
 };
 
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtcui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_cui_log_item		*cuip;
+	struct xfs_cui_log_format	*cui_formatp;
+	size_t				len;
+
+	cui_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents);
+	xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
+	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+
+	xlog_recover_intent_item(log, &cuip->cui_item, lsn,
+			&xfs_rtrefcount_update_defer_type);
+	return 0;
+}
+#else
+STATIC int
+xlog_recover_rtcui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+			item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+	return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtcui_item_ops = {
+	.item_type		= XFS_LI_CUI_RT,
+	.commit_pass2		= xlog_recover_rtcui_commit_pass2,
+};
+
 /*
  * This routine is called when an CUD format structure is found in a committed
  * transaction in the log. Its purpose is to cancel the corresponding CUI if it
@@ -648,3 +814,33 @@ const struct xlog_recover_item_ops xlog_cud_item_ops = {
 	.item_type		= XFS_LI_CUD,
 	.commit_pass2		= xlog_recover_cud_commit_pass2,
 };
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtcud_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_cud_log_format	*cud_formatp;
+
+	cud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_CUI_RT,
+			cud_formatp->cud_cui_id);
+	return 0;
+}
+#else
+# define xlog_recover_rtcud_commit_pass2	xlog_recover_rtcui_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtcud_item_ops = {
+	.item_type		= XFS_LI_CUD_RT,
+	.commit_pass2		= xlog_recover_rtcud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b11769c009ef..02457e176c42 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -571,6 +571,7 @@ xfs_reflink_cancel_cow_blocks(
 	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	struct xfs_bmbt_irec		got, del;
 	struct xfs_iext_cursor		icur;
+	bool				isrt = XFS_IS_REALTIME_INODE(ip);
 	int				error = 0;
 
 	if (!xfs_inode_has_cow_data(ip))
@@ -598,12 +599,13 @@ xfs_reflink_cancel_cow_blocks(
 			ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
 
 			/* Free the CoW orphan record. */
-			xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
-					del.br_blockcount);
+			xfs_refcount_free_cow_extent(*tpp, isrt,
+					del.br_startblock, del.br_blockcount);
 
 			error = xfs_free_extent_later(*tpp, del.br_startblock,
 					del.br_blockcount, NULL,
-					XFS_AG_RESV_NONE, 0);
+					XFS_AG_RESV_NONE,
+					isrt ? XFS_FREE_EXTENT_REALTIME : 0);
 			if (error)
 				break;
 
@@ -710,6 +712,7 @@ xfs_reflink_end_cow_extent(
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	unsigned int		resblks;
 	int			nmaps;
+	bool			isrt = XFS_IS_REALTIME_INODE(ip);
 	int			error;
 
 	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
@@ -779,7 +782,7 @@ xfs_reflink_end_cow_extent(
 		 * or not), unmap the extent and drop its refcount.
 		 */
 		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
-		xfs_refcount_decrease_extent(tp, &data);
+		xfs_refcount_decrease_extent(tp, isrt, &data);
 		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
 				-data.br_blockcount);
 	} else if (data.br_startblock == DELAYSTARTBLOCK) {
@@ -799,7 +802,8 @@ xfs_reflink_end_cow_extent(
 	}
 
 	/* Free the CoW orphan record. */
-	xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
+	xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock,
+			del.br_blockcount);
 
 	/* Map the new blocks into the data fork. */
 	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
@@ -1135,6 +1139,7 @@ xfs_reflink_remap_extent(
 	bool			quota_reserved = true;
 	bool			smap_real;
 	bool			dmap_written = xfs_bmap_is_written_extent(dmap);
+	bool			isrt = XFS_IS_REALTIME_INODE(ip);
 	int			iext_delta = 0;
 	int			nimaps;
 	int			error;
@@ -1264,7 +1269,7 @@ xfs_reflink_remap_extent(
 		 * or not), unmap the extent and drop its refcount.
 		 */
 		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
-		xfs_refcount_decrease_extent(tp, &smap);
+		xfs_refcount_decrease_extent(tp, isrt, &smap);
 		qdelta -= smap.br_blockcount;
 	} else if (smap.br_startblock == DELAYSTARTBLOCK) {
 		int		done;
@@ -1287,7 +1292,7 @@ xfs_reflink_remap_extent(
 	 * its refcount and map it into the file.
 	 */
 	if (dmap_written) {
-		xfs_refcount_increase_extent(tp, dmap);
+		xfs_refcount_increase_extent(tp, isrt, dmap);
 		xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
 		qdelta += dmap->br_blockcount;
 	}
-- 
cgit v1.2.3


From ee6d43447923260830285c7ca2d4ae4efab4f31f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:51 -0800
Subject: xfs: support recovering refcount intent items targetting realtime
 extents

Now that we have reflink on the realtime device, refcount intent items
have to support remapping extents on the realtime volume.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_refcount_item.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 2086d40514d0..fe2d7aab8554 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -440,6 +440,7 @@ xfs_refcount_update_abort_intent(
 static inline bool
 xfs_cui_validate_phys(
 	struct xfs_mount		*mp,
+	bool				isrt,
 	struct xfs_phys_extent		*pmap)
 {
 	if (!xfs_has_reflink(mp))
@@ -458,6 +459,9 @@ xfs_cui_validate_phys(
 		return false;
 	}
 
+	if (isrt)
+		return xfs_verify_rtbext(mp, pmap->pe_startblock, pmap->pe_len);
+
 	return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
 }
 
@@ -465,6 +469,7 @@ static inline void
 xfs_cui_recover_work(
 	struct xfs_mount		*mp,
 	struct xfs_defer_pending	*dfp,
+	bool				isrt,
 	struct xfs_phys_extent		*pmap)
 {
 	struct xfs_refcount_intent	*ri;
@@ -475,7 +480,8 @@ xfs_cui_recover_work(
 	ri->ri_startblock = pmap->pe_startblock;
 	ri->ri_blockcount = pmap->pe_len;
 	ri->ri_group = xfs_group_intent_get(mp, pmap->pe_startblock,
-			XG_TYPE_AG);
+			isrt ? XG_TYPE_RTG : XG_TYPE_AG);
+	ri->ri_realtime = isrt;
 
 	xfs_defer_add_item(dfp, &ri->ri_list);
 }
@@ -494,6 +500,7 @@ xfs_refcount_recover_work(
 	struct xfs_cui_log_item		*cuip = CUI_ITEM(lip);
 	struct xfs_trans		*tp;
 	struct xfs_mount		*mp = lip->li_log->l_mp;
+	bool				isrt = xfs_cui_item_isrt(lip);
 	int				i;
 	int				error = 0;
 
@@ -503,7 +510,7 @@ xfs_refcount_recover_work(
 	 * just toss the CUI.
 	 */
 	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
-		if (!xfs_cui_validate_phys(mp,
+		if (!xfs_cui_validate_phys(mp, isrt,
 					&cuip->cui_format.cui_extents[i])) {
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					&cuip->cui_format,
@@ -511,7 +518,8 @@ xfs_refcount_recover_work(
 			return -EFSCORRUPTED;
 		}
 
-		xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]);
+		xfs_cui_recover_work(mp, dfp, isrt,
+				&cuip->cui_format.cui_extents[i]);
 	}
 
 	/*
-- 
cgit v1.2.3


From e08d0f2004cd925c6b68f982ac7de6ef77dab8ea Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:51 -0800
Subject: xfs: add realtime refcount btree block detection to log recovery

Identify rt refcount btree blocks in the log correctly so that we can
validate them during log recovery.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf_item_recover.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 4f2e4ea29e1f..b05d5b81f642 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -271,6 +271,9 @@ xlog_recover_validate_buf_type(
 		case XFS_REFC_CRC_MAGIC:
 			bp->b_ops = &xfs_refcountbt_buf_ops;
 			break;
+		case XFS_RTREFC_CRC_MAGIC:
+			bp->b_ops = &xfs_rtrefcountbt_buf_ops;
+			break;
 		default:
 			warnmsg = "Bad btree block magic!";
 			break;
@@ -859,6 +862,7 @@ xlog_recover_get_buf_lsn(
 		break;
 	}
 	case XFS_RTRMAP_CRC_MAGIC:
+	case XFS_RTREFC_CRC_MAGIC:
 	case XFS_BMAP_CRC_MAGIC:
 	case XFS_BMAP_MAGIC: {
 		struct xfs_btree_block *btb = blk;
-- 
cgit v1.2.3


From eaed472c40527e526217aff3737816b44b08b363 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:52 -0800
Subject: xfs: add realtime refcount btree inode to metadata directory

Add a metadir path to select the realtime refcount btree inode and load
it at mount time.  The rtrefcountbt inode will have a unique extent format
code, which means that we also have to update the inode validation and
flush routines to look for it.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_format.h           | 4 +++-
 fs/xfs/libxfs/xfs_inode_buf.c        | 5 +++++
 fs/xfs/libxfs/xfs_inode_fork.c       | 6 ++++++
 fs/xfs/libxfs/xfs_rtgroup.c          | 7 +++++++
 fs/xfs/libxfs/xfs_rtgroup.h          | 6 ++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.c | 6 +++---
 6 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 17f7c0d1aaa4..b6828f92c131 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -858,6 +858,7 @@ enum xfs_metafile_type {
 	XFS_METAFILE_RTBITMAP,		/* rt bitmap */
 	XFS_METAFILE_RTSUMMARY,		/* rt summary */
 	XFS_METAFILE_RTRMAP,		/* rt rmap */
+	XFS_METAFILE_RTREFCOUNT,	/* rt refcount */
 
 	XFS_METAFILE_MAX
 } __packed;
@@ -870,7 +871,8 @@ enum xfs_metafile_type {
 	{ XFS_METAFILE_PRJQUOTA,	"prjquota" }, \
 	{ XFS_METAFILE_RTBITMAP,	"rtbitmap" }, \
 	{ XFS_METAFILE_RTSUMMARY,	"rtsummary" }, \
-	{ XFS_METAFILE_RTRMAP,		"rtrmap" }
+	{ XFS_METAFILE_RTRMAP,		"rtrmap" }, \
+	{ XFS_METAFILE_RTREFCOUNT,	"rtrefcount" }
 
 /*
  * On-disk inode structure.
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 17cb91b89fca..65eec8f60376 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -456,6 +456,11 @@ xfs_dinode_verify_fork(
 			if (!xfs_has_rmapbt(mp))
 				return __this_address;
 			break;
+		case XFS_METAFILE_RTREFCOUNT:
+			/* same comment about growfs and rmap inodes applies */
+			if (!xfs_has_reflink(mp))
+				return __this_address;
+			break;
 		default:
 			return __this_address;
 		}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index d9b3c182cb40..0c4bc12401a1 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -272,6 +272,9 @@ xfs_iformat_data_fork(
 			switch (ip->i_metatype) {
 			case XFS_METAFILE_RTRMAP:
 				return xfs_iformat_rtrmap(ip, dip);
+			case XFS_METAFILE_RTREFCOUNT:
+				ASSERT(0); /* to be implemented later */
+				return -EFSCORRUPTED;
 			default:
 				break;
 			}
@@ -620,6 +623,9 @@ xfs_iflush_fork(
 		case XFS_METAFILE_RTRMAP:
 			xfs_iflush_rtrmap(ip, dip);
 			break;
+		case XFS_METAFILE_RTREFCOUNT:
+			ASSERT(0); /* to be implemented later */
+			break;
 		default:
 			ASSERT(0);
 			break;
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index b7ed2d27d545..6aebe9f48490 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -367,6 +367,13 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
 		.enabled	= xfs_has_rmapbt,
 		.create		= xfs_rtrmapbt_create,
 	},
+	[XFS_RTGI_REFCOUNT] = {
+		.name		= "refcount",
+		.metafile_type	= XFS_METAFILE_RTREFCOUNT,
+		.fmt_mask	= 1U << XFS_DINODE_FMT_META_BTREE,
+		/* same comment about growfs and rmap inodes applies here */
+		.enabled	= xfs_has_reflink,
+	},
 };
 
 /* Return the shortname of this rtgroup inode. */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 733da7417c9c..385ea8e2f28b 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -15,6 +15,7 @@ enum xfs_rtg_inodes {
 	XFS_RTGI_BITMAP,	/* allocation bitmap */
 	XFS_RTGI_SUMMARY,	/* allocation summary */
 	XFS_RTGI_RMAP,		/* rmap btree inode */
+	XFS_RTGI_REFCOUNT,	/* refcount btree inode */
 
 	XFS_RTGI_MAX,
 };
@@ -80,6 +81,11 @@ static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg)
 	return rtg->rtg_inodes[XFS_RTGI_RMAP];
 }
 
+static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg)
+{
+	return rtg->rtg_inodes[XFS_RTGI_REFCOUNT];
+}
+
 /* Passive rtgroup references */
 static inline struct xfs_rtgroup *
 xfs_rtgroup_get(
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index e30af9415816..ebbeab112d14 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -26,6 +26,7 @@
 #include "xfs_extent_busy.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_metafile.h"
 
 static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
 
@@ -281,12 +282,10 @@ xfs_rtrefcountbt_init_cursor(
 	struct xfs_trans	*tp,
 	struct xfs_rtgroup	*rtg)
 {
-	struct xfs_inode	*ip = NULL;
+	struct xfs_inode	*ip = rtg_refcount(rtg);
 	struct xfs_mount	*mp = rtg_mount(rtg);
 	struct xfs_btree_cur	*cur;
 
-	return NULL; /* XXX */
-
 	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
 
 	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops,
@@ -316,6 +315,7 @@ xfs_rtrefcountbt_commit_staged_btree(
 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
 
 	/*
 	 * Free any resources hanging off the real fork, then shallow-copy the
-- 
cgit v1.2.3


From bf0b99411335db18a9ed4fcef278ce9e313f6076 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:53 -0800
Subject: xfs: add metadata reservations for realtime refcount btree

Reserve some free blocks so that we will always have enough free blocks
in the data volume to handle expansion of the realtime refcount btree.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtrefcount_btree.c | 38 ++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |  4 ++++
 fs/xfs/xfs_rtalloc.c                 |  6 ++++++
 3 files changed, 48 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index ebbeab112d14..ff72ed09e75f 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -419,3 +419,41 @@ xfs_rtrefcountbt_compute_maxlevels(
 	/* Add one level to handle the inode root level. */
 	mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
 }
+
+/* Calculate the rtrefcount btree size for some records. */
+unsigned long long
+xfs_rtrefcountbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_rtrefc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+static unsigned long long
+xfs_rtrefcountbt_max_size(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtblocks)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rtrefc_mxr[0] == 0)
+		return 0;
+
+	return xfs_rtrefcountbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ * We need enough space to hold one record for every rt extent in the rtgroup.
+ */
+xfs_filblks_t
+xfs_rtrefcountbt_calc_reserves(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_has_rtreflink(mp))
+		return 0;
+
+	return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents);
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
index b713b3381880..3cd44590c930 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -67,4 +67,8 @@ unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
 int __init xfs_rtrefcountbt_init_cur_cache(void);
 void xfs_rtrefcountbt_destroy_cur_cache(void);
 
+xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
+unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a69967f9d88e..294aa0739be3 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -31,6 +31,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Return whether there are any free extents in the size range given
@@ -1547,6 +1548,11 @@ xfs_rt_resv_init(
 		err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
 		if (err2 && !error)
 			error = err2;
+
+		ask = xfs_rtrefcountbt_calc_reserves(mp);
+		err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask);
+		if (err2 && !error)
+			error = err2;
 	}
 
 	return error;
-- 
cgit v1.2.3


From f0415af60f482a2192065be8b334b409495ca8a3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:54 -0800
Subject: xfs: wire up a new metafile type for the realtime refcount

Plumb in the pieces we need to embed the root of the realtime refcount
btree in an inode's data fork, complete with metafile type and on-disk
interpretation functions.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_format.h           |   8 ++
 fs/xfs/libxfs/xfs_inode_fork.c       |   6 +-
 fs/xfs/libxfs/xfs_ondisk.h           |   1 +
 fs/xfs/libxfs/xfs_rtrefcount_btree.c | 264 +++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h | 112 +++++++++++++++
 fs/xfs/xfs_inode_item_recover.c      |   4 +
 6 files changed, 392 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index b6828f92c131..b1007fb661ba 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1805,6 +1805,14 @@ typedef __be32 xfs_refcount_ptr_t;
  */
 #define	XFS_RTREFC_CRC_MAGIC	0x52434e54	/* 'RCNT' */
 
+/*
+ * rt refcount root header, on-disk form only.
+ */
+struct xfs_rtrefcount_root {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+};
+
 /* inode-rooted btree pointer type */
 typedef __be64 xfs_rtrefcount_ptr_t;
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0c4bc12401a1..4f99b90add55 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -28,6 +28,7 @@
 #include "xfs_health.h"
 #include "xfs_symlink_remote.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 struct kmem_cache *xfs_ifork_cache;
 
@@ -273,8 +274,7 @@ xfs_iformat_data_fork(
 			case XFS_METAFILE_RTRMAP:
 				return xfs_iformat_rtrmap(ip, dip);
 			case XFS_METAFILE_RTREFCOUNT:
-				ASSERT(0); /* to be implemented later */
-				return -EFSCORRUPTED;
+				return xfs_iformat_rtrefcount(ip, dip);
 			default:
 				break;
 			}
@@ -624,7 +624,7 @@ xfs_iflush_fork(
 			xfs_iflush_rtrmap(ip, dip);
 			break;
 		case XFS_METAFILE_RTREFCOUNT:
-			ASSERT(0); /* to be implemented later */
+			xfs_iflush_rtrefcount(ip, dip);
 			break;
 		default:
 			ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index efb035050c00..a85ecddaa48e 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -86,6 +86,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t,		8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root,	4);
 
 	/*
 	 * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index ff72ed09e75f..6a5bc7ea42fb 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -77,6 +77,41 @@ xfs_rtrefcountbt_get_maxrecs(
 	return cur->bc_mp->m_rtrefc_mxr[level != 0];
 }
 
+/*
+ * Calculate number of records in a realtime refcount btree inode root.
+ */
+unsigned int
+xfs_rtrefcountbt_droot_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= sizeof(struct xfs_rtrefcount_root);
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (2 * sizeof(struct xfs_refcount_key) +
+			sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork so that
+ * we can resize the in-memory buffer to match it.  After a resize to the
+ * maximum size this function returns the same value as
+ * xfs_rtrefcountbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrefcountbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_rtrefc_mxr[level != 0];
+	return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
 STATIC void
 xfs_rtrefcountbt_init_key_from_rec(
 	union xfs_btree_key		*key,
@@ -247,6 +282,87 @@ xfs_rtrefcountbt_keys_contiguous(
 				 be32_to_cpu(key2->refc.rc_startblock));
 }
 
+static inline void
+xfs_rtrefcountbt_move_ptrs(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*broot,
+	short			old_size,
+	size_t			new_size,
+	unsigned int		numrecs)
+{
+	void			*dptr;
+	void			*sptr;
+
+	sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size);
+	dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size);
+	memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+static struct xfs_btree_block *
+xfs_rtrefcountbt_broot_realloc(
+	struct xfs_btree_cur	*cur,
+	unsigned int		new_numrecs)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+	struct xfs_btree_block	*broot;
+	unsigned int		new_size;
+	unsigned int		old_size = ifp->if_broot_bytes;
+	const unsigned int	level = cur->bc_nlevels - 1;
+
+	new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs);
+
+	/* Handle the nop case quietly. */
+	if (new_size == old_size)
+		return ifp->if_broot;
+
+	if (new_size > old_size) {
+		unsigned int	old_numrecs;
+
+		/*
+		 * If there wasn't any memory allocated before, just allocate
+		 * it now and get out.
+		 */
+		if (old_size == 0)
+			return xfs_broot_realloc(ifp, new_size);
+
+		/*
+		 * If there is already an existing if_broot, then we need to
+		 * realloc it and possibly move the node block pointers because
+		 * those are not butted up against the btree block header.
+		 */
+		old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level);
+		broot = xfs_broot_realloc(ifp, new_size);
+		if (level > 0)
+			xfs_rtrefcountbt_move_ptrs(mp, broot, old_size,
+					new_size, old_numrecs);
+		goto out_broot;
+	}
+
+	/*
+	 * We're reducing numrecs.  If we're going all the way to zero, just
+	 * free the block.
+	 */
+	ASSERT(ifp->if_broot != NULL && old_size > 0);
+	if (new_size == 0)
+		return xfs_broot_realloc(ifp, 0);
+
+	/*
+	 * Shrink the btree root by possibly moving the rtrmapbt pointers,
+	 * since they are not butted up against the btree block header.  Then
+	 * reallocate broot.
+	 */
+	if (level > 0)
+		xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size,
+				new_size, new_numrecs);
+	broot = xfs_broot_realloc(ifp, new_size);
+
+out_broot:
+	ASSERT(xfs_rtrefcount_droot_space(broot) <=
+	       xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
+	return broot;
+}
+
 const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.name			= "rtrefcount",
 	.type			= XFS_BTREE_TYPE_INODE,
@@ -264,6 +380,7 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.free_block		= xfs_btree_free_metafile_block,
 	.get_minrecs		= xfs_rtrefcountbt_get_minrecs,
 	.get_maxrecs		= xfs_rtrefcountbt_get_maxrecs,
+	.get_dmaxrecs		= xfs_rtrefcountbt_get_dmaxrecs,
 	.init_key_from_rec	= xfs_rtrefcountbt_init_key_from_rec,
 	.init_high_key_from_rec	= xfs_rtrefcountbt_init_high_key_from_rec,
 	.init_rec_from_cur	= xfs_rtrefcountbt_init_rec_from_cur,
@@ -274,6 +391,7 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.keys_inorder		= xfs_rtrefcountbt_keys_inorder,
 	.recs_inorder		= xfs_rtrefcountbt_recs_inorder,
 	.keys_contiguous	= xfs_rtrefcountbt_keys_contiguous,
+	.broot_realloc		= xfs_rtrefcountbt_broot_realloc,
 };
 
 /* Allocate a new rt refcount btree cursor. */
@@ -457,3 +575,149 @@ xfs_rtrefcountbt_calc_reserves(
 
 	return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents);
 }
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+STATIC void
+xfs_rtrefcountbt_from_disk(
+	struct xfs_inode		*ip,
+	struct xfs_rtrefcount_root	*dblock,
+	int				dblocklen,
+	struct xfs_btree_block		*rblock)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_refcount_key	*fkp;
+	__be64				*fpp;
+	struct xfs_refcount_key	*tkp;
+	__be64				*tpp;
+	struct xfs_refcount_rec	*frp;
+	struct xfs_refcount_rec	*trp;
+	unsigned int			numrecs;
+	unsigned int			maxrecs;
+	unsigned int			rblocklen;
+
+	rblocklen = xfs_rtrefcount_broot_space(mp, dblock);
+
+	xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0,
+			ip->i_ino);
+
+	rblock->bb_level = dblock->bb_level;
+	rblock->bb_numrecs = dblock->bb_numrecs;
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+		tkp = xfs_rtrefcount_key_addr(rblock, 1);
+		fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+		tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		numrecs = be16_to_cpu(dblock->bb_numrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+		trp = xfs_rtrefcount_rec_addr(rblock, 1);
+		numrecs = be16_to_cpu(dblock->bb_numrecs);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Load a realtime reference count btree root in from disk. */
+int
+xfs_iformat_rtrefcount(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	struct xfs_btree_block	*broot;
+	unsigned int		numrecs;
+	unsigned int		level;
+	int			dsize;
+
+	/*
+	 * growfs must create the rtrefcount inodes before adding a realtime
+	 * volume to the filesystem, so we cannot use the rtrefcount predicate
+	 * here.
+	 */
+	if (!xfs_has_reflink(ip->i_mount))
+		return -EFSCORRUPTED;
+
+	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+	numrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > mp->m_rtrefc_maxlevels ||
+	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize)
+		return -EFSCORRUPTED;
+
+	broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
+			xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
+	if (broot)
+		xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot);
+	return 0;
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_rtrefcountbt_to_disk(
+	struct xfs_mount		*mp,
+	struct xfs_btree_block		*rblock,
+	int				rblocklen,
+	struct xfs_rtrefcount_root	*dblock,
+	int				dblocklen)
+{
+	struct xfs_refcount_key	*fkp;
+	__be64				*fpp;
+	struct xfs_refcount_key	*tkp;
+	__be64				*tpp;
+	struct xfs_refcount_rec	*frp;
+	struct xfs_refcount_rec	*trp;
+	unsigned int			maxrecs;
+	unsigned int			numrecs;
+
+	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC));
+	ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+	ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrefcount_key_addr(rblock, 1);
+		tkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+		fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+		numrecs = be16_to_cpu(rblock->bb_numrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrefcount_rec_addr(rblock, 1);
+		trp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+		numrecs = be16_to_cpu(rblock->bb_numrecs);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Flush a realtime reference count btree root out to disk. */
+void
+xfs_iflush_rtrefcount(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+	ASSERT(ifp->if_broot != NULL);
+	ASSERT(ifp->if_broot_bytes > 0);
+	ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, XFS_DATA_FORK));
+	xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot,
+			ifp->if_broot_bytes, dfp,
+			XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
index 3cd44590c930..e558a10c4744 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -25,6 +25,7 @@ void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
 unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
 		unsigned int blocklen, bool leaf);
 void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf);
 
 /*
  * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
@@ -71,4 +72,115 @@ xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
 unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
+/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */
+
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_droot_rec_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index)
+{
+	return (struct xfs_refcount_rec *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_droot_key_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index)
+{
+	return (struct xfs_refcount_key *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_droot_ptr_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index,
+	unsigned int			maxrecs)
+{
+	return (xfs_rtrefcount_ptr_t *)
+		((char *)(block + 1) +
+		 maxrecs * sizeof(struct xfs_refcount_key) +
+		 (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_broot_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*bb,
+	unsigned int		index,
+	unsigned int		block_size)
+{
+	return xfs_rtrefcount_ptr_addr(bb, index,
+			xfs_rtrefcountbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space_calc(
+	struct xfs_mount	*mp,
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = XFS_RTREFCOUNT_BLOCK_LEN;
+
+	if (level > 0)
+		return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+				     sizeof(xfs_rtrefcount_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb)
+{
+	return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrefcount_droot_space_calc(
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = sizeof(struct xfs_rtrefcount_root);
+
+	if (level > 0)
+		return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+				     sizeof(xfs_rtrefcount_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrefcount_droot_space(struct xfs_btree_block *bb)
+{
+	return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
+		struct xfs_btree_block *rblock, int rblocklen,
+		struct xfs_rtrefcount_root *dblock, int dblocklen);
+void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 5de1d3563b76..f3bfb814378c 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -23,6 +23,7 @@
 #include "xfs_icache.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 STATIC void
 xlog_recover_inode_ra_pass2(
@@ -286,6 +287,9 @@ xlog_recover_inode_dbroot(
 		case XFS_METAFILE_RTRMAP:
 			xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize);
 			return 0;
+		case XFS_METAFILE_RTREFCOUNT:
+			xfs_rtrefcountbt_to_disk(mp, src, len, dfork, dsize);
+			return 0;
 		default:
 			ASSERT(0);
 			return -EFSCORRUPTED;
-- 
cgit v1.2.3


From 4e87047539c7d695571173ad71471b2ac5b8e7e2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Nov 2024 16:20:54 -0800
Subject: xfs: refactor xfs_reflink_find_shared

Move lookup of the perag structure from the callers into the helpers,
and return the offset into the extent of the shared region instead of
the block number that needs post-processing.  This prepares the
callsites for the creation of an rt-specific variant in the next patch.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
[djwong: port to the middle of the rtreflink series for cleanliness]
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c | 110 +++++++++++++++++++++++----------------------------
 fs/xfs/xfs_reflink.h |   2 +-
 2 files changed, 50 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 02457e176c42..71b4743ffb77 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -120,38 +120,46 @@
  */
 
 /*
- * Given an AG extent, find the lowest-numbered run of shared blocks
- * within that range and return the range in fbno/flen.  If
- * find_end_of_shared is true, return the longest contiguous extent of
- * shared blocks.  If there are no shared extents, fbno and flen will
- * be set to NULLAGBLOCK and 0, respectively.
+ * Given a file mapping for the data device, find the lowest-numbered run of
+ * shared blocks within that mapping and return it in shared_offset/shared_len.
+ * The offset is relative to the start of irec.
+ *
+ * If find_end_of_shared is true, return the longest contiguous extent of shared
+ * blocks.  If there are no shared extents, shared_offset and shared_len will be
+ * set to 0;
  */
 static int
 xfs_reflink_find_shared(
-	struct xfs_perag	*pag,
+	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
-	xfs_agblock_t		agbno,
-	xfs_extlen_t		aglen,
-	xfs_agblock_t		*fbno,
-	xfs_extlen_t		*flen,
+	const struct xfs_bmbt_irec *irec,
+	xfs_extlen_t		*shared_offset,
+	xfs_extlen_t		*shared_len,
 	bool			find_end_of_shared)
 {
 	struct xfs_buf		*agbp;
+	struct xfs_perag	*pag;
 	struct xfs_btree_cur	*cur;
 	int			error;
+	xfs_agblock_t		orig_bno, found_bno;
+
+	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+	orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
 
 	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 	if (error)
-		return error;
-
-	cur = xfs_refcountbt_init_cursor(pag_mount(pag), tp, agbp, pag);
-
-	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
-			find_end_of_shared);
+		goto out;
 
+	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+	error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount,
+			&found_bno, shared_len, find_end_of_shared);
 	xfs_btree_del_cursor(cur, error);
-
 	xfs_trans_brelse(tp, agbp);
+
+	if (!error && *shared_len)
+		*shared_offset = found_bno - orig_bno;
+out:
+	xfs_perag_put(pag);
 	return error;
 }
 
@@ -172,11 +180,7 @@ xfs_reflink_trim_around_shared(
 	bool			*shared)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_perag	*pag;
-	xfs_agblock_t		agbno;
-	xfs_extlen_t		aglen;
-	xfs_agblock_t		fbno;
-	xfs_extlen_t		flen;
+	xfs_extlen_t		shared_offset, shared_len;
 	int			error = 0;
 
 	/* Holes, unwritten, and delalloc extents cannot be shared */
@@ -187,41 +191,33 @@ xfs_reflink_trim_around_shared(
 
 	trace_xfs_reflink_trim_around_shared(ip, irec);
 
-	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
-	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
-	aglen = irec->br_blockcount;
-
-	error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
-			true);
-	xfs_perag_put(pag);
+	error = xfs_reflink_find_shared(mp, NULL, irec, &shared_offset,
+			&shared_len, true);
 	if (error)
 		return error;
 
-	*shared = false;
-	if (fbno == NULLAGBLOCK) {
+	if (!shared_len) {
 		/* No shared blocks at all. */
-		return 0;
-	}
-
-	if (fbno == agbno) {
+		*shared = false;
+	} else if (!shared_offset) {
 		/*
-		 * The start of this extent is shared.  Truncate the
-		 * mapping at the end of the shared region so that a
-		 * subsequent iteration starts at the start of the
-		 * unshared region.
+		 * The start of this mapping points to shared space.  Truncate
+		 * the mapping at the end of the shared region so that a
+		 * subsequent iteration starts at the start of the unshared
+		 * region.
 		 */
-		irec->br_blockcount = flen;
+		irec->br_blockcount = shared_len;
 		*shared = true;
-		return 0;
+	} else {
+		/*
+		 * There's a shared region that doesn't start at the beginning
+		 * of the mapping.  Truncate the mapping at the start of the
+		 * shared extent so that a subsequent iteration starts at the
+		 * start of the shared region.
+		 */
+		irec->br_blockcount = shared_offset;
+		*shared = false;
 	}
-
-	/*
-	 * There's a shared extent midway through this extent.
-	 * Truncate the mapping at the start of the shared
-	 * extent so that a subsequent iteration starts at the
-	 * start of the shared region.
-	 */
-	irec->br_blockcount = fbno - agbno;
 	return 0;
 }
 
@@ -1552,27 +1548,19 @@ xfs_reflink_inode_has_shared_extents(
 	*has_shared = false;
 	found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
 	while (found) {
-		struct xfs_perag	*pag;
-		xfs_agblock_t		agbno;
-		xfs_extlen_t		aglen;
-		xfs_agblock_t		rbno;
-		xfs_extlen_t		rlen;
+		xfs_extlen_t		shared_offset, shared_len;
 
 		if (isnullstartblock(got.br_startblock) ||
 		    got.br_state != XFS_EXT_NORM)
 			goto next;
 
-		pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
-		agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
-		aglen = got.br_blockcount;
-		error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
-				&rbno, &rlen, false);
-		xfs_perag_put(pag);
+		error = xfs_reflink_find_shared(mp, tp, &got, &shared_offset,
+				&shared_len, false);
 		if (error)
 			return error;
 
 		/* Is there still a shared block here? */
-		if (rbno != NULLAGBLOCK) {
+		if (shared_len) {
 			*has_shared = true;
 			return 0;
 		}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 4a58e4533671..3bfd7ab9e114 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -25,7 +25,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip)
 	return true;
 }
 
-extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
+int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *irec, bool *shared);
 int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
 		bool *shared);
-- 
cgit v1.2.3


From e5a171729baf61b703069b11fa0d2955890e9b6b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:55 -0800
Subject: xfs: wire up realtime refcount btree cursors

Wire up realtime refcount btree cursors wherever they're needed
throughout the code base.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.h    |   2 +-
 fs/xfs/libxfs/xfs_refcount.c | 100 ++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/libxfs/xfs_rtgroup.c  |   9 ++++
 fs/xfs/libxfs/xfs_rtgroup.h  |   5 ++-
 fs/xfs/xfs_fsmap.c           |  25 +++++------
 fs/xfs/xfs_reflink.c         |  66 ++++++++++++++++++++++++++--
 6 files changed, 187 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index dbc047b2fb2c..355b304696e6 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -297,7 +297,7 @@ struct xfs_btree_cur
 		struct {
 			unsigned int	nr_ops;		/* # record updates */
 			unsigned int	shape_changes;	/* # of extent splits */
-		} bc_refc;	/* refcountbt */
+		} bc_refc;	/* refcountbt/rtrefcountbt */
 	};
 
 	/* Must be at the end of the struct! */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 8007d1585625..11bff098db2d 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -27,6 +27,7 @@
 #include "xfs_refcount_item.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtalloc.h"
+#include "xfs_rtrefcount_btree.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -1462,6 +1463,32 @@ xfs_refcount_finish_one(
 	return error;
 }
 
+/*
+ * Set up a continuation a deferred rtrefcount operation by updating the
+ * intent.  Checks to make sure we're not going to run off the end of the
+ * rtgroup.
+ */
+static inline int
+xfs_rtrefcount_continue_op(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_intent	*ri,
+	xfs_agblock_t			new_agbno)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rtgroup		*rtg = to_rtg(ri->ri_group);
+
+	if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno,
+					ri->ri_blockcount))) {
+		xfs_btree_mark_sick(cur);
+		return -EFSCORRUPTED;
+	}
+
+	ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno);
+
+	ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount));
+	return 0;
+}
+
 /*
  * Process one of the deferred realtime refcount operations.  We pass back the
  * btree cursor to maintain our lock on the btree between calls.
@@ -1472,8 +1499,77 @@ xfs_rtrefcount_finish_one(
 	struct xfs_refcount_intent	*ri,
 	struct xfs_btree_cur		**pcur)
 {
-	ASSERT(0);
-	return -EFSCORRUPTED;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_rtgroup		*rtg = to_rtg(ri->ri_group);
+	struct xfs_btree_cur		*rcur = *pcur;
+	int				error = 0;
+	xfs_rgblock_t			bno;
+	unsigned long			nr_ops = 0;
+	int				shape_changes = 0;
+
+	bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock);
+
+	trace_xfs_refcount_deferred(mp, ri);
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+		return -EIO;
+
+	/*
+	 * If we haven't gotten a cursor or the cursor AG doesn't match
+	 * the startblock, get one now.
+	 */
+	if (rcur != NULL && rcur->bc_group != ri->ri_group) {
+		nr_ops = rcur->bc_refc.nr_ops;
+		shape_changes = rcur->bc_refc.shape_changes;
+		xfs_btree_del_cursor(rcur, 0);
+		rcur = NULL;
+		*pcur = NULL;
+	}
+	if (rcur == NULL) {
+		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT);
+		xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT);
+		*pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg);
+
+		rcur->bc_refc.nr_ops = nr_ops;
+		rcur->bc_refc.shape_changes = shape_changes;
+	}
+
+	switch (ri->ri_type) {
+	case XFS_REFCOUNT_INCREASE:
+		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+				XFS_REFCOUNT_ADJUST_INCREASE);
+		if (error)
+			return error;
+		if (ri->ri_blockcount > 0)
+			error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+		break;
+	case XFS_REFCOUNT_DECREASE:
+		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+				XFS_REFCOUNT_ADJUST_DECREASE);
+		if (error)
+			return error;
+		if (ri->ri_blockcount > 0)
+			error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+		break;
+	case XFS_REFCOUNT_ALLOC_COW:
+		error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
+		if (error)
+			return error;
+		ri->ri_blockcount = 0;
+		break;
+	case XFS_REFCOUNT_FREE_COW:
+		error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
+		if (error)
+			return error;
+		ri->ri_blockcount = 0;
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+	if (!error && ri->ri_blockcount > 0)
+		trace_xfs_refcount_finish_one_leftover(mp, ri);
+	return error;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 6aebe9f48490..d5ecc2f6c5c2 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -206,6 +206,9 @@ xfs_rtgroup_lock(
 
 	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
 		xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+	if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+		xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL);
 }
 
 /* Unlock metadata inodes associated with this rt group. */
@@ -218,6 +221,9 @@ xfs_rtgroup_unlock(
 	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
 	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
+	if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+		xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL);
+
 	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
 		xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
 
@@ -249,6 +255,9 @@ xfs_rtgroup_trans_join(
 
 	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
 		xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+	if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+		xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL);
 }
 
 /* Retrieve rt group geometry. */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 385ea8e2f28b..de4eeb381fc9 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -273,10 +273,13 @@ int xfs_update_last_rtgroup_size(struct xfs_mount *mp,
 #define XFS_RTGLOCK_BITMAP_SHARED	(1U << 1)
 /* Lock the rt rmap inode in exclusive mode */
 #define XFS_RTGLOCK_RMAP		(1U << 2)
+/* Lock the rt refcount inode in exclusive mode */
+#define XFS_RTGLOCK_REFCOUNT		(1U << 3)
 
 #define XFS_RTGLOCK_ALL_FLAGS	(XFS_RTGLOCK_BITMAP | \
 				 XFS_RTGLOCK_BITMAP_SHARED | \
-				 XFS_RTGLOCK_RMAP)
+				 XFS_RTGLOCK_RMAP | \
+				 XFS_RTGLOCK_REFCOUNT)
 
 void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
 void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3e3ef16f65a3..1dbd2d75f7ae 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -27,6 +27,7 @@
 #include "xfs_ag.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 /* Convert an xfs_fsmap to an fsmap. */
 static void
@@ -212,21 +213,20 @@ xfs_getfsmap_is_shared(
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_btree_cur		*cur;
 	xfs_agblock_t			fbno;
-	xfs_extlen_t			flen;
+	xfs_extlen_t			flen = 0;
 	int				error;
 
 	*stat = false;
-	if (!xfs_has_reflink(mp))
-		return 0;
-	/* rt files will have no perag structure */
-	if (!info->group)
+	if (!xfs_has_reflink(mp) || !info->group)
 		return 0;
 
-	/* Are there any shared blocks here? */
-	flen = 0;
-	cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
-			to_perag(info->group));
+	if (info->group->xg_type == XG_TYPE_RTG)
+		cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(info->group));
+	else
+		cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+				to_perag(info->group));
 
+	/* Are there any shared blocks here? */
 	error = xfs_refcount_find_shared(cur, frec->rec_key,
 			XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen,
 			false);
@@ -863,7 +863,7 @@ xfs_getfsmap_rtdev_rmapbt_query(
 	struct xfs_rtgroup		*rtg = to_rtg(info->group);
 
 	/* Query the rtrmapbt */
-	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
 	*curpp = xfs_rtrmapbt_init_cursor(tp, rtg);
 	return xfs_rmap_query_range(*curpp, &info->low, &info->high,
 			xfs_getfsmap_rtdev_rmapbt_helper, info);
@@ -950,7 +950,8 @@ xfs_getfsmap_rtdev_rmapbt(
 
 		if (bt_cur) {
 			xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group),
-					XFS_RTGLOCK_RMAP);
+					XFS_RTGLOCK_RMAP |
+					XFS_RTGLOCK_REFCOUNT);
 			xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
 			bt_cur = NULL;
 		}
@@ -988,7 +989,7 @@ xfs_getfsmap_rtdev_rmapbt(
 
 	if (bt_cur) {
 		xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group),
-				XFS_RTGLOCK_RMAP);
+				XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
 		xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
 							 XFS_BTREE_NOERROR);
 	}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 71b4743ffb77..66ce29101462 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -30,6 +30,9 @@
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 #include "xfs_health.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -163,6 +166,53 @@ out:
 	return error;
 }
 
+/*
+ * Given a file mapping for the rt device, find the lowest-numbered run of
+ * shared blocks within that mapping and return it in shared_offset/shared_len.
+ * The offset is relative to the start of irec.
+ *
+ * If find_end_of_shared is true, return the longest contiguous extent of shared
+ * blocks.  If there are no shared extents, shared_offset and shared_len will be
+ * set to 0;
+ */
+static int
+xfs_reflink_find_rtshared(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	const struct xfs_bmbt_irec *irec,
+	xfs_extlen_t		*shared_offset,
+	xfs_extlen_t		*shared_len,
+	bool			find_end_of_shared)
+{
+	struct xfs_rtgroup	*rtg;
+	struct xfs_btree_cur	*cur;
+	xfs_rgblock_t		orig_bno;
+	xfs_agblock_t		found_bno;
+	int			error;
+
+	BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK);
+
+	/*
+	 * Note: this uses the not quite correct xfs_agblock_t type because
+	 * xfs_refcount_find_shared is shared between the RT and data device
+	 * refcount code.
+	 */
+	orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock);
+	rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, irec->br_startblock));
+
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT);
+	cur = xfs_rtrefcountbt_init_cursor(tp, rtg);
+	error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount,
+			&found_bno, shared_len, find_end_of_shared);
+	xfs_btree_del_cursor(cur, error);
+	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
+	xfs_rtgroup_put(rtg);
+
+	if (!error && *shared_len)
+		*shared_offset = found_bno - orig_bno;
+	return error;
+}
+
 /*
  * Trim the mapping to the next block where there's a change in the
  * shared/unshared status.  More specifically, this means that we
@@ -191,8 +241,12 @@ xfs_reflink_trim_around_shared(
 
 	trace_xfs_reflink_trim_around_shared(ip, irec);
 
-	error = xfs_reflink_find_shared(mp, NULL, irec, &shared_offset,
-			&shared_len, true);
+	if (XFS_IS_REALTIME_INODE(ip))
+		error = xfs_reflink_find_rtshared(mp, NULL, irec,
+				&shared_offset, &shared_len, true);
+	else
+		error = xfs_reflink_find_shared(mp, NULL, irec,
+				&shared_offset, &shared_len, true);
 	if (error)
 		return error;
 
@@ -1554,8 +1608,12 @@ xfs_reflink_inode_has_shared_extents(
 		    got.br_state != XFS_EXT_NORM)
 			goto next;
 
-		error = xfs_reflink_find_shared(mp, tp, &got, &shared_offset,
-				&shared_len, false);
+		if (XFS_IS_REALTIME_INODE(ip))
+			error = xfs_reflink_find_rtshared(mp, tp, &got,
+					&shared_offset, &shared_len, false);
+		else
+			error = xfs_reflink_find_shared(mp, tp, &got,
+					&shared_offset, &shared_len, false);
 		if (error)
 			return error;
 
-- 
cgit v1.2.3


From 4ee3113aaf3f6a3c24fcf952d8489363f56ab375 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:56 -0800
Subject: xfs: create routine to allocate and initialize a realtime refcount
 btree inode

Create a library routine to allocate and initialize an empty realtime
refcountbt inode.  We'll use this for growfs, mkfs, and repair.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtgroup.c          |  2 ++
 fs/xfs/libxfs/xfs_rtrefcount_btree.c | 28 ++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |  3 +++
 3 files changed, 33 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index d5ecc2f6c5c2..eab655a4a9ef 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -34,6 +34,7 @@
 #include "xfs_metafile.h"
 #include "xfs_metadir.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 /* Find the first usable fsblock in this rtgroup. */
 static inline uint32_t
@@ -382,6 +383,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
 		.fmt_mask	= 1U << XFS_DINODE_FMT_META_BTREE,
 		/* same comment about growfs and rmap inodes applies here */
 		.enabled	= xfs_has_reflink,
+		.create		= xfs_rtrefcountbt_create,
 	},
 };
 
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index 6a5bc7ea42fb..151fb1ef7db1 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -721,3 +721,31 @@ xfs_iflush_rtrefcount(
 			ifp->if_broot_bytes, dfp,
 			XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
 }
+
+/*
+ * Create a realtime refcount btree inode.
+ */
+int
+xfs_rtrefcountbt_create(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp,
+	bool			init)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_btree_block	*broot;
+
+	ifp->if_format = XFS_DINODE_FMT_META_BTREE;
+	ASSERT(ifp->if_broot_bytes == 0);
+	ASSERT(ifp->if_bytes == 0);
+
+	/* Initialize the empty incore btree root. */
+	broot = xfs_broot_realloc(ifp,
+			xfs_rtrefcount_broot_space_calc(mp, 0, 0));
+	if (broot)
+		xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0,
+				ip->i_ino);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
index e558a10c4744..a99b7a8aec86 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -183,4 +183,7 @@ void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
 		struct xfs_rtrefcount_root *dblock, int dblocklen);
 void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
 
+int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		struct xfs_trans *tp, bool init);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
-- 
cgit v1.2.3


From 0bada82331238bd366aaa0566d125c6338b42590 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:57 -0800
Subject: xfs: update rmap to allow cow staging extents in the rt rmap

Don't error out on CoW staging extent records when realtime reflink is
enabled.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rmap.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index f8415fd96cc2..3cdf50563fec 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -285,6 +285,13 @@ xfs_rtrmap_check_meta_irec(
 		if (irec->rm_blockcount != mp->m_sb.sb_rextsize)
 			return __this_address;
 		return NULL;
+	case XFS_RMAP_OWN_COW:
+		if (!xfs_has_rtreflink(mp))
+			return __this_address;
+		if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
+					    irec->rm_blockcount))
+			return __this_address;
+		return NULL;
 	default:
 		return __this_address;
 	}
-- 
cgit v1.2.3


From c2694ff678c9b667ab4cb7c0b45d45309c4dd64b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:57 -0800
Subject: xfs: compute rtrmap btree max levels when reflink enabled

Compute the maximum possible height of the realtime rmap btree when
reflink is enabled.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index b90901e39e92..e4ec36943cb7 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -718,6 +718,7 @@ xfs_rtrmapbt_maxrecs(
 unsigned int
 xfs_rtrmapbt_maxlevels_ondisk(void)
 {
+	unsigned long long	max_dblocks;
 	unsigned int		minrecs[2];
 	unsigned int		blocklen;
 
@@ -726,8 +727,20 @@ xfs_rtrmapbt_maxlevels_ondisk(void)
 	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
 	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
 
-	/* We need at most one record for every block in an rt group. */
-	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+	/*
+	 * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs.
+	 *
+	 * On a reflink filesystem, each block in an rtgroup can have up to
+	 * 2^32 (per the refcount record format) owners, which means that
+	 * theoretically we could face up to 2^64 rmap records.  However, we're
+	 * likely to run out of blocks in the data device long before that
+	 * happens, which means that we must compute the max height based on
+	 * what the btree will look like if it consumes almost all the blocks
+	 * in the data device due to maximal sharing factor.
+	 */
+	max_dblocks = -1U; /* max ag count */
+	max_dblocks *= XFS_MAX_CRC_AG_BLOCKS;
+	return xfs_btree_space_to_height(minrecs, max_dblocks);
 }
 
 int __init
@@ -766,9 +779,20 @@ xfs_rtrmapbt_compute_maxlevels(
 	 * maximum height is constrained by the size of the data device and
 	 * the height required to store one rmap record for each block in an
 	 * rt group.
+	 *
+	 * On a reflink filesystem, each rt block can have up to 2^32 (per the
+	 * refcount record format) owners, which means that theoretically we
+	 * could face up to 2^64 rmap records.  This makes the computation of
+	 * maxlevels based on record count meaningless, so we only consider the
+	 * size of the data device.
 	 */
 	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
 				mp->m_sb.sb_dblocks);
+	if (xfs_has_rtreflink(mp)) {
+		mp->m_rtrmap_maxlevels = d_maxlevels + 1;
+		return;
+	}
+
 	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
 				mp->m_groups[XG_TYPE_RTG].blocks);
 
-- 
cgit v1.2.3


From 3639c63d4643f1949e4c71baa8c8d02e63d184b1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:58 -0800
Subject: xfs: refactor reflink quota updates

Hoist all quota updates for reflink into a helper function, since things
are about to become more complicated.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 66ce29101462..29574b015fdd 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -739,6 +739,35 @@ out:
 	return error;
 }
 
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * Update quota accounting for a remapping operation.  When we're remapping
+ * something from the CoW fork to the data fork, we must update the quota
+ * accounting for delayed allocations.  For remapping from the data fork to the
+ * data fork, use regular block accounting.
+ */
+static inline void
+xfs_reflink_update_quota(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	bool			is_cow,
+	int64_t			blocks)
+{
+	unsigned int		qflag;
+
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT :
+				 XFS_TRANS_DQ_RTBCOUNT;
+	} else {
+		qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT :
+				 XFS_TRANS_DQ_BCOUNT;
+	}
+	xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks);
+}
+#else
+# define xfs_reflink_update_quota(tp, ip, is_cow, blocks)	((void)0)
+#endif
+
 /*
  * Remap part of the CoW fork into the data fork.
  *
@@ -833,8 +862,7 @@ xfs_reflink_end_cow_extent(
 		 */
 		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
 		xfs_refcount_decrease_extent(tp, isrt, &data);
-		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-				-data.br_blockcount);
+		xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount);
 	} else if (data.br_startblock == DELAYSTARTBLOCK) {
 		int		done;
 
@@ -859,8 +887,7 @@ xfs_reflink_end_cow_extent(
 	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
 
 	/* Charge this new data fork mapping to the on-disk quota. */
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
-			(long)del.br_blockcount);
+	xfs_reflink_update_quota(tp, ip, true, del.br_blockcount);
 
 	/* Remove the mapping from the CoW fork. */
 	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
@@ -1347,7 +1374,7 @@ xfs_reflink_remap_extent(
 		qdelta += dmap->br_blockcount;
 	}
 
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
+	xfs_reflink_update_quota(tp, ip, false, qdelta);
 
 	/* Update dest isize if needed. */
 	newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
-- 
cgit v1.2.3


From 26e97d9b4b7601bc29535b9e9edc8f8aa5dae8d4 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:20:59 -0800
Subject: xfs: enable CoW for realtime data

Update our write paths to support copy on write on the rt volume.  This
works in more or less the same way as it does on the data device, with
the major exception that we never do delalloc on the rt volume.

Because we consider unwritten CoW fork staging extents to be incore
quota reservation, we update xfs_quota_reserve_blkres to support this
case.  Though xfs doesn't allow rt and quota together, the change is
trivial and we shouldn't leave a logic bomb here.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 29574b015fdd..24f545687b87 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -439,20 +439,26 @@ xfs_reflink_fill_cow_hole(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
 	xfs_filblks_t		resaligned;
-	xfs_extlen_t		resblks;
+	unsigned int		dblocks = 0, rblocks = 0;
 	int			nimaps;
 	int			error;
 	bool			found;
 
 	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
 		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
-	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+		rblocks = resaligned;
+	} else {
+		dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+		rblocks = 0;
+	}
 
 	xfs_iunlock(ip, *lockmode);
 	*lockmode = 0;
 
-	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
-			false, &tp);
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+			rblocks, false, &tp);
 	if (error)
 		return error;
 
@@ -1212,7 +1218,7 @@ xfs_reflink_remap_extent(
 	struct xfs_trans	*tp;
 	xfs_off_t		newlen;
 	int64_t			qdelta = 0;
-	unsigned int		resblks;
+	unsigned int		dblocks, rblocks, resblks;
 	bool			quota_reserved = true;
 	bool			smap_real;
 	bool			dmap_written = xfs_bmap_is_written_extent(dmap);
@@ -1243,8 +1249,15 @@ xfs_reflink_remap_extent(
 	 * we're remapping.
 	 */
 	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		dblocks = resblks;
+		rblocks = dmap->br_blockcount;
+	} else {
+		dblocks = resblks + dmap->br_blockcount;
+		rblocks = 0;
+	}
 	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
-			resblks + dmap->br_blockcount, 0, false, &tp);
+			dblocks, rblocks, false, &tp);
 	if (error == -EDQUOT || error == -ENOSPC) {
 		quota_reserved = false;
 		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
@@ -1324,8 +1337,15 @@ xfs_reflink_remap_extent(
 	 * done.
 	 */
 	if (!quota_reserved && !smap_real && dmap_written) {
-		error = xfs_trans_reserve_quota_nblks(tp, ip,
-				dmap->br_blockcount, 0, false);
+		if (XFS_IS_REALTIME_INODE(ip)) {
+			dblocks = 0;
+			rblocks = dmap->br_blockcount;
+		} else {
+			dblocks = dmap->br_blockcount;
+			rblocks = 0;
+		}
+		error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
+				false);
 		if (error)
 			goto out_cancel;
 	}
-- 
cgit v1.2.3


From 5519251da0b0ae1224d7e0ce8dee3612b571c01c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:00 -0800
Subject: xfs: enable sharing of realtime file blocks

Update the remapping routines to be able to handle realtime files.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 24f545687b87..78b47b2ac124 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -33,6 +33,7 @@
 #include "xfs_rtrefcount_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -1187,14 +1188,28 @@ out_error:
 static int
 xfs_reflink_ag_has_free_space(
 	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno)
+	struct xfs_inode	*ip,
+	xfs_fsblock_t		fsb)
 {
 	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
 	int			error = 0;
 
 	if (!xfs_has_rmapbt(mp))
 		return 0;
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		struct xfs_rtgroup	*rtg;
+		xfs_rgnumber_t		rgno;
+
+		rgno = xfs_rtb_to_rgno(mp, fsb);
+		rtg = xfs_rtgroup_get(mp, rgno);
+		if (xfs_metafile_resv_critical(rtg_rmap(rtg)))
+			error = -ENOSPC;
+		xfs_rtgroup_put(rtg);
+		return error;
+	}
 
+	agno = XFS_FSB_TO_AGNO(mp, fsb);
 	pag = xfs_perag_get(mp, agno);
 	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
 	    xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
@@ -1308,8 +1323,8 @@ xfs_reflink_remap_extent(
 
 	/* No reflinking if the AG of the dest mapping is low on space. */
 	if (dmap_written) {
-		error = xfs_reflink_ag_has_free_space(mp,
-				XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
+		error = xfs_reflink_ag_has_free_space(mp, ip,
+				dmap->br_startblock);
 		if (error)
 			goto out_cancel;
 	}
@@ -1568,8 +1583,8 @@ xfs_reflink_remap_prep(
 
 	/* Check file eligibility and prepare for block sharing. */
 	ret = -EINVAL;
-	/* Don't reflink realtime inodes */
-	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+	/* Can't reflink between data and rt volumes */
+	if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest))
 		goto out_unlock;
 
 	/* Don't share DAX file data with non-DAX file. */
-- 
cgit v1.2.3


From c3d3605f9661a2451c437a037d338dc79fb78f37 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:00 -0800
Subject: xfs: allow inodes to have the realtime and reflink flags

Now that we can share blocks between realtime files, allow this
combination.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_inode_buf.c | 3 ++-
 fs/xfs/scrub/inode.c          | 5 +++--
 fs/xfs/scrub/inode_repair.c   | 6 ------
 fs/xfs/xfs_ioctl.c            | 4 ----
 4 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 65eec8f60376..4273d096fb0a 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -748,7 +748,8 @@ xfs_dinode_verify(
 		return __this_address;
 
 	/* don't let reflink and realtime mix */
-	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) &&
+	    !xfs_has_rtreflink(mp))
 		return __this_address;
 
 	/* COW extent size hint validation */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 8e702121dc86..c7bbc3f78e90 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -360,8 +360,9 @@ xchk_inode_flags2(
 	if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
 		goto bad;
 
-	/* realtime and reflink make no sense, currently */
-	if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+	/* realtime and reflink don't always go together */
+	if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) &&
+	    !xfs_has_rtreflink(mp))
 		goto bad;
 
 	/* no bigtime iflag without the bigtime feature */
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index d7e3f033b160..938a18721f36 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -564,8 +564,6 @@ xrep_dinode_flags(
 		flags2 |= XFS_DIFLAG2_REFLINK;
 	else
 		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
-	if (flags & XFS_DIFLAG_REALTIME)
-		flags2 &= ~XFS_DIFLAG2_REFLINK;
 	if (!xfs_has_bigtime(mp))
 		flags2 &= ~XFS_DIFLAG2_BIGTIME;
 	if (!xfs_has_large_extent_counts(mp))
@@ -1790,10 +1788,6 @@ xrep_inode_flags(
 	/* DAX only applies to files and dirs. */
 	if (!(S_ISREG(mode) || S_ISDIR(mode)))
 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
-
-	/* No reflink files on the realtime device. */
-	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
-		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 }
 
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0789c18aaa18..4caf29cc59b9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -541,10 +541,6 @@ xfs_ioctl_setattr_xflags(
 		if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
 		    xfs_extlen_to_rtxmod(mp, ip->i_extsize))
 			return -EINVAL;
-
-		/* Clear reflink if we are actually able to set the rt flag. */
-		if (xfs_is_reflink_inode(ip))
-			ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 	}
 
 	/* diflags2 only valid for v3 inodes. */
-- 
cgit v1.2.3


From 51e232674975ff138d0e892272fdde9bc444c572 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:01 -0800
Subject: xfs: recover CoW leftovers in the realtime volume

Scan the realtime refcount tree at mount time to get rid of leftover
CoW staging extents.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_refcount.c | 47 +++++++++++++++++++++++++++++++-------------
 fs/xfs/libxfs/xfs_refcount.h |  3 +--
 fs/xfs/xfs_reflink.c         | 15 +++++++++++---
 3 files changed, 46 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 11bff098db2d..cebe83f7842a 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -2054,12 +2054,13 @@ xfs_refcount_recover_extent(
 /* Find and remove leftover CoW reservations. */
 int
 xfs_refcount_recover_cow_leftovers(
-	struct xfs_mount		*mp,
-	struct xfs_perag		*pag)
+	struct xfs_group		*xg)
 {
+	struct xfs_mount		*mp = xg->xg_mount;
+	bool				isrt = xg->xg_type == XG_TYPE_RTG;
 	struct xfs_trans		*tp;
 	struct xfs_btree_cur		*cur;
-	struct xfs_buf			*agbp;
+	struct xfs_buf			*agbp = NULL;
 	struct xfs_refcount_recovery	*rr, *n;
 	struct list_head		debris;
 	union xfs_btree_irec		low = {
@@ -2072,10 +2073,19 @@ xfs_refcount_recover_cow_leftovers(
 	xfs_fsblock_t			fsb;
 	int				error;
 
-	/* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+	/* reflink filesystems must not have groups larger than 2^31-1 blocks */
+	BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG);
 	BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
-	if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
-		return -EOPNOTSUPP;
+
+	if (isrt) {
+		if (!xfs_has_rtgroups(mp))
+			return 0;
+		if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS)
+			return -EOPNOTSUPP;
+	} else {
+		if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS)
+			return -EOPNOTSUPP;
+	}
 
 	INIT_LIST_HEAD(&debris);
 
@@ -2093,16 +2103,24 @@ xfs_refcount_recover_cow_leftovers(
 	if (error)
 		return error;
 
-	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
-	if (error)
-		goto out_trans;
-	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+	if (isrt) {
+		xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
+		cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg));
+	} else {
+		error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp);
+		if (error)
+			goto out_trans;
+		cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg));
+	}
 
 	/* Find all the leftover CoW staging extents. */
 	error = xfs_btree_query_range(cur, &low, &high,
 			xfs_refcount_recover_extent, &debris);
 	xfs_btree_del_cursor(cur, error);
-	xfs_trans_brelse(tp, agbp);
+	if (agbp)
+		xfs_trans_brelse(tp, agbp);
+	else
+		xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
 	xfs_trans_cancel(tp);
 	if (error)
 		goto out_free;
@@ -2115,14 +2133,15 @@ xfs_refcount_recover_cow_leftovers(
 			goto out_free;
 
 		/* Free the orphan record */
-		fsb = xfs_agbno_to_fsb(pag, rr->rr_rrec.rc_startblock);
-		xfs_refcount_free_cow_extent(tp, false, fsb,
+		fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock);
+		xfs_refcount_free_cow_extent(tp, isrt, fsb,
 				rr->rr_rrec.rc_blockcount);
 
 		/* Free the block. */
 		error = xfs_free_extent_later(tp, fsb,
 				rr->rr_rrec.rc_blockcount, NULL,
-				XFS_AG_RESV_NONE, 0);
+				XFS_AG_RESV_NONE,
+				isrt ? XFS_FREE_EXTENT_REALTIME : 0);
 		if (error)
 			goto out_trans;
 
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index be11df25abcc..f2e299a716a4 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -94,8 +94,7 @@ void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
 		xfs_fsblock_t fsb, xfs_extlen_t len);
 void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
 		xfs_fsblock_t fsb, xfs_extlen_t len);
-extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
-		struct xfs_perag *pag);
+int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg);
 
 /*
  * While we're adjusting the refcounts records of an extent, we have
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 78b47b2ac124..d9b33e22c176 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -983,20 +983,29 @@ xfs_reflink_recover_cow(
 	struct xfs_mount	*mp)
 {
 	struct xfs_perag	*pag = NULL;
+	struct xfs_rtgroup	*rtg = NULL;
 	int			error = 0;
 
 	if (!xfs_has_reflink(mp))
 		return 0;
 
 	while ((pag = xfs_perag_next(mp, pag))) {
-		error = xfs_refcount_recover_cow_leftovers(mp, pag);
+		error = xfs_refcount_recover_cow_leftovers(pag_group(pag));
 		if (error) {
 			xfs_perag_rele(pag);
-			break;
+			return error;
 		}
 	}
 
-	return error;
+	while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+		error = xfs_refcount_recover_cow_leftovers(rtg_group(rtg));
+		if (error) {
+			xfs_rtgroup_rele(rtg);
+			return error;
+		}
+	}
+
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 6853d23badd0f1852d3b711128924e2456d27634 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:02 -0800
Subject: xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files

Currently, we (ab)use xfs_get_extsz_hint so that it always returns a
nonzero value for realtime files.  This apparently was done to disable
delayed allocation for realtime files.

However, once we enable realtime reflink, we can also turn on the
alwayscow flag to force CoW writes to realtime files.  In this case, the
logic will incorrectly send the write through the delalloc write path.

Fix this by adjusting the logic slightly.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index d63713630236..ae3d33d60761 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6500,9 +6500,8 @@ xfs_get_extsz_hint(
 	 * No point in aligning allocations if we need to COW to actually
 	 * write to them.
 	 */
-	if (xfs_is_always_cow_inode(ip))
-		return 0;
-	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+	if (!xfs_is_always_cow_inode(ip) &&
+	    (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
 		return ip->i_extsize;
 	if (XFS_IS_REALTIME_INODE(ip) &&
 	    ip->i_mount->m_sb.sb_rextsize > 1)
-- 
cgit v1.2.3


From 4de1a7ba4171db681691bd80506d0cf43c5cb46a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:03 -0800
Subject: xfs: apply rt extent alignment constraints to CoW extsize hint

The copy-on-write extent size hint is subject to the same alignment
constraints as the regular extent size hint.  Since we're in the process
of adding reflink (and therefore CoW) to the realtime device, we must
apply the same scattered rextsize alignment validation strategies to
both hints to deal with the possibility of rextsize changing.

Therefore, fix the inode validator to perform rextsize alignment checks
on regular realtime files, and to remove misaligned directory hints.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_inode_buf.c | 25 ++++++++++++++++++++-----
 fs/xfs/xfs_inode_item.c       | 14 ++++++++++++++
 fs/xfs/xfs_ioctl.c            | 17 +++++++++++++++--
 3 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 4273d096fb0a..f24fa628fecf 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -910,11 +910,29 @@ xfs_inode_validate_cowextsize(
 	bool				rt_flag;
 	bool				hint_flag;
 	uint32_t			cowextsize_bytes;
+	uint32_t			blocksize_bytes;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
 	cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
 
+	/*
+	 * Similar to extent size hints, a directory can be configured to
+	 * propagate realtime status and a CoW extent size hint to newly
+	 * created files even if there is no realtime device, and the hints on
+	 * disk can become misaligned if the sysadmin changes the rt extent
+	 * size while adding the realtime device.
+	 *
+	 * Therefore, we can only enforce the rextsize alignment check against
+	 * regular realtime files, and rely on callers to decide when alignment
+	 * checks are appropriate, and fix things up as needed.
+	 */
+
+	if (rt_flag)
+		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+	else
+		blocksize_bytes = mp->m_sb.sb_blocksize;
+
 	if (hint_flag && !xfs_has_reflink(mp))
 		return __this_address;
 
@@ -928,16 +946,13 @@ xfs_inode_validate_cowextsize(
 	if (mode && !hint_flag && cowextsize != 0)
 		return __this_address;
 
-	if (hint_flag && rt_flag)
-		return __this_address;
-
-	if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+	if (cowextsize_bytes % blocksize_bytes)
 		return __this_address;
 
 	if (cowextsize > XFS_MAX_BMBT_EXTLEN)
 		return __this_address;
 
-	if (cowextsize > mp->m_sb.sb_agblocks / 2)
+	if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2)
 		return __this_address;
 
 	return NULL;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a174f64b8bb2..70283c6419fd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -157,6 +157,20 @@ xfs_inode_item_precommit(
 	if (flags & XFS_ILOG_IVERSION)
 		flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
 
+	/*
+	 * Inode verifiers do not check that the CoW extent size hint is an
+	 * integer multiple of the rt extent size on a directory with both
+	 * rtinherit and cowextsize flags set.  If we're logging a directory
+	 * that is misconfigured in this way, clear the hint.
+	 */
+	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = 0;
+		flags |= XFS_ILOG_CORE;
+	}
+
 	if (!iip->ili_item.li_buf) {
 		struct xfs_buf	*bp;
 		int		error;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4caf29cc59b9..726282e74d54 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -469,8 +469,21 @@ xfs_fill_fsxattr(
 		}
 	}
 
-	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
-		fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+		/*
+		 * Don't let a misaligned CoW extent size hint on a directory
+		 * escape to userspace if it won't pass the setattr checks
+		 * later.
+		 */
+		if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+		    ip->i_cowextsize % mp->m_sb.sb_rextsize > 0) {
+			fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+			fa->fsx_cowextsize = 0;
+		} else {
+			fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+		}
+	}
+
 	fa->fsx_projid = ip->i_projid;
 	if (ifp && !xfs_need_iread_extents(ifp))
 		fa->fsx_nextents = xfs_iext_count(ifp);
-- 
cgit v1.2.3


From 8e84e8052bc283ebb37f929eb9fb97483ea7385e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:03 -0800
Subject: xfs: enable extent size hints for CoW operations

Wire up the copy-on-write extent size hint for realtime files, and
connect it to the rt allocator so that we avoid fragmentation on rt
filesystems.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 8 +++++++-
 fs/xfs/xfs_rtalloc.c     | 5 ++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ae3d33d60761..40ad22fb808b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6524,7 +6524,13 @@ xfs_get_cowextsz_hint(
 	a = 0;
 	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 		a = ip->i_cowextsize;
-	b = xfs_get_extsz_hint(ip);
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		b = 0;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+			b = ip->i_extsize;
+	} else {
+		b = xfs_get_extsz_hint(ip);
+	}
 
 	a = max(a, b);
 	if (a == 0)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 294aa0739be3..f5a3d5f8c948 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2021,7 +2021,10 @@ xfs_rtallocate_align(
 	if (*noalign) {
 		align = mp->m_sb.sb_rextsize;
 	} else {
-		align = xfs_get_extsz_hint(ap->ip);
+		if (ap->flags & XFS_BMAPI_COWFORK)
+			align = xfs_get_cowextsz_hint(ap->ip);
+		else
+			align = xfs_get_extsz_hint(ap->ip);
 		if (!align)
 			align = 1;
 		if (align == mp->m_sb.sb_rextsize)
-- 
cgit v1.2.3


From 88a70768df138b97b36bf1571fcde92907812a3d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:04 -0800
Subject: xfs: check that the rtrefcount maxlevels doesn't increase when
 growing fs

The size of filesystem transaction reservations depends on the maximum
height (maxlevels) of the realtime btrees.  Since we don't want a grow
operation to increase the reservation size enough that we'll fail the
minimum log size checks on the next mount, constrain growfs operations
if they would cause an increase in the rt refcount btree maxlevels.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_fsops.c   | 2 ++
 fs/xfs/xfs_rtalloc.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9df5a09c0acd..455298503d01 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -23,6 +23,7 @@
 #include "xfs_trace.h"
 #include "xfs_rtalloc.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Write new AG headers to disk. Non-transactional, but need to be
@@ -231,6 +232,7 @@ xfs_growfs_data_private(
 
 		/* Compute new maxlevels for rt btrees. */
 		xfs_rtrmapbt_compute_maxlevels(mp);
+		xfs_rtrefcountbt_compute_maxlevels(mp);
 	}
 
 	return error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f5a3d5f8c948..a5de5405800a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -995,6 +995,7 @@ xfs_growfs_rt_bmblock(
 	 */
 	mp->m_features |= XFS_FEAT_REALTIME;
 	xfs_rtrmapbt_compute_maxlevels(mp);
+	xfs_rtrefcountbt_compute_maxlevels(mp);
 
 	kfree(nmp);
 	return 0;
@@ -1178,6 +1179,7 @@ xfs_growfs_check_rtgeom(
 	nmp->m_sb.sb_dblocks = dblocks;
 
 	xfs_rtrmapbt_compute_maxlevels(nmp);
+	xfs_rtrefcountbt_compute_maxlevels(nmp);
 	xfs_trans_resv_calc(nmp, M_RES(nmp));
 
 	/*
-- 
cgit v1.2.3


From 026c8ed8d4580228949f177445c605d475880c93 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:05 -0800
Subject: xfs: report realtime refcount btree corruption errors to the health
 system

Whenever we encounter corrupt realtime refcount btree blocks, we should
report that to the health monitoring system for later reporting.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h               |  1 +
 fs/xfs/libxfs/xfs_health.h           |  4 +++-
 fs/xfs/libxfs/xfs_rtgroup.c          |  1 +
 fs/xfs/libxfs/xfs_rtrefcount_btree.c | 10 ++++++++--
 fs/xfs/xfs_health.c                  |  1 +
 5 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index d42d3a5617e3..ea9e58a89d92 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -996,6 +996,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_BITMAP	(1U << 1)  /* rtbitmap */
 #define XFS_RTGROUP_GEOM_SICK_SUMMARY	(1U << 2)  /* rtsummary */
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1U << 3)  /* reverse mappings */
+#define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1U << 4)  /* reference counts */
 
 /*
  * ioctl commands that are used by Linux filesystems
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 5c8a0aff6ba6..b31000f7190c 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -71,6 +71,7 @@ struct xfs_rtgroup;
 #define XFS_SICK_RG_BITMAP	(1 << 1)  /* rt group bitmap */
 #define XFS_SICK_RG_SUMMARY	(1 << 2)  /* rt groups summary */
 #define XFS_SICK_RG_RMAPBT	(1 << 3)  /* reverse mappings */
+#define XFS_SICK_RG_REFCNTBT	(1 << 4)  /* reference counts */
 
 /* Observable health issues for AG metadata. */
 #define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
@@ -117,7 +118,8 @@ struct xfs_rtgroup;
 #define XFS_SICK_RG_PRIMARY	(XFS_SICK_RG_SUPER | \
 				 XFS_SICK_RG_BITMAP | \
 				 XFS_SICK_RG_SUMMARY | \
-				 XFS_SICK_RG_RMAPBT)
+				 XFS_SICK_RG_RMAPBT | \
+				 XFS_SICK_RG_REFCNTBT)
 
 #define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
 				 XFS_SICK_AG_AGF | \
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index eab655a4a9ef..a6468e591232 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -380,6 +380,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
 	[XFS_RTGI_REFCOUNT] = {
 		.name		= "refcount",
 		.metafile_type	= XFS_METAFILE_RTREFCOUNT,
+		.sick		= XFS_SICK_RG_REFCNTBT,
 		.fmt_mask	= 1U << XFS_DINODE_FMT_META_BTREE,
 		/* same comment about growfs and rmap inodes applies here */
 		.enabled	= xfs_has_reflink,
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index 151fb1ef7db1..3db5e7a4a945 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -27,6 +27,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_metafile.h"
+#include "xfs_health.h"
 
 static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
 
@@ -374,6 +375,7 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 
 	.lru_refs		= XFS_REFC_BTREE_REF,
 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrefcbt_2),
+	.sick_mask		= XFS_SICK_RG_REFCNTBT,
 
 	.dup_cursor		= xfs_rtrefcountbt_dup_cursor,
 	.alloc_block		= xfs_btree_alloc_metafile_block,
@@ -640,16 +642,20 @@ xfs_iformat_rtrefcount(
 	 * volume to the filesystem, so we cannot use the rtrefcount predicate
 	 * here.
 	 */
-	if (!xfs_has_reflink(ip->i_mount))
+	if (!xfs_has_reflink(ip->i_mount)) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
+	}
 
 	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
 	numrecs = be16_to_cpu(dfp->bb_numrecs);
 	level = be16_to_cpu(dfp->bb_level);
 
 	if (level > mp->m_rtrefc_maxlevels ||
-	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize)
+	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
+	}
 
 	broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
 			xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index d438c3c001c8..7c541fb373d5 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -448,6 +448,7 @@ static const struct ioctl_sick_map rtgroup_map[] = {
 	{ XFS_SICK_RG_BITMAP,	XFS_RTGROUP_GEOM_SICK_BITMAP },
 	{ XFS_SICK_RG_SUMMARY,	XFS_RTGROUP_GEOM_SICK_SUMMARY },
 	{ XFS_SICK_RG_RMAPBT,	XFS_RTGROUP_GEOM_SICK_RMAPBT },
+	{ XFS_SICK_RG_REFCNTBT,	XFS_RTGROUP_GEOM_SICK_REFCNTBT },
 };
 
 /* Fill out rtgroup geometry health info. */
-- 
cgit v1.2.3


From c27929670de144ec76a0dab2f3a168cb4897b314 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:06 -0800
Subject: xfs: scrub the realtime refcount btree

Add code to scrub realtime refcount btrees.  Similar to the refcount
btree checking code for the data device, we walk the rmap btree for each
refcount record to confirm that the reference counts are correct.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile           |   1 +
 fs/xfs/libxfs/xfs_fs.h    |   3 +-
 fs/xfs/scrub/common.c     |  10 +
 fs/xfs/scrub/common.h     |   5 +-
 fs/xfs/scrub/health.c     |   1 +
 fs/xfs/scrub/rtrefcount.c | 487 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c      |   7 +
 fs/xfs/scrub/scrub.h      |   3 +
 fs/xfs/scrub/stats.c      |   1 +
 fs/xfs/scrub/trace.h      |   4 +-
 10 files changed, 519 insertions(+), 3 deletions(-)
 create mode 100644 fs/xfs/scrub/rtrefcount.c

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b9356d01416e..9dd9921e5356 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -195,6 +195,7 @@ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rgsuper.o \
 				   rtbitmap.o \
+				   rtrefcount.o \
 				   rtrmap.o \
 				   rtsummary.o \
 				   )
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index ea9e58a89d92..a4bd6a39c6ba 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -738,9 +738,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
 #define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
 #define XFS_SCRUB_TYPE_RTRMAPBT	31	/* rtgroup reverse mapping btree */
+#define XFS_SCRUB_TYPE_RTREFCBT	32	/* realtime reference count btree */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	32
+#define XFS_SCRUB_TYPE_NR	33
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 06cb61e63498..28ad341df8ee 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -37,6 +37,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_bmap_util.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -797,6 +798,9 @@ xchk_rtgroup_lock(
 	if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP))
 		sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
 
+	if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT))
+		sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg);
+
 	return 0;
 }
 
@@ -811,7 +815,10 @@ xchk_rtgroup_btcur_free(
 {
 	if (sr->rmap_cur)
 		xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR);
+	if (sr->refc_cur)
+		xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR);
 
+	sr->refc_cur = NULL;
 	sr->rmap_cur = NULL;
 }
 
@@ -1687,6 +1694,9 @@ xchk_meta_btree_count_blocks(
 	case XFS_METAFILE_RTRMAP:
 		cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
 		break;
+	case XFS_METAFILE_RTREFCOUNT:
+		cur = xfs_rtrefcountbt_init_cursor(sc->tp, sc->sr.rtg);
+		break;
 	default:
 		ASSERT(0);
 		return -EFSCORRUPTED;
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 50ac6cca18fe..bdcd40f0ec74 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -82,11 +82,13 @@ int xchk_setup_rtbitmap(struct xfs_scrub *sc);
 int xchk_setup_rtsummary(struct xfs_scrub *sc);
 int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
 int xchk_setup_rtrmapbt(struct xfs_scrub *sc);
+int xchk_setup_rtrefcountbt(struct xfs_scrub *sc);
 #else
 # define xchk_setup_rtbitmap		xchk_setup_nothing
 # define xchk_setup_rtsummary		xchk_setup_nothing
 # define xchk_setup_rgsuperblock	xchk_setup_nothing
 # define xchk_setup_rtrmapbt		xchk_setup_nothing
+# define xchk_setup_rtrefcountbt	xchk_setup_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -129,7 +131,8 @@ xchk_ag_init_existing(
 
 /* All the locks we need to check an rtgroup. */
 #define XCHK_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP | \
-				 XFS_RTGLOCK_RMAP)
+				 XFS_RTGLOCK_RMAP | \
+				 XFS_RTGLOCK_REFCOUNT)
 
 int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
 		struct xchk_rt *sr);
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index bcc4244e3b55..3c0f25098b69 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -115,6 +115,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_METAPATH]	= { XHG_FS,  XFS_SICK_FS_METAPATH },
 	[XFS_SCRUB_TYPE_RGSUPER]	= { XHG_RTGROUP, XFS_SICK_RG_SUPER },
 	[XFS_SCRUB_TYPE_RTRMAPBT]	= { XHG_RTGROUP, XFS_SICK_RG_RMAPBT },
+	[XFS_SCRUB_TYPE_RTREFCBT]	= { XHG_RTGROUP, XFS_SICK_RG_REFCNTBT },
 };
 
 /* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
new file mode 100644
index 000000000000..92ae2e15ae9f
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -0,0 +1,487 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+
+/* Set us up with the realtime refcount metadata locked. */
+int
+xchk_setup_rtrefcountbt(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+	error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+	if (error)
+		return error;
+
+	error = xchk_setup_rt(sc);
+	if (error)
+		return error;
+
+	error = xchk_install_live_inode(sc, rtg_refcount(sc->sr.rtg));
+	if (error)
+		return error;
+
+	return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+}
+
+/* Realtime Reference count btree scrubber. */
+
+/*
+ * Confirming Reference Counts via Reverse Mappings
+ *
+ * We want to count the reverse mappings overlapping a refcount record
+ * (bno, len, refcount), allowing for the possibility that some of the
+ * overlap may come from smaller adjoining reverse mappings, while some
+ * comes from single extents which overlap the range entirely.  The
+ * outer loop is as follows:
+ *
+ * 1. For all reverse mappings overlapping the refcount extent,
+ *    a. If a given rmap completely overlaps, mark it as seen.
+ *    b. Otherwise, record the fragment (in agbno order) for later
+ *       processing.
+ *
+ * Once we've seen all the rmaps, we know that for all blocks in the
+ * refcount record we want to find $refcount owners and we've already
+ * visited $seen extents that overlap all the blocks.  Therefore, we
+ * need to find ($refcount - $seen) owners for every block in the
+ * extent; call that quantity $target_nr.  Proceed as follows:
+ *
+ * 2. Pull the first $target_nr fragments from the list; all of them
+ *    should start at or before the start of the extent.
+ *    Call this subset of fragments the working set.
+ * 3. Until there are no more unprocessed fragments,
+ *    a. Find the shortest fragments in the set and remove them.
+ *    b. Note the block number of the end of these fragments.
+ *    c. Pull the same number of fragments from the list.  All of these
+ *       fragments should start at the block number recorded in the
+ *       previous step.
+ *    d. Put those fragments in the set.
+ * 4. Check that there are $target_nr fragments remaining in the list,
+ *    and that they all end at or beyond the end of the refcount extent.
+ *
+ * If the refcount is correct, all the check conditions in the algorithm
+ * should always hold true.  If not, the refcount is incorrect.
+ */
+struct xchk_rtrefcnt_frag {
+	struct list_head	list;
+	struct xfs_rmap_irec	rm;
+};
+
+struct xchk_rtrefcnt_check {
+	struct xfs_scrub	*sc;
+	struct list_head	fragments;
+
+	/* refcount extent we're examining */
+	xfs_rgblock_t		bno;
+	xfs_extlen_t		len;
+	xfs_nlink_t		refcount;
+
+	/* number of owners seen */
+	xfs_nlink_t		seen;
+};
+
+/*
+ * Decide if the given rmap is large enough that we can redeem it
+ * towards refcount verification now, or if it's a fragment, in
+ * which case we'll hang onto it in the hopes that we'll later
+ * discover that we've collected exactly the correct number of
+ * fragments as the rtrefcountbt says we should have.
+ */
+STATIC int
+xchk_rtrefcountbt_rmap_check(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xchk_rtrefcnt_check	*refchk = priv;
+	struct xchk_rtrefcnt_frag	*frag;
+	xfs_rgblock_t			rm_last;
+	xfs_rgblock_t			rc_last;
+	int				error = 0;
+
+	if (xchk_should_terminate(refchk->sc, &error))
+		return error;
+
+	rm_last = rec->rm_startblock + rec->rm_blockcount - 1;
+	rc_last = refchk->bno + refchk->len - 1;
+
+	/* Confirm that a single-owner refc extent is a CoW stage. */
+	if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) {
+		xchk_btree_xref_set_corrupt(refchk->sc, cur, 0);
+		return 0;
+	}
+
+	if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) {
+		/*
+		 * The rmap overlaps the refcount record, so we can confirm
+		 * one refcount owner seen.
+		 */
+		refchk->seen++;
+	} else {
+		/*
+		 * This rmap covers only part of the refcount record, so
+		 * save the fragment for later processing.  If the rmapbt
+		 * is healthy each rmap_irec we see will be in agbno order
+		 * so we don't need insertion sort here.
+		 */
+		frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag),
+				XCHK_GFP_FLAGS);
+		if (!frag)
+			return -ENOMEM;
+		memcpy(&frag->rm, rec, sizeof(frag->rm));
+		list_add_tail(&frag->list, &refchk->fragments);
+	}
+
+	return 0;
+}
+
+/*
+ * Given a bunch of rmap fragments, iterate through them, keeping
+ * a running tally of the refcount.  If this ever deviates from
+ * what we expect (which is the rtrefcountbt's refcount minus the
+ * number of extents that totally covered the rtrefcountbt extent),
+ * we have a rtrefcountbt error.
+ */
+STATIC void
+xchk_rtrefcountbt_process_rmap_fragments(
+	struct xchk_rtrefcnt_check	*refchk)
+{
+	struct list_head		worklist;
+	struct xchk_rtrefcnt_frag	*frag;
+	struct xchk_rtrefcnt_frag	*n;
+	xfs_rgblock_t			bno;
+	xfs_rgblock_t			rbno;
+	xfs_rgblock_t			next_rbno;
+	xfs_nlink_t			nr;
+	xfs_nlink_t			target_nr;
+
+	target_nr = refchk->refcount - refchk->seen;
+	if (target_nr == 0)
+		return;
+
+	/*
+	 * There are (refchk->rc.rc_refcount - refchk->nr refcount)
+	 * references we haven't found yet.  Pull that many off the
+	 * fragment list and figure out where the smallest rmap ends
+	 * (and therefore the next rmap should start).  All the rmaps
+	 * we pull off should start at or before the beginning of the
+	 * refcount record's range.
+	 */
+	INIT_LIST_HEAD(&worklist);
+	rbno = NULLRGBLOCK;
+
+	/* Make sure the fragments actually /are/ in bno order. */
+	bno = 0;
+	list_for_each_entry(frag, &refchk->fragments, list) {
+		if (frag->rm.rm_startblock < bno)
+			goto done;
+		bno = frag->rm.rm_startblock;
+	}
+
+	/*
+	 * Find all the rmaps that start at or before the refc extent,
+	 * and put them on the worklist.
+	 */
+	nr = 0;
+	list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+		if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
+			break;
+		bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+		if (bno < rbno)
+			rbno = bno;
+		list_move_tail(&frag->list, &worklist);
+		nr++;
+	}
+
+	/*
+	 * We should have found exactly $target_nr rmap fragments starting
+	 * at or before the refcount extent.
+	 */
+	if (nr != target_nr)
+		goto done;
+
+	while (!list_empty(&refchk->fragments)) {
+		/* Discard any fragments ending at rbno from the worklist. */
+		nr = 0;
+		next_rbno = NULLRGBLOCK;
+		list_for_each_entry_safe(frag, n, &worklist, list) {
+			bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+			if (bno != rbno) {
+				if (bno < next_rbno)
+					next_rbno = bno;
+				continue;
+			}
+			list_del(&frag->list);
+			kfree(frag);
+			nr++;
+		}
+
+		/* Try to add nr rmaps starting at rbno to the worklist. */
+		list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+			bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+			if (frag->rm.rm_startblock != rbno)
+				goto done;
+			list_move_tail(&frag->list, &worklist);
+			if (next_rbno > bno)
+				next_rbno = bno;
+			nr--;
+			if (nr == 0)
+				break;
+		}
+
+		/*
+		 * If we get here and nr > 0, this means that we added fewer
+		 * items to the worklist than we discarded because the fragment
+		 * list ran out of items.  Therefore, we cannot maintain the
+		 * required refcount.  Something is wrong, so we're done.
+		 */
+		if (nr)
+			goto done;
+
+		rbno = next_rbno;
+	}
+
+	/*
+	 * Make sure the last extent we processed ends at or beyond
+	 * the end of the refcount extent.
+	 */
+	if (rbno < refchk->bno + refchk->len)
+		goto done;
+
+	/* Actually record us having seen the remaining refcount. */
+	refchk->seen = refchk->refcount;
+done:
+	/* Delete fragments and work list. */
+	list_for_each_entry_safe(frag, n, &worklist, list) {
+		list_del(&frag->list);
+		kfree(frag);
+	}
+	list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+		list_del(&frag->list);
+		kfree(frag);
+	}
+}
+
+/* Use the rmap entries covering this extent to verify the refcount. */
+STATIC void
+xchk_rtrefcountbt_xref_rmap(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*irec)
+{
+	struct xchk_rtrefcnt_check	refchk = {
+		.sc			= sc,
+		.bno			= irec->rc_startblock,
+		.len			= irec->rc_blockcount,
+		.refcount		= irec->rc_refcount,
+		.seen			= 0,
+	};
+	struct xfs_rmap_irec		low;
+	struct xfs_rmap_irec		high;
+	struct xchk_rtrefcnt_frag	*frag;
+	struct xchk_rtrefcnt_frag	*n;
+	int				error;
+
+	if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Cross-reference with the rmapbt to confirm the refcount. */
+	memset(&low, 0, sizeof(low));
+	low.rm_startblock = irec->rc_startblock;
+	memset(&high, 0xFF, sizeof(high));
+	high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
+
+	INIT_LIST_HEAD(&refchk.fragments);
+	error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+			xchk_rtrefcountbt_rmap_check, &refchk);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		goto out_free;
+
+	xchk_rtrefcountbt_process_rmap_fragments(&refchk);
+	if (irec->rc_refcount != refchk.seen)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+
+out_free:
+	list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
+		list_del(&frag->list);
+		kfree(frag);
+	}
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_rtrefcountbt_xref(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	xchk_xref_is_used_rt_space(sc,
+			xfs_rgbno_to_rtb(sc->sr.rtg, irec->rc_startblock),
+			irec->rc_blockcount);
+	xchk_rtrefcountbt_xref_rmap(sc, irec);
+}
+
+struct xchk_rtrefcbt_records {
+	/* Previous refcount record. */
+	struct xfs_refcount_irec	prev_rec;
+
+	/* Number of CoW blocks we expect. */
+	xfs_extlen_t			cow_blocks;
+};
+
+static inline bool
+xchk_rtrefcount_mergeable(
+	struct xchk_rtrefcbt_records	*rrc,
+	const struct xfs_refcount_irec	*r2)
+{
+	const struct xfs_refcount_irec	*r1 = &rrc->prev_rec;
+
+	/* Ignore if prev_rec is not yet initialized. */
+	if (r1->rc_blockcount > 0)
+		return false;
+
+	if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock)
+		return false;
+	if (r1->rc_refcount != r2->rc_refcount)
+		return false;
+	if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
+			XFS_REFC_LEN_MAX)
+		return false;
+
+	return true;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrefcountbt_check_mergeable(
+	struct xchk_btree		*bs,
+	struct xchk_rtrefcbt_records	*rrc,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	if (xchk_rtrefcount_mergeable(rrc, irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
+}
+
+/* Scrub a rtrefcountbt record. */
+STATIC int
+xchk_rtrefcountbt_rec(
+	struct xchk_btree		*bs,
+	const union xfs_btree_rec	*rec)
+{
+	struct xfs_mount		*mp = bs->cur->bc_mp;
+	struct xchk_rtrefcbt_records	*rrc = bs->private;
+	struct xfs_refcount_irec	irec;
+	u32				mod;
+
+	xfs_refcount_btrec_to_irec(rec, &irec);
+	if (xfs_rtrefcount_check_irec(to_rtg(bs->cur->bc_group), &irec) !=
+			NULL) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
+
+	/* We can only share full rt extents. */
+	mod = xfs_rgbno_to_rtxoff(mp, irec.rc_startblock);
+	if (mod)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	mod = xfs_extlen_to_rtxmod(mp, irec.rc_blockcount);
+	if (mod)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+		rrc->cow_blocks += irec.rc_blockcount;
+
+	xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec);
+	xchk_rtrefcountbt_xref(bs->sc, &irec);
+
+	return 0;
+}
+
+/* Make sure we have as many refc blocks as the rmap says. */
+STATIC void
+xchk_refcount_xref_rmap(
+	struct xfs_scrub	*sc,
+	const struct xfs_owner_info *btree_oinfo,
+	xfs_extlen_t		cow_blocks)
+{
+	xfs_filblks_t		refcbt_blocks = 0;
+	xfs_filblks_t		blocks;
+	int			error;
+
+	if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Check that we saw as many refcbt blocks as the rmap knows about. */
+	error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks);
+	if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error))
+		return;
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo,
+			&blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	if (blocks != refcbt_blocks)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+	/* Check that we saw as many cow blocks as the rmap knows about. */
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur,
+			&XFS_RMAP_OINFO_COW, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		return;
+	if (blocks != cow_blocks)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xchk_rtrefcountbt(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_owner_info	btree_oinfo;
+	struct xchk_rtrefcbt_records rrc = {
+		.cow_blocks	= 0,
+	};
+	int			error;
+
+	error = xchk_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	xfs_rmap_ino_bmbt_owner(&btree_oinfo, rtg_refcount(sc->sr.rtg)->i_ino,
+			XFS_DATA_FORK);
+	error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec,
+			&btree_oinfo, &rrc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks);
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 16da054b2eb0..6e31f12cef4c 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -467,6 +467,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.has	= xfs_has_rtrmapbt,
 		.repair	= xrep_rtrmapbt,
 	},
+	[XFS_SCRUB_TYPE_RTREFCBT] = {	/* realtime refcountbt */
+		.type	= ST_RTGROUP,
+		.setup	= xchk_setup_rtrefcountbt,
+		.scrub	= xchk_rtrefcountbt,
+		.has	= xfs_has_rtreflink,
+		.repair	= xrep_notsupported,
+	},
 };
 
 static int
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index cba4e89a3a62..ab3d221dd9ed 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -129,6 +129,7 @@ struct xchk_rt {
 
 	/* rtgroup btrees */
 	struct xfs_btree_cur	*rmap_cur;
+	struct xfs_btree_cur	*refc_cur;
 };
 
 struct xfs_scrub {
@@ -284,11 +285,13 @@ int xchk_rtbitmap(struct xfs_scrub *sc);
 int xchk_rtsummary(struct xfs_scrub *sc);
 int xchk_rgsuperblock(struct xfs_scrub *sc);
 int xchk_rtrmapbt(struct xfs_scrub *sc);
+int xchk_rtrefcountbt(struct xfs_scrub *sc);
 #else
 # define xchk_rtbitmap		xchk_nothing
 # define xchk_rtsummary		xchk_nothing
 # define xchk_rgsuperblock	xchk_nothing
 # define xchk_rtrmapbt		xchk_nothing
+# define xchk_rtrefcountbt	xchk_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_quota(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index eb6bb170c902..f8a37ea97791 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -83,6 +83,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_METAPATH]	= "metapath",
 	[XFS_SCRUB_TYPE_RGSUPER]	= "rgsuper",
 	[XFS_SCRUB_TYPE_RTRMAPBT]	= "rtrmapbt",
+	[XFS_SCRUB_TYPE_RTREFCBT]	= "rtrefcountbt",
 };
 
 /* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index fb86b746bc17..289e39d1f418 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -77,6 +77,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT);
 
 #define XFS_SCRUB_TYPE_STRINGS \
 	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
@@ -111,7 +112,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
 	{ XFS_SCRUB_TYPE_BARRIER,	"barrier" }, \
 	{ XFS_SCRUB_TYPE_METAPATH,	"metapath" }, \
 	{ XFS_SCRUB_TYPE_RGSUPER,	"rgsuper" }, \
-	{ XFS_SCRUB_TYPE_RTRMAPBT,	"rtrmapbt" }
+	{ XFS_SCRUB_TYPE_RTRMAPBT,	"rtrmapbt" }, \
+	{ XFS_SCRUB_TYPE_RTREFCBT,	"rtrefcountbt" }
 
 #define XFS_SCRUB_FLAG_STRINGS \
 	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \
-- 
cgit v1.2.3


From 91683bb3f264c04e6f0710de23dbd6c9605c550a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:06 -0800
Subject: xfs: cross-reference checks with the rt refcount btree

Use the realtime refcount btree to implement cross-reference checks in
other data structures.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/bmap.c       | 30 +++++++++++++----
 fs/xfs/scrub/rtbitmap.c   |  2 ++
 fs/xfs/scrub/rtrefcount.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/rtrmap.c     | 37 ++++++++++++++++++++
 fs/xfs/scrub/scrub.h      |  9 +++++
 5 files changed, 158 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index f6077b0cba8a..66da7d4d56ba 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -347,13 +347,31 @@ xchk_bmap_rt_iextent_xref(
 		goto out_cur;
 
 	rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock);
-	xchk_bmap_xref_rmap(info, irec, rgbno);
-
-	xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, info->whichfork,
-			irec->br_startoff);
-	xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
-			irec->br_blockcount, &oinfo);
 
+	switch (info->whichfork) {
+	case XFS_DATA_FORK:
+		xchk_bmap_xref_rmap(info, irec, rgbno);
+		if (!xfs_is_reflink_inode(info->sc->ip)) {
+			xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+					info->whichfork, irec->br_startoff);
+			xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+					irec->br_blockcount, &oinfo);
+			xchk_xref_is_not_rt_shared(info->sc, rgbno,
+					irec->br_blockcount);
+		}
+		xchk_xref_is_not_rt_cow_staging(info->sc, rgbno,
+				irec->br_blockcount);
+		break;
+	case XFS_COW_FORK:
+		xchk_bmap_xref_rmap_cow(info, irec, rgbno);
+		xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+				irec->br_blockcount, &XFS_RMAP_OINFO_COW);
+		xchk_xref_is_rt_cow_staging(info->sc, rgbno,
+				irec->br_blockcount);
+		xchk_xref_is_not_rt_shared(info->sc, rgbno,
+				irec->br_blockcount);
+		break;
+	}
 out_cur:
 	xchk_rtgroup_btcur_free(&info->sc->sr);
 out_free:
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 28c90a31f4c3..e8c776a34c1d 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -105,6 +105,8 @@ xchk_rtbitmap_xref(
 		return;
 
 	xchk_xref_has_no_rt_owner(sc, rgbno, blockcount);
+	xchk_xref_is_not_rt_shared(sc, rgbno, blockcount);
+	xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount);
 
 	if (rtb->next_free_rgbno < rgbno)
 		xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno,
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
index 92ae2e15ae9f..f56f2cb7a9f7 100644
--- a/fs/xfs/scrub/rtrefcount.c
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -485,3 +485,89 @@ xchk_rtrefcountbt(
 
 	return 0;
 }
+
+/* xref check that a cow staging extent is marked in the rtrefcountbt. */
+void
+xchk_xref_is_rt_cow_staging(
+	struct xfs_scrub		*sc,
+	xfs_rgblock_t			bno,
+	xfs_extlen_t			len)
+{
+	struct xfs_refcount_irec	rc;
+	int				has_refcount;
+	int				error;
+
+	if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Find the CoW staging extent. */
+	error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+			bno, &has_refcount);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (!has_refcount) {
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+		return;
+	}
+
+	error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (!has_refcount) {
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+		return;
+	}
+
+	/* CoW lookup returned a shared extent record? */
+	if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+
+	/* Must be at least as long as what was passed in */
+	if (rc.rc_blockcount < len)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/*
+ * xref check that the extent is not shared.  Only file data blocks
+ * can have multiple owners.
+ */
+void
+xchk_xref_is_not_rt_shared(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		bno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_refcount_has_records(sc->sr.refc_cur,
+			XFS_REFC_DOMAIN_SHARED, bno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/* xref check that the extent is not being used for CoW staging. */
+void
+xchk_xref_is_not_rt_cow_staging(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		bno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+			bno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 300a1e85b3d6..3d5419682d65 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -22,6 +22,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_rtgroup.h"
 #include "xfs_metafile.h"
+#include "xfs_refcount.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -149,6 +150,37 @@ xchk_rtrmapbt_check_mergeable(
 	memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
 }
 
+/* Cross-reference a rmap against the refcount btree. */
+STATIC void
+xchk_rtrmapbt_xref_rtrefc(
+	struct xfs_scrub	*sc,
+	struct xfs_rmap_irec	*irec)
+{
+	xfs_rgblock_t		fbno;
+	xfs_extlen_t		flen;
+	bool			is_inode;
+	bool			is_bmbt;
+	bool			is_attr;
+	bool			is_unwritten;
+	int			error;
+
+	if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+	is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+	is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+	is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+	/* If this is shared, must be a data fork extent. */
+	error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock,
+			irec->rm_blockcount, &fbno, &flen, false);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten))
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
 /* Cross-reference with other metadata. */
 STATIC void
 xchk_rtrmapbt_xref(
@@ -161,6 +193,11 @@ xchk_rtrmapbt_xref(
 	xchk_xref_is_used_rt_space(sc,
 			xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock),
 			irec->rm_blockcount);
+	if (irec->rm_owner == XFS_RMAP_OWN_COW)
+		xchk_xref_is_cow_staging(sc, irec->rm_startblock,
+				irec->rm_blockcount);
+	else
+		xchk_rtrmapbt_xref_rtrefc(sc, irec);
 }
 
 /* Scrub a realtime rmapbt record. */
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index ab3d221dd9ed..a1086f1f06d0 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -331,11 +331,20 @@ void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
 		xfs_extlen_t len);
 void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
 		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
+void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
+void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
+void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
 #else
 # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
 # define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0)
 # define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0)
 # define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0)
+# define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0)
 #endif
 
 #endif	/* __XFS_SCRUB_SCRUB_H__ */
-- 
cgit v1.2.3


From 2d9a3e98053e8cfa6ec9c113efb7f8883cd148e6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:07 -0800
Subject: xfs: allow overlapping rtrmapbt records for shared data extents

Allow overlapping realtime reverse mapping records if they both describe
shared data extents and the fs supports reflink on the realtime volume.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/rtrmap.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 3d5419682d65..12989fe80e8b 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -78,6 +78,18 @@ struct xchk_rtrmap {
 	struct xfs_rmap_irec	prev_rec;
 };
 
+static inline bool
+xchk_rtrmapbt_is_shareable(
+	struct xfs_scrub		*sc,
+	const struct xfs_rmap_irec	*irec)
+{
+	if (!xfs_has_rtreflink(sc->mp))
+		return false;
+	if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+		return false;
+	return true;
+}
+
 /* Flag failures for records that overlap but cannot. */
 STATIC void
 xchk_rtrmapbt_check_overlapping(
@@ -99,7 +111,10 @@ xchk_rtrmapbt_check_overlapping(
 	if (pnext <= irec->rm_startblock)
 		goto set_prev;
 
-	xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	/* Overlap is only allowed if both records are data fork mappings. */
+	if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) ||
+	    !xchk_rtrmapbt_is_shareable(bs->sc, irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
 	/* Save whichever rmap record extends furthest. */
 	inext = irec->rm_startblock + irec->rm_blockcount;
-- 
cgit v1.2.3


From 30f47950dc2ebaf906d389506c3a9ab048a8c08d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:08 -0800
Subject: xfs: check reference counts of gaps between rt refcount records

If there's a gap between records in the rt refcount btree, we ought to
cross-reference the gap with the rtrmap records to make sure that there
aren't any overlapping records for a region that doesn't have any shared
ownership.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/rtrefcount.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
index f56f2cb7a9f7..18c9bcb0e821 100644
--- a/fs/xfs/scrub/rtrefcount.c
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -17,6 +17,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_metafile.h"
 #include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -348,8 +349,14 @@ struct xchk_rtrefcbt_records {
 	/* Previous refcount record. */
 	struct xfs_refcount_irec	prev_rec;
 
+	/* The next rtgroup block where we aren't expecting shared extents. */
+	xfs_rgblock_t			next_unshared_rgbno;
+
 	/* Number of CoW blocks we expect. */
 	xfs_extlen_t			cow_blocks;
+
+	/* Was the last record a shared or CoW staging extent? */
+	enum xfs_refc_domain		prev_domain;
 };
 
 static inline bool
@@ -390,6 +397,53 @@ xchk_rtrefcountbt_check_mergeable(
 	memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
 }
 
+STATIC int
+xchk_rtrefcountbt_rmap_check_gap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	xfs_rgblock_t			*next_bno = priv;
+
+	if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno)
+		return -ECANCELED;
+
+	*next_bno = rec->rm_startblock + rec->rm_blockcount;
+	return 0;
+}
+
+/*
+ * Make sure that a gap in the reference count records does not correspond to
+ * overlapping records (i.e. shared extents) in the reverse mappings.
+ */
+static inline void
+xchk_rtrefcountbt_xref_gaps(
+	struct xfs_scrub	*sc,
+	struct xchk_rtrefcbt_records *rrc,
+	xfs_rtblock_t		bno)
+{
+	struct xfs_rmap_irec	low;
+	struct xfs_rmap_irec	high;
+	xfs_rgblock_t		next_bno = NULLRGBLOCK;
+	int			error;
+
+	if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur ||
+            xchk_skip_xref(sc->sm))
+		return;
+
+	memset(&low, 0, sizeof(low));
+	low.rm_startblock = rrc->next_unshared_rgbno;
+	memset(&high, 0xFF, sizeof(high));
+	high.rm_startblock = bno - 1;
+
+	error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+			xchk_rtrefcountbt_rmap_check_gap, &next_bno);
+	if (error == -ECANCELED)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+	else
+		xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur);
+}
+
 /* Scrub a rtrefcountbt record. */
 STATIC int
 xchk_rtrefcountbt_rec(
@@ -419,9 +473,26 @@ xchk_rtrefcountbt_rec(
 	if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
 		rrc->cow_blocks += irec.rc_blockcount;
 
+	/* Shared records always come before CoW records. */
+	if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED &&
+	    rrc->prev_domain == XFS_REFC_DOMAIN_COW)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	rrc->prev_domain = irec.rc_domain;
+
 	xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec);
 	xchk_rtrefcountbt_xref(bs->sc, &irec);
 
+	/*
+	 * If this is a record for a shared extent, check that all blocks
+	 * between the previous record and this one have at most one reverse
+	 * mapping.
+	 */
+	if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) {
+		xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock);
+		rrc->next_unshared_rgbno = irec.rc_startblock +
+					   irec.rc_blockcount;
+	}
+
 	return 0;
 }
 
@@ -466,7 +537,9 @@ xchk_rtrefcountbt(
 {
 	struct xfs_owner_info	btree_oinfo;
 	struct xchk_rtrefcbt_records rrc = {
-		.cow_blocks	= 0,
+		.cow_blocks		= 0,
+		.next_unshared_rgbno	= 0,
+		.prev_domain		= XFS_REFC_DOMAIN_SHARED,
 	};
 	int			error;
 
@@ -481,6 +554,12 @@ xchk_rtrefcountbt(
 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 		return error;
 
+	/*
+	 * Check that all blocks between the last refcount > 1 record and the
+	 * end of the rt volume have at most one reverse mapping.
+	 */
+	xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks);
+
 	xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks);
 
 	return 0;
-- 
cgit v1.2.3


From 48bc170f2cb54371a062d2fae6d9518f6f47d878 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:09 -0800
Subject: xfs: allow dquot rt block count to exceed rt blocks on reflink fs

Update the quota scrubber to allow dquots where the realtime block count
exceeds the block count of the rt volume if reflink is enabled.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/quota.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 183d531875ea..58d6d4ed2853 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -212,12 +212,18 @@ xchk_quota_item(
 		if (mp->m_sb.sb_dblocks < dq->q_blk.count)
 			xchk_fblock_set_warning(sc, XFS_DATA_FORK,
 					offset);
+		if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+			xchk_fblock_set_warning(sc, XFS_DATA_FORK,
+					offset);
 	} else {
 		if (mp->m_sb.sb_dblocks < dq->q_blk.count)
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
 					offset);
+		if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+					offset);
 	}
-	if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
+	if (dq->q_ino.count > fs_icount)
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
 
 	/*
-- 
cgit v1.2.3


From a9600db96f74af50cc025a02e9e39a9515e2256d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:09 -0800
Subject: xfs: detect and repair misaligned rtinherit directory cowextsize
 hints

If we encounter a directory that has been configured to pass on a CoW
extent size hint to a new realtime file and the hint isn't an integer
multiple of the rt extent size, we should flag the hint for
administrative review and/or turn it off because that is a
misconfiguration.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/inode.c        | 26 +++++++++++++++++---------
 fs/xfs/scrub/inode_repair.c | 15 +++++++++++++++
 2 files changed, 32 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index c7bbc3f78e90..db6edd5a5fe5 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -260,12 +260,7 @@ xchk_inode_extsize(
 		xchk_ino_set_warning(sc, ino);
 }
 
-/*
- * Validate di_cowextsize hint.
- *
- * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
- * These functions must be kept in sync with each other.
- */
+/* Validate di_cowextsize hint. */
 STATIC void
 xchk_inode_cowextsize(
 	struct xfs_scrub	*sc,
@@ -276,12 +271,25 @@ xchk_inode_cowextsize(
 	uint64_t		flags2)
 {
 	xfs_failaddr_t		fa;
+	uint32_t		value = be32_to_cpu(dip->di_cowextsize);
 
-	fa = xfs_inode_validate_cowextsize(sc->mp,
-			be32_to_cpu(dip->di_cowextsize), mode, flags,
-			flags2);
+	fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
 	if (fa)
 		xchk_ino_set_corrupt(sc, ino);
+
+	/*
+	 * XFS allows a sysadmin to change the rt extent size when adding a rt
+	 * section to a filesystem after formatting.  If there are any
+	 * directories with cowextsize and rtinherit set, the hint could become
+	 * misaligned with the new rextsize.  The verifier doesn't check this,
+	 * because we allow rtinherit directories even without an rt device.
+	 * Flag this as an administrative warning since we will clean this up
+	 * eventually.
+	 */
+	if ((flags & XFS_DIFLAG_RTINHERIT) &&
+	    (flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    value % sc->mp->m_sb.sb_rextsize > 0)
+		xchk_ino_set_warning(sc, ino);
 }
 
 /* Make sure the di_flags make sense for the inode. */
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 938a18721f36..701baee144a6 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1903,6 +1903,20 @@ xrep_inode_pptr(
 			sizeof(struct xfs_attr_sf_hdr), true);
 }
 
+/* Fix COW extent size hint problems. */
+STATIC void
+xrep_inode_cowextsize(
+	struct xfs_scrub	*sc)
+{
+	/* Fix misaligned CoW extent size hints on a directory. */
+	if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) {
+		sc->ip->i_cowextsize = 0;
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+	}
+}
+
 /* Fix any irregularities in an inode that the verifiers don't catch. */
 STATIC int
 xrep_inode_problems(
@@ -1926,6 +1940,7 @@ xrep_inode_problems(
 	if (S_ISDIR(VFS_I(sc->ip)->i_mode))
 		xrep_inode_dir_size(sc);
 	xrep_inode_extsize(sc);
+	xrep_inode_cowextsize(sc);
 
 	trace_xrep_inode_fixed(sc);
 	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
-- 
cgit v1.2.3


From ca757af07fccf527f91ad49f3b6648e6783b0bc8 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:10 -0800
Subject: xfs: scrub the metadir path of rt refcount btree files

Add a new XFS_SCRUB_METAPATH subtype so that we can scrub the metadata
directory tree path to the refcount btree file for each rt group.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h  | 3 ++-
 fs/xfs/scrub/metapath.c | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index a4bd6a39c6ba..2c3171262b44 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -832,9 +832,10 @@ struct xfs_scrub_vec_head {
 #define XFS_SCRUB_METAPATH_GRPQUOTA	(6)  /* group quota */
 #define XFS_SCRUB_METAPATH_PRJQUOTA	(7)  /* project quota */
 #define XFS_SCRUB_METAPATH_RTRMAPBT	(8)  /* realtime reverse mapping */
+#define XFS_SCRUB_METAPATH_RTREFCOUNTBT	(9)  /* realtime refcount */
 
 /* Number of metapath sm_ino values */
-#define XFS_SCRUB_METAPATH_NR		(9)
+#define XFS_SCRUB_METAPATH_NR		(10)
 
 /*
  * ioctl limits
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index 74d71373e7ed..e21c16fbd15d 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -22,6 +22,7 @@
 #include "xfs_attr.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -249,6 +250,8 @@ xchk_setup_metapath(
 		return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ);
 	case XFS_SCRUB_METAPATH_RTRMAPBT:
 		return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP);
+	case XFS_SCRUB_METAPATH_RTREFCOUNTBT:
+		return xchk_setup_metapath_rtginode(sc, XFS_RTGI_REFCOUNT);
 	default:
 		return -ENOENT;
 	}
-- 
cgit v1.2.3


From cca34a305446f0eeea7c6c76bbb01ecf58be9cbc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:11 -0800
Subject: xfs: don't flag quota rt block usage on rtreflink filesystems

Quota space usage is allowed to exceed the size of the physical storage
when reflink is enabled.  Now that we have reflink for the realtime
volume, apply this same logic to the rtb repair logic.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/quota_repair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index cd51f10f2920..8f4c8d41f308 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -233,7 +233,7 @@ xrep_quota_item(
 		rqi->need_quotacheck = true;
 		dirty = true;
 	}
-	if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+	if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) {
 		dq->q_rtb.reserved -= dq->q_rtb.count;
 		dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
 		dq->q_rtb.count = mp->m_sb.sb_rblocks;
-- 
cgit v1.2.3


From 6470ceef325c886cac909631d212839ee653dbc6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:11 -0800
Subject: xfs: check new rtbitmap records against rt refcount btree

When we're rebuilding the realtime bitmap, check the proposed free
extents against the rt refcount btree to make sure we don't commit any
grievous errors.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/repair.c          |  6 ++++++
 fs/xfs/scrub/rtbitmap_repair.c | 24 +++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 61e414c81253..3b5288d3ef4e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -42,6 +42,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtalloc.h"
 #include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -1009,6 +1010,11 @@ xrep_rtgroup_btcur_init(
 	    (sr->rtlock_flags & XFS_RTGLOCK_RMAP) &&
 	    xfs_has_rtrmapbt(mp))
 		sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT &&
+	    (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) &&
+	    xfs_has_rtreflink(mp))
+		sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg);
 }
 
 /*
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index c6e33834c5ae..203a1a97c502 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -23,6 +23,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
 #include "xfs_extent_busy.h"
+#include "xfs_refcount.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -183,7 +184,8 @@ xrep_rtbitmap_mark_free(
 	xfs_rgblock_t		rgbno)
 {
 	struct xfs_mount	*mp = rtb->sc->mp;
-	struct xfs_rtgroup	*rtg = rtb->sc->sr.rtg;
+	struct xchk_rt		*sr = &rtb->sc->sr;
+	struct xfs_rtgroup	*rtg = sr->rtg;
 	xfs_rtxnum_t		startrtx;
 	xfs_rtxnum_t		nextrtx;
 	xrep_wordoff_t		wordoff, nextwordoff;
@@ -191,6 +193,7 @@ xrep_rtbitmap_mark_free(
 	unsigned int		bufwsize;
 	xfs_extlen_t		mod;
 	xfs_rtword_t		mask;
+	enum xbtree_recpacking	outcome;
 	int			error;
 
 	if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno))
@@ -210,6 +213,25 @@ xrep_rtbitmap_mark_free(
 	if (mod != mp->m_sb.sb_rextsize - 1)
 		return -EFSCORRUPTED;
 
+	/* Must not be shared or CoW staging. */
+	if (sr->refc_cur) {
+		error = xfs_refcount_has_records(sr->refc_cur,
+				XFS_REFC_DOMAIN_SHARED, rtb->next_rgbno,
+				rgbno - rtb->next_rgbno, &outcome);
+		if (error)
+			return error;
+		if (outcome != XBTREE_RECPACKING_EMPTY)
+			return -EFSCORRUPTED;
+
+		error = xfs_refcount_has_records(sr->refc_cur,
+				XFS_REFC_DOMAIN_COW, rtb->next_rgbno,
+				rgbno - rtb->next_rgbno, &outcome);
+		if (error)
+			return error;
+		if (outcome != XBTREE_RECPACKING_EMPTY)
+			return -EFSCORRUPTED;
+	}
+
 	trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1);
 
 	/* Set bits as needed to round startrtx up to the nearest word. */
-- 
cgit v1.2.3


From 477493082fe8a480e33cb26ff545b5976818c071 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:12 -0800
Subject: xfs: walk the rt reference count tree when rebuilding rmap

When we're rebuilding the data device rmap, if we encounter a "refcount"
format fork, we have to walk the (realtime) refcount btree inode to
build the appropriate mappings.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/rmap_repair.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index c2c7b76cc25a..f5f73078ffe2 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -33,6 +33,7 @@
 #include "xfs_ag.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -519,6 +520,9 @@ xrep_rmap_scan_meta_btree(
 	case XFS_METAFILE_RTRMAP:
 		type = XFS_RTGI_RMAP;
 		break;
+	case XFS_METAFILE_RTREFCOUNT:
+		type = XFS_RTGI_REFCOUNT;
+		break;
 	default:
 		ASSERT(0);
 		return -EFSCORRUPTED;
@@ -545,6 +549,9 @@ found:
 	case XFS_METAFILE_RTRMAP:
 		cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
 		break;
+	case XFS_METAFILE_RTREFCOUNT:
+		cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg);
+		break;
 	default:
 		ASSERT(0);
 		error = -EFSCORRUPTED;
-- 
cgit v1.2.3


From fe2efe95082a592955c03f26d00ae8e34bb814ed Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:13 -0800
Subject: xfs: capture realtime CoW staging extents when rebuilding rt rmapbt

Walk the realtime refcount btree to find the CoW staging extents when
we're rebuilding the realtime rmap btree.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/repair.h        |   1 +
 fs/xfs/scrub/rgb_bitmap.h    |  37 ++++++++++++++++
 fs/xfs/scrub/rtrmap_repair.c | 103 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 fs/xfs/scrub/rgb_bitmap.h

(limited to 'fs')

diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ac5962732d26..773438132053 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -50,6 +50,7 @@ xrep_trans_commit(
 
 struct xbitmap;
 struct xagb_bitmap;
+struct xrgb_bitmap;
 struct xfsb_bitmap;
 
 int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
diff --git a/fs/xfs/scrub/rgb_bitmap.h b/fs/xfs/scrub/rgb_bitmap.h
new file mode 100644
index 000000000000..4c3126b66dcb
--- /dev/null
+++ b/fs/xfs/scrub/rgb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RGB_BITMAP_H__
+#define __XFS_SCRUB_RGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rgblock_t */
+
+struct xrgb_bitmap {
+	struct xbitmap32	rgbitmap;
+};
+
+static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap)
+{
+	xbitmap32_init(&bitmap->rgbitmap);
+}
+
+static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap)
+{
+	xbitmap32_destroy(&bitmap->rgbitmap);
+}
+
+static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap,
+		xfs_rgblock_t start, xfs_extlen_t len)
+{
+	return xbitmap32_set(&bitmap->rgbitmap, start, len);
+}
+
+static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap,
+		xbitmap32_walk_fn fn, void *priv)
+{
+	return xbitmap32_walk(&bitmap->rgbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_RGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index 49de8bc2dd17..f2fdd7a9fc24 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -30,6 +30,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_ag.h"
 #include "xfs_rtgroup.h"
+#include "xfs_refcount.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -38,6 +39,7 @@
 #include "scrub/repair.h"
 #include "scrub/bitmap.h"
 #include "scrub/fsb_bitmap.h"
+#include "scrub/rgb_bitmap.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
 #include "scrub/iscan.h"
@@ -423,6 +425,100 @@ xrep_rtrmap_scan_ag(
 	return error;
 }
 
+struct xrep_rtrmap_stash_run {
+	struct xrep_rtrmap	*rr;
+	uint64_t		owner;
+};
+
+static int
+xrep_rtrmap_stash_run(
+	uint32_t			start,
+	uint32_t			len,
+	void				*priv)
+{
+	struct xrep_rtrmap_stash_run	*rsr = priv;
+	struct xrep_rtrmap		*rr = rsr->rr;
+	xfs_rgblock_t			rgbno = start;
+
+	return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap.  Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rtrmap_stash_bitmap(
+	struct xrep_rtrmap		*rr,
+	struct xrgb_bitmap		*bitmap,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xrep_rtrmap_stash_run	rsr = {
+		.rr			= rr,
+		.owner			= oinfo->oi_owner,
+	};
+
+	return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrmap_walk_cowblocks(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*irec,
+	void				*priv)
+{
+	struct xrgb_bitmap		*bitmap = priv;
+
+	if (!xfs_refcount_check_domain(irec) ||
+	    irec->rc_domain != XFS_REFC_DOMAIN_COW)
+		return -EFSCORRUPTED;
+
+	return xrgb_bitmap_set(bitmap, irec->rc_startblock,
+			irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rtrmap_find_refcount_rmaps(
+	struct xrep_rtrmap	*rr)
+{
+	struct xrgb_bitmap	cow_blocks;		/* COWBIT */
+	struct xfs_refcount_irec low = {
+		.rc_startblock	= 0,
+		.rc_domain	= XFS_REFC_DOMAIN_COW,
+	};
+	struct xfs_refcount_irec high = {
+		.rc_startblock	= -1U,
+		.rc_domain	= XFS_REFC_DOMAIN_COW,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	if (!xfs_has_rtreflink(sc->mp))
+		return 0;
+
+	xrgb_bitmap_init(&cow_blocks);
+
+	/* Collect rmaps for CoW staging extents. */
+	error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high,
+			xrep_rtrmap_walk_cowblocks, &cow_blocks);
+	if (error)
+		goto out_bitmap;
+
+	/* Generate rmaps for everything. */
+	error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xrgb_bitmap_destroy(&cow_blocks);
+	return error;
+}
+
 /* Count and check all collected records. */
 STATIC int
 xrep_rtrmap_check_record(
@@ -460,6 +556,13 @@ xrep_rtrmap_find_rmaps(
 			return error;
 	}
 
+	/* Find CoW staging extents. */
+	xrep_rtgroup_btcur_init(sc, &sc->sr);
+	error = xrep_rtrmap_find_refcount_rmaps(rr);
+	xchk_rtgroup_btcur_free(&sc->sr);
+	if (error)
+		return error;
+
 	/*
 	 * Set up for a potentially lengthy filesystem scan by reducing our
 	 * transaction resource usage for the duration.  Specifically:
-- 
cgit v1.2.3


From 83ccffc489975db625d7f2600d39faa8b0ff69d6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:14 -0800
Subject: xfs: online repair of the realtime refcount btree

Port the data device's refcount btree repair code to the realtime
refcount btree.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile                  |   1 +
 fs/xfs/scrub/refcount_repair.c   |   2 +-
 fs/xfs/scrub/repair.h            |   5 +
 fs/xfs/scrub/rtrefcount.c        |   9 +
 fs/xfs/scrub/rtrefcount_repair.c | 783 +++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c             |   2 +-
 fs/xfs/scrub/trace.h             |  14 +-
 7 files changed, 809 insertions(+), 7 deletions(-)
 create mode 100644 fs/xfs/scrub/rtrefcount_repair.c

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 9dd9921e5356..7afa51e41427 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -236,6 +236,7 @@ xfs-y				+= $(addprefix scrub/, \
 
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rtbitmap_repair.o \
+				   rtrefcount_repair.o \
 				   rtrmap_repair.o \
 				   rtsummary_repair.o \
 				   )
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index 1ee6d4aeb308..9c8cb5332da0 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -189,7 +189,7 @@ xrep_refc_stash(
 	if (error)
 		return error;
 
-	trace_xrep_refc_found(sc->sa.pag, &irec);
+	trace_xrep_refc_found(pag_group(sc->sa.pag), &irec);
 
 	return xfarray_append(rr->refcount_records, &irec);
 }
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 773438132053..8f8f18b48a44 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -99,6 +99,7 @@ int xrep_setup_nlinks(struct xfs_scrub *sc);
 int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
 int xrep_setup_dirtree(struct xfs_scrub *sc);
 int xrep_setup_rtrmapbt(struct xfs_scrub *sc);
+int xrep_setup_rtrefcountbt(struct xfs_scrub *sc);
 
 /* Repair setup functions */
 int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -158,11 +159,13 @@ int xrep_rtbitmap(struct xfs_scrub *sc);
 int xrep_rtsummary(struct xfs_scrub *sc);
 int xrep_rgsuperblock(struct xfs_scrub *sc);
 int xrep_rtrmapbt(struct xfs_scrub *sc);
+int xrep_rtrefcountbt(struct xfs_scrub *sc);
 #else
 # define xrep_rtbitmap			xrep_notsupported
 # define xrep_rtsummary			xrep_notsupported
 # define xrep_rgsuperblock		xrep_notsupported
 # define xrep_rtrmapbt			xrep_notsupported
+# define xrep_rtrefcountbt		xrep_notsupported
 #endif /* CONFIG_XFS_RT */
 
 #ifdef CONFIG_XFS_QUOTA
@@ -236,6 +239,7 @@ xrep_setup_nothing(
 #define xrep_setup_dirtree		xrep_setup_nothing
 #define xrep_setup_metapath		xrep_setup_nothing
 #define xrep_setup_rtrmapbt		xrep_setup_nothing
+#define xrep_setup_rtrefcountbt		xrep_setup_nothing
 
 #define xrep_setup_inode(sc, imap)	((void)0)
 
@@ -274,6 +278,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
 #define xrep_metapath			xrep_notsupported
 #define xrep_rgsuperblock		xrep_notsupported
 #define xrep_rtrmapbt			xrep_notsupported
+#define xrep_rtrefcountbt		xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
index 18c9bcb0e821..4c5dffc73641 100644
--- a/fs/xfs/scrub/rtrefcount.c
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -7,8 +7,10 @@
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_refcount.h"
@@ -21,6 +23,7 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
+#include "scrub/repair.h"
 
 /* Set us up with the realtime refcount metadata locked. */
 int
@@ -32,6 +35,12 @@ xchk_setup_rtrefcountbt(
 	if (xchk_need_intent_drain(sc))
 		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
 
+	if (xchk_could_repair(sc)) {
+		error = xrep_setup_rtrefcountbt(sc);
+		if (error)
+			return error;
+	}
+
 	error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
 	if (error)
 		return error;
diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c
new file mode 100644
index 000000000000..257cfb24beb4
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount_repair.c
@@ -0,0 +1,783 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+#include "scrub/rcbag.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair.  Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ *                 -    ---
+ *       --      ----- ----   ---        ------
+ * --   ----     ----------- ----     ---------
+ * -------------------------------- -----------
+ * ^ ^  ^^ ^^    ^ ^^ ^^^  ^^^^  ^ ^^ ^  ^     ^
+ * 2 1  23 21    3 43 234  2123  1 01 2  3     0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt.  If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time.  Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * rt refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ *  - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ *  - Add to the bag all rmaps in the array where startblock == sp.
+ *  - Set np to the physical block where the bag size will change.  This
+ *    is the minimum of (the pblk of the next unprocessed rmap) and
+ *    (startblock + len of each rmap in the bag).
+ *  - Record the bag size as old_bag_size.
+ *
+ *  - While the bag isn't empty,
+ *     - Remove from the bag all rmaps where startblock + len == np.
+ *     - Add to the bag all rmaps in the array where startblock == np.
+ *     - If the bag size isn't old_bag_size, store the refcount entry
+ *       (sp, np - sp, bag_size) in the refcnt btree.
+ *     - If the bag is empty, break out of the inner loop.
+ *     - Set old_bag_size to the bag size
+ *     - Set sp = np.
+ *     - Set np to the physical block where the bag size will change.
+ *       This is the minimum of (the pblk of the next unprocessed rmap)
+ *       and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the rt refcount btree root and
+ * insert all the records.
+ */
+
+struct xrep_rtrefc {
+	/* refcount extents */
+	struct xfarray		*refcount_records;
+
+	/* new refcountbt information */
+	struct xrep_newbt	new_btree;
+
+	/* old refcountbt blocks */
+	struct xfsb_bitmap	old_rtrefcountbt_blocks;
+
+	struct xfs_scrub	*sc;
+
+	/* get_records()'s position in the rt refcount record array. */
+	xfarray_idx_t		array_cur;
+
+	/* # of refcountbt blocks */
+	xfs_filblks_t		btblocks;
+};
+
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_rtrefcountbt(
+	struct xfs_scrub	*sc)
+{
+	char			*descr;
+	int			error;
+
+	descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+	error = xrep_setup_xfbtree(sc, descr);
+	kfree(descr);
+	return error;
+}
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_rtrefc_check_ext(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*rec)
+{
+	xfs_rgblock_t			last;
+
+	if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL)
+		return -EFSCORRUPTED;
+
+	if (xfs_rgbno_to_rtxoff(sc->mp, rec->rc_startblock) != 0)
+		return -EFSCORRUPTED;
+
+	last = rec->rc_startblock + rec->rc_blockcount - 1;
+	if (xfs_rgbno_to_rtxoff(sc->mp, last) != sc->mp->m_sb.sb_rextsize - 1)
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space or misaligned. */
+	return xrep_require_rtext_inuse(sc, rec->rc_startblock,
+			rec->rc_blockcount);
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_rtrefc_stash(
+	struct xrep_rtrefc		*rr,
+	enum xfs_refc_domain		domain,
+	xfs_rgblock_t			bno,
+	xfs_extlen_t			len,
+	uint64_t			refcount)
+{
+	struct xfs_refcount_irec	irec = {
+		.rc_startblock		= bno,
+		.rc_blockcount		= len,
+		.rc_refcount		= refcount,
+		.rc_domain		= domain,
+	};
+	int				error = 0;
+
+	if (xchk_should_terminate(rr->sc, &error))
+		return error;
+
+	irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
+
+	error = xrep_rtrefc_check_ext(rr->sc, &irec);
+	if (error)
+		return error;
+
+	trace_xrep_refc_found(rtg_group(rr->sc->sr.rtg), &irec);
+
+	return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrefc_stash_cow(
+	struct xrep_rtrefc		*rr,
+	xfs_rgblock_t			bno,
+	xfs_extlen_t			len)
+{
+	return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_rtrefc_rmap_shareable(
+	const struct xfs_rmap_irec	*rmap)
+{
+	/* rt metadata are never sharable */
+	if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+		return false;
+
+	/* Unwritten file blocks are not shareable. */
+	if (rmap->rm_flags & XFS_RMAP_UNWRITTEN)
+		return false;
+
+	return true;
+}
+
+/* Grab the next (abbreviated) rmap record from the rmapbt. */
+STATIC int
+xrep_rtrefc_walk_rmaps(
+	struct xrep_rtrefc	*rr,
+	struct xfs_rmap_irec	*rmap,
+	bool			*have_rec)
+{
+	struct xfs_btree_cur	*cur = rr->sc->sr.rmap_cur;
+	struct xfs_mount	*mp = cur->bc_mp;
+	int			have_gt;
+	int			error = 0;
+
+	*have_rec = false;
+
+	/*
+	 * Loop through the remaining rmaps.  Remember CoW staging
+	 * extents and the refcountbt blocks from the old tree for later
+	 * disposal.  We can only share written data fork extents, so
+	 * keep looping until we find an rmap for one.
+	 */
+	do {
+		if (xchk_should_terminate(rr->sc, &error))
+			return error;
+
+		error = xfs_btree_increment(cur, 0, &have_gt);
+		if (error)
+			return error;
+		if (!have_gt)
+			return 0;
+
+		error = xfs_rmap_get_rec(cur, rmap, &have_gt);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(mp, !have_gt)) {
+			xfs_btree_mark_sick(cur);
+			return -EFSCORRUPTED;
+		}
+
+		if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+			error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock,
+					rmap->rm_blockcount);
+			if (error)
+				return error;
+		} else if (xfs_is_sb_inum(mp, rmap->rm_owner) ||
+			   (rmap->rm_flags & (XFS_RMAP_ATTR_FORK |
+					      XFS_RMAP_BMBT_BLOCK))) {
+			xfs_btree_mark_sick(cur);
+			return -EFSCORRUPTED;
+		}
+	} while (!xrep_rtrefc_rmap_shareable(rmap));
+
+	*have_rec = true;
+	return 0;
+}
+
+static inline uint32_t
+xrep_rtrefc_encode_startblock(
+	const struct xfs_refcount_irec	*irec)
+{
+	uint32_t			start;
+
+	start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+	if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+		start |= XFS_REFC_COWFLAG;
+
+	return start;
+}
+
+/*
+ * Compare two refcount records.  We want to sort in order of increasing block
+ * number.
+ */
+static int
+xrep_rtrefc_extent_cmp(
+	const void			*a,
+	const void			*b)
+{
+	const struct xfs_refcount_irec	*ap = a;
+	const struct xfs_refcount_irec	*bp = b;
+	uint32_t			sa, sb;
+
+	sa = xrep_rtrefc_encode_startblock(ap);
+	sb = xrep_rtrefc_encode_startblock(bp);
+
+	if (sa > sb)
+		return 1;
+	if (sa < sb)
+		return -1;
+	return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order.  Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_rtrefc_sort_records(
+	struct xrep_rtrefc		*rr)
+{
+	struct xfs_refcount_irec	irec;
+	xfarray_idx_t			cur;
+	enum xfs_refc_domain		dom = XFS_REFC_DOMAIN_SHARED;
+	xfs_rgblock_t			next_rgbno = 0;
+	int				error;
+
+	error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp,
+			XFARRAY_SORT_KILLABLE);
+	if (error)
+		return error;
+
+	foreach_xfarray_idx(rr->refcount_records, cur) {
+		if (xchk_should_terminate(rr->sc, &error))
+			return error;
+
+		error = xfarray_load(rr->refcount_records, cur, &irec);
+		if (error)
+			return error;
+
+		if (dom == XFS_REFC_DOMAIN_SHARED &&
+		    irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+			dom = irec.rc_domain;
+			next_rgbno = 0;
+		}
+
+		if (dom != irec.rc_domain)
+			return -EFSCORRUPTED;
+		if (irec.rc_startblock < next_rgbno)
+			return -EFSCORRUPTED;
+
+		next_rgbno = irec.rc_startblock + irec.rc_blockcount;
+	}
+
+	return error;
+}
+
+/* Record extents that belong to the realtime refcount inode. */
+STATIC int
+xrep_rtrefc_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_rtrefc		*rr = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(rr->sc, &error))
+		return error;
+
+	/* Skip extents which are not owned by this inode and fork. */
+	if (rec->rm_owner != rr->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_check_ino_btree_mapping(rr->sc, rec);
+	if (error)
+		return error;
+
+	return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks,
+			xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock),
+			rec->rm_blockcount);
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag.  These represent the file(s) that share ownership of
+ * the current block.  Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_rtrefc_push_rmaps_at(
+	struct xrep_rtrefc	*rr,
+	struct rcbag		*rcstack,
+	xfs_rgblock_t		bno,
+	struct xfs_rmap_irec	*rmap,
+	bool			*have)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	int			have_gt;
+	int			error;
+
+	while (*have && rmap->rm_startblock == bno) {
+		error = rcbag_add(rcstack, rr->sc->tp, rmap);
+		if (error)
+			return error;
+
+		error = xrep_rtrefc_walk_rmaps(rr, rmap, have);
+		if (error)
+			return error;
+	}
+
+	error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+		xfs_btree_mark_sick(sc->sr.rmap_cur);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+/* Scan one AG for reverse mappings for the realtime refcount btree. */
+STATIC int
+xrep_rtrefc_scan_ag(
+	struct xrep_rtrefc	*rr,
+	struct xfs_perag	*pag)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	error = xrep_ag_init(sc, pag, &sc->sa);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr);
+	xchk_ag_free(sc, &sc->sa);
+	return error;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_rtrefc_find_refcounts(
+	struct xrep_rtrefc	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct rcbag		*rcstack;
+	struct xfs_perag	*pag = NULL;
+	uint64_t		old_stack_height;
+	xfs_rgblock_t		sbno;
+	xfs_rgblock_t		cbno;
+	xfs_rgblock_t		nbno;
+	bool			have;
+	int			error;
+
+	/* Scan for old rtrefc btree blocks. */
+	while ((pag = xfs_perag_next(sc->mp, pag))) {
+		error = xrep_rtrefc_scan_ag(rr, pag);
+		if (error) {
+			xfs_perag_rele(pag);
+			return error;
+		}
+	}
+
+	xrep_rtgroup_btcur_init(sc, &sc->sr);
+
+	/*
+	 * Set up a bag to store all the rmap records that we're tracking to
+	 * generate a reference count record.  If this exceeds
+	 * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
+	 */
+	error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
+	if (error)
+		goto out_cur;
+
+	/* Start the rtrmapbt cursor to the left of all records. */
+	error = xfs_btree_goto_left_edge(sc->sr.rmap_cur);
+	if (error)
+		goto out_bag;
+
+	/* Process reverse mappings into refcount data. */
+	while (xfs_btree_has_more_records(sc->sr.rmap_cur)) {
+		struct xfs_rmap_irec	rmap;
+
+		/* Push all rmaps with pblk == sbno onto the stack */
+		error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+		if (error)
+			goto out_bag;
+		if (!have)
+			break;
+		sbno = cbno = rmap.rm_startblock;
+		error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+				&have);
+		if (error)
+			goto out_bag;
+
+		/* Set nbno to the bno of the next refcount change */
+		error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
+		if (error)
+			goto out_bag;
+
+		ASSERT(nbno > sbno);
+		old_stack_height = rcbag_count(rcstack);
+
+		/* While stack isn't empty... */
+		while (rcbag_count(rcstack) > 0) {
+			/* Pop all rmaps that end at nbno */
+			error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
+			if (error)
+				goto out_bag;
+
+			/* Push array items that start at nbno */
+			error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+			if (error)
+				goto out_bag;
+			if (have) {
+				error = xrep_rtrefc_push_rmaps_at(rr, rcstack,
+						nbno, &rmap, &have);
+				if (error)
+					goto out_bag;
+			}
+
+			/* Emit refcount if necessary */
+			ASSERT(nbno > cbno);
+			if (rcbag_count(rcstack) != old_stack_height) {
+				if (old_stack_height > 1) {
+					error = xrep_rtrefc_stash(rr,
+							XFS_REFC_DOMAIN_SHARED,
+							cbno, nbno - cbno,
+							old_stack_height);
+					if (error)
+						goto out_bag;
+				}
+				cbno = nbno;
+			}
+
+			/* Stack empty, go find the next rmap */
+			if (rcbag_count(rcstack) == 0)
+				break;
+			old_stack_height = rcbag_count(rcstack);
+			sbno = nbno;
+
+			/* Set nbno to the bno of the next refcount change */
+			error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
+					&nbno);
+			if (error)
+				goto out_bag;
+
+			ASSERT(nbno > sbno);
+		}
+	}
+
+	ASSERT(rcbag_count(rcstack) == 0);
+out_bag:
+	rcbag_free(&rcstack);
+out_cur:
+	xchk_rtgroup_btcur_free(&sc->sr);
+	return error;
+}
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_rtrefc_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xrep_rtrefc		*rr = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+	int				error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		error = xfarray_load(rr->refcount_records, rr->array_cur++,
+				&cur->bc_rec.rc);
+		if (error)
+			return error;
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrefc_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_rtrefc	*rr = priv;
+
+	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrefc_iroot_size(
+	struct xfs_btree_cur	*cur,
+	unsigned int		level,
+	unsigned int		nr_this_level,
+	void			*priv)
+{
+	return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level,
+			nr_this_level);
+}
+
+/*
+ * Use the collected refcount information to stage a new rt refcount btree.  If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_rtrefc_build_new_tree(
+	struct xrep_rtrefc	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_rtgroup	*rtg = sc->sr.rtg;
+	struct xfs_btree_cur	*refc_cur;
+	int			error;
+
+	error = xrep_rtrefc_sort_records(rr);
+	if (error)
+		return error;
+
+	/*
+	 * Prepare to construct the new btree by reserving disk space for the
+	 * new btree and setting up all the accounting information we'll need
+	 * to root the new btree while it's under construction and before we
+	 * attach it to the realtime refcount inode.
+	 */
+	error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+	if (error)
+		return error;
+
+	rr->new_btree.bload.get_records = xrep_rtrefc_get_records;
+	rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block;
+	rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size;
+
+	refc_cur = xfs_rtrefcountbt_init_cursor(NULL, rtg);
+	xfs_btree_stage_ifakeroot(refc_cur, &rr->new_btree.ifake);
+
+	/* Compute how many blocks we'll need. */
+	error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload,
+			xfarray_length(rr->refcount_records));
+	if (error)
+		goto err_cur;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		goto err_cur;
+
+	/*
+	 * Guess how many blocks we're going to need to rebuild an entire
+	 * rtrefcountbt from the number of extents we found, and pump up our
+	 * transaction to have sufficient block reservation.  We're allowed
+	 * to exceed quota to repair inconsistent metadata, though this is
+	 * unlikely.
+	 */
+	error = xfs_trans_reserve_more_inode(sc->tp, rtg_refcount(rtg),
+			rr->new_btree.bload.nr_blocks, 0, true);
+	if (error)
+		goto err_cur;
+
+	/* Reserve the space we'll need for the new btree. */
+	error = xrep_newbt_alloc_blocks(&rr->new_btree,
+			rr->new_btree.bload.nr_blocks);
+	if (error)
+		goto err_cur;
+
+	/* Add all observed refcount records. */
+	rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
+	rr->array_cur = XFARRAY_CURSOR_INIT;
+	error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Install the new rtrefc btree in the inode.  After this point the old
+	 * btree is no longer accessible, the new tree is live, and we can
+	 * delete the cursor.
+	 */
+	xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp);
+	xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+	xfs_btree_del_cursor(refc_cur, 0);
+
+	/* Dispose of any unused blocks and the accounting information. */
+	error = xrep_newbt_commit(&rr->new_btree);
+	if (error)
+		return error;
+
+	return xrep_roll_trans(sc);
+err_cur:
+	xfs_btree_del_cursor(refc_cur, error);
+	xrep_newbt_cancel(&rr->new_btree);
+	return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_rtrefc_remove_old_tree(
+	struct xrep_rtrefc	*rr)
+{
+	int			error;
+
+	/*
+	 * Free all the extents that were allocated to the former rtrefcountbt
+	 * and aren't cross-linked with something else.
+	 */
+	error = xrep_reap_metadir_fsblocks(rr->sc,
+			&rr->old_rtrefcountbt_blocks);
+	if (error)
+		return error;
+
+	/*
+	 * Ensure the proper reservation for the rtrefcount inode so that we
+	 * don't fail to expand the btree.
+	 */
+	return xrep_reset_metafile_resv(rr->sc);
+}
+
+/* Rebuild the rt refcount btree. */
+int
+xrep_rtrefcountbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_rtrefc	*rr;
+	struct xfs_mount	*mp = sc->mp;
+	char			*descr;
+	int			error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_has_rtrmapbt(mp))
+		return -EOPNOTSUPP;
+
+	/* Make sure any problems with the fork are fixed. */
+	error = xrep_metadata_inode_forks(sc);
+	if (error)
+		return error;
+
+	rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS);
+	if (!rr)
+		return -ENOMEM;
+	rr->sc = sc;
+
+	/* Set up enough storage to handle one refcount record per rt extent. */
+	descr = xchk_xfile_ag_descr(sc, "reference count records");
+	error = xfarray_create(descr, mp->m_sb.sb_rextents,
+			sizeof(struct xfs_refcount_irec),
+			&rr->refcount_records);
+	kfree(descr);
+	if (error)
+		goto out_rr;
+
+	/* Collect all reference counts. */
+	xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks);
+	error = xrep_rtrefc_find_refcounts(rr);
+	if (error)
+		goto out_bitmap;
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* Rebuild the refcount information. */
+	error = xrep_rtrefc_build_new_tree(rr);
+	if (error)
+		goto out_bitmap;
+
+	/* Kill the old tree. */
+	error = xrep_rtrefc_remove_old_tree(rr);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks);
+	xfarray_destroy(rr->refcount_records);
+out_rr:
+	kfree(rr);
+	return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 6e31f12cef4c..7567dd5cad14 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -472,7 +472,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_rtrefcountbt,
 		.scrub	= xchk_rtrefcountbt,
 		.has	= xfs_has_rtreflink,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_rtrefcountbt,
 	},
 };
 
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 289e39d1f418..56811862aa82 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2116,29 +2116,33 @@ TRACE_EVENT(xrep_ibt_found,
 )
 
 TRACE_EVENT(xrep_refc_found,
-	TP_PROTO(const struct xfs_perag *pag,
+	TP_PROTO(const struct xfs_group *xg,
 		 const struct xfs_refcount_irec *rec),
-	TP_ARGS(pag, rec),
+	TP_ARGS(xg, rec),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, domain)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agblock_t, startblock)
 		__field(xfs_extlen_t, blockcount)
 		__field(xfs_nlink_t, refcount)
 	),
 	TP_fast_assign(
-		__entry->dev = pag_mount(pag)->m_super->s_dev;
-		__entry->agno = pag_agno(pag);
+		__entry->dev = xg->xg_mount->m_super->s_dev;
+		__entry->agno = xg->xg_gno;
+		__entry->type = xg->xg_type;
 		__entry->domain = rec->rc_domain;
 		__entry->startblock = rec->rc_startblock;
 		__entry->blockcount = rec->rc_blockcount;
 		__entry->refcount = rec->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+	TP_printk("dev %d:%d %sno 0x%x dom %s %sbno 0x%x fsbcount 0x%x refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
 		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->startblock,
 		  __entry->blockcount,
 		  __entry->refcount)
-- 
cgit v1.2.3


From 92b2019493d114d2fad7219376a2448d9efba4a7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:15 -0800
Subject: xfs: repair inodes that have a refcount btree in the data fork

Plumb knowledge of refcount btrees into the inode core repair code.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/inode_repair.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 701baee144a6..2f641b6d663e 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -40,6 +40,7 @@
 #include "xfs_symlink_remote.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -970,6 +971,34 @@ xrep_dinode_bad_rtrmapbt_fork(
 	return false;
 }
 
+/* Return true if this refcount-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_rtrefcountbt_fork(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	unsigned int		dfork_size)
+{
+	struct xfs_rtrefcount_root *dfp;
+	unsigned int		nrecs;
+	unsigned int		level;
+
+	if (dfork_size < sizeof(struct xfs_rtrefcount_root))
+		return true;
+
+	dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > sc->mp->m_rtrefc_maxlevels)
+		return true;
+	if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size)
+		return true;
+	if (level > 0 && nrecs == 0)
+		return true;
+
+	return false;
+}
+
 /* Check a metadata-btree fork. */
 STATIC bool
 xrep_dinode_bad_metabt_fork(
@@ -984,6 +1013,8 @@ xrep_dinode_bad_metabt_fork(
 	switch (be16_to_cpu(dip->di_metatype)) {
 	case XFS_METAFILE_RTRMAP:
 		return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size);
+	case XFS_METAFILE_RTREFCOUNT:
+		return xrep_dinode_bad_rtrefcountbt_fork(sc, dip, dfork_size);
 	default:
 		return true;
 	}
@@ -1249,6 +1280,7 @@ xrep_dinode_ensure_forkoff(
 {
 	struct xfs_bmdr_block	*bmdr;
 	struct xfs_rtrmap_root	*rmdr;
+	struct xfs_rtrefcount_root *rcdr;
 	struct xfs_scrub	*sc = ri->sc;
 	xfs_extnum_t		attr_extents, data_extents;
 	size_t			bmdr_minsz = xfs_bmdr_space_calc(1);
@@ -1361,6 +1393,10 @@ xrep_dinode_ensure_forkoff(
 			rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
 			dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr);
 			break;
+		case XFS_METAFILE_RTREFCOUNT:
+			rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+			dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr);
+			break;
 		default:
 			dfork_min = 0;
 			break;
-- 
cgit v1.2.3


From 12f4d203289d1b4f59288389fd0349da3a528dee Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:15 -0800
Subject: xfs: check for shared rt extents when rebuilding rt file's data fork

When we're rebuilding the data fork of a realtime file, we need to
cross-reference each mapping with the rt refcount btree to ensure that
the reflink flag is set if there are any shared extents found.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/bmap_repair.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index fd64bdf4e138..1084213b8e9b 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -101,14 +101,21 @@ xrep_bmap_discover_shared(
 	xfs_filblks_t		blockcount)
 {
 	struct xfs_scrub	*sc = rb->sc;
+	struct xfs_btree_cur	*cur;
 	xfs_agblock_t		agbno;
 	xfs_agblock_t		fbno;
 	xfs_extlen_t		flen;
 	int			error;
 
-	agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
-	error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
-			&fbno, &flen, false);
+	if (XFS_IS_REALTIME_INODE(sc->ip)) {
+		agbno = xfs_rtb_to_rgbno(sc->mp, startblock);
+		cur = sc->sr.refc_cur;
+	} else {
+		agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+		cur = sc->sa.refc_cur;
+	}
+	error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen,
+			false);
 	if (error)
 		return error;
 
@@ -450,7 +457,9 @@ xrep_bmap_scan_rtgroup(
 		return 0;
 
 	error = xrep_rtgroup_init(sc, rtg, &sc->sr,
-			XFS_RTGLOCK_RMAP | XFS_RTGLOCK_BITMAP_SHARED);
+			XFS_RTGLOCK_RMAP |
+			XFS_RTGLOCK_REFCOUNT |
+			XFS_RTGLOCK_BITMAP_SHARED);
 	if (error)
 		return error;
 
@@ -903,10 +912,6 @@ xrep_bmap_init_reflink_scan(
 	if (whichfork != XFS_DATA_FORK)
 		return RLS_IRRELEVANT;
 
-	/* cannot share realtime extents */
-	if (XFS_IS_REALTIME_INODE(sc->ip))
-		return RLS_IRRELEVANT;
-
 	return RLS_UNKNOWN;
 }
 
-- 
cgit v1.2.3


From fd97fe1112088c16594dcd45e809f1643e623309 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:16 -0800
Subject: xfs: fix CoW forks for realtime files

Port the copy on write fork repair to realtime files.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/agheader_repair.c |   2 +-
 fs/xfs/scrub/cow_repair.c      | 178 +++++++++++++++++++++++++++---
 fs/xfs/scrub/reap.c            | 242 +++++++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/reap.h            |   7 ++
 fs/xfs/scrub/repair.h          |   1 +
 fs/xfs/scrub/rtb_bitmap.h      |  37 +++++++
 fs/xfs/scrub/trace.h           |  36 +++---
 fs/xfs/xfs_rtalloc.c           |   4 +-
 fs/xfs/xfs_rtalloc.h           |   5 +
 9 files changed, 470 insertions(+), 42 deletions(-)
 create mode 100644 fs/xfs/scrub/rtb_bitmap.h

(limited to 'fs')

diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index b45d2b32051a..cd6f0223879f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -647,7 +647,7 @@ xrep_agfl_fill(
 	xfs_agblock_t		agbno = start;
 	int			error;
 
-	trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
+	trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len);
 
 	while (agbno < start + len && af->fl_off < af->flcount)
 		af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index ba695dd21f8b..38a246b8bf11 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -26,6 +26,9 @@
 #include "xfs_errortag.h"
 #include "xfs_icache.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -34,6 +37,7 @@
 #include "scrub/bitmap.h"
 #include "scrub/off_bitmap.h"
 #include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
 #include "scrub/reap.h"
 
 /*
@@ -61,7 +65,10 @@ struct xrep_cow {
 	struct xoff_bitmap	bad_fileoffs;
 
 	/* Bitmap of fsblocks that were removed from the CoW fork. */
-	struct xfsb_bitmap	old_cowfork_fsblocks;
+	union {
+		struct xfsb_bitmap	old_cowfork_fsblocks;
+		struct xrtb_bitmap	old_cowfork_rtblocks;
+	};
 
 	/* CoW fork mappings used to scan for bad CoW staging extents. */
 	struct xfs_bmbt_irec	irec;
@@ -145,8 +152,7 @@ xrep_cow_mark_shared_staging(
 	xrep_cow_trim_refcount(xc, &rrec, rec);
 
 	return xrep_cow_mark_file_range(xc,
-			xfs_agbno_to_fsb(to_perag(cur->bc_group),
-				rrec.rc_startblock),
+			xfs_gbno_to_fsb(cur->bc_group, rrec.rc_startblock),
 			rrec.rc_blockcount);
 }
 
@@ -177,9 +183,8 @@ xrep_cow_mark_missing_staging(
 	if (xc->next_bno >= rrec.rc_startblock)
 		goto next;
 
-
 	error = xrep_cow_mark_file_range(xc,
-			xfs_agbno_to_fsb(to_perag(cur->bc_group), xc->next_bno),
+			xfs_gbno_to_fsb(cur->bc_group, xc->next_bno),
 			rrec.rc_startblock - xc->next_bno);
 	if (error)
 		return error;
@@ -222,8 +227,7 @@ xrep_cow_mark_missing_staging_rmap(
 	}
 
 	return xrep_cow_mark_file_range(xc,
-			xfs_agbno_to_fsb(to_perag(cur->bc_group), rec_bno),
-			rec_len);
+			xfs_gbno_to_fsb(cur->bc_group, rec_bno), rec_len);
 }
 
 /*
@@ -310,6 +314,92 @@ out_pag:
 	return 0;
 }
 
+/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad_rt(
+	struct xrep_cow			*xc)
+{
+	struct xfs_refcount_irec	rc_low = { 0 };
+	struct xfs_refcount_irec	rc_high = { 0 };
+	struct xfs_rmap_irec		rm_low = { 0 };
+	struct xfs_rmap_irec		rm_high = { 0 };
+	struct xfs_scrub		*sc = xc->sc;
+	struct xfs_rtgroup		*rtg;
+	int				error = 0;
+
+	xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock);
+
+	rtg = xfs_rtgroup_get(sc->mp,
+			xfs_rtb_to_rgno(sc->mp, xc->irec.br_startblock));
+	if (!rtg)
+		return -EFSCORRUPTED;
+
+	error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+			XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
+	if (error)
+		goto out_rtg;
+
+	/* Mark any CoW fork extents that are shared. */
+	rc_low.rc_startblock = xc->irec_startbno;
+	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+	error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+			xrep_cow_mark_shared_staging, xc);
+	if (error)
+		goto out_sr;
+
+	/* Make sure there are CoW staging extents for the whole mapping. */
+	rc_low.rc_startblock = xc->irec_startbno;
+	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+	xc->next_bno = xc->irec_startbno;
+	error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+			xrep_cow_mark_missing_staging, xc);
+	if (error)
+		goto out_sr;
+
+	if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+		error = xrep_cow_mark_file_range(xc,
+				xfs_rgbno_to_rtb(rtg, xc->next_bno),
+				xc->irec_startbno + xc->irec.br_blockcount -
+				xc->next_bno);
+		if (error)
+			goto out_sr;
+	}
+
+	/* Mark any area has an rmap that isn't a COW staging extent. */
+	rm_low.rm_startblock = xc->irec_startbno;
+	memset(&rm_high, 0xFF, sizeof(rm_high));
+	rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high,
+			xrep_cow_mark_missing_staging_rmap, xc);
+	if (error)
+		goto out_sr;
+
+	/*
+	 * If userspace is forcing us to rebuild the CoW fork or someone
+	 * turned on the debugging knob, replace everything in the
+	 * CoW fork and then scan for staging extents in the refcountbt.
+	 */
+	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+				xc->irec.br_blockcount);
+		if (error)
+			goto out_rtg;
+	}
+
+out_sr:
+	xchk_rtgroup_btcur_free(&sc->sr);
+	xchk_rtgroup_free(sc, &sc->sr);
+out_rtg:
+	xfs_rtgroup_put(rtg);
+	return error;
+}
+
 /*
  * Allocate a replacement CoW staging extent of up to the given number of
  * blocks, and fill out the mapping.
@@ -350,6 +440,32 @@ xrep_cow_alloc(
 	return 0;
 }
 
+/*
+ * Allocate a replacement rt CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc_rt(
+	struct xfs_scrub	*sc,
+	xfs_extlen_t		maxlen,
+	struct xrep_cow_extent	*repl)
+{
+	xfs_rtxlen_t		maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen);
+	int			error;
+
+	error = xfs_trans_reserve_more(sc->tp, 0, maxrtx);
+	if (error)
+		return error;
+
+	error = xfs_rtallocate_rtgs(sc->tp, NULLRTBLOCK, 1, maxrtx, 1, false,
+			false, &repl->fsbno, &repl->len);
+	if (error)
+		return error;
+
+	xfs_refcount_alloc_cow_extent(sc->tp, true, repl->fsbno, repl->len);
+	return 0;
+}
+
 /*
  * Look up the current CoW fork mapping so that we only allocate enough to
  * replace a single mapping.  If we don't find a mapping that covers the start
@@ -467,7 +583,10 @@ xrep_cow_replace_range(
 	 */
 	alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
 			  nextoff - startoff);
-	error = xrep_cow_alloc(sc, alloc_len, &repl);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		error = xrep_cow_alloc_rt(sc, alloc_len, &repl);
+	else
+		error = xrep_cow_alloc(sc, alloc_len, &repl);
 	if (error)
 		return error;
 
@@ -483,8 +602,12 @@ xrep_cow_replace_range(
 		return error;
 
 	/* Note the old CoW staging extents; we'll reap them all later. */
-	error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
-			repl.len);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks,
+				got.br_startblock, repl.len);
+	else
+		error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks,
+				got.br_startblock, repl.len);
 	if (error)
 		return error;
 
@@ -540,8 +663,16 @@ xrep_bmap_cow(
 	if (!ifp)
 		return 0;
 
-	/* realtime files aren't supported yet */
-	if (XFS_IS_REALTIME_INODE(sc->ip))
+	/*
+	 * Realtime files with large extent sizes are not supported because
+	 * we could encounter an CoW mapping that has been partially written
+	 * out *and* requires replacement, and there's no solution to that.
+	 */
+	if (xfs_inode_has_bigrtalloc(sc->ip))
+		return -EOPNOTSUPP;
+
+	/* Metadata inodes aren't supposed to have data on the rt volume. */
+	if (xfs_is_metadir_inode(sc->ip) && XFS_IS_REALTIME_INODE(sc->ip))
 		return -EOPNOTSUPP;
 
 	/*
@@ -562,7 +693,10 @@ xrep_bmap_cow(
 
 	xc->sc = sc;
 	xoff_bitmap_init(&xc->bad_fileoffs);
-	xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		xrtb_bitmap_init(&xc->old_cowfork_rtblocks);
+	else
+		xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
 
 	for_each_xfs_iext(ifp, &icur, &xc->irec) {
 		if (xchk_should_terminate(sc, &error))
@@ -585,7 +719,10 @@ xrep_bmap_cow(
 		if (xfs_bmap_is_written_extent(&xc->irec))
 			continue;
 
-		error = xrep_cow_find_bad(xc);
+		if (XFS_IS_REALTIME_INODE(sc->ip))
+			error = xrep_cow_find_bad_rt(xc);
+		else
+			error = xrep_cow_find_bad(xc);
 		if (error)
 			goto out_bitmap;
 	}
@@ -600,13 +737,20 @@ xrep_bmap_cow(
 	 * by the refcount btree, not the inode, so it is correct to treat them
 	 * like inode metadata.
 	 */
-	error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
-			&XFS_RMAP_OINFO_COW);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks,
+				&XFS_RMAP_OINFO_COW);
+	else
+		error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+				&XFS_RMAP_OINFO_COW);
 	if (error)
 		goto out_bitmap;
 
 out_bitmap:
-	xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks);
+	else
+		xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
 	xoff_bitmap_destroy(&xc->bad_fileoffs);
 	kfree(xc);
 	return error;
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 534c7696e9a1..b32fb233cf84 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -34,6 +34,8 @@
 #include "xfs_attr_remote.h"
 #include "xfs_defer.h"
 #include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -41,6 +43,7 @@
 #include "scrub/bitmap.h"
 #include "scrub/agb_bitmap.h"
 #include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
 #include "scrub/reap.h"
 
 /*
@@ -311,7 +314,7 @@ xreap_agextent_binval(
 	}
 
 out:
-	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
+	trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp);
 }
 
 /*
@@ -370,7 +373,8 @@ xreap_agextent_select(
 
 out_found:
 	*aglenp = len;
-	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
+	trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len,
+			*crosslinked);
 out_cur:
 	xfs_btree_del_cursor(cur, error);
 	return error;
@@ -409,7 +413,8 @@ xreap_agextent_iter(
 	 * to run xfs_repair.
 	 */
 	if (crosslinked) {
-		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
+		trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
+				*aglenp);
 
 		rs->force_roll = true;
 
@@ -428,7 +433,7 @@ xreap_agextent_iter(
 				*aglenp, rs->oinfo);
 	}
 
-	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
+	trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
 
 	/*
 	 * Invalidate as many buffers as we can, starting at agbno.  If this
@@ -679,6 +684,225 @@ xrep_reap_fsblocks(
 	return 0;
 }
 
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out the longest run of blocks that we can dispose of with a single
+ * call.  Cross-linked blocks should have their reverse mappings removed, but
+ * single-owner extents can be freed.  Units are rt blocks, not rt extents.
+ */
+STATIC int
+xreap_rgextent_select(
+	struct xreap_state	*rs,
+	xfs_rgblock_t		rgbno,
+	xfs_rgblock_t		rgbno_next,
+	bool			*crosslinked,
+	xfs_extlen_t		*rglenp)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_btree_cur	*cur;
+	xfs_rgblock_t		bno = rgbno + 1;
+	xfs_extlen_t		len = 1;
+	int			error;
+
+	/*
+	 * Determine if there are any other rmap records covering the first
+	 * block of this extent.  If so, the block is crosslinked.
+	 */
+	cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
+	error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo,
+			crosslinked);
+	if (error)
+		goto out_cur;
+
+	/*
+	 * Figure out how many of the subsequent blocks have the same crosslink
+	 * status.
+	 */
+	while (bno < rgbno_next) {
+		bool		also_crosslinked;
+
+		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
+				&also_crosslinked);
+		if (error)
+			goto out_cur;
+
+		if (*crosslinked != also_crosslinked)
+			break;
+
+		len++;
+		bno++;
+	}
+
+	*rglenp = len;
+	trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len,
+			*crosslinked);
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * Dispose of as much of the beginning of this rtgroup extent as possible.
+ * The number of blocks disposed of will be returned in @rglenp.
+ */
+STATIC int
+xreap_rgextent_iter(
+	struct xreap_state	*rs,
+	xfs_rgblock_t		rgbno,
+	xfs_extlen_t		*rglenp,
+	bool			crosslinked)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	xfs_rtblock_t		rtbno;
+	int			error;
+
+	/*
+	 * The only caller so far is CoW fork repair, so we only know how to
+	 * unlink or free CoW staging extents.  Here we don't have to worry
+	 * about invalidating buffers!
+	 */
+	if (rs->oinfo != &XFS_RMAP_OINFO_COW) {
+		ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW);
+		return -EFSCORRUPTED;
+	}
+	ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+	rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
+
+	/*
+	 * If there are other rmappings, this block is cross linked and must
+	 * not be freed.  Remove the forward and reverse mapping and move on.
+	 */
+	if (crosslinked) {
+		trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno,
+				*rglenp);
+
+		xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+		rs->deferred++;
+		return 0;
+	}
+
+	trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
+
+	/*
+	 * The CoW staging extent is not crosslinked.  Use deferred work items
+	 * to remove the refcountbt records (which removes the rmap records)
+	 * and free the extent.  We're not worried about the system going down
+	 * here because log recovery walks the refcount btree to clean out the
+	 * CoW staging extents.
+	 */
+	xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+	error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL,
+			rs->resv,
+			XFS_FREE_EXTENT_REALTIME |
+			XFS_FREE_EXTENT_SKIP_DISCARD);
+	if (error)
+		return error;
+
+	rs->deferred++;
+	return 0;
+}
+
+#define XREAP_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP | \
+				 XFS_RTGLOCK_RMAP | \
+				 XFS_RTGLOCK_REFCOUNT)
+
+/*
+ * Break a rt file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately.  The extent must
+ * be aligned to a realtime extent.
+ */
+STATIC int
+xreap_rtmeta_extent(
+	uint64_t		rtbno,
+	uint64_t		len,
+	void			*priv)
+{
+	struct xreap_state	*rs = priv;
+	struct xfs_scrub	*sc = rs->sc;
+	xfs_rgblock_t		rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno);
+	xfs_rgblock_t		rgbno_next = rgbno + len;
+	int			error = 0;
+
+	ASSERT(sc->ip != NULL);
+	ASSERT(!sc->sr.rtg);
+
+	/*
+	 * We're reaping blocks after repairing file metadata, which means that
+	 * we have to init the xchk_ag structure ourselves.
+	 */
+	sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno));
+	if (!sc->sr.rtg)
+		return -EFSCORRUPTED;
+
+	xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+
+	while (rgbno < rgbno_next) {
+		xfs_extlen_t	rglen;
+		bool		crosslinked;
+
+		error = xreap_rgextent_select(rs, rgbno, rgbno_next,
+				&crosslinked, &rglen);
+		if (error)
+			goto out_unlock;
+
+		error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked);
+		if (error)
+			goto out_unlock;
+
+		if (xreap_want_defer_finish(rs)) {
+			error = xfs_defer_finish(&sc->tp);
+			if (error)
+				goto out_unlock;
+			xreap_defer_finish_reset(rs);
+		} else if (xreap_want_roll(rs)) {
+			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+			if (error)
+				goto out_unlock;
+			xreap_reset(rs);
+		}
+
+		rgbno += rglen;
+	}
+
+out_unlock:
+	xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+	xfs_rtgroup_put(sc->sr.rtg);
+	sc->sr.rtg = NULL;
+	return error;
+}
+
+/*
+ * Dispose of every block of every rt metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_rtblocks(
+	struct xfs_scrub		*sc,
+	struct xrtb_bitmap		*bitmap,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xreap_state		rs = {
+		.sc			= sc,
+		.oinfo			= oinfo,
+		.resv			= XFS_AG_RESV_NONE,
+	};
+	int				error;
+
+	ASSERT(xfs_has_rmapbt(sc->mp));
+	ASSERT(sc->ip != NULL);
+
+	error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
+	if (error)
+		return error;
+
+	if (xreap_dirty(&rs))
+		return xrep_defer_finish(sc);
+
+	return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Dispose of every block of an old metadata btree that used to be rooted in a
  * metadata directory file.
@@ -771,7 +995,8 @@ xreap_bmapi_select(
 	}
 
 	imap->br_blockcount = len;
-	trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
+	trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len,
+			*crosslinked);
 out_cur:
 	xfs_btree_del_cursor(cur, error);
 	return error;
@@ -910,7 +1135,8 @@ xreap_bmapi_binval(
 	}
 
 out:
-	trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
+	trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno,
+			imap->br_blockcount);
 	return 0;
 }
 
@@ -937,7 +1163,7 @@ xrep_reap_bmapi_iter(
 		 * anybody else who thinks they own the block, even though that
 		 * runs the risk of stale buffer warnings in the future.
 		 */
-		trace_xreap_dispose_unmap_extent(sc->sa.pag,
+		trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag),
 				XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
 				imap->br_blockcount);
 
@@ -960,7 +1186,7 @@ xrep_reap_bmapi_iter(
 	 * by a block starting before the first block of the extent but overlap
 	 * anyway.
 	 */
-	trace_xreap_dispose_free_extent(sc->sa.pag,
+	trace_xreap_dispose_free_extent(pag_group(sc->sa.pag),
 			XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
 			imap->br_blockcount);
 
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 70e5e6bbb8d3..4c8f62701fb3 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -17,6 +17,13 @@ int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
 int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc,
 		struct xfsb_bitmap *bitmap);
 
+#ifdef CONFIG_XFS_RT
+int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap,
+		const struct xfs_owner_info *oinfo);
+#else
+# define xrep_reap_rtblocks(...)	(-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
+
 /* Buffer cache scan context. */
 struct xrep_bufscan {
 	/* Disk address for the buffers we want to scan. */
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 8f8f18b48a44..823c00d1a502 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -52,6 +52,7 @@ struct xbitmap;
 struct xagb_bitmap;
 struct xrgb_bitmap;
 struct xfsb_bitmap;
+struct xrtb_bitmap;
 
 int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
 
diff --git a/fs/xfs/scrub/rtb_bitmap.h b/fs/xfs/scrub/rtb_bitmap.h
new file mode 100644
index 000000000000..1313ef605511
--- /dev/null
+++ b/fs/xfs/scrub/rtb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTB_BITMAP_H__
+#define __XFS_SCRUB_RTB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rtblock_t */
+
+struct xrtb_bitmap {
+	struct xbitmap64	rtbitmap;
+};
+
+static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap)
+{
+	xbitmap64_init(&bitmap->rtbitmap);
+}
+
+static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap)
+{
+	xbitmap64_destroy(&bitmap->rtbitmap);
+}
+
+static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap,
+		xfs_rtblock_t start, xfs_filblks_t len)
+{
+	return xbitmap64_set(&bitmap->rtbitmap, start, len);
+}
+
+static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap,
+		xbitmap64_walk_fn fn, void *priv)
+{
+	return xbitmap64_walk(&bitmap->rtbitmap, fn, priv);
+}
+
+#endif	/* __XFS_SCRUB_RTB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 56811862aa82..d7c4ced47c15 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1964,32 +1964,36 @@ DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup);
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
 DECLARE_EVENT_CLASS(xrep_extent_class,
-	TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
 		 xfs_extlen_t len),
-	TP_ARGS(pag, agbno, len),
+	TP_ARGS(xg, agbno, len),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
-		__entry->dev = pag_mount(pag)->m_super->s_dev;
-		__entry->agno = pag_agno(pag);
+		__entry->dev = xg->xg_mount->m_super->s_dev;
+		__entry->type = xg->xg_type;
+		__entry->agno = xg->xg_gno;
 		__entry->agbno = agbno;
 		__entry->len = len;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+	TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agbno,
 		  __entry->len)
 );
 #define DEFINE_REPAIR_EXTENT_EVENT(name) \
 DEFINE_EVENT(xrep_extent_class, name, \
-	TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
 		 xfs_extlen_t len), \
-	TP_ARGS(pag, agbno, len))
+	TP_ARGS(xg, agbno, len))
 DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
 DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
 DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
@@ -1997,35 +2001,39 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
 
 DECLARE_EVENT_CLASS(xrep_reap_find_class,
-	TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
 		 xfs_extlen_t len, bool crosslinked),
-	TP_ARGS(pag, agbno, len, crosslinked),
+	TP_ARGS(xg, agbno, len, crosslinked),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(enum xfs_group_type, type)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
 		__field(bool, crosslinked)
 	),
 	TP_fast_assign(
-		__entry->dev = pag_mount(pag)->m_super->s_dev;
-		__entry->agno = pag_agno(pag);
+		__entry->dev = xg->xg_mount->m_super->s_dev;
+		__entry->type = xg->xg_type;
+		__entry->agno = xg->xg_gno;
 		__entry->agbno = agbno;
 		__entry->len = len;
 		__entry->crosslinked = crosslinked;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d",
+	TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x crosslinked %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agno,
+		  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
 		  __entry->agbno,
 		  __entry->len,
 		  __entry->crosslinked ? 1 : 0)
 );
 #define DEFINE_REPAIR_REAP_FIND_EVENT(name) \
 DEFINE_EVENT(xrep_reap_find_class, name, \
-	TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
 		 xfs_extlen_t len, bool crosslinked), \
-	TP_ARGS(pag, agbno, len, crosslinked))
+	TP_ARGS(xg, agbno, len, crosslinked))
 DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
 DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a5de5405800a..7135a6717a9e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -594,7 +594,7 @@ xfs_rtalloc_sumlevel(
  * specified.  If we don't get maxlen then use prod to trim
  * the length, if given.  The lengths are all in rtextents.
  */
-STATIC int
+static int
 xfs_rtallocate_extent_size(
 	struct xfs_rtalloc_args	*args,
 	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
@@ -1958,7 +1958,7 @@ out_unlock:
 	goto out_release;
 }
 
-static int
+int
 xfs_rtallocate_rtgs(
 	struct xfs_trans	*tp,
 	xfs_fsblock_t		bno_hint,
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 9044f7226ab6..0d95b29092c9 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -77,4 +77,9 @@ xfs_growfs_check_rtgeom(const struct xfs_mount *mp,
 }
 #endif	/* CONFIG_XFS_RT */
 
+int xfs_rtallocate_rtgs(struct xfs_trans *tp, xfs_fsblock_t bno_hint,
+		xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, xfs_rtxlen_t prod,
+		bool wasdel, bool initial_user_data, xfs_rtblock_t *bno,
+		xfs_extlen_t *blen);
+
 #endif	/* __XFS_RTALLOC_H__ */
-- 
cgit v1.2.3


From 155debbe7e627baf6439e75f15856d40ec115c5d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 20 Nov 2024 16:21:17 -0800
Subject: xfs: enable realtime reflink

Enable reflink for realtime devices, as long as the realtime allocation
unit is a single fsblock.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 25 +++++++++++++++++++++++++
 fs/xfs/xfs_reflink.h |  2 ++
 fs/xfs/xfs_rtalloc.c |  7 +++++--
 fs/xfs/xfs_super.c   |  6 ++++--
 4 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d9b33e22c176..59f7fc16eb80 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1822,3 +1822,28 @@ out:
 	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
 	return error;
 }
+
+/*
+ * Can we use reflink with this realtime extent size?  Note that we don't check
+ * for rblocks > 0 here because this can be called as part of attaching a new
+ * rt section.
+ */
+bool
+xfs_reflink_supports_rextsize(
+	struct xfs_mount	*mp,
+	unsigned int		rextsize)
+{
+	/* reflink on the realtime device requires rtgroups */
+	if (!xfs_has_rtgroups(mp))
+	       return false;
+
+	/*
+	 * Reflink doesn't support rt extent size larger than a single fsblock
+	 * because we would have to perform CoW-around for unaligned write
+	 * requests to guarantee that we always remap entire rt extents.
+	 */
+	if (rextsize != 1)
+		return false;
+
+	return true;
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 3bfd7ab9e114..cc4e92278279 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -62,4 +62,6 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
 extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
 		xfs_extlen_t cowextsize, unsigned int remap_flags);
 
+bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize);
+
 #endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 7135a6717a9e..d8e6d073d64d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -32,6 +32,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_rtrefcount_btree.h"
+#include "xfs_reflink.h"
 
 /*
  * Return whether there are any free extents in the size range given
@@ -1292,8 +1293,10 @@ xfs_growfs_rt(
 			goto out_unlock;
 		if (xfs_has_quota(mp))
 			goto out_unlock;
-	}
-	if (xfs_has_reflink(mp))
+		if (xfs_has_reflink(mp))
+			goto out_unlock;
+	} else if (xfs_has_reflink(mp) &&
+		   !xfs_reflink_supports_rextsize(mp, in->extsize))
 		goto out_unlock;
 
 	error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ecd5a9f444d8..7c3f996cd39e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1754,9 +1754,11 @@ xfs_fs_fill_super(
 		xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
 
 	if (xfs_has_reflink(mp)) {
-		if (mp->m_sb.sb_rblocks) {
+		if (xfs_has_realtime(mp) &&
+		    !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) {
 			xfs_alert(mp,
-	"reflink not compatible with realtime device!");
+	"reflink not compatible with realtime extent size %u!",
+					mp->m_sb.sb_rextsize);
 			error = -EINVAL;
 			goto out_filestream_unmount;
 		}
-- 
cgit v1.2.3


From fca432e73db2bec0fdbfbf6d98d3ebcd5388a977 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 18 Dec 2024 17:00:56 +1030
Subject: btrfs: sysfs: fix direct super block member reads

The following sysfs entries are reading super block member directly,
which can have a different endian and cause wrong values:

- sys/fs/btrfs/<uuid>/nodesize
- sys/fs/btrfs/<uuid>/sectorsize
- sys/fs/btrfs/<uuid>/clone_alignment

Thankfully those values (nodesize and sectorsize) are always aligned
inside the btrfs_super_block, so it won't trigger unaligned read errors,
just endian problems.

Fix them by using the native cached members instead.

Fixes: df93589a1737 ("btrfs: export more from FS_INFO to sysfs")
CC: stable@vger.kernel.org
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index fdcbf650ac31..7f09b6c9cc2d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1118,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-	return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
+	return sysfs_emit(buf, "%u\n", fs_info->nodesize);
 }
 
 BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -1128,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-	return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+	return sysfs_emit(buf, "%u\n", fs_info->sectorsize);
 }
 
 BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -1180,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-	return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+	return sysfs_emit(buf, "%u\n", fs_info->sectorsize);
 }
 
 BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
-- 
cgit v1.2.3


From d0855e210675b8018f4e89ca77cbfa133bce3a71 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 04:03:32 -0500
Subject: bcachefs: Kill snapshot_t->equiv

Now entirely dead code.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c        | 108 +++++-------------------------------------
 fs/bcachefs/snapshot.h        |  15 ------
 fs/bcachefs/subvolume_types.h |   1 -
 3 files changed, 13 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0d60251946f1..8e6d85504efb 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -280,23 +280,6 @@ fsck_err:
 	return ret;
 }
 
-static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
-{
-	struct snapshot_t *t = snapshot_t_mut(c, id);
-	u32 parent = id;
-
-	while ((parent = bch2_snapshot_parent_early(c, parent)) &&
-	       parent - id - 1 < IS_ANCESTOR_BITMAP)
-		__set_bit(parent - id - 1, t->is_ancestor);
-}
-
-static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
-{
-	mutex_lock(&c->snapshot_table_lock);
-	__set_is_ancestor_bitmap(c, id);
-	mutex_unlock(&c->snapshot_table_lock);
-}
-
 static int __bch2_mark_snapshot(struct btree_trans *trans,
 		       enum btree_id btree, unsigned level,
 		       struct bkey_s_c old, struct bkey_s_c new,
@@ -337,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
 			t->skip[2]	= 0;
 		}
 
-		__set_is_ancestor_bitmap(c, id);
+		u32 parent = id;
+
+		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+		       parent - id - 1 < IS_ANCESTOR_BITMAP)
+			__set_bit(parent - id - 1, t->is_ancestor);
 
 		if (BCH_SNAPSHOT_DELETED(s.v)) {
 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
@@ -367,70 +354,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
 				       BTREE_ITER_with_updates, snapshot, s);
 }
 
-static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
-{
-	struct bch_snapshot v;
-	int ret;
-
-	if (!id)
-		return 0;
-
-	ret = bch2_snapshot_lookup(trans, id, &v);
-	if (bch2_err_matches(ret, ENOENT))
-		bch_err(trans->c, "snapshot node %u not found", id);
-	if (ret)
-		return ret;
-
-	return !BCH_SNAPSHOT_DELETED(&v);
-}
-
-/*
- * If @k is a snapshot with just one live child, it's part of a linear chain,
- * which we consider to be an equivalence class: and then after snapshot
- * deletion cleanup, there should only be a single key at a given position in
- * this equivalence class.
- *
- * This sets the equivalence class of @k to be the child's equivalence class, if
- * it's part of such a linear chain: this correctly sets equivalence classes on
- * startup if we run leaf to root (i.e. in natural key order).
- */
-static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	unsigned i, nr_live = 0, live_idx = 0;
-	struct bkey_s_c_snapshot snap;
-	u32 id = k.k->p.offset, child[2];
-
-	if (k.k->type != KEY_TYPE_snapshot)
-		return 0;
-
-	snap = bkey_s_c_to_snapshot(k);
-
-	child[0] = le32_to_cpu(snap.v->children[0]);
-	child[1] = le32_to_cpu(snap.v->children[1]);
-
-	for (i = 0; i < 2; i++) {
-		int ret = bch2_snapshot_live(trans, child[i]);
-
-		if (ret < 0)
-			return ret;
-
-		if (ret)
-			live_idx = i;
-		nr_live += ret;
-	}
-
-	mutex_lock(&c->snapshot_table_lock);
-
-	snapshot_t_mut(c, id)->equiv = nr_live == 1
-		? snapshot_t_mut(c, child[live_idx])->equiv
-		: id;
-
-	mutex_unlock(&c->snapshot_table_lock);
-
-	return 0;
-}
-
 /* fsck: */
 
 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
@@ -964,8 +887,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
 
 	return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
 		bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
-				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
-		bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
 }
 
 /* Figure out which snapshot nodes belong in the same tree: */
@@ -1309,10 +1231,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
 			goto err;
 
 		new_snapids[i]	= iter.pos.offset;
-
-		mutex_lock(&c->snapshot_table_lock);
-		snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
-		mutex_unlock(&c->snapshot_table_lock);
 	}
 err:
 	bch2_trans_iter_exit(trans, &iter);
@@ -1774,15 +1692,15 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
 
 int bch2_snapshots_read(struct bch_fs *c)
 {
+	/*
+	 * Initializing the is_ancestor bitmaps requires ancestors to already be
+	 * initialized - so mark in reverse:
+	 */
 	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-				   POS_MIN, 0, k,
+		for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
+				   POS_MAX, 0, k,
 			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-			bch2_snapshot_set_equiv(trans, k) ?:
-			bch2_check_snapshot_needs_deletion(trans, k)) ?:
-		for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-				   POS_MIN, 0, k,
-			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+			bch2_check_snapshot_needs_deletion(trans, k)));
 	bch_err_fn(c, ret);
 
 	/*
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 3ff0ffa774f5..00373cf32e7b 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -134,21 +134,6 @@ static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
 	return ret;
 }
 
-static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s = snapshot_t(c, id);
-	return s ? s->equiv : 0;
-}
-
-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-{
-	rcu_read_lock();
-	id = __bch2_snapshot_equiv(c, id);
-	rcu_read_unlock();
-
-	return id;
-}
-
 static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
 {
 	rcu_read_lock();
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 8a7f7e87c381..1549d6daf7af 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -16,7 +16,6 @@ struct snapshot_t {
 	u32			children[2];
 	u32			subvol; /* Nonzero only if a subvolume points to this node: */
 	u32			tree;
-	u32			equiv;
 	unsigned long		is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
 };
 
-- 
cgit v1.2.3


From 54c9b92fc7c0ffe7662eeb7f4874f885a15e5d1a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 00:44:28 -0500
Subject: bcachefs: bch2_trans_log_msg()

Export a helper for logging to the journal when we're already in a
transaction context.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 13 ++++++++++---
 fs/bcachefs/btree_update.h |  1 +
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 06fd5aa62296..a4b70e3fe4c3 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -823,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
 	return bch2_trans_update_buffered(trans, btree, &k);
 }
 
-static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
+int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
 {
+	unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
+	prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
+
+	int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+	if (ret)
+		return ret;
+
 	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
-	int ret = PTR_ERR_OR_ZERO(e);
+	ret = PTR_ERR_OR_ZERO(e);
 	if (ret)
 		return ret;
 
@@ -862,7 +869,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
 		c->journal.early_journal_entries.nr += jset_u64s(u64s);
 	} else {
 		ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
-			__bch2_trans_log_msg(trans, &buf, u64s));
+			bch2_trans_log_msg(trans, &buf));
 	}
 err:
 	printbuf_exit(&buf);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 58df20194306..8f22ef9a7651 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -159,6 +159,7 @@ void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *, unsigned);
 
+int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
 __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
 __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
 
-- 
cgit v1.2.3


From 17d678bcdd839be6e85cc637b9ac9e672be9a270 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 04:00:40 -0500
Subject: bcachefs: Log message in journal for snapshot deletion

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 8e6d85504efb..1506445eaaf4 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1568,6 +1568,22 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	if (!delete_leaves.nr && !delete_interior.nr)
 		goto err;
 
+	{
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "deleting leaves");
+		darray_for_each(delete_leaves, i)
+			prt_printf(&buf, " %u", *i);
+
+		prt_printf(&buf, " interior");
+		darray_for_each(delete_interior, i)
+			prt_printf(&buf, " %u->%u", i->id, i->live_child);
+
+		ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
+		printbuf_exit(&buf);
+		if (ret)
+			goto err;
+	}
+
 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
 		struct disk_reservation res = { 0 };
 
-- 
cgit v1.2.3


From 07c1a6fa901dc478a2685614399dedece910d6f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 13 Dec 2024 05:43:00 -0500
Subject: bcachefs: trace_key_cache_fill

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 10 ++++++++++
 fs/bcachefs/trace.h           | 27 ++++++++++++++++++++++-----
 2 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 3bd40ea0fa3d..4eba2871f289 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -309,6 +309,16 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
 	ret = btree_key_cache_create(trans, ck_path, k);
 	if (ret)
 		goto err;
+
+	if (trace_key_cache_fill_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bpos_to_text(&buf, ck_path->pos);
+		prt_char(&buf, ' ');
+		bch2_bkey_val_to_text(&buf, trans->c, k);
+		trace_key_cache_fill(trans, buf.buf);
+		printbuf_exit(&buf);
+	}
 out:
 	/* We're not likely to need this iterator again: */
 	bch2_set_btree_iter_dontneed(&iter);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 11e6547f91d6..9d40b7d4ea29 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1338,6 +1338,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
 		  __entry->new_u64s)
 );
 
+DEFINE_EVENT(transaction_event,	trans_restart_write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
 TRACE_EVENT(path_downgrade,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip,
@@ -1374,10 +1380,21 @@ TRACE_EVENT(path_downgrade,
 		  __entry->pos_snapshot)
 );
 
-DEFINE_EVENT(transaction_event,	trans_restart_write_buffer_flush,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
+TRACE_EVENT(key_cache_fill,
+	TP_PROTO(struct btree_trans *trans, const char *key),
+	TP_ARGS(trans, key),
+
+	TP_STRUCT__entry(
+		__array(char,		trans_fn, 32	)
+		__string(key,		key			)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__assign_str(key);
+	),
+
+	TP_printk("%s %s", __entry->trans_fn, __get_str(key))
 );
 
 TRACE_EVENT(write_buffer_flush,
@@ -1443,7 +1460,7 @@ TRACE_EVENT(write_buffer_maybe_flush,
 	TP_STRUCT__entry(
 		__array(char,			trans_fn, 32	)
 		__field(unsigned long,		caller_ip	)
-		__string(key,		key			)
+		__string(key,			key		)
 	),
 
 	TP_fast_assign(
-- 
cgit v1.2.3


From 6679e363f44121ec07e9daeb0c78464df410bc47 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 13 Dec 2024 05:58:34 -0500
Subject: bcachefs: bch2_btree_path_peek_slot() doesn't return errors

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b27944b62087..a1c5fcced24e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2229,14 +2229,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 	btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
 
 	k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
-	if (k.k && !bkey_err(k)) {
-		if ((iter->flags & BTREE_ITER_all_snapshots) &&
-		    !bpos_eq(pos, k.k->p))
-			return bkey_s_c_null;
+	if (!k.k)
+		return k;
 
-		iter->k = u;
-		k.k = &iter->k;
-	}
+	if ((iter->flags & BTREE_ITER_all_snapshots) &&
+	    !bpos_eq(pos, k.k->p))
+		return bkey_s_c_null;
+
+	iter->k = u;
+	k.k = &iter->k;
 	return k;
 }
 
-- 
cgit v1.2.3


From ebdca072683844e04fe8bd31f64745554b9119d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 16 Nov 2024 23:53:07 -0500
Subject: bcachefs: bcachefs_metadata_version_backpointer_bucket_gen

New on disk format version: backpointers new include the generation
number of the bucket they refer to, and the obsolete bucket_offset field
(no longer needed because we no longer store backpointers in alloc keys)
is gone.

This is an expensive forced upgrade - hopefully the last; we have to run
the extents_to_backpointers recovery pass to regenerate backpointers.

It's a forced incompatible upgrade because the alternative would've been
permamently making backpointers bigger, and as one of the biggest btrees
(along with the extents btree) that's not an ideal option.

It's worth it though, because this allows us to make the
check_extents_to_backpointers pass drastically cheaper: an upcoming
patch changes it to sum up backpointers in a bucket and check the sum
against the sector counts for that bucket, only looking for missing
backpointers if they don't match (and then only for specific buckets).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c    | 27 +++++----------------------
 fs/bcachefs/backpointers.h    |  2 +-
 fs/bcachefs/bcachefs_format.h |  6 ++++--
 fs/bcachefs/sb-downgrade.c    | 15 +++++++++++++--
 4 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 0e3b7b5d626e..b19719b02df8 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -28,23 +28,6 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
 	bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
 			 c, backpointer_dev_bad,
 			 "backpointer for BCH_SB_MEMBER_INVALID");
-
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
-	if (!ca) {
-		/* these will be caught by fsck */
-		rcu_read_unlock();
-		return 0;
-	}
-
-	struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p);
-	struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset);
-	rcu_read_unlock();
-
-	bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
-			 !bpos_eq(bp.k->p, bp_pos),
-			 c, backpointer_bucket_offset_wrong,
-			 "backpointer bucket_offset wrong (%llu)", (u64) bp.v->bucket_offset);
 fsck_err:
 	return ret;
 }
@@ -59,16 +42,17 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
 		u32 bucket_offset;
 		struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
 		rcu_read_unlock();
-		prt_printf(out, "bucket=%llu:%llu:%u", bucket.inode, bucket.offset, bucket_offset);
+		prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
 	} else {
 		rcu_read_unlock();
-		prt_printf(out, "sector=%llu:%llu", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
+		prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
 	}
 
 	bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
-	prt_printf(out, " suboffset=%u len=%u pos=",
+	prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
 		   (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-		   bp.v->bucket_len);
+		   bp.v->bucket_len,
+		   bp.v->bucket_gen);
 	bch2_bpos_to_text(out, bp.v->pos);
 }
 
@@ -76,7 +60,6 @@ void bch2_backpointer_swab(struct bkey_s k)
 {
 	struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
 
-	bp.v->bucket_offset	= swab40(bp.v->bucket_offset);
 	bp.v->bucket_len	= swab32(bp.v->bucket_len);
 	bch2_bpos_swab(&bp.v->pos);
 }
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 95caeabb8978..caffc68407ab 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -158,7 +158,7 @@ static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
 		.btree_id	= btree_id,
 		.level		= level,
 		.data_type	= bch2_bkey_ptr_data_type(k, p, entry),
-		.bucket_offset	= bp_bucket_offset,
+		.bucket_gen	= p.ptr.gen,
 		.bucket_len	= sectors,
 		.pos		= k.k->p,
 	};
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index dc14bfe37e3b..e4bb74d6f439 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -463,7 +463,8 @@ struct bch_backpointer {
 	__u8			btree_id;
 	__u8			level;
 	__u8			data_type;
-	__u64			bucket_offset:40;
+	__u8			bucket_gen;
+	__u32			pad;
 	__u32			bucket_len;
 	struct bpos		pos;
 } __packed __aligned(8);
@@ -677,7 +678,8 @@ struct bch_sb_field_ext {
 	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
 	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
 	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
-	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))
+	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))		\
+	x(backpointer_bucket_gen,	BCH_VERSION(1, 14))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 8767c33c2b51..9879845413a6 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -81,7 +81,11 @@
 	  BCH_FSCK_ERR_accounting_mismatch)			\
 	x(inode_has_child_snapshots,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
-	  BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
+	  BCH_FSCK_ERR_inode_has_child_snapshots_wrong)		\
+	x(backpointer_bucket_gen,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+	  BCH_FSCK_ERR_backpointer_to_missing_ptr,		\
+	  BCH_FSCK_ERR_ptr_to_missing_backpointer)
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
@@ -117,7 +121,14 @@
 	  BCH_FSCK_ERR_bkey_version_in_future)			\
 	x(rebalance_work_acct_fix,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch)
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
+	x(backpointer_bucket_gen,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+	  BCH_FSCK_ERR_backpointer_bucket_offset_wrong,		\
+	  BCH_FSCK_ERR_backpointer_to_missing_ptr,		\
+	  BCH_FSCK_ERR_ptr_to_missing_backpointer)
 
 struct upgrade_downgrade_entry {
 	u64		recovery_passes;
-- 
cgit v1.2.3


From ba9752e5f43637096d636abb934b8172fc23ea62 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 29 Nov 2024 17:41:43 -0500
Subject: bcachefs: bcachefs_metadata_version_disk_accounting_big_endian

Fix sort order for disk accounting keys, in order to fix a regression on
mount times.

The typetag is now the most significant byte of the key, meaning disk
accounting keys of the same type now sort together.

This lets us skip over disk accounting keys that aren't mirrored in
memory when reading accounting at startup, instead of having them
interleaved with other counter types.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  3 ++-
 fs/bcachefs/disk_accounting.c | 22 +++++++++++++++++-----
 fs/bcachefs/disk_accounting.h | 18 +++++++++++-------
 fs/bcachefs/sb-downgrade.c    | 14 ++++++++++++--
 fs/bcachefs/util.h            |  9 +++++++++
 5 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e4bb74d6f439..cef22c15c256 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -679,7 +679,8 @@ struct bch_sb_field_ext {
 	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
 	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
 	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))		\
-	x(backpointer_bucket_gen,	BCH_VERSION(1, 14))
+	x(backpointer_bucket_gen,	BCH_VERSION(1, 14))		\
+	x(disk_accounting_big_endian,	BCH_VERSION(1, 15))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 22a7db63e50c..72c8dcb9226f 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -698,8 +698,11 @@ int bch2_accounting_read(struct bch_fs *c)
 	percpu_memset(c->usage, 0, sizeof(*c->usage));
 	percpu_up_write(&c->mark_lock);
 
-	int ret = for_each_btree_key(trans, iter,
-				BTREE_ID_accounting, POS_MIN,
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
+			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+	iter.flags &= ~BTREE_ITER_with_journal;
+	int ret = for_each_btree_key_continue(trans, iter,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
 			struct bkey u;
 			struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
@@ -710,8 +713,14 @@ int bch2_accounting_read(struct bch_fs *c)
 			struct disk_accounting_pos acc_k;
 			bpos_to_disk_accounting_pos(&acc_k, k.k->p);
 
-			if (!bch2_accounting_is_mem(acc_k))
+			if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+				break;
+
+			if (!bch2_accounting_is_mem(acc_k)) {
+				struct disk_accounting_pos next = { .type = acc_k.type + 1 };
+				bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
 				continue;
+			}
 
 			accounting_read_key(trans, k);
 		}));
@@ -896,10 +905,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
 			bpos_to_disk_accounting_pos(&acc_k, k.k->p);
 
 			if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
-				continue;
+				break;
 
-			if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+			if (!bch2_accounting_is_mem(acc_k)) {
+				struct disk_accounting_pos next = { .type = acc_k.type + 1 };
+				bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
 				continue;
+			}
 
 			bch2_accounting_mem_read(c, k.k->p, v, nr);
 
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 2560de10b09d..fc1b673689c8 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -63,20 +63,24 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage
 
 static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
 {
-	acc->_pad = p;
+	BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
+
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	bch2_bpos_swab(&acc->_pad);
+	acc->_pad = p;
+#else
+	memcpy_swab(acc, &p, sizeof(p));
 #endif
 }
 
-static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k)
+static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
 {
-	struct bpos ret = k->_pad;
-
+	struct bpos p;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	bch2_bpos_swab(&ret);
+	p = acc->_pad;
+#else
+	memcpy_swab(&p, acc, sizeof(p));
 #endif
-	return ret;
+	return p;
 }
 
 int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 9879845413a6..051214fdc735 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -85,7 +85,12 @@
 	x(backpointer_bucket_gen,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
 	  BCH_FSCK_ERR_backpointer_to_missing_ptr,		\
-	  BCH_FSCK_ERR_ptr_to_missing_backpointer)
+	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
+	x(disk_accounting_big_endian,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
@@ -128,7 +133,12 @@
 	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
 	  BCH_FSCK_ERR_backpointer_bucket_offset_wrong,		\
 	  BCH_FSCK_ERR_backpointer_to_missing_ptr,		\
-	  BCH_FSCK_ERR_ptr_to_missing_backpointer)
+	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
+	x(disk_accounting_big_endian,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)
 
 struct upgrade_downgrade_entry {
 	u64		recovery_passes;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 5e4820c8fa44..c292b9ce8240 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -709,4 +709,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr)
 	return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
 }
 
+static inline void memcpy_swab(void *_dst, void *_src, size_t len)
+{
+	u8 *dst = _dst + len;
+	u8 *src = _src;
+
+	while (len--)
+		*--dst = *src++;
+}
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
cgit v1.2.3


From aca7a26f7f47f0290e00250c6ad53a7814584159 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Nov 2024 23:58:21 -0500
Subject: bcachefs: bch2_extent_ptr_to_bp() no longer depends on device

bch_backpointer no longer contains the bucket_offset field, it's just a
direct LBA mapping (with low bits to account for compressed extent
splitting), so we don't need to refer to the device to construct it
anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 26 ++++----------------------
 fs/bcachefs/backpointers.h | 17 ++++++-----------
 fs/bcachefs/buckets.c      |  7 ++++---
 3 files changed, 14 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index b19719b02df8..98d89133fc75 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -73,26 +73,14 @@ static bool extent_matches_bp(struct bch_fs *c,
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 
-	rcu_read_lock();
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bpos bucket2;
 		struct bkey_i_backpointer bp2;
+		bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2);
 
-		if (p.ptr.cached)
-			continue;
-
-		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-		if (!ca)
-			continue;
-
-		bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2);
 		if (bpos_eq(bp.k->p, bp2.k.p) &&
-		    !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) {
-			rcu_read_unlock();
+		    !memcmp(bp.v, &bp2.v, sizeof(bp2.v)))
 			return true;
-		}
 	}
-	rcu_read_unlock();
 
 	return false;
 }
@@ -586,21 +574,15 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 
 	ptrs = bch2_bkey_ptrs_c(k);
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bpos bucket_pos;
 		struct bkey_i_backpointer bp;
 
 		if (p.ptr.cached)
 			continue;
 
-		rcu_read_lock();
-		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
-		if (ca)
-			bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp);
-		rcu_read_unlock();
-
-		if (!ca)
+		if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
 			continue;
 
+		bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
 		ret = check_bp_exists(trans, s, &bp, k);
 		if (ret)
 			return ret;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index caffc68407ab..d126d40dda99 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -140,20 +140,15 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
 	}
 }
 
-static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+static inline void __bch2_extent_ptr_to_bp(
 			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c k, struct extent_ptr_decoded p,
 			   const union bch_extent_entry *entry,
-			   struct bpos *bucket, struct bkey_i_backpointer *bp,
+			   struct bkey_i_backpointer *bp,
 			   u64 sectors)
 {
-	u32 bucket_offset;
-	*bucket = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
-
-	u64 bp_bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
-
 	bkey_backpointer_init(&bp->k_i);
-	bp->k.p = bucket_pos_to_bp(ca, *bucket, bp_bucket_offset);
+	bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
 	bp->v	= (struct bch_backpointer) {
 		.btree_id	= btree_id,
 		.level		= level,
@@ -164,15 +159,15 @@ static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
 	};
 }
 
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
 			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c k, struct extent_ptr_decoded p,
 			   const union bch_extent_entry *entry,
-			   struct bpos *bucket_pos, struct bkey_i_backpointer *bp)
+			   struct bkey_i_backpointer *bp)
 {
 	u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
 
-	__bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors);
+	__bch2_extent_ptr_to_bp(btree_id, level, k, p, entry, bp, sectors);
 }
 
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index bbd37b1ed5d2..30b983cf9780 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -572,6 +572,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
 	*sectors = insert ? abs_sectors : -abs_sectors;
 
+	struct bkey_i_backpointer bp;
+	__bch2_extent_ptr_to_bp(btree_id, level, k, p, entry, &bp, abs_sectors);
+
 	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
 	if (unlikely(!ca)) {
 		if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
@@ -579,9 +582,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 		goto err;
 	}
 
-	struct bpos bucket;
-	struct bkey_i_backpointer bp;
-	__bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors);
+	struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
 
 	if (flags & BTREE_TRIGGER_transactional) {
 		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
-- 
cgit v1.2.3


From 7171b1fd270e22a157e1a51bb684154bc146ab6f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Nov 2024 18:37:41 -0500
Subject: bcachefs: kill __bch2_extent_ptr_to_bp()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.h | 18 +++---------------
 fs/bcachefs/buckets.c      |  7 +++----
 2 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index d126d40dda99..65ede8adbc36 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -140,12 +140,11 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
 	}
 }
 
-static inline void __bch2_extent_ptr_to_bp(
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
 			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c k, struct extent_ptr_decoded p,
 			   const union bch_extent_entry *entry,
-			   struct bkey_i_backpointer *bp,
-			   u64 sectors)
+			   struct bkey_i_backpointer *bp)
 {
 	bkey_backpointer_init(&bp->k_i);
 	bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
@@ -154,22 +153,11 @@ static inline void __bch2_extent_ptr_to_bp(
 		.level		= level,
 		.data_type	= bch2_bkey_ptr_data_type(k, p, entry),
 		.bucket_gen	= p.ptr.gen,
-		.bucket_len	= sectors,
+		.bucket_len	= ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p),
 		.pos		= k.k->p,
 	};
 }
 
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
-			   enum btree_id btree_id, unsigned level,
-			   struct bkey_s_c k, struct extent_ptr_decoded p,
-			   const union bch_extent_entry *entry,
-			   struct bkey_i_backpointer *bp)
-{
-	u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
-
-	__bch2_extent_ptr_to_bp(btree_id, level, k, p, entry, bp, sectors);
-}
-
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
 					 struct btree_iter *, unsigned);
 struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 30b983cf9780..56d3e3800a89 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -569,11 +569,10 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
-	*sectors = insert ? abs_sectors : -abs_sectors;
-
 	struct bkey_i_backpointer bp;
-	__bch2_extent_ptr_to_bp(btree_id, level, k, p, entry, &bp, abs_sectors);
+	bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp);
+
+	*sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len;
 
 	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
 	if (unlikely(!ca)) {
-- 
cgit v1.2.3


From 056cae1c00b9773aa69791f4703262f690c28cdb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 14 Nov 2024 22:13:29 -0500
Subject: bcachefs: Add write buffer flush param to backpointer_get_key()

In an upcoming patch bch2_backpointer_get_key() will be repairing when
it finds a dangling backpointer; it will need to flush the btree write
buffer before it can definitively say there's an error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 12 +++++++-----
 fs/bcachefs/backpointers.h |  5 +++--
 fs/bcachefs/ec.c           | 14 ++++++++++----
 fs/bcachefs/move.c         |  8 ++++++--
 4 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 98d89133fc75..d2f0b3140983 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -218,7 +218,8 @@ static void backpointer_target_not_found(struct btree_trans *trans,
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 					 struct bkey_s_c_backpointer bp,
 					 struct btree_iter *iter,
-					 unsigned iter_flags)
+					 unsigned iter_flags,
+					 struct bkey_buf *last_flushed)
 {
 	struct bch_fs *c = trans->c;
 
@@ -245,7 +246,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 		backpointer_target_not_found(trans, bp, k);
 		return bkey_s_c_null;
 	} else {
-		struct btree *b = bch2_backpointer_get_node(trans, bp, iter);
+		struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
 
 		if (IS_ERR_OR_NULL(b)) {
 			bch2_trans_iter_exit(trans, iter);
@@ -257,7 +258,8 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 
 struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
 					struct bkey_s_c_backpointer bp,
-					struct btree_iter *iter)
+					struct btree_iter *iter,
+					struct bkey_buf *last_flushed)
 {
 	struct bch_fs *c = trans->c;
 
@@ -486,7 +488,7 @@ check_existing_bp:
 	struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
 
 	struct bkey_s_c other_extent =
-		bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0);
+		bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, &s->last_flushed);
 	ret = bkey_err(other_extent);
 	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 		ret = 0;
@@ -866,7 +868,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 		return 0;
 
 	struct btree_iter iter;
-	struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0);
+	struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
 	int ret = bkey_err(k);
 	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 		return 0;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 65ede8adbc36..060dad1521ee 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -158,10 +158,11 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
 	};
 }
 
+struct bkey_buf;
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
-					 struct btree_iter *, unsigned);
+					 struct btree_iter *, unsigned, struct bkey_buf *);
 struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
-					struct btree_iter *);
+					struct btree_iter *, struct bkey_buf *);
 
 int bch2_check_btree_backpointers(struct bch_fs *);
 int bch2_check_extents_to_backpointers(struct bch_fs *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 3e7e41cd8380..d2a5e76e6479 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1265,7 +1265,8 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 				   struct bch_dev *ca,
 				   struct bpos bucket, u8 gen,
 				   struct ec_stripe_buf *s,
-				   struct bkey_s_c_backpointer bp)
+				   struct bkey_s_c_backpointer bp,
+				   struct bkey_buf *last_flushed)
 {
 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 	struct bch_fs *c = trans->c;
@@ -1282,7 +1283,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 		struct btree_iter node_iter;
 		struct btree *b;
 
-		b = bch2_backpointer_get_node(trans, bp, &node_iter);
+		b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
 		bch2_trans_iter_exit(trans, &node_iter);
 
 		if (!b)
@@ -1296,7 +1297,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 		return -EIO;
 	}
 
-	k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent);
+	k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
 	ret = bkey_err(k);
 	if (ret)
 		return ret;
@@ -1363,6 +1364,10 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
 
+	struct bkey_buf last_flushed;
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
 	ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
 			bucket_pos_to_bp_start(ca, bucket_pos),
 			bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
@@ -1376,9 +1381,10 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 			continue;
 
 		ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
-					bkey_s_c_to_backpointer(bp_k));
+					bkey_s_c_to_backpointer(bp_k), &last_flushed);
 	}));
 
+	bch2_bkey_buf_exit(&last_flushed, c);
 	bch2_dev_put(ca);
 	return ret;
 }
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 6d38afcaaaab..184620d5a3b4 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -677,6 +677,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 	struct bkey_s_c k;
 	struct data_update_opts data_opts;
 	unsigned sectors_moved = 0;
+	struct bkey_buf last_flushed;
 	int ret = 0;
 
 	struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
@@ -685,6 +686,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 
 	trace_bucket_evacuate(c, &bucket);
 
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
 	bch2_bkey_buf_init(&sk);
 
 	/*
@@ -726,7 +729,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 
 		if (!bp.v->level) {
-			k = bch2_backpointer_get_key(trans, bp, &iter, 0);
+			k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
 			ret = bkey_err(k);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
@@ -784,7 +787,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 		} else {
 			struct btree *b;
 
-			b = bch2_backpointer_get_node(trans, bp, &iter);
+			b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed);
 			ret = PTR_ERR_OR_ZERO(b);
 			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 				goto next;
@@ -822,6 +825,7 @@ err:
 	bch2_trans_iter_exit(trans, &bp_iter);
 	bch2_dev_put(ca);
 	bch2_bkey_buf_exit(&sk, c);
+	bch2_bkey_buf_exit(&last_flushed, c);
 	return ret;
 }
 
-- 
cgit v1.2.3


From c738866e47ef2e4d543698f0ab370ffe2b7e0d59 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Nov 2024 16:31:54 -0500
Subject: bcachefs: check_extents_to_backpointers() now only checks buckets
 with mismatches

Instead of walking every extent and every backpointer it points to,
first sum up backpointers in each bucket and check for mismatches, and
only look for missing backpointers if mismatches were detected, and only
check extents in those buckets.

This is a major fsck scalability improvement, since the two backpointers
passes (backpointers -> extents and extents -> backpointers) are the
most expensive fsck passes by far.

Additionally, to speed up the upgrade for backpointer bucket gens, or in
situations when we have to rebuild alloc info, add a special case for
when no backpointers are found in a bucket - don't check each individual
backpointer (in particular, avoiding the write buffer flushes), just
recreate them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 339 ++++++++++++++++++++++++++++++++++++++++++---
 fs/bcachefs/bcachefs.h     |   3 +
 fs/bcachefs/btree_cache.c  |   1 -
 fs/bcachefs/errcode.h      |   1 +
 4 files changed, 325 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index d2f0b3140983..892939763c3a 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -569,25 +569,33 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 					struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	int ret;
 
-	ptrs = bch2_bkey_ptrs_c(k);
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bkey_i_backpointer bp;
-
 		if (p.ptr.cached)
 			continue;
 
 		if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
 			continue;
 
-		bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
-		ret = check_bp_exists(trans, s, &bp, k);
-		if (ret)
-			return ret;
+		rcu_read_lock();
+		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+		bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
+		bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
+		rcu_read_unlock();
+
+		if (check || empty) {
+			struct bkey_i_backpointer bp;
+			bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+
+			int ret = check
+				? check_bp_exists(trans, s, &bp, k)
+				: bch2_bucket_backpointer_mod(trans, k, &bp, true);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return 0;
@@ -796,26 +804,295 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 	return 0;
 }
 
+enum alloc_sector_counter {
+	ALLOC_dirty,
+	ALLOC_cached,
+	ALLOC_stripe,
+	ALLOC_SECTORS_NR
+};
+
+static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t)
+{
+	switch (t) {
+	case BCH_DATA_btree:
+	case BCH_DATA_user:
+		return ALLOC_dirty;
+	case BCH_DATA_cached:
+		return ALLOC_cached;
+	case BCH_DATA_stripe:
+		return ALLOC_stripe;
+	default:
+		BUG();
+	}
+}
+
+static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
+
+static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
+					     struct bkey_buf *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
+	bool need_commit = false;
+
+	if (a->data_type == BCH_DATA_sb ||
+	    a->data_type == BCH_DATA_journal ||
+	    a->data_type == BCH_DATA_parity)
+		return 0;
+
+	u32 sectors[ALLOC_SECTORS_NR];
+	memset(sectors, 0, sizeof(sectors));
+
+	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p);
+	if (!ca)
+		return 0;
+
+	struct btree_iter iter;
+	struct bkey_s_c bp_k;
+	int ret = 0;
+	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
+				bucket_pos_to_bp_start(ca, alloc_k.k->p),
+				bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
+		if (bp_k.k->type != KEY_TYPE_backpointer)
+			continue;
+
+		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
+
+		if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
+		    (bp.v->bucket_gen != a->gen ||
+		     bp.v->pad)) {
+			ret = bch2_backpointer_del(trans, bp_k.k->p);
+			if (ret)
+				break;
+
+			need_commit = true;
+			continue;
+		}
+
+		if (bp.v->bucket_gen != a->gen)
+			continue;
+
+		sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len;
+	};
+	bch2_trans_iter_exit(trans, &iter);
+	if (ret)
+		goto err;
+
+	if (need_commit) {
+		ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+		if (ret)
+			goto err;
+	}
+
+	/* Cached pointers don't have backpointers: */
+
+	if (sectors[ALLOC_dirty]  != a->dirty_sectors ||
+	    sectors[ALLOC_stripe] != a->stripe_sectors) {
+		if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
+			ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
+			if (ret)
+				goto err;
+		}
+
+		if (sectors[ALLOC_dirty]  > a->dirty_sectors ||
+		    sectors[ALLOC_stripe] > a->stripe_sectors) {
+			ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
+				-BCH_ERR_transaction_restart_nested;
+			goto err;
+		}
+
+		if (!sectors[ALLOC_dirty] &&
+		    !sectors[ALLOC_stripe])
+			__set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
+		else
+			__set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
+	}
+err:
+	bch2_dev_put(ca);
+	return ret;
+}
+
+static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr_v2: {
+		bool ret = false;
+
+		rcu_read_lock();
+		struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
+		while (pos.inode <= k.k->p.inode) {
+			if (pos.inode >= c->sb.nr_devices)
+				break;
+
+			struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
+			if (!ca)
+				goto next;
+
+			struct bpos bucket = bp_pos_to_bucket(ca, pos);
+			bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches,
+						      ca->mi.nbuckets, bucket.offset);
+			if (bucket.offset == ca->mi.nbuckets)
+				goto next;
+
+			ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p);
+			if (ret)
+				break;
+next:
+			pos = SPOS(pos.inode + 1, 0, 0);
+		}
+		rcu_read_unlock();
+
+		return ret;
+	}
+	case KEY_TYPE_btree_ptr:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
+				  enum btree_id btree, unsigned level)
+{
+	struct btree_iter iter;
+	bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
+	struct btree *b = bch2_btree_iter_peek_node(&iter);
+	int ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto err;
+
+	if (b)
+		bch2_node_pin(trans->c, b);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans,
+						   struct bpos start, struct bpos *end)
+{
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	struct bkey_buf tmp;
+	bch2_bkey_buf_init(&tmp);
+
+	bch2_btree_cache_unpin(c);
+
+	*end = SPOS_MAX;
+
+	s64 mem_may_pin = mem_may_pin_bytes(c);
+	struct btree_iter iter;
+	bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
+				  0, 1, BTREE_ITER_prefetch);
+	ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+		if (!backpointer_node_has_missing(c, k))
+			continue;
+
+		mem_may_pin -= c->opts.btree_node_size;
+		if (mem_may_pin <= 0)
+			break;
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		struct btree_path *path = btree_iter_path(trans, &iter);
+
+		BUG_ON(path->level != 1);
+
+		bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1);
+	}));
+	if (ret)
+		return ret;
+
+	struct bpos pinned = SPOS_MAX;
+	mem_may_pin = mem_may_pin_bytes(c);
+	bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
+				  0, 1, BTREE_ITER_prefetch);
+	ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+		if (!backpointer_node_has_missing(c, k))
+			continue;
+
+		mem_may_pin -= c->opts.btree_node_size;
+		if (mem_may_pin <= 0) {
+			*end = pinned;
+			break;
+		}
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		struct btree_path *path = btree_iter_path(trans, &iter);
+
+		BUG_ON(path->level != 1);
+
+		int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1);
+
+		if (!ret2)
+			pinned = tmp.k->k.p;
+
+		ret;
+	}));
+	if (ret)
+		return ret;
+
+	return ret;
+}
+
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
+	int ret = 0;
+
+	/*
+	 * Can't allow devices to come/go/resize while we have bucket bitmaps
+	 * allocated
+	 */
+	lockdep_assert_held(&c->state_lock);
+
+	for_each_member_device(c, ca) {
+		BUG_ON(ca->bucket_backpointer_mismatches);
+		ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+							     sizeof(unsigned long),
+							     GFP_KERNEL);
+		ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+							sizeof(unsigned long),
+							GFP_KERNEL);
+		if (!ca->bucket_backpointer_mismatches ||
+		    !ca->bucket_backpointer_empty) {
+			bch2_dev_put(ca);
+			ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
+			goto err_free_bitmaps;
+		}
+	}
+
 	struct btree_trans *trans = bch2_trans_get(c);
 	struct extents_to_bp_state s = { .bp_start = POS_MIN };
-	int ret;
 
 	bch2_bkey_buf_init(&s.last_flushed);
 	bkey_init(&s.last_flushed.k->k);
 
+	ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
+				 POS_MIN, BTREE_ITER_prefetch, k, ({
+		check_bucket_backpointer_mismatch(trans, k, &s.last_flushed);
+	}));
+	if (ret)
+		goto err;
+
+	u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
+	for_each_member_device(c, ca) {
+		nr_buckets	+= ca->mi.nbuckets;
+		nr_mismatches	+= bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
+		nr_empty	+= bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets);
+	}
+
+	if (!nr_mismatches && !nr_empty)
+		goto err;
+
+	bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
+		 nr_mismatches + nr_empty, nr_buckets);
+
 	while (1) {
-		struct bbpos end;
-		ret = bch2_get_btree_in_memory_pos(trans,
-				BIT_ULL(BTREE_ID_backpointers),
-				BIT_ULL(BTREE_ID_backpointers),
-				BBPOS(BTREE_ID_backpointers, s.bp_start), &end);
+		ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
 		if (ret)
 			break;
 
-		s.bp_end = end.pos;
-
 		if ( bpos_eq(s.bp_start, POS_MIN) &&
 		    !bpos_eq(s.bp_end, SPOS_MAX))
 			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
@@ -840,10 +1117,17 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 
 		s.bp_start = bpos_successor(s.bp_end);
 	}
+err:
 	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&s.last_flushed, c);
-
 	bch2_btree_cache_unpin(c);
+err_free_bitmaps:
+	for_each_member_device(c, ca) {
+		kvfree(ca->bucket_backpointer_empty);
+		ca->bucket_backpointer_empty = NULL;
+		kvfree(ca->bucket_backpointer_mismatches);
+		ca->bucket_backpointer_mismatches = NULL;
+	}
 
 	bch_err_fn(c, ret);
 	return ret;
@@ -895,6 +1179,25 @@ fsck_err:
 	return ret;
 }
 
+static int check_bucket_backpointers_to_extents(struct btree_trans *trans,
+						struct bch_dev *ca, struct bpos bucket)
+{
+	u32 restart_count = trans->restart_count;
+	struct bkey_buf last_flushed;
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
+	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
+				      bucket_pos_to_bp_start(ca, bucket),
+				      bucket_pos_to_bp_end(ca, bucket),
+				      0, k,
+		check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed)
+	);
+
+	bch2_bkey_buf_exit(&last_flushed, trans->c);
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
 static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 						   struct bbpos start,
 						   struct bbpos end)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3a3cb79d8518..7b0959fb35dd 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -555,6 +555,9 @@ struct bch_dev {
 	u8			*oldest_gen;
 	unsigned long		*buckets_nouse;
 
+	unsigned long		*bucket_backpointer_mismatches;
+	unsigned long		*bucket_backpointer_empty;
+
 	struct bch_dev_usage __percpu	*usage;
 
 	/* Allocator: */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 1117be901cf0..b00c6a20be27 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -222,7 +222,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b)
 	struct btree_cache *bc = &c->btree_cache;
 
 	mutex_lock(&bc->lock);
-	BUG_ON(!__btree_node_pinned(bc, b));
 	if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
 		set_btree_node_pinned(b);
 		list_move(&b->list, &bc->live[1].list);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 5d17ceb1e83a..c0df2587a580 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -54,6 +54,7 @@
 	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
 	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
 	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
+	x(ENOMEM,			ENOMEM_backpointer_mismatches_bitmap)	\
 	x(EIO,				compression_workspace_not_initialized)	\
 	x(ENOMEM,			ENOMEM_bucket_gens)			\
 	x(ENOMEM,			ENOMEM_buckets_nouse)			\
-- 
cgit v1.2.3


From c2c2a4d6420bbfd584aa5e159a6f78b7b5124fd7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 12 Nov 2024 03:46:31 -0500
Subject: bcachefs: bch2_backpointer_get_key() now repairs dangling
 backpointers

Continuing on with the self healing theme, we should be running any
check and repair code at runtime that we can - instead of declaring the
filesystemt inconsistent.

This will also let us skip running the backpointers -> extents fsck pass
except in debug mode.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 70 +++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 892939763c3a..5cc4aaa3a325 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -172,9 +172,10 @@ err:
 
 static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
 {
-	return likely(!bch2_backpointers_no_use_write_buffer)
-	       ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
-	       : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0);
+	return (likely(!bch2_backpointers_no_use_write_buffer)
+		? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
+		: bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
+		 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 }
 
 static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
@@ -186,20 +187,25 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
 		: 0;
 }
 
-static void backpointer_target_not_found(struct btree_trans *trans,
-					 struct bkey_s_c_backpointer bp,
-					 struct bkey_s_c target_k)
+static int backpointer_target_not_found(struct btree_trans *trans,
+				  struct bkey_s_c_backpointer bp,
+				  struct bkey_s_c target_k,
+				  struct bkey_buf *last_flushed)
 {
 	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
 	/*
 	 * If we're using the btree write buffer, the backpointer we were
 	 * looking at may have already been deleted - failure to find what it
 	 * pointed to is not an error:
 	 */
-	if (likely(!bch2_backpointers_no_use_write_buffer))
-		return;
+	ret = last_flushed
+		? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed)
+		: 0;
+	if (ret)
+		return ret;
 
 	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
 		   bp.v->level ? "btree node" : "extent");
@@ -207,12 +213,13 @@ static void backpointer_target_not_found(struct btree_trans *trans,
 	prt_printf(&buf, "\n  ");
 
 	bch2_bkey_val_to_text(&buf, c, target_k);
-	if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
-		bch_err_ratelimited(c, "%s", buf.buf);
-	else
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
 
+	if (fsck_err(trans, backpointer_to_missing_ptr,
+		     "%s", buf.buf))
+		ret = bch2_backpointer_del(trans, bp.k->p);
+fsck_err:
 	printbuf_exit(&buf);
+	return ret;
 }
 
 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
@@ -243,15 +250,13 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
 			return k;
 
 		bch2_trans_iter_exit(trans, iter);
-		backpointer_target_not_found(trans, bp, k);
-		return bkey_s_c_null;
+		int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
+		return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
 	} else {
 		struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
+		if (IS_ERR_OR_NULL(b))
+			return ((struct bkey_s_c) { .k = ERR_CAST(b) });
 
-		if (IS_ERR_OR_NULL(b)) {
-			bch2_trans_iter_exit(trans, iter);
-			return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
-		}
 		return bkey_i_to_s_c(&b->key);
 	}
 }
@@ -284,8 +289,8 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
 	if (btree_node_will_make_reachable(b)) {
 		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
 	} else {
-		backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key));
-		b = NULL;
+		int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
+		b = ret ? ERR_PTR(ret) : NULL;
 	}
 err:
 	bch2_trans_iter_exit(trans, iter);
@@ -488,7 +493,7 @@ check_existing_bp:
 	struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
 
 	struct bkey_s_c other_extent =
-		bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, &s->last_flushed);
+		bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
 	ret = bkey_err(other_extent);
 	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
 		ret = 0;
@@ -1143,9 +1148,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 		return 0;
 
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-	struct bch_fs *c = trans->c;
 	struct bbpos pos = bp_to_bbpos(*bp.v);
-	struct printbuf buf = PRINTBUF;
 
 	if (bbpos_cmp(pos, start) < 0 ||
 	    bbpos_cmp(pos, end) > 0)
@@ -1159,23 +1162,7 @@ static int check_one_backpointer(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	if (!k.k) {
-		ret = bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed);
-		if (ret)
-			goto out;
-
-		if (fsck_err(trans, backpointer_to_missing_ptr,
-			     "backpointer for missing %s\n  %s",
-			     bp.v->level ? "btree node" : "extent",
-			     (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
-			ret = bch2_backpointer_del(trans, bp.k->p);
-			goto out;
-		}
-	}
-out:
-fsck_err:
 	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1210,9 +1197,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 	bkey_init(&last_flushed.k->k);
 	progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
 
-	int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
-				  POS_MIN, BTREE_ITER_prefetch, k,
-				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+	int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
+				     POS_MIN, BTREE_ITER_prefetch, k, ({
 			progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
 			check_one_backpointer(trans, start, end, k, &last_flushed);
 	}));
-- 
cgit v1.2.3


From 7611d6b5d1c12e2210c6f1283232fcb16b685de1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 10 Dec 2024 14:04:39 -0500
Subject: bcachefs: better backpointer_target_not_found() error message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 5cc4aaa3a325..b93ddfa00fdd 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -210,10 +210,21 @@ static int backpointer_target_not_found(struct btree_trans *trans,
 	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
 		   bp.v->level ? "btree node" : "extent");
 	bch2_bkey_val_to_text(&buf, c, bp.s_c);
-	prt_printf(&buf, "\n  ");
 
+	prt_printf(&buf, "\n  ");
 	bch2_bkey_val_to_text(&buf, c, target_k);
 
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
+		if (p.ptr.dev == bp.k->p.inode) {
+			prt_printf(&buf, "\n  ");
+			struct bkey_i_backpointer bp2;
+			bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
+		}
+
 	if (fsck_err(trans, backpointer_to_missing_ptr,
 		     "%s", buf.buf))
 		ret = bch2_backpointer_del(trans, bp.k->p);
-- 
cgit v1.2.3


From d884cf189a92881c02fa22b2318bf8fcbbd61297 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 14 Nov 2024 20:47:32 -0500
Subject: bcachefs: Only run check_backpointers_to_extents in debug mode

The backpointers passes, check_backpointers_to_extents() and
check_extents_to_backpointers() are the most expensive fsck passes.

Now that we're running the same check and repair code when using a
backpointer at runtime (via bch2_backpointer_get_key()) that fsck does,
there's no reason fsck needs to - except to verify that the filesystem
really has no errors in debug mode.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery_passes_types.h | 92 ++++++++++++++++++++-----------------
 fs/bcachefs/sb-errors_format.h      |  4 +-
 2 files changed, 51 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 2b3ef3980fc3..71baad41d8c5 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -8,53 +8,59 @@
 #define PASS_ALWAYS		BIT(3)
 #define PASS_ONLINE		BIT(4)
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define PASS_FSCK_DEBUG		BIT(1)
+#else
+#define PASS_FSCK_DEBUG		0
+#endif
+
 /*
  * Passes may be reordered, but the second field is a persistent identifier and
  * must never change:
  */
-#define BCH_RECOVERY_PASSES()							\
-	x(recovery_pass_empty,			41, PASS_SILENT)		\
-	x(scan_for_btree_nodes,			37, 0)				\
-	x(check_topology,			 4, 0)				\
-	x(accounting_read,			39, PASS_ALWAYS)		\
-	x(alloc_read,				 0, PASS_ALWAYS)		\
-	x(stripes_read,				 1, PASS_ALWAYS)		\
-	x(initialize_subvolumes,		 2, 0)				\
-	x(snapshots_read,			 3, PASS_ALWAYS)		\
-	x(check_allocations,			 5, PASS_FSCK)			\
-	x(trans_mark_dev_sbs,			 6, PASS_ALWAYS|PASS_SILENT)	\
-	x(fs_journal_alloc,			 7, PASS_ALWAYS|PASS_SILENT)	\
-	x(set_may_go_rw,			 8, PASS_ALWAYS|PASS_SILENT)	\
-	x(journal_replay,			 9, PASS_ALWAYS)		\
-	x(check_alloc_info,			10, PASS_ONLINE|PASS_FSCK)	\
-	x(check_lrus,				11, PASS_ONLINE|PASS_FSCK)	\
-	x(check_btree_backpointers,		12, PASS_ONLINE|PASS_FSCK)	\
-	x(check_backpointers_to_extents,	13, PASS_ONLINE|PASS_FSCK)	\
-	x(check_extents_to_backpointers,	14, PASS_ONLINE|PASS_FSCK)	\
-	x(check_alloc_to_lru_refs,		15, PASS_ONLINE|PASS_FSCK)	\
-	x(fs_freespace_init,			16, PASS_ALWAYS|PASS_SILENT)	\
-	x(bucket_gens_init,			17, 0)				\
-	x(reconstruct_snapshots,		38, 0)				\
-	x(check_snapshot_trees,			18, PASS_ONLINE|PASS_FSCK)	\
-	x(check_snapshots,			19, PASS_ONLINE|PASS_FSCK)	\
-	x(check_subvols,			20, PASS_ONLINE|PASS_FSCK)	\
-	x(check_subvol_children,		35, PASS_ONLINE|PASS_FSCK)	\
-	x(delete_dead_snapshots,		21, PASS_ONLINE|PASS_FSCK)	\
-	x(fs_upgrade_for_subvolumes,		22, 0)				\
-	x(check_inodes,				24, PASS_FSCK)			\
-	x(check_extents,			25, PASS_FSCK)			\
-	x(check_indirect_extents,		26, PASS_ONLINE|PASS_FSCK)	\
-	x(check_dirents,			27, PASS_FSCK)			\
-	x(check_xattrs,				28, PASS_FSCK)			\
-	x(check_root,				29, PASS_ONLINE|PASS_FSCK)	\
-	x(check_unreachable_inodes,		40, PASS_ONLINE|PASS_FSCK)	\
-	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)	\
-	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)	\
-	x(check_nlinks,				31, PASS_FSCK)			\
-	x(resume_logged_ops,			23, PASS_ALWAYS)		\
-	x(delete_dead_inodes,			32, PASS_ALWAYS)		\
-	x(fix_reflink_p,			33, 0)				\
-	x(set_fs_needs_rebalance,		34, 0)				\
+#define BCH_RECOVERY_PASSES()								\
+	x(recovery_pass_empty,			41, PASS_SILENT)			\
+	x(scan_for_btree_nodes,			37, 0)					\
+	x(check_topology,			 4, 0)					\
+	x(accounting_read,			39, PASS_ALWAYS)			\
+	x(alloc_read,				 0, PASS_ALWAYS)			\
+	x(stripes_read,				 1, PASS_ALWAYS)			\
+	x(initialize_subvolumes,		 2, 0)					\
+	x(snapshots_read,			 3, PASS_ALWAYS)			\
+	x(check_allocations,			 5, PASS_FSCK)				\
+	x(trans_mark_dev_sbs,			 6, PASS_ALWAYS|PASS_SILENT)		\
+	x(fs_journal_alloc,			 7, PASS_ALWAYS|PASS_SILENT)		\
+	x(set_may_go_rw,			 8, PASS_ALWAYS|PASS_SILENT)		\
+	x(journal_replay,			 9, PASS_ALWAYS)			\
+	x(check_alloc_info,			10, PASS_ONLINE|PASS_FSCK)		\
+	x(check_lrus,				11, PASS_ONLINE|PASS_FSCK)		\
+	x(check_btree_backpointers,		12, PASS_ONLINE|PASS_FSCK)		\
+	x(check_backpointers_to_extents,	13, PASS_ONLINE|PASS_FSCK_DEBUG)	\
+	x(check_extents_to_backpointers,	14, PASS_ONLINE|PASS_FSCK)		\
+	x(check_alloc_to_lru_refs,		15, PASS_ONLINE|PASS_FSCK)		\
+	x(fs_freespace_init,			16, PASS_ALWAYS|PASS_SILENT)		\
+	x(bucket_gens_init,			17, 0)					\
+	x(reconstruct_snapshots,		38, 0)					\
+	x(check_snapshot_trees,			18, PASS_ONLINE|PASS_FSCK)		\
+	x(check_snapshots,			19, PASS_ONLINE|PASS_FSCK)		\
+	x(check_subvols,			20, PASS_ONLINE|PASS_FSCK)		\
+	x(check_subvol_children,		35, PASS_ONLINE|PASS_FSCK)		\
+	x(delete_dead_snapshots,		21, PASS_ONLINE|PASS_FSCK)		\
+	x(fs_upgrade_for_subvolumes,		22, 0)					\
+	x(check_inodes,				24, PASS_FSCK)				\
+	x(check_extents,			25, PASS_FSCK)				\
+	x(check_indirect_extents,		26, PASS_ONLINE|PASS_FSCK)		\
+	x(check_dirents,			27, PASS_FSCK)				\
+	x(check_xattrs,				28, PASS_FSCK)				\
+	x(check_root,				29, PASS_ONLINE|PASS_FSCK)		\
+	x(check_unreachable_inodes,		40, PASS_ONLINE|PASS_FSCK)		\
+	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)		\
+	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)		\
+	x(check_nlinks,				31, PASS_FSCK)				\
+	x(resume_logged_ops,			23, PASS_ALWAYS)			\
+	x(delete_dead_inodes,			32, PASS_ALWAYS)			\
+	x(fix_reflink_p,			33, 0)					\
+	x(set_fs_needs_rebalance,		34, 0)
 
 /* We normally enumerate recovery passes in the order we run them: */
 enum bch_recovery_pass {
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 0bc4cec2926c..806486635075 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -140,8 +140,8 @@ enum bch_fsck_flags {
 	x(backpointer_bucket_offset_wrong,			125,	0)		\
 	x(backpointer_level_bad,				294,	0)		\
 	x(backpointer_dev_bad,					297,	0)		\
-	x(backpointer_to_missing_device,			126,	0)		\
-	x(backpointer_to_missing_alloc,				127,	0)		\
+	x(backpointer_to_missing_device,			126,	FSCK_AUTOFIX)	\
+	x(backpointer_to_missing_alloc,				127,	FSCK_AUTOFIX)	\
 	x(backpointer_to_missing_ptr,				128,	FSCK_AUTOFIX)	\
 	x(lru_entry_at_time_0,					129,	FSCK_AUTOFIX)	\
 	x(lru_entry_to_invalid_bucket,				130,	FSCK_AUTOFIX)	\
-- 
cgit v1.2.3


From a36d8f0e0e3d427ffafae30694587efffda16f7c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Nov 2024 21:50:29 -0500
Subject: bcachefs: BCH_SB_VERSION_INCOMPAT

We've been getting away from feature bits: they don't have any kind of
ordering, and thus it's possible for people to enable weird combinations
of features that were never tested or intended to be run.

Much better to just give every new feature, compatible or incompatible,
a version number.

Additionally, we probably won't ever rev the major version number: major
version numbers represent incompatible versions, but that doesn't really
fit with how we actually roll out incompatible features - we need a
better way of rolling out incompatible features.

So, this patch adds two new superblock fields:
- BCH_SB_VERSION_INCOMPAT
- BCH_SB_VERSION_INCOMPAT_ALLOWED

BCH_SB_VERSION_INCOMPAT_ALLOWED indicates that incompatible features up
to version number x are allowed to be used without user prompting, but
it does not by itself deny old versions from mounting.

BCH_SB_VERSION_INCOMPAT does deny old versions from mounting, and must
be <= BCH_SB_VERSION_INCOMPAT_ALLOWED.

BCH_SB_VERSION_INCOMPAT will only be set when a codepath attempts to use
an incompatible feature, so as to not unnecessarily break compatibility
with old versions.

bch2_request_incompat_feature() is the new interface to check if an
incompatible feature may be used.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  2 ++
 fs/bcachefs/bcachefs_format.h | 24 +++++++++++--------
 fs/bcachefs/recovery.c        | 27 ++++++++++++++++++----
 fs/bcachefs/super-io.c        | 54 ++++++++++++++++++++++++++++++++++++++++---
 fs/bcachefs/super-io.h        | 19 ++++++++++++---
 5 files changed, 106 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7b0959fb35dd..b749c4ecad1b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -760,6 +760,8 @@ struct bch_fs {
 		__uuid_t	user_uuid;
 
 		u16		version;
+		u16		version_incompat;
+		u16		version_incompat_allowed;
 		u16		version_min;
 		u16		version_upgrade_complete;
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index cef22c15c256..0c6dfc4c1743 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -845,6 +845,9 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
 					struct bch_sb, flags[5],  0, 16);
 LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
 					struct bch_sb, flags[5], 16, 32);
+LE64_BITMASK(BCH_SB_VERSION_INCOMPAT,	struct bch_sb, flags[5], 32, 48);
+LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
+					struct bch_sb, flags[5], 48, 64);
 
 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
@@ -897,21 +900,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
 	x(new_varint,			15)	\
 	x(journal_no_flush,		16)	\
 	x(alloc_v2,			17)	\
-	x(extents_across_btree_nodes,	18)
+	x(extents_across_btree_nodes,	18)	\
+	x(incompat_version_field,	19)
 
 #define BCH_SB_FEATURES_ALWAYS				\
-	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
-	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
-	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
-	 (1ULL << BCH_FEATURE_alloc_v2)|\
-	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
+	(BIT_ULL(BCH_FEATURE_new_extent_overwrite)|	\
+	 BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
+	 BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
+	 BIT_ULL(BCH_FEATURE_alloc_v2)|\
+	 BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
 
 #define BCH_SB_FEATURES_ALL				\
 	(BCH_SB_FEATURES_ALWAYS|			\
-	 (1ULL << BCH_FEATURE_new_siphash)|		\
-	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
-	 (1ULL << BCH_FEATURE_new_varint)|		\
-	 (1ULL << BCH_FEATURE_journal_no_flush))
+	 BIT_ULL(BCH_FEATURE_new_siphash)|		\
+	 BIT_ULL(BCH_FEATURE_btree_ptr_v2)|		\
+	 BIT_ULL(BCH_FEATURE_new_varint)|		\
+	 BIT_ULL(BCH_FEATURE_journal_no_flush))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index fbef6579d884..383e03606d6e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -612,6 +612,7 @@ static bool check_version_upgrade(struct bch_fs *c)
 					 bch2_latest_compatible_version(c->sb.version));
 	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
 	unsigned new_version = 0;
+	bool ret = false;
 
 	if (old_version < bcachefs_metadata_required_upgrade_below) {
 		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
@@ -667,14 +668,32 @@ static bool check_version_upgrade(struct bch_fs *c)
 		}
 
 		bch_info(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+
+		ret = true;
+	}
 
-		bch2_sb_upgrade(c, new_version);
+	if (new_version > c->sb.version_incompat &&
+	    c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "Now allowing incompatible features up to ");
+		bch2_version_to_text(&buf, new_version);
+		prt_str(&buf, ", previously allowed up to ");
+		bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
+		prt_newline(&buf);
 
+		bch_info(c, "%s", buf.buf);
 		printbuf_exit(&buf);
-		return true;
+
+		ret = true;
 	}
 
-	return false;
+	if (ret)
+		bch2_sb_upgrade(c, new_version,
+				c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
+
+	return ret;
 }
 
 int bch2_fs_recovery(struct bch_fs *c)
@@ -1074,7 +1093,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_check_version_downgrade(c);
 
 	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
-		bch2_sb_upgrade(c, bcachefs_metadata_version_current);
+		bch2_sb_upgrade(c, bcachefs_metadata_version_current, false);
 		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
 		bch2_write_super(c);
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6a086c1c4b14..b0d52b6ccad4 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -42,7 +42,7 @@ static const struct bch2_metadata_version bch2_metadata_versions[] = {
 #undef x
 };
 
-void bch2_version_to_text(struct printbuf *out, unsigned v)
+void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v)
 {
 	const char *str = "(unknown version)";
 
@@ -55,7 +55,7 @@ void bch2_version_to_text(struct printbuf *out, unsigned v)
 	prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
 }
 
-unsigned bch2_latest_compatible_version(unsigned v)
+enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v)
 {
 	if (!BCH_VERSION_MAJOR(v))
 		return v;
@@ -69,6 +69,16 @@ unsigned bch2_latest_compatible_version(unsigned v)
 	return v;
 }
 
+void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
+{
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
+		max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
+
 const char * const bch2_sb_fields[] = {
 #define x(name, nr)	#name,
 	BCH_SB_FIELDS()
@@ -369,6 +379,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 		return -BCH_ERR_invalid_sb_features;
 	}
 
+	if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
+	    BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
+		prt_printf(out, "Filesystem has incompatible version");
+		return -BCH_ERR_invalid_sb_features;
+	}
+
 	block_size = le16_to_cpu(sb->block_size);
 
 	if (block_size > PAGE_SECTORS) {
@@ -407,6 +423,21 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 		return -BCH_ERR_invalid_sb_time_precision;
 	}
 
+	/* old versions didn't know to downgrade this field */
+	if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version))
+		SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version));
+
+	if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) {
+		prt_printf(out, "Invalid version_incompat ");
+		bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
+		prt_str(out, " > incompat_allowed ");
+		bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
+		if (flags & BCH_VALIDATE_write)
+			return -BCH_ERR_invalid_sb_version;
+		else
+			SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
+	}
+
 	if (!flags) {
 		/*
 		 * Been seeing a bug where these are getting inexplicably
@@ -520,6 +551,9 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.uuid		= src->uuid;
 	c->sb.user_uuid		= src->user_uuid;
 	c->sb.version		= le16_to_cpu(src->version);
+	c->sb.version_incompat	= BCH_SB_VERSION_INCOMPAT(src);
+	c->sb.version_incompat_allowed
+				= BCH_SB_VERSION_INCOMPAT_ALLOWED(src);
 	c->sb.version_min	= le16_to_cpu(src->version_min);
 	c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
 	c->sb.nr_devices	= src->nr_devices;
@@ -1152,6 +1186,8 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
 	 */
 	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
 		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+	if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current)
+		SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current);
 	if (c->sb.version > bcachefs_metadata_version_current)
 		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
 	if (c->sb.version_min > bcachefs_metadata_version_current)
@@ -1160,7 +1196,7 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
 	return ret;
 }
 
-void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
 {
 	lockdep_assert_held(&c->sb_lock);
 
@@ -1170,6 +1206,10 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
 
 	c->disk_sb.sb->version = cpu_to_le16(new_version);
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+
+	if (incompat)
+		SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
+			max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
 }
 
 static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
@@ -1334,6 +1374,14 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	bch2_version_to_text(out, le16_to_cpu(sb->version));
 	prt_newline(out);
 
+	prt_printf(out, "Incompatible features allowed:\t");
+	bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Incompatible features in use:\t");
+	bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
+	prt_newline(out);
+
 	prt_printf(out, "Version upgrade complete:\t");
 	bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
 	prt_newline(out);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 90e7b176cdd0..f1ab4f943720 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -18,8 +18,21 @@ static inline bool bch2_version_compatible(u16 version)
 		version >= bcachefs_metadata_version_min;
 }
 
-void bch2_version_to_text(struct printbuf *, unsigned);
-unsigned bch2_latest_compatible_version(unsigned);
+void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
+enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
+
+void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
+
+static inline bool bch2_request_incompat_feature(struct bch_fs *c,
+						 enum bcachefs_metadata_version version)
+{
+	if (unlikely(version > c->sb.version_incompat)) {
+		if (version > c->sb.version_incompat_allowed)
+			return false;
+		bch2_set_version_incompat(c, version);
+	}
+	return true;
+}
 
 static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
 {
@@ -94,7 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 }
 
 bool bch2_check_version_downgrade(struct bch_fs *);
-void bch2_sb_upgrade(struct bch_fs *, unsigned);
+void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
 
 void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
 			     struct bch_sb_field *);
-- 
cgit v1.2.3


From ea4f9e75ecfb1203980716d6306de3f8789a049e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 6 Nov 2024 23:16:24 -0500
Subject: bcachefs: bcachefs_metadata_version_reflink_p_may_update_opts

Previously, io path option changes on a file would be picked up
automatically and applied to existing data - but not for reflinked data,
as we had no way of doing this safely. A user may have had permission to
copy (and reflink) a given file, but not write to it, and if so they
shouldn't be allowed to change e.g. nr_replicas or other options.

This uses the incompat feature mechanism in the previous patch to add a
new incompatible flag to bch_reflink_p, indicating whether a given
reflink pointer may propagate io path option changes back to the
indirect extent.

In this initial patch we're only setting it for the source extents.

We'd like to set it for the destination in a reflink copy, when the user
has write access to the source, but that requires mnt_idmap which is not
curretly plumbed up to remap_file_range.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  3 ++-
 fs/bcachefs/fs-io.c           |  9 ++++++++-
 fs/bcachefs/reflink.c         | 18 +++++++++++++++---
 fs/bcachefs/reflink.h         |  3 ++-
 fs/bcachefs/reflink_format.h  |  2 ++
 5 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0c6dfc4c1743..c6cc2690aa26 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -680,7 +680,8 @@ struct bch_sb_field_ext {
 	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
 	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))		\
 	x(backpointer_bucket_gen,	BCH_VERSION(1, 14))		\
-	x(disk_accounting_big_endian,	BCH_VERSION(1, 15))
+	x(disk_accounting_big_endian,	BCH_VERSION(1, 15))		\
+	x(reflink_p_may_update_opts,	BCH_VERSION(1, 16))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 33d0e7080bf6..94bf34b9b65f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -906,11 +906,18 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
 				   (pos_src + aligned_len) >> 9);
 
+	/*
+	 * XXX: we'd like to be telling bch2_remap_range() if we have
+	 * permission to write to the source file, and thus if io path option
+	 * changes should be propagated through the copy, but we need mnt_idmap
+	 * from the pathwalk, awkward
+	 */
 	ret = bch2_remap_range(c,
 			       inode_inum(dst), pos_dst >> 9,
 			       inode_inum(src), pos_src >> 9,
 			       aligned_len >> 9,
-			       pos_dst + len, &i_sectors_delta);
+			       pos_dst + len, &i_sectors_delta,
+			       false);
 	if (ret < 0)
 		goto err;
 
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index e1911b9beb61..93ba4f4e47ca 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -482,7 +482,8 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
 
 static int bch2_make_extent_indirect(struct btree_trans *trans,
 				     struct btree_iter *extent_iter,
-				     struct bkey_i *orig)
+				     struct bkey_i *orig,
+				     bool reflink_p_may_update_opts_field)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter reflink_iter = { NULL };
@@ -548,6 +549,9 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
 	SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
 
+	if (reflink_p_may_update_opts_field)
+		SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true);
+
 	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
 				BTREE_UPDATE_internal_snapshot_node);
 err:
@@ -578,7 +582,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 		     subvol_inum dst_inum, u64 dst_offset,
 		     subvol_inum src_inum, u64 src_offset,
 		     u64 remap_sectors,
-		     u64 new_i_size, s64 *i_sectors_delta)
+		     u64 new_i_size, s64 *i_sectors_delta,
+		     bool may_change_src_io_path_opts)
 {
 	struct btree_trans *trans;
 	struct btree_iter dst_iter, src_iter;
@@ -591,6 +596,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 	struct bpos src_want;
 	u64 dst_done = 0;
 	u32 dst_snapshot, src_snapshot;
+	bool reflink_p_may_update_opts_field =
+		bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
 	int ret = 0, ret2 = 0;
 
 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
@@ -672,7 +679,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 			src_k = bkey_i_to_s_c(new_src.k);
 
 			ret = bch2_make_extent_indirect(trans, &src_iter,
-						new_src.k);
+						new_src.k,
+						reflink_p_may_update_opts_field);
 			if (ret)
 				continue;
 
@@ -690,6 +698,10 @@ s64 bch2_remap_range(struct bch_fs *c,
 				 bkey_start_offset(src_k.k));
 
 			SET_REFLINK_P_IDX(&dst_p->v, offset);
+
+			if (reflink_p_may_update_opts_field &&
+			    may_change_src_io_path_opts)
+				SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
 		} else {
 			BUG();
 		}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index f119316adc81..1632780bdf18 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -78,7 +78,8 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_i
 					    bool, unsigned);
 
 s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
-		     subvol_inum, u64, u64, u64, s64 *);
+		     subvol_inum, u64, u64, u64, s64 *,
+		     bool);
 
 int bch2_gc_reflink_done(struct bch_fs *);
 int bch2_gc_reflink_start(struct bch_fs *);
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
index 53502627b2c5..92995e4f898e 100644
--- a/fs/bcachefs/reflink_format.h
+++ b/fs/bcachefs/reflink_format.h
@@ -19,6 +19,8 @@ struct bch_reflink_p {
 
 LE64_BITMASK(REFLINK_P_IDX,	struct bch_reflink_p, idx_flags,  0, 56);
 LE64_BITMASK(REFLINK_P_ERROR,	struct bch_reflink_p, idx_flags, 56, 57);
+LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS,
+				struct bch_reflink_p, idx_flags, 57, 58);
 
 struct bch_reflink_v {
 	struct bch_val		v;
-- 
cgit v1.2.3


From 80c6352c2c98bc7b399ce94ae7e54b5b36aad731 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 20 Oct 2024 02:12:21 -0400
Subject: bcachefs: Option changes now get propagated to reflinked data

Now that bch2_move_get_io_opts() re-propagates changed inode io options
to bch_extent_rebalance, we can properly suport changing IO path options
for reflinked data.

Changing a per-file IO path option, either via the xattr interface or
via the BCHFS_IOC_REINHERIT_ATTRS ioctl, will now trigger a scan (the
inode number is marked as needing a scan, via
bch2_set_rebalance_needs_scan()), and rebalance will use
bch2_move_data(), which will walk the inode number and pick up the new
options.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 184620d5a3b4..c493ea625553 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -22,6 +22,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "rebalance.h"
+#include "reflink.h"
 #include "replicas.h"
 #include "snapshot.h"
 #include "super-io.h"
@@ -389,6 +390,7 @@ err:
 
 static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
 			  struct per_snapshot_io_opts *io_opts,
+			  struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
 			  struct btree_iter *extent_iter,
 			  struct bkey_s_c extent_k)
 {
@@ -400,12 +402,12 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
 	if (extent_k.k->type == KEY_TYPE_reflink_v)
 		goto out;
 
-	if (io_opts->cur_inum != extent_k.k->p.inode) {
+	if (io_opts->cur_inum != extent_pos.inode) {
 		io_opts->d.nr = 0;
 
-		ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+		ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
 					 BTREE_ITER_all_snapshots, k, ({
-			if (k.k->p.offset != extent_k.k->p.inode)
+			if (k.k->p.offset != extent_pos.inode)
 				break;
 
 			if (!bkey_is_inode(k.k))
@@ -421,7 +423,7 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
 
 			darray_push(&io_opts->d, e);
 		}));
-		io_opts->cur_inum = extent_k.k->p.inode;
+		io_opts->cur_inum = extent_pos.inode;
 	}
 
 	ret = ret ?: trans_was_restarted(trans, restart_count);
@@ -527,9 +529,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
 	struct per_snapshot_io_opts snapshot_io_opts;
 	struct bch_io_opts *io_opts;
 	struct bkey_buf sk;
-	struct btree_iter iter;
+	struct btree_iter iter, reflink_iter = {};
 	struct bkey_s_c k;
 	struct data_update_opts data_opts;
+	/*
+	 * If we're moving a single file, also process reflinked data it points
+	 * to (this includes propagating changed io_opts from the inode to the
+	 * extent):
+	 */
+	bool walk_indirect = start.inode == end.inode;
 	int ret = 0, ret2;
 
 	per_snapshot_io_opts_init(&snapshot_io_opts, c);
@@ -549,6 +557,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
 		bch2_ratelimit_reset(ctxt->rate);
 
 	while (!bch2_move_ratelimit(ctxt)) {
+		struct btree_iter *extent_iter = &iter;
+
 		bch2_trans_begin(trans);
 
 		k = bch2_btree_iter_peek(&iter);
@@ -567,10 +577,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
 		if (ctxt->stats)
 			ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
 
+		if (walk_indirect &&
+		    k.k->type == KEY_TYPE_reflink_p &&
+		    REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
+			struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+			s64 offset_into_extent	= iter.pos.offset - bkey_start_offset(k.k);
+
+			bch2_trans_iter_exit(trans, &reflink_iter);
+			k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
+			ret = bkey_err(k);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				break;
+
+			if (bkey_deleted(k.k))
+				goto next_nondata;
+
+			/*
+			 * XXX: reflink pointers may point to multiple indirect
+			 * extents, so don't advance past the entire reflink
+			 * pointer - need to fixup iter->k
+			 */
+			extent_iter = &reflink_iter;
+		}
+
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
-		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, &iter, k);
+		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
+						iter.pos, extent_iter, k);
 		ret = PTR_ERR_OR_ZERO(io_opts);
 		if (ret)
 			continue;
@@ -586,7 +622,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
-		ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
+		ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
 		if (ret2) {
 			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
 				continue;
@@ -607,6 +643,7 @@ next_nondata:
 		bch2_btree_iter_advance(&iter);
 	}
 
+	bch2_trans_iter_exit(trans, &reflink_iter);
 	bch2_trans_iter_exit(trans, &iter);
 	bch2_bkey_buf_exit(&sk, c);
 	per_snapshot_io_opts_exit(&snapshot_io_opts);
-- 
cgit v1.2.3


From 59c50511f7a8ecded211656425c9b92e31329333 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 2 Aug 2023 20:27:38 -0400
Subject: bcachefs: bcachefs_metadata_version_inode_depth

This adds a new inode field, bi_depth, for directory inodes: this allows
us to make the check_directory_structure pass much more efficient.

Currently, to ensure the filesystem is fully connect and has no loops,
for every directory we follow backpointers until we find the root. But
by adding a depth counter, it sufficies to only check the parent of each
directory, and check that the parent's bi_depth is smaller.

(fsck doesn't require that bi_depth = parent->bi_depth + 1; if a rename
causes bi_depth off, but the chain to the root is still strictly
decreasing, then the algorithm still works and there's no need for fsck
to fixup the bi_depth fields).

We've already checked backpointers, so we know that every directory
(excluding the root)has a valid parent: if bi_depth is always
decreasing, every chain must terminate, and terminate at the root
directory.

bi_depth will not necessarily be correct when fsck runs, due to
directory renames - we can't change bi_depth on every child directory
when renaming a directory. That's ok; fsck will silently fix the
bi_depth field as needed, and future fsck runs will be much faster.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  3 +-
 fs/bcachefs/fs-common.c       | 13 ++++++
 fs/bcachefs/fsck.c            | 94 ++++++++++++++++++++++++++++++++++---------
 fs/bcachefs/inode.h           | 14 +++++++
 fs/bcachefs/inode_format.h    |  3 +-
 5 files changed, 105 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index c6cc2690aa26..f140c3366e65 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -681,7 +681,8 @@ struct bch_sb_field_ext {
 	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))		\
 	x(backpointer_bucket_gen,	BCH_VERSION(1, 14))		\
 	x(disk_accounting_big_endian,	BCH_VERSION(1, 15))		\
-	x(reflink_p_may_update_opts,	BCH_VERSION(1, 16))
+	x(reflink_p_may_update_opts,	BCH_VERSION(1, 16))		\
+	x(inode_depth,			BCH_VERSION(1, 17))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index f8d27244e1d6..7d279f211312 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -170,6 +170,10 @@ int bch2_create_trans(struct btree_trans *trans,
 		new_inode->bi_dir_offset	= dir_offset;
 	}
 
+	if (S_ISDIR(mode) &&
+	    !new_inode->bi_subvol)
+		new_inode->bi_depth = dir_u->bi_depth + 1;
+
 	inode_iter.flags &= ~BTREE_ITER_all_snapshots;
 	bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
 
@@ -510,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans,
 		dst_dir_u->bi_nlink++;
 	}
 
+	if (S_ISDIR(src_inode_u->bi_mode) &&
+	    !src_inode_u->bi_subvol)
+		src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    S_ISDIR(dst_inode_u->bi_mode) &&
+	    !dst_inode_u->bi_subvol)
+		dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
+
 	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
 		dst_dir_u->bi_nlink--;
 		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b8ced64cce2c..ea8c8ed06940 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2597,6 +2597,48 @@ struct pathbuf_entry {
 
 typedef DARRAY(struct pathbuf_entry) pathbuf;
 
+static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
+				      u32 new_depth)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+					       SPOS(0, p->inum, p->snapshot), 0);
+
+	struct bch_inode_unpacked inode;
+	int ret = bkey_err(k) ?:
+		!bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
+		: bch2_inode_unpack(k, &inode);
+	if (ret)
+		goto err;
+
+	if (inode.bi_depth != new_depth) {
+		inode.bi_depth = new_depth;
+		ret = __bch2_fsck_write_inode(trans, &inode) ?:
+			bch2_trans_commit(trans, NULL, NULL, 0);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
+{
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+
+	darray_for_each_reverse(*path, i) {
+		ret = nested_lockrestart_do(trans,
+				bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
+		bch_err_fn(trans->c, ret);
+		if (ret)
+			break;
+
+		new_bi_depth++;
+	}
+
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 {
 	darray_for_each(*p, i)
@@ -2606,24 +2648,22 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 	return false;
 }
 
-static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
+static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter inode_iter = {};
+	pathbuf path = {};
 	struct printbuf buf = PRINTBUF;
 	u32 snapshot = inode_k.k->p.snapshot;
+	bool redo_bi_depth = false;
+	u32 min_bi_depth = U32_MAX;
 	int ret = 0;
 
-	p->nr = 0;
-
 	struct bch_inode_unpacked inode;
 	ret = bch2_inode_unpack(inode_k, &inode);
 	if (ret)
 		return ret;
 
-	if (!S_ISDIR(inode.bi_mode))
-		return 0;
-
 	while (!inode.bi_subvol) {
 		struct btree_iter dirent_iter;
 		struct bkey_s_c_dirent d;
@@ -2632,7 +2672,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
 		ret = bkey_err(d.s_c);
 		if (ret && !bch2_err_matches(ret, ENOENT))
-			break;
+			goto out;
 
 		if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
 			bch2_trans_iter_exit(trans, &dirent_iter);
@@ -2647,7 +2687,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 
 		bch2_trans_iter_exit(trans, &dirent_iter);
 
-		ret = darray_push(p, ((struct pathbuf_entry) {
+		ret = darray_push(&path, ((struct pathbuf_entry) {
 			.inum		= inode.bi_inum,
 			.snapshot	= snapshot,
 		}));
@@ -2659,22 +2699,32 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 		bch2_trans_iter_exit(trans, &inode_iter);
 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
 					     SPOS(0, inode.bi_dir, snapshot), 0);
+
+		struct bch_inode_unpacked parent_inode;
 		ret = bkey_err(inode_k) ?:
 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
-			: bch2_inode_unpack(inode_k, &inode);
+			: bch2_inode_unpack(inode_k, &parent_inode);
 		if (ret) {
 			/* Should have been caught in dirents pass */
 			bch_err_msg(c, ret, "error looking up parent directory");
-			break;
+			goto out;
 		}
 
+		min_bi_depth = parent_inode.bi_depth;
+
+		if (parent_inode.bi_depth < inode.bi_depth &&
+		    min_bi_depth < U16_MAX)
+			break;
+
+		inode = parent_inode;
 		snapshot = inode_k.k->p.snapshot;
+		redo_bi_depth = true;
 
-		if (path_is_dup(p, inode.bi_inum, snapshot)) {
+		if (path_is_dup(&path, inode.bi_inum, snapshot)) {
 			/* XXX print path */
 			bch_err(c, "directory structure loop");
 
-			darray_for_each(*p, i)
+			darray_for_each(path, i)
 				pr_err("%llu:%u", i->inum, i->snapshot);
 			pr_err("%llu:%u", inode.bi_inum, snapshot);
 
@@ -2687,12 +2737,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 				ret = reattach_inode(trans, &inode);
 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
 			}
-			break;
+
+			goto out;
 		}
 	}
+
+	if (inode.bi_subvol)
+		min_bi_depth = 0;
+
+	if (redo_bi_depth)
+		ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
 out:
 fsck_err:
 	bch2_trans_iter_exit(trans, &inode_iter);
+	darray_exit(&path);
 	printbuf_exit(&buf);
 	bch_err_fn(c, ret);
 	return ret;
@@ -2704,24 +2762,20 @@ fsck_err:
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
-	pathbuf path = { 0, };
-	int ret;
-
-	ret = bch2_trans_run(c,
+	int ret = bch2_trans_run(c,
 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
 					  BTREE_ITER_intent|
 					  BTREE_ITER_prefetch|
 					  BTREE_ITER_all_snapshots, k,
 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-			if (!bkey_is_inode(k.k))
+			if (!S_ISDIR(bkey_inode_mode(k)))
 				continue;
 
 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
 				continue;
 
-			check_path(trans, &path, k);
+			check_path_loop(trans, k);
 		})));
-	darray_exit(&path);
 
 	bch_err_fn(c, ret);
 	return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 927c875976da..5bca6950f20e 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -219,6 +219,20 @@ static inline u32 bch2_inode_flags(struct bkey_s_c k)
 	}
 }
 
+static inline unsigned bkey_inode_mode(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode);
+	case KEY_TYPE_inode_v2:
+		return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode);
+	case KEY_TYPE_inode_v3:
+		return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v);
+	default:
+		return 0;
+	}
+}
+
 /* i_nlink: */
 
 static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 7928d0c6954f..be1e747629d2 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -101,7 +101,8 @@ struct bch_inode_generation {
 	x(bi_dir_offset,		64)	\
 	x(bi_subvol,			32)	\
 	x(bi_parent_subvol,		32)	\
-	x(bi_nocow,			8)
+	x(bi_nocow,			8)	\
+	x(bi_depth,			32)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()			\
-- 
cgit v1.2.3


From 2afd4d267e6dbaec8d3ccd4f5396cb84bc67aa2e Mon Sep 17 00:00:00 2001
From: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Date: Mon, 30 Dec 2024 10:33:34 +0300
Subject: fs/ntfs3: Mark inode as bad as soon as error detected in
 mi_enum_attr()

Extended the `mi_enum_attr()` function interface with an additional
parameter, `struct ntfs_inode *ni`, to allow marking the inode
as bad as soon as an error is detected.

Reported-by: syzbot+73d8fc29ec7cba8286fa@syzkaller.appspotmail.com
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
---
 fs/ntfs3/attrib.c  | 11 ++++----
 fs/ntfs3/frecord.c | 59 +++++++++++++++++++++-------------------
 fs/ntfs3/ntfs_fs.h | 21 ++++++++-------
 fs/ntfs3/record.c  | 79 +++++++++++++++++++++++++++++-------------------------
 4 files changed, 90 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 8d789b017fa9..795cf8e75d2e 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -787,7 +787,8 @@ pack_runs:
 		if (err)
 			goto out;
 
-		attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
+		attr = mi_find_attr(ni, mi, NULL, type, name, name_len,
+				    &le->id);
 		if (!attr) {
 			err = -EINVAL;
 			goto bad_inode;
@@ -1181,7 +1182,7 @@ repack:
 			goto out;
 		}
 
-		attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id);
+		attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id);
 		if (!attr) {
 			err = -EINVAL;
 			goto out;
@@ -1796,7 +1797,7 @@ repack:
 				goto out;
 			}
 
-			attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0,
+			attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0,
 					    &le->id);
 			if (!attr) {
 				err = -EINVAL;
@@ -2041,8 +2042,8 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
 				}
 
 				/* Look for required attribute. */
-				attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL,
-						    0, &le->id);
+				attr = mi_find_attr(ni, mi, NULL, ATTR_DATA,
+						    NULL, 0, &le->id);
 				if (!attr) {
 					err = -EINVAL;
 					goto out;
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index c57f0686b14b..b6be7dfafcbd 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -75,7 +75,7 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni)
 {
 	const struct ATTRIB *attr;
 
-	attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
+	attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
 	return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) :
 		      NULL;
 }
@@ -89,7 +89,7 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni)
 {
 	const struct ATTRIB *attr;
 
-	attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
+	attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
 
 	return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) :
 		      NULL;
@@ -201,7 +201,8 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
 			*mi = &ni->mi;
 
 		/* Look for required attribute in primary record. */
-		return mi_find_attr(&ni->mi, attr, type, name, name_len, NULL);
+		return mi_find_attr(ni, &ni->mi, attr, type, name, name_len,
+				    NULL);
 	}
 
 	/* First look for list entry of required type. */
@@ -217,7 +218,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
 		return NULL;
 
 	/* Look for required attribute. */
-	attr = mi_find_attr(m, NULL, type, name, name_len, &le->id);
+	attr = mi_find_attr(ni, m, NULL, type, name, name_len, &le->id);
 
 	if (!attr)
 		goto out;
@@ -259,7 +260,7 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
 		if (mi)
 			*mi = &ni->mi;
 		/* Enum attributes in primary record. */
-		return mi_enum_attr(&ni->mi, attr);
+		return mi_enum_attr(ni, &ni->mi, attr);
 	}
 
 	/* Get next list entry. */
@@ -275,7 +276,7 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
 		*mi = mi2;
 
 	/* Find attribute in loaded record. */
-	return rec_find_attr_le(mi2, le2);
+	return rec_find_attr_le(ni, mi2, le2);
 }
 
 /*
@@ -293,7 +294,8 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
 	if (!ni->attr_list.size) {
 		if (pmi)
 			*pmi = &ni->mi;
-		return mi_find_attr(&ni->mi, NULL, type, name, name_len, NULL);
+		return mi_find_attr(ni, &ni->mi, NULL, type, name, name_len,
+				    NULL);
 	}
 
 	le = al_find_ex(ni, NULL, type, name, name_len, NULL);
@@ -319,7 +321,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
 	if (pmi)
 		*pmi = mi;
 
-	attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
+	attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id);
 	if (!attr)
 		return NULL;
 
@@ -398,7 +400,8 @@ int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
 	int diff;
 
 	if (base_only || type == ATTR_LIST || !ni->attr_list.size) {
-		attr = mi_find_attr(&ni->mi, NULL, type, name, name_len, id);
+		attr = mi_find_attr(ni, &ni->mi, NULL, type, name, name_len,
+				    id);
 		if (!attr)
 			return -ENOENT;
 
@@ -437,7 +440,7 @@ next_le2:
 
 		al_remove_le(ni, le);
 
-		attr = mi_find_attr(mi, NULL, type, name, name_len, id);
+		attr = mi_find_attr(ni, mi, NULL, type, name, name_len, id);
 		if (!attr)
 			return -ENOENT;
 
@@ -485,7 +488,7 @@ ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi,
 		name = le->name;
 	}
 
-	attr = mi_insert_attr(mi, type, name, name_len, asize, name_off);
+	attr = mi_insert_attr(ni, mi, type, name, name_len, asize, name_off);
 	if (!attr) {
 		if (le_added)
 			al_remove_le(ni, le);
@@ -673,7 +676,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
 	if (err)
 		return err;
 
-	attr_list = mi_find_attr(&ni->mi, NULL, ATTR_LIST, NULL, 0, NULL);
+	attr_list = mi_find_attr(ni, &ni->mi, NULL, ATTR_LIST, NULL, 0, NULL);
 	if (!attr_list)
 		return 0;
 
@@ -695,7 +698,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
 		if (!mi)
 			return 0;
 
-		attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+		attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le),
 				    le->name_len, &le->id);
 		if (!attr)
 			return 0;
@@ -731,7 +734,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
 			goto out;
 		}
 
-		attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+		attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le),
 				    le->name_len, &le->id);
 		if (!attr) {
 			/* Should never happened, 'cause already checked. */
@@ -740,7 +743,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
 		asize = le32_to_cpu(attr->size);
 
 		/* Insert into primary record. */
-		attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le),
+		attr_ins = mi_insert_attr(ni, &ni->mi, le->type, le_name(le),
 					  le->name_len, asize,
 					  le16_to_cpu(attr->name_off));
 		if (!attr_ins) {
@@ -768,7 +771,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
 		if (!mi)
 			continue;
 
-		attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+		attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le),
 				    le->name_len, &le->id);
 		if (!attr)
 			continue;
@@ -831,7 +834,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
 	free_b = 0;
 	attr = NULL;
 
-	for (; (attr = mi_enum_attr(&ni->mi, attr)); le = Add2Ptr(le, sz)) {
+	for (; (attr = mi_enum_attr(ni, &ni->mi, attr)); le = Add2Ptr(le, sz)) {
 		sz = le_size(attr->name_len);
 		le->type = attr->type;
 		le->size = cpu_to_le16(sz);
@@ -886,7 +889,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
 		u32 asize = le32_to_cpu(b->size);
 		u16 name_off = le16_to_cpu(b->name_off);
 
-		attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off),
+		attr = mi_insert_attr(ni, mi, b->type, Add2Ptr(b, name_off),
 				      b->name_len, asize, name_off);
 		if (!attr)
 			goto out;
@@ -909,7 +912,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
 			goto out;
 	}
 
-	attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0,
+	attr = mi_insert_attr(ni, &ni->mi, ATTR_LIST, NULL, 0,
 			      lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT);
 	if (!attr)
 		goto out;
@@ -993,13 +996,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
 		mi = rb_entry(node, struct mft_inode, node);
 
 		if (is_mft_data &&
-		    (mi_enum_attr(mi, NULL) ||
+		    (mi_enum_attr(ni, mi, NULL) ||
 		     vbo <= ((u64)mi->rno << sbi->record_bits))) {
 			/* We can't accept this record 'cause MFT's bootstrapping. */
 			continue;
 		}
 		if (is_mft &&
-		    mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, NULL)) {
+		    mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, NULL)) {
 			/*
 			 * This child record already has a ATTR_DATA.
 			 * So it can't accept any other records.
@@ -1008,7 +1011,7 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
 		}
 
 		if ((type != ATTR_NAME || name_len) &&
-		    mi_find_attr(mi, NULL, type, name, name_len, NULL)) {
+		    mi_find_attr(ni, mi, NULL, type, name, name_len, NULL)) {
 			/* Only indexed attributes can share same record. */
 			continue;
 		}
@@ -1157,7 +1160,7 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
 	/* Estimate the result of moving all possible attributes away. */
 	attr = NULL;
 
-	while ((attr = mi_enum_attr(&ni->mi, attr))) {
+	while ((attr = mi_enum_attr(ni, &ni->mi, attr))) {
 		if (attr->type == ATTR_STD)
 			continue;
 		if (attr->type == ATTR_LIST)
@@ -1175,7 +1178,7 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
 	attr = NULL;
 
 	for (;;) {
-		attr = mi_enum_attr(&ni->mi, attr);
+		attr = mi_enum_attr(ni, &ni->mi, attr);
 		if (!attr) {
 			/* We should never be here 'cause we have already check this case. */
 			err = -EINVAL;
@@ -1259,7 +1262,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
 	for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) {
 		mi = rb_entry(node, struct mft_inode, node);
 
-		attr = mi_enum_attr(mi, NULL);
+		attr = mi_enum_attr(ni, mi, NULL);
 
 		if (!attr) {
 			mft_min = mi->rno;
@@ -1280,7 +1283,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
 		ni_remove_mi(ni, mi_new);
 	}
 
-	attr = mi_find_attr(&ni->mi, NULL, ATTR_DATA, NULL, 0, NULL);
+	attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_DATA, NULL, 0, NULL);
 	if (!attr) {
 		err = -EINVAL;
 		goto out;
@@ -1397,7 +1400,7 @@ int ni_expand_list(struct ntfs_inode *ni)
 			continue;
 
 		/* Find attribute in primary record. */
-		attr = rec_find_attr_le(&ni->mi, le);
+		attr = rec_find_attr_le(ni, &ni->mi, le);
 		if (!attr) {
 			err = -EINVAL;
 			goto out;
@@ -3344,7 +3347,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
 		if (!mi->dirty)
 			continue;
 
-		is_empty = !mi_enum_attr(mi, NULL);
+		is_empty = !mi_enum_attr(ni, mi, NULL);
 
 		if (is_empty)
 			clear_rec_inuse(mi->mrec);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index cd8e8374bb5a..382820464dee 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -745,23 +745,24 @@ int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi);
 void mi_put(struct mft_inode *mi);
 int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno);
 int mi_read(struct mft_inode *mi, bool is_mft);
-struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr);
-// TODO: id?
-struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
-			    enum ATTR_TYPE type, const __le16 *name,
-			    u8 name_len, const __le16 *id);
-static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec,
+struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+			    struct ATTRIB *attr);
+struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+			    struct ATTRIB *attr, enum ATTR_TYPE type,
+			    const __le16 *name, u8 name_len, const __le16 *id);
+static inline struct ATTRIB *rec_find_attr_le(struct ntfs_inode *ni,
+					      struct mft_inode *rec,
 					      struct ATTR_LIST_ENTRY *le)
 {
-	return mi_find_attr(rec, NULL, le->type, le_name(le), le->name_len,
+	return mi_find_attr(ni, rec, NULL, le->type, le_name(le), le->name_len,
 			    &le->id);
 }
 int mi_write(struct mft_inode *mi, int wait);
 int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
 		  __le16 flags, bool is_mft);
-struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
-			      const __le16 *name, u8 name_len, u32 asize,
-			      u16 name_off);
+struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+			      enum ATTR_TYPE type, const __le16 *name,
+			      u8 name_len, u32 asize, u16 name_off);
 
 bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
 		    struct ATTRIB *attr);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 61d53d39f3b9..714c7ecedca8 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -31,7 +31,7 @@ static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type,
  *
  * Return: Unused attribute id that is less than mrec->next_attr_id.
  */
-static __le16 mi_new_attt_id(struct mft_inode *mi)
+static __le16 mi_new_attt_id(struct ntfs_inode *ni, struct mft_inode *mi)
 {
 	u16 free_id, max_id, t16;
 	struct MFT_REC *rec = mi->mrec;
@@ -52,7 +52,7 @@ static __le16 mi_new_attt_id(struct mft_inode *mi)
 	attr = NULL;
 
 	for (;;) {
-		attr = mi_enum_attr(mi, attr);
+		attr = mi_enum_attr(ni, mi, attr);
 		if (!attr) {
 			rec->next_attr_id = cpu_to_le16(max_id + 1);
 			mi->dirty = true;
@@ -195,7 +195,8 @@ out:
  * NOTE: mi->mrec - memory of size sbi->record_size
  * here we sure that mi->mrec->total == sbi->record_size (see mi_read)
  */
-struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
+struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+			    struct ATTRIB *attr)
 {
 	const struct MFT_REC *rec = mi->mrec;
 	u32 used = le32_to_cpu(rec->used);
@@ -209,11 +210,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
 		off = le16_to_cpu(rec->attr_off);
 
 		if (used > total)
-			return NULL;
+			goto out;
 
 		if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 ||
 		    !IS_ALIGNED(off, 8)) {
-			return NULL;
+			goto out;
 		}
 
 		/* Skip non-resident records. */
@@ -243,7 +244,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
 	 */
 	if (off + 8 > used) {
 		static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8);
-		return NULL;
+		goto out;
 	}
 
 	if (attr->type == ATTR_END) {
@@ -254,112 +255,116 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
 	/* 0x100 is last known attribute for now. */
 	t32 = le32_to_cpu(attr->type);
 	if (!t32 || (t32 & 0xf) || (t32 > 0x100))
-		return NULL;
+		goto out;
 
 	/* attributes in record must be ordered by type */
 	if (t32 < prev_type)
-		return NULL;
+		goto out;
 
 	asize = le32_to_cpu(attr->size);
 
 	if (!IS_ALIGNED(asize, 8))
-		return NULL;
+		goto out;
 
 	/* Check overflow and boundary. */
 	if (off + asize < off || off + asize > used)
-		return NULL;
+		goto out;
 
 	/* Can we use the field attr->non_res. */
 	if (off + 9 > used)
-		return NULL;
+		goto out;
 
 	/* Check size of attribute. */
 	if (!attr->non_res) {
 		/* Check resident fields. */
 		if (asize < SIZEOF_RESIDENT)
-			return NULL;
+			goto out;
 
 		t16 = le16_to_cpu(attr->res.data_off);
 		if (t16 > asize)
-			return NULL;
+			goto out;
 
 		if (le32_to_cpu(attr->res.data_size) > asize - t16)
-			return NULL;
+			goto out;
 
 		t32 = sizeof(short) * attr->name_len;
 		if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
-			return NULL;
+			goto out;
 
 		return attr;
 	}
 
 	/* Check nonresident fields. */
 	if (attr->non_res != 1)
-		return NULL;
+		goto out;
 
 	/* Can we use memory including attr->nres.valid_size? */
 	if (asize < SIZEOF_NONRESIDENT)
-		return NULL;
+		goto out;
 
 	t16 = le16_to_cpu(attr->nres.run_off);
 	if (t16 > asize)
-		return NULL;
+		goto out;
 
 	t32 = sizeof(short) * attr->name_len;
 	if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
-		return NULL;
+		goto out;
 
 	/* Check start/end vcn. */
 	if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1)
-		return NULL;
+		goto out;
 
 	data_size = le64_to_cpu(attr->nres.data_size);
 	if (le64_to_cpu(attr->nres.valid_size) > data_size)
-		return NULL;
+		goto out;
 
 	alloc_size = le64_to_cpu(attr->nres.alloc_size);
 	if (data_size > alloc_size)
-		return NULL;
+		goto out;
 
 	t32 = mi->sbi->cluster_mask;
 	if (alloc_size & t32)
-		return NULL;
+		goto out;
 
 	if (!attr->nres.svcn && is_attr_ext(attr)) {
 		/* First segment of sparse/compressed attribute */
 		/* Can we use memory including attr->nres.total_size? */
 		if (asize < SIZEOF_NONRESIDENT_EX)
-			return NULL;
+			goto out;
 
 		tot_size = le64_to_cpu(attr->nres.total_size);
 		if (tot_size & t32)
-			return NULL;
+			goto out;
 
 		if (tot_size > alloc_size)
-			return NULL;
+			goto out;
 	} else {
 		if (attr->nres.c_unit)
-			return NULL;
+			goto out;
 
 		if (alloc_size > mi->sbi->volume.size)
-			return NULL;
+			goto out;
 	}
 
 	return attr;
+
+out:
+	_ntfs_bad_inode(&ni->vfs_inode);
+	return NULL;
 }
 
 /*
  * mi_find_attr - Find the attribute by type and name and id.
  */
-struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
-			    enum ATTR_TYPE type, const __le16 *name,
-			    u8 name_len, const __le16 *id)
+struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+			    struct ATTRIB *attr, enum ATTR_TYPE type,
+			    const __le16 *name, u8 name_len, const __le16 *id)
 {
 	u32 type_in = le32_to_cpu(type);
 	u32 atype;
 
 next_attr:
-	attr = mi_enum_attr(mi, attr);
+	attr = mi_enum_attr(ni, mi, attr);
 	if (!attr)
 		return NULL;
 
@@ -467,9 +472,9 @@ int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
  *
  * Return: Not full constructed attribute or NULL if not possible to create.
  */
-struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
-			      const __le16 *name, u8 name_len, u32 asize,
-			      u16 name_off)
+struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+			      enum ATTR_TYPE type, const __le16 *name,
+			      u8 name_len, u32 asize, u16 name_off)
 {
 	size_t tail;
 	struct ATTRIB *attr;
@@ -488,7 +493,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
 	 * at which we should insert it.
 	 */
 	attr = NULL;
-	while ((attr = mi_enum_attr(mi, attr))) {
+	while ((attr = mi_enum_attr(ni, mi, attr))) {
 		int diff = compare_attr(attr, type, name, name_len, upcase);
 
 		if (diff < 0)
@@ -508,7 +513,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
 		tail = used - PtrOffset(rec, attr);
 	}
 
-	id = mi_new_attt_id(mi);
+	id = mi_new_attt_id(ni, mi);
 
 	memmove(Add2Ptr(attr, asize), attr, tail);
 	memset(attr, 0, asize);
-- 
cgit v1.2.3


From 55ad333de0f80bc0caee10c6c27196cdcf8891bb Mon Sep 17 00:00:00 2001
From: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Date: Mon, 30 Dec 2024 10:34:08 +0300
Subject: fs/ntfs3: Unify inode corruption marking with _ntfs_bad_inode()

Also reworked error handling in a couple of places.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
---
 fs/ntfs3/attrib.c  |  4 ++--
 fs/ntfs3/dir.c     |  2 +-
 fs/ntfs3/frecord.c | 12 +++++++-----
 fs/ntfs3/fsntfs.c  |  6 +++++-
 fs/ntfs3/index.c   |  6 ++----
 fs/ntfs3/inode.c   |  3 +++
 6 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 795cf8e75d2e..af94e3737470 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -1407,7 +1407,7 @@ int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr,
 	 */
 	if (!attr->non_res) {
 		if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) {
-			ntfs_inode_err(&ni->vfs_inode, "is corrupted");
+			_ntfs_bad_inode(&ni->vfs_inode);
 			return -EINVAL;
 		}
 		addr = resident_data(attr);
@@ -2588,7 +2588,7 @@ int attr_force_nonresident(struct ntfs_inode *ni)
 
 	attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi);
 	if (!attr) {
-		ntfs_bad_inode(&ni->vfs_inode, "no data attribute");
+		_ntfs_bad_inode(&ni->vfs_inode);
 		return -ENOENT;
 	}
 
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index fc6a8aa29e3a..b6da80c69ca6 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -512,7 +512,7 @@ out:
 		ctx->pos = pos;
 	} else if (err < 0) {
 		if (err == -EINVAL)
-			ntfs_inode_err(dir, "directory corrupted");
+			_ntfs_bad_inode(dir);
 		ctx->pos = eod;
 	}
 
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index b6be7dfafcbd..5df6a0b5add9 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -148,8 +148,10 @@ int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi)
 		goto out;
 
 	err = mi_get(ni->mi.sbi, rno, &r);
-	if (err)
+	if (err) {
+		_ntfs_bad_inode(&ni->vfs_inode);
 		return err;
+	}
 
 	ni_add_mi(ni, r);
 
@@ -239,8 +241,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
 	return attr;
 
 out:
-	ntfs_inode_err(&ni->vfs_inode, "failed to parse mft record");
-	ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+	_ntfs_bad_inode(&ni->vfs_inode);
 	return NULL;
 }
 
@@ -332,6 +333,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
 	    vcn <= le64_to_cpu(attr->nres.evcn))
 		return attr;
 
+	_ntfs_bad_inode(&ni->vfs_inode);
 	return NULL;
 }
 
@@ -1607,8 +1609,8 @@ int ni_delete_all(struct ntfs_inode *ni)
 		roff = le16_to_cpu(attr->nres.run_off);
 
 		if (roff > asize) {
-			_ntfs_bad_inode(&ni->vfs_inode);
-			return -EINVAL;
+			/* ni_enum_attr_ex checks this case. */
+			continue;
 		}
 
 		/* run==1 means unpack and deallocate. */
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 03471bc9371c..938d351ebac7 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -908,7 +908,11 @@ void ntfs_bad_inode(struct inode *inode, const char *hint)
 
 	ntfs_inode_err(inode, "%s", hint);
 	make_bad_inode(inode);
-	ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+	/* Avoid recursion if bad inode is $Volume. */
+	if (inode->i_ino != MFT_REC_VOL &&
+	    !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) {
+		ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+	}
 }
 
 /*
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 9089c58a005c..7eb9fae22f8d 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1094,8 +1094,7 @@ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn,
 
 ok:
 	if (!index_buf_check(ib, bytes, &vbn)) {
-		ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
-		ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+		_ntfs_bad_inode(&ni->vfs_inode);
 		err = -EINVAL;
 		goto out;
 	}
@@ -1117,8 +1116,7 @@ ok:
 
 out:
 	if (err == -E_NTFS_CORRUPT) {
-		ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
-		ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+		_ntfs_bad_inode(&ni->vfs_inode);
 		err = -EINVAL;
 	}
 
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index be04d2845bb7..a1e11228dafd 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -410,6 +410,9 @@ end_enum:
 	if (!std5)
 		goto out;
 
+	if (is_bad_inode(inode))
+		goto out;
+
 	if (!is_match && name) {
 		err = -ENOENT;
 		goto out;
-- 
cgit v1.2.3


From 3754137d263f52f4b507cf9ae913f8f0497d1b0e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 17 Dec 2024 20:50:00 +0100
Subject: fs/proc/task_mmu: fix pagemap flags with PMD THP entries on 32bit

Entries (including flags) are u64, even on 32bit.  So right now we are
cutting of the flags on 32bit.  This way, for example the cow selftest
complains about:

  # ./cow
  ...
  Bail Out! read and ioctl return unmatched results for populated: 0 1

Link: https://lkml.kernel.org/r/20241217195000.1734039-1-david@redhat.com
Fixes: 2c1f057e5be6 ("fs/proc/task_mmu: properly detect PM_MMAP_EXCLUSIVE per page of PMD-mapped THPs")
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 38a5a3e9cba2..f02cd362309a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1810,7 +1810,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		}
 
 		for (; addr != end; addr += PAGE_SIZE, idx++) {
-			unsigned long cur_flags = flags;
+			u64 cur_flags = flags;
 			pagemap_entry_t pme;
 
 			if (folio && (flags & PM_PRESENT) &&
-- 
cgit v1.2.3


From 5f3fd772d152229d94602bca243fbb658068a597 Mon Sep 17 00:00:00 2001
From: Dennis Lam <dennis.lamerice@gmail.com>
Date: Tue, 17 Dec 2024 21:39:25 -0500
Subject: ocfs2: fix slab-use-after-free due to dangling pointer dqi_priv

When mounting ocfs2 and then remounting it as read-only, a
slab-use-after-free occurs after the user uses a syscall to
quota_getnextquota.  Specifically, sb_dqinfo(sb, type)->dqi_priv is the
dangling pointer.

During the remounting process, the pointer dqi_priv is freed but is never
set as null leaving it to be accessed.  Additionally, the read-only option
for remounting sets the DQUOT_SUSPENDED flag instead of setting the
DQUOT_USAGE_ENABLED flags.  Moreover, later in the process of getting the
next quota, the function ocfs2_get_next_id is called and only checks the
quota usage flags and not the quota suspended flags.

To fix this, I set dqi_priv to null when it is freed after remounting with
read-only and put a check for DQUOT_SUSPENDED in ocfs2_get_next_id.

[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/20241218023924.22821-2-dennis.lamerice@gmail.com
Fixes: 8f9e8f5fcc05 ("ocfs2: Fix Q_GETNEXTQUOTA for filesystem without quotas")
Signed-off-by: Dennis Lam <dennis.lamerice@gmail.com>
Reported-by: syzbot+d173bf8a5a7faeede34c@syzkaller.appspotmail.com
Tested-by: syzbot+d173bf8a5a7faeede34c@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6731d26f.050a0220.1fb99c.014b.GAE@google.com/T/
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/quota_global.c | 2 +-
 fs/ocfs2/quota_local.c  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2b0daced98eb..3404e7a30c33 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -893,7 +893,7 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
 	int status = 0;
 
 	trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type);
-	if (!sb_has_quota_loaded(sb, type)) {
+	if (!sb_has_quota_active(sb, type)) {
 		status = -ESRCH;
 		goto out;
 	}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 73d3367c533b..2956d888c131 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -867,6 +867,7 @@ out:
 	brelse(oinfo->dqi_libh);
 	brelse(oinfo->dqi_lqi_bh);
 	kfree(oinfo);
+	info->dqi_priv = NULL;
 	return status;
 }
 
-- 
cgit v1.2.3


From fee873761bd978d077d8c55334b4966ac4cb7b59 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Fri, 13 Dec 2024 13:08:37 +0800
Subject: exfat: fix the infinite loop in exfat_readdir()

If the file system is corrupted so that a cluster is linked to
itself in the cluster chain, and there is an unused directory
entry in the cluster, 'dentry' will not be incremented, causing
condition 'dentry < max_dentries' unable to prevent an infinite
loop.

This infinite loop causes s_lock not to be released, and other
tasks will hang, such as exfat_sync_fs().

This commit stops traversing the cluster chain when there is unused
directory entry in the cluster to avoid this infinite loop.

Reported-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=205c2644abdff9d3f9fc
Tested-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com
Fixes: ca06197382bd ("exfat: add directory operations")
Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index fe0a9b8a0cd0..3103b932b674 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -122,7 +122,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
 			type = exfat_get_entry_type(ep);
 			if (type == TYPE_UNUSED) {
 				brelse(bh);
-				break;
+				goto out;
 			}
 
 			if (type != TYPE_FILE && type != TYPE_DIR) {
@@ -170,6 +170,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
 		}
 	}
 
+out:
 	dir_entry->namebuf.lfn[0] = '\0';
 	*cpos = EXFAT_DEN_TO_B(dentry);
 	return 0;
-- 
cgit v1.2.3


From 98e2fb26d1a9eafe79f46d15d54e68e014d81d8c Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Thu, 12 Dec 2024 16:29:23 +0800
Subject: exfat: fix the new buffer was not zeroed before writing

Before writing, if a buffer_head marked as new, its data must
be zeroed, otherwise uninitialized data in the page cache will
be written.

So this commit uses folio_zero_new_buffers() to zero the new
buffers before ->write_end().

Fixes: 6630ea49103c ("exfat: move extend valid_size into ->page_mkwrite()")
Reported-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=91ae49e1c1a2634d20c0
Tested-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com
Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/file.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index fb38769c3e39..05b51e721783 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -545,6 +545,7 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
 	while (pos < new_valid_size) {
 		u32 len;
 		struct folio *folio;
+		unsigned long off;
 
 		len = PAGE_SIZE - (pos & (PAGE_SIZE - 1));
 		if (pos + len > new_valid_size)
@@ -554,6 +555,9 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
 		if (err)
 			goto out;
 
+		off = offset_in_folio(folio, pos);
+		folio_zero_new_buffers(folio, off, off + len);
+
 		err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
 		if (err < 0)
 			goto out;
@@ -563,6 +567,8 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
 		cond_resched();
 	}
 
+	return 0;
+
 out:
 	return err;
 }
-- 
cgit v1.2.3


From a5324b3a488d883aa2d42f72260054e87d0940a0 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Mon, 16 Dec 2024 13:39:42 +0800
Subject: exfat: fix the infinite loop in __exfat_free_cluster()

In __exfat_free_cluster(), the cluster chain is traversed until the
EOF cluster. If the cluster chain includes a loop due to file system
corruption, the EOF cluster cannot be traversed, resulting in an
infinite loop.

This commit uses the total number of clusters to prevent this infinite
loop.

Reported-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=1de5a37cb85a2d536330
Tested-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com
Fixes: 31023864e67a ("exfat: add fat entry operations")
Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/fatent.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 773c320d68f3..9e5492ac409b 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -216,6 +216,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
 
 			if (err)
 				goto dec_used_clus;
+
+			if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) {
+				/*
+				 * The cluster chain includes a loop, scan the
+				 * bitmap to get the number of used clusters.
+				 */
+				exfat_count_used_clusters(sb, &sbi->used_clusters);
+
+				return 0;
+			}
 		} while (clu != EXFAT_EOF_CLUSTER);
 	}
 
-- 
cgit v1.2.3


From 03f275adb8fbd7b4ebe96a1ad5044d8e602692dc Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 1 Jan 2025 14:00:37 +0100
Subject: fuse: respect FOPEN_KEEP_CACHE on opendir

The re-factoring of fuse_dir_open() missed the need to invalidate
directory inode page cache with open flag FOPEN_KEEP_CACHE.

Fixes: 7de64d521bf92 ("fuse: break up fuse_open_common()")
Reported-by: Prince Kumar <princer@google.com>
Closes: https://lore.kernel.org/linux-fsdevel/CAEW=TRr7CYb4LtsvQPLj-zx5Y+EYBmGfM24SuzwyDoGVNoKm7w@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20250101130037.96680-1-amir73il@gmail.com
Reviewed-by: Bernd Schubert <bernd.schubert@fastmail.fm>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fuse/dir.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 494ac372ace0..e540d05549ff 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1681,6 +1681,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
 		 */
 		if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE))
 			nonseekable_open(inode, file);
+		if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+			invalidate_inode_pages2(inode->i_mapping);
 	}
 
 	return err;
-- 
cgit v1.2.3


From ab251dacfbae28772c897f068a4184f478189ff2 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 2 Jan 2025 09:22:56 +0100
Subject: fs/proc: do_task_stat: Fix ESP not readable during coredump

The field "eip" (instruction pointer) and "esp" (stack pointer) of a task
can be read from /proc/PID/stat. These fields can be interesting for
coredump.

However, these fields were disabled by commit 0a1eb2d474ed ("fs/proc: Stop
reporting eip and esp in /proc/PID/stat"), because it is generally unsafe
to do so. But it is safe for a coredumping process, and therefore
exceptions were made:

  - for a coredumping thread by commit fd7d56270b52 ("fs/proc: Report
    eip/esp in /prod/PID/stat for coredumping").

  - for all other threads in a coredumping process by commit cb8f381f1613
    ("fs/proc/array.c: allow reporting eip/esp for all coredumping
    threads").

The above two commits check the PF_DUMPCORE flag to determine a coredump thread
and the PF_EXITING flag for the other threads.

Unfortunately, commit 92307383082d ("coredump:  Don't perform any cleanups
before dumping core") moved coredump to happen earlier and before PF_EXITING is
set. Thus, checking PF_EXITING is no longer the correct way to determine
threads in a coredumping process.

Instead of PF_EXITING, use PF_POSTCOREDUMP to determine the other threads.

Checking of PF_EXITING was added for coredumping, so it probably can now be
removed. But it doesn't hurt to keep.

Fixes: 92307383082d ("coredump:  Don't perform any cleanups before dumping core")
Cc: stable@vger.kernel.org
Cc: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <kees@kernel.org>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/d89af63d478d6c64cc46a01420b46fd6eb147d6f.1735805772.git.namcao@linutronix.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/array.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 55ed3510d2bb..d6a0369caa93 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -500,7 +500,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		 * a program is not able to use ptrace(2) in that case. It is
 		 * safe because the task has stopped executing permanently.
 		 */
-		if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
+		if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) {
 			if (try_get_task_stack(task)) {
 				eip = KSTK_EIP(task);
 				esp = KSTK_ESP(task);
-- 
cgit v1.2.3


From aaec5a95d59615523db03dd53c2052f0a87beea7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Jan 2025 15:07:15 +0100
Subject: pipe_read: don't wake up the writer if the pipe is still full

wake_up(pipe->wr_wait) makes no sense if pipe_full() is still true after
the reading, the writer sleeping in wait_event(wr_wait, pipe_writable())
will check the pipe_writable() == !pipe_full() condition and sleep again.

Only wake the writer if we actually released a pipe buf, and the pipe was
full before we did so.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/all/20241229135737.GA3293@redhat.com/
Link: https://lore.kernel.org/r/20250102140715.GA7091@redhat.com
Reported-by: WangYuli <wangyuli@uniontech.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 12b22c2723b7..82fede0f2111 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -253,7 +253,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 	size_t total_len = iov_iter_count(to);
 	struct file *filp = iocb->ki_filp;
 	struct pipe_inode_info *pipe = filp->private_data;
-	bool was_full, wake_next_reader = false;
+	bool wake_writer = false, wake_next_reader = false;
 	ssize_t ret;
 
 	/* Null read succeeds. */
@@ -264,14 +264,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 	mutex_lock(&pipe->mutex);
 
 	/*
-	 * We only wake up writers if the pipe was full when we started
-	 * reading in order to avoid unnecessary wakeups.
+	 * We only wake up writers if the pipe was full when we started reading
+	 * and it is no longer full after reading to avoid unnecessary wakeups.
 	 *
 	 * But when we do wake up writers, we do so using a sync wakeup
 	 * (WF_SYNC), because we want them to get going and generate more
 	 * data for us.
 	 */
-	was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 	for (;;) {
 		/* Read ->head with a barrier vs post_one_notification() */
 		unsigned int head = smp_load_acquire(&pipe->head);
@@ -340,8 +339,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 				buf->len = 0;
 			}
 
-			if (!buf->len)
+			if (!buf->len) {
+				wake_writer |= pipe_full(head, tail, pipe->max_usage);
 				tail = pipe_update_tail(pipe, buf, tail);
+			}
 			total_len -= chars;
 			if (!total_len)
 				break;	/* common path: read succeeded */
@@ -377,7 +378,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		 * _very_ unlikely case that the pipe was full, but we got
 		 * no data.
 		 */
-		if (unlikely(was_full))
+		if (unlikely(wake_writer))
 			wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 
@@ -390,15 +391,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
 			return -ERESTARTSYS;
 
-		mutex_lock(&pipe->mutex);
-		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
+		wake_writer = false;
 		wake_next_reader = true;
+		mutex_lock(&pipe->mutex);
 	}
 	if (pipe_empty(pipe->head, pipe->tail))
 		wake_next_reader = false;
 	mutex_unlock(&pipe->mutex);
 
-	if (was_full)
+	if (wake_writer)
 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 	if (wake_next_reader)
 		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
-- 
cgit v1.2.3


From 903dc9c43a155e0893280c7472d4a9a3a83d75a6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 28 Dec 2024 12:55:17 -0500
Subject: libfs: Return ENOSPC when the directory offset range is exhausted

Testing shows that the EBUSY error return from mtree_alloc_cyclic()
leaks into user space. The ERRORS section of "man creat(2)" says:

>	EBUSY	O_EXCL was specified in flags and pathname refers
>		to a block device that is in use by the system
>		(e.g., it is mounted).

ENOSPC is closer to what applications expect in this situation.

Note that the normal range of simple directory offset values is
2..2^63, so hitting this error is going to be rare to impossible.

Fixes: 6faddda69f62 ("libfs: Add directory operations for stable offsets")
Cc: stable@vger.kernel.org # v6.9+
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/r/20241228175522.1854234-2-cel@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 748ac5923154..3da58a92f48f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -292,8 +292,8 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
 
 	ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
 				 LONG_MAX, &octx->next_offset, GFP_KERNEL);
-	if (ret < 0)
-		return ret;
+	if (unlikely(ret < 0))
+		return ret == -EBUSY ? -ENOSPC : ret;
 
 	offset_set(dentry, offset);
 	return 0;
-- 
cgit v1.2.3


From d7bde4f27ceef3dc6d72010a20d4da23db835a32 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 28 Dec 2024 12:55:18 -0500
Subject: Revert "libfs: Add simple_offset_empty()"

simple_empty() and simple_offset_empty() perform the same task.
The latter's use as a canary to find bugs has not found any new
issues. A subsequent patch will remove the use of the mtree for
iterating directory contents, so revert back to using a similar
mechanism for determining whether a directory is indeed empty.

Only one such mechanism is ever needed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/r/20241228175522.1854234-3-cel@kernel.org
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c         | 32 --------------------------------
 include/linux/fs.h |  1 -
 mm/shmem.c         |  4 ++--
 3 files changed, 2 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 3da58a92f48f..8380d9314ebd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -329,38 +329,6 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
 	offset_set(dentry, 0);
 }
 
-/**
- * simple_offset_empty - Check if a dentry can be unlinked
- * @dentry: dentry to be tested
- *
- * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
- */
-int simple_offset_empty(struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-	struct offset_ctx *octx;
-	struct dentry *child;
-	unsigned long index;
-	int ret = 1;
-
-	if (!inode || !S_ISDIR(inode->i_mode))
-		return ret;
-
-	index = DIR_OFFSET_MIN;
-	octx = inode->i_op->get_offset_ctx(inode);
-	mt_for_each(&octx->mt, child, index, LONG_MAX) {
-		spin_lock(&child->d_lock);
-		if (simple_positive(child)) {
-			spin_unlock(&child->d_lock);
-			ret = 0;
-			break;
-		}
-		spin_unlock(&child->d_lock);
-	}
-
-	return ret;
-}
-
 /**
  * simple_offset_rename - handle directory offsets for rename
  * @old_dir: parent directory of source entry
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e29433c5ecc..f7efc6866ebc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3468,7 +3468,6 @@ struct offset_ctx {
 void simple_offset_init(struct offset_ctx *octx);
 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
-int simple_offset_empty(struct dentry *dentry);
 int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
 			 struct inode *new_dir, struct dentry *new_dentry);
 int simple_offset_rename_exchange(struct inode *old_dir,
diff --git a/mm/shmem.c b/mm/shmem.c
index ccb9629a0f70..274c2666f457 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3818,7 +3818,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 
 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	if (!simple_offset_empty(dentry))
+	if (!simple_empty(dentry))
 		return -ENOTEMPTY;
 
 	drop_nlink(d_inode(dentry));
@@ -3875,7 +3875,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
 		return simple_offset_rename_exchange(old_dir, old_dentry,
 						     new_dir, new_dentry);
 
-	if (!simple_offset_empty(new_dentry))
+	if (!simple_empty(new_dentry))
 		return -ENOTEMPTY;
 
 	if (flags & RENAME_WHITEOUT) {
-- 
cgit v1.2.3


From b662d858131da9a8a14e68661656989b14dbf113 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 28 Dec 2024 12:55:19 -0500
Subject: Revert "libfs: fix infinite directory reads for offset dir"

The current directory offset allocator (based on mtree_alloc_cyclic)
stores the next offset value to return in octx->next_offset. This
mechanism typically returns values that increase monotonically over
time. Eventually, though, the newly allocated offset value wraps
back to a low number (say, 2) which is smaller than other already-
allocated offset values.

Yu Kuai <yukuai3@huawei.com> reports that, after commit 64a7ce76fb90
("libfs: fix infinite directory reads for offset dir"), if a
directory's offset allocator wraps, existing entries are no longer
visible via readdir/getdents because offset_readdir() stops listing
entries once an entry's offset is larger than octx->next_offset.
These entries vanish persistently -- they can be looked up, but will
never again appear in readdir(3) output.

The reason for this is that the commit treats directory offsets as
monotonically increasing integer values rather than opaque cookies,
and introduces this comparison:

	if (dentry2offset(dentry) >= last_index) {

On 64-bit platforms, the directory offset value upper bound is
2^63 - 1. Directory offsets will monotonically increase for millions
of years without wrapping.

On 32-bit platforms, however, LONG_MAX is 2^31 - 1. The allocator
can wrap after only a few weeks (at worst).

Revert commit 64a7ce76fb90 ("libfs: fix infinite directory reads for
offset dir") to prepare for a fix that can work properly on 32-bit
systems and might apply to recent LTS kernels where shmem employs
the simple_offset mechanism.

Reported-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/r/20241228175522.1854234-4-cel@kernel.org
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c | 35 +++++++++++------------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 8380d9314ebd..8c9364a0174c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -422,14 +422,6 @@ void simple_offset_destroy(struct offset_ctx *octx)
 	mtree_destroy(&octx->mt);
 }
 
-static int offset_dir_open(struct inode *inode, struct file *file)
-{
-	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
-
-	file->private_data = (void *)ctx->next_offset;
-	return 0;
-}
-
 /**
  * offset_dir_llseek - Advance the read position of a directory descriptor
  * @file: an open directory whose position is to be updated
@@ -443,9 +435,6 @@ static int offset_dir_open(struct inode *inode, struct file *file)
  */
 static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 {
-	struct inode *inode = file->f_inode;
-	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
-
 	switch (whence) {
 	case SEEK_CUR:
 		offset += file->f_pos;
@@ -459,8 +448,7 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 	}
 
 	/* In this case, ->private_data is protected by f_pos_lock */
-	if (!offset)
-		file->private_data = (void *)ctx->next_offset;
+	file->private_data = NULL;
 	return vfs_setpos(file, offset, LONG_MAX);
 }
 
@@ -491,7 +479,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
 }
 
-static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
+static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 {
 	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
 	struct dentry *dentry;
@@ -499,21 +487,17 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon
 	while (true) {
 		dentry = offset_find_next(octx, ctx->pos);
 		if (!dentry)
-			return;
-
-		if (dentry2offset(dentry) >= last_index) {
-			dput(dentry);
-			return;
-		}
+			return ERR_PTR(-ENOENT);
 
 		if (!offset_dir_emit(ctx, dentry)) {
 			dput(dentry);
-			return;
+			break;
 		}
 
 		ctx->pos = dentry2offset(dentry) + 1;
 		dput(dentry);
 	}
+	return NULL;
 }
 
 /**
@@ -540,19 +524,22 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon
 static int offset_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dir = file->f_path.dentry;
-	long last_index = (long)file->private_data;
 
 	lockdep_assert_held(&d_inode(dir)->i_rwsem);
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	offset_iterate_dir(d_inode(dir), ctx, last_index);
+	/* In this case, ->private_data is protected by f_pos_lock */
+	if (ctx->pos == DIR_OFFSET_MIN)
+		file->private_data = NULL;
+	else if (file->private_data == ERR_PTR(-ENOENT))
+		return 0;
+	file->private_data = offset_iterate_dir(d_inode(dir), ctx);
 	return 0;
 }
 
 const struct file_operations simple_offset_dir_operations = {
-	.open		= offset_dir_open,
 	.llseek		= offset_dir_llseek,
 	.iterate_shared	= offset_readdir,
 	.read		= generic_read_dir,
-- 
cgit v1.2.3


From 68a3a65003145644efcbb651e91db249ccd96281 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 28 Dec 2024 12:55:20 -0500
Subject: libfs: Replace simple_offset end-of-directory detection

According to getdents(3), the d_off field in each returned directory
entry points to the next entry in the directory. The d_off field in
the last returned entry in the readdir buffer must contain a valid
offset value, but if it points to an actual directory entry, then
readdir/getdents can loop.

This patch introduces a specific fixed offset value that is placed
in the d_off field of the last entry in a directory. Some user space
applications assume that the EOD offset value is larger than the
offsets of real directory entries, so the largest valid offset value
is reserved for this purpose. This new value is never allocated by
simple_offset_add().

When ->iterate_dir() returns, getdents{64} inserts the ctx->pos
value into the d_off field of the last valid entry in the readdir
buffer. When it hits EOD, offset_readdir() sets ctx->pos to the EOD
offset value so the last entry is updated to point to the EOD marker.

When trying to read the entry at the EOD offset, offset_readdir()
terminates immediately.

It is worth noting that using a Maple tree for directory offset
value allocation does not guarantee a 63-bit range of values --
on platforms where "long" is a 32-bit type, the directory offset
value range is still 0..(2^31 - 1). For broad compatibility with
32-bit user space, the largest tmpfs directory cookie value is now
S32_MAX.

Fixes: 796432efab1e ("libfs: getdents() should return 0 after reaching EOD")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/r/20241228175522.1854234-5-cel@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 8c9364a0174c..47399f90511a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -245,9 +245,15 @@ const struct inode_operations simple_dir_inode_operations = {
 };
 EXPORT_SYMBOL(simple_dir_inode_operations);
 
-/* 0 is '.', 1 is '..', so always start with offset 2 or more */
+/* simple_offset_add() never assigns these to a dentry */
 enum {
-	DIR_OFFSET_MIN	= 2,
+	DIR_OFFSET_EOD		= S32_MAX,
+};
+
+/* simple_offset_add() allocation range */
+enum {
+	DIR_OFFSET_MIN		= 2,
+	DIR_OFFSET_MAX		= DIR_OFFSET_EOD - 1,
 };
 
 static void offset_set(struct dentry *dentry, long offset)
@@ -291,7 +297,8 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
 		return -EBUSY;
 
 	ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
-				 LONG_MAX, &octx->next_offset, GFP_KERNEL);
+				 DIR_OFFSET_MAX, &octx->next_offset,
+				 GFP_KERNEL);
 	if (unlikely(ret < 0))
 		return ret == -EBUSY ? -ENOSPC : ret;
 
@@ -447,8 +454,6 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 		return -EINVAL;
 	}
 
-	/* In this case, ->private_data is protected by f_pos_lock */
-	file->private_data = NULL;
 	return vfs_setpos(file, offset, LONG_MAX);
 }
 
@@ -458,7 +463,7 @@ static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
 	struct dentry *child, *found = NULL;
 
 	rcu_read_lock();
-	child = mas_find(&mas, LONG_MAX);
+	child = mas_find(&mas, DIR_OFFSET_MAX);
 	if (!child)
 		goto out;
 	spin_lock(&child->d_lock);
@@ -479,7 +484,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
 }
 
-static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 {
 	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
 	struct dentry *dentry;
@@ -487,7 +492,7 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 	while (true) {
 		dentry = offset_find_next(octx, ctx->pos);
 		if (!dentry)
-			return ERR_PTR(-ENOENT);
+			goto out_eod;
 
 		if (!offset_dir_emit(ctx, dentry)) {
 			dput(dentry);
@@ -497,7 +502,10 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 		ctx->pos = dentry2offset(dentry) + 1;
 		dput(dentry);
 	}
-	return NULL;
+	return;
+
+out_eod:
+	ctx->pos = DIR_OFFSET_EOD;
 }
 
 /**
@@ -517,6 +525,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
  *
  * On return, @ctx->pos contains an offset that will read the next entry
  * in this directory when offset_readdir() is called again with @ctx.
+ * Caller places this value in the d_off field of the last entry in the
+ * user's buffer.
  *
  * Return values:
  *   %0 - Complete
@@ -529,13 +539,8 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
-
-	/* In this case, ->private_data is protected by f_pos_lock */
-	if (ctx->pos == DIR_OFFSET_MIN)
-		file->private_data = NULL;
-	else if (file->private_data == ERR_PTR(-ENOENT))
-		return 0;
-	file->private_data = offset_iterate_dir(d_inode(dir), ctx);
+	if (ctx->pos != DIR_OFFSET_EOD)
+		offset_iterate_dir(d_inode(dir), ctx);
 	return 0;
 }
 
-- 
cgit v1.2.3


From b9b588f22a0c049a14885399e27625635ae6ef91 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 28 Dec 2024 12:55:21 -0500
Subject: libfs: Use d_children list to iterate simple_offset directories

The mtree mechanism has been effective at creating directory offsets
that are stable over multiple opendir instances. However, it has not
been able to handle the subtleties of renames that are concurrent
with readdir.

Instead of using the mtree to emit entries in the order of their
offset values, use it only to map incoming ctx->pos to a starting
entry. Then use the directory's d_children list, which is already
maintained properly by the dcache, to find the next child to emit.

One of the sneaky things about this is that when the mtree-allocated
offset value wraps (which is very rare), looking up ctx->pos++ is
not going to find the next entry; it will return NULL. Instead, by
following the d_children list, the offset values can appear in any
order but all of the entries in the directory will be visited
eventually.

Note also that the readdir() is guaranteed to reach the tail of this
list. Entries are added only at the head of d_children, and readdir
walks from its current position in that list towards its tail.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/r/20241228175522.1854234-6-cel@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c | 84 +++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 47399f90511a..279442b1fe96 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -247,12 +247,13 @@ EXPORT_SYMBOL(simple_dir_inode_operations);
 
 /* simple_offset_add() never assigns these to a dentry */
 enum {
+	DIR_OFFSET_FIRST	= 2,		/* Find first real entry */
 	DIR_OFFSET_EOD		= S32_MAX,
 };
 
 /* simple_offset_add() allocation range */
 enum {
-	DIR_OFFSET_MIN		= 2,
+	DIR_OFFSET_MIN		= DIR_OFFSET_FIRST + 1,
 	DIR_OFFSET_MAX		= DIR_OFFSET_EOD - 1,
 };
 
@@ -457,51 +458,82 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 	return vfs_setpos(file, offset, LONG_MAX);
 }
 
-static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
+static struct dentry *find_positive_dentry(struct dentry *parent,
+					   struct dentry *dentry,
+					   bool next)
 {
-	MA_STATE(mas, &octx->mt, offset, offset);
+	struct dentry *found = NULL;
+
+	spin_lock(&parent->d_lock);
+	if (next)
+		dentry = d_next_sibling(dentry);
+	else if (!dentry)
+		dentry = d_first_child(parent);
+	hlist_for_each_entry_from(dentry, d_sib) {
+		if (!simple_positive(dentry))
+			continue;
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		if (simple_positive(dentry))
+			found = dget_dlock(dentry);
+		spin_unlock(&dentry->d_lock);
+		if (likely(found))
+			break;
+	}
+	spin_unlock(&parent->d_lock);
+	return found;
+}
+
+static noinline_for_stack struct dentry *
+offset_dir_lookup(struct dentry *parent, loff_t offset)
+{
+	struct inode *inode = d_inode(parent);
+	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
 	struct dentry *child, *found = NULL;
 
-	rcu_read_lock();
-	child = mas_find(&mas, DIR_OFFSET_MAX);
-	if (!child)
-		goto out;
-	spin_lock(&child->d_lock);
-	if (simple_positive(child))
-		found = dget_dlock(child);
-	spin_unlock(&child->d_lock);
-out:
-	rcu_read_unlock();
+	MA_STATE(mas, &octx->mt, offset, offset);
+
+	if (offset == DIR_OFFSET_FIRST)
+		found = find_positive_dentry(parent, NULL, false);
+	else {
+		rcu_read_lock();
+		child = mas_find(&mas, DIR_OFFSET_MAX);
+		found = find_positive_dentry(parent, child, false);
+		rcu_read_unlock();
+	}
 	return found;
 }
 
 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
-	long offset = dentry2offset(dentry);
 
-	return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
-			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
+	return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len,
+			inode->i_ino, fs_umode_to_dtype(inode->i_mode));
 }
 
-static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+static void offset_iterate_dir(struct file *file, struct dir_context *ctx)
 {
-	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
+	struct dentry *dir = file->f_path.dentry;
 	struct dentry *dentry;
 
+	dentry = offset_dir_lookup(dir, ctx->pos);
+	if (!dentry)
+		goto out_eod;
 	while (true) {
-		dentry = offset_find_next(octx, ctx->pos);
-		if (!dentry)
-			goto out_eod;
+		struct dentry *next;
 
-		if (!offset_dir_emit(ctx, dentry)) {
-			dput(dentry);
+		ctx->pos = dentry2offset(dentry);
+		if (!offset_dir_emit(ctx, dentry))
 			break;
-		}
 
-		ctx->pos = dentry2offset(dentry) + 1;
+		next = find_positive_dentry(dir, dentry, true);
 		dput(dentry);
+
+		if (!next)
+			goto out_eod;
+		dentry = next;
 	}
+	dput(dentry);
 	return;
 
 out_eod:
@@ -540,7 +572,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 	if (ctx->pos != DIR_OFFSET_EOD)
-		offset_iterate_dir(d_inode(dir), ctx);
+		offset_iterate_dir(file, ctx);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4c16e1cadcbcaf3c82d5fc310fbd34d0f5d0db7c Mon Sep 17 00:00:00 2001
From: Wentao Liang <liangwentao@iscas.ac.cn>
Date: Mon, 23 Dec 2024 23:30:50 +0800
Subject: ksmbd: fix a missing return value check bug

In the smb2_send_interim_resp(), if ksmbd_alloc_work_struct()
fails to allocate a node, it returns a NULL pointer to the
in_work pointer. This can lead to an illegal memory write of
in_work->response_buf when allocate_interim_rsp_buf() attempts
to perform a kzalloc() on it.

To address this issue, incorporating a check for the return
value of ksmbd_alloc_work_struct() ensures that the function
returns immediately upon allocation failure, thereby preventing
the aforementioned illegal memory access.

Fixes: 041bba4414cd ("ksmbd: fix wrong interim response on compound")
Signed-off-by: Wentao Liang <liangwentao@iscas.ac.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 23e21845f928..433e33c04039 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -695,6 +695,9 @@ void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status)
 	struct smb2_hdr *rsp_hdr;
 	struct ksmbd_work *in_work = ksmbd_alloc_work_struct();
 
+	if (!in_work)
+		return;
+
 	if (allocate_interim_rsp_buf(in_work)) {
 		pr_err("smb_allocate_rsp_buf failed!\n");
 		ksmbd_free_work_struct(in_work);
-- 
cgit v1.2.3


From c7f3cd1b245dbdd846ae376cc022c22af8059717 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Mon, 30 Dec 2024 13:44:56 +0100
Subject: ksmbd: Remove unneeded if check in ksmbd_rdma_capable_netdev()

Remove the unnecessary if check and assign the result directly.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/transport_rdma.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 0ef3c9f0bfeb..c3785a5434f9 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -2283,8 +2283,7 @@ out:
 
 		ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN);
 		if (ibdev) {
-			if (rdma_frwr_is_supported(&ibdev->attrs))
-				rdma_capable = true;
+			rdma_capable = rdma_frwr_is_supported(&ibdev->attrs);
 			ib_device_put(ibdev);
 		}
 	}
-- 
cgit v1.2.3


From 0c0214df28f0dba8de084cb4dedc0c459dfbc083 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 6 Jan 2025 12:08:42 +0100
Subject: fanotify: Fix crash in fanotify_init(2)

The rrror handling in fanotify_init(2) is buggy and overwrites 'fd'
before calling put_unused_fd() leading to possible access beyond the end
of fd bitmap. Fix it.

Reported-by: syzbot+6a3aa63412255587b21b@syzkaller.appspotmail.com
Fixes: ebe559609d78 ("fs: get rid of __FMODE_NONOTIFY kludge")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 19435cd2c41f..6ff94e312232 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1624,8 +1624,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
 					f_flags, FMODE_NONOTIFY);
 	if (IS_ERR(file)) {
-		fd = PTR_ERR(file);
 		put_unused_fd(fd);
+		fd = PTR_ERR(file);
 		goto out_destroy_group;
 	}
 	fd_install(fd, file);
-- 
cgit v1.2.3


From c21b89d495bab6ae7ce0a1592bb955e5e80127fd Mon Sep 17 00:00:00 2001
From: Mark Harmstone <maharmstone@fb.com>
Date: Fri, 3 Jan 2025 15:02:26 +0000
Subject: btrfs: don't read from userspace twice in btrfs_uring_encoded_read()

If we return -EAGAIN the first time because we need to block,
btrfs_uring_encoded_read() will get called twice. Take a copy of args,
the iovs, and the iter the first time, as by the time we are called the
second time these may have gone out of scope.

Reported-by: Jens Axboe <axboe@kernel.dk>
Fixes: 34310c442e17 ("btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)")
Signed-off-by: Mark Harmstone <maharmstone@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 122 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 65 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f8680e7cc974..149259180faa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4879,25 +4879,29 @@ out_fail:
 	return ret;
 }
 
+struct btrfs_uring_encoded_data {
+	struct btrfs_ioctl_encoded_io_args args;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov;
+	struct iov_iter iter;
+};
+
 static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
 	size_t copy_end;
-	struct btrfs_ioctl_encoded_io_args args = { 0 };
 	int ret;
 	u64 disk_bytenr, disk_io_size;
 	struct file *file;
 	struct btrfs_inode *inode;
 	struct btrfs_fs_info *fs_info;
 	struct extent_io_tree *io_tree;
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	struct iov_iter iter;
 	loff_t pos;
 	struct kiocb kiocb;
 	struct extent_state *cached_state = NULL;
 	u64 start, lockend;
 	void __user *sqe_addr;
+	struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		ret = -EPERM;
@@ -4911,43 +4915,64 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
 
 	if (issue_flags & IO_URING_F_COMPAT) {
 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
-		struct btrfs_ioctl_encoded_io_args_32 args32;
-
 		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
-		if (copy_from_user(&args32, sqe_addr, copy_end)) {
-			ret = -EFAULT;
-			goto out_acct;
-		}
-		args.iov = compat_ptr(args32.iov);
-		args.iovcnt = args32.iovcnt;
-		args.offset = args32.offset;
-		args.flags = args32.flags;
 #else
 		return -ENOTTY;
 #endif
 	} else {
 		copy_end = copy_end_kernel;
-		if (copy_from_user(&args, sqe_addr, copy_end)) {
-			ret = -EFAULT;
+	}
+
+	if (!data) {
+		data = kzalloc(sizeof(*data), GFP_NOFS);
+		if (!data) {
+			ret = -ENOMEM;
 			goto out_acct;
 		}
-	}
 
-	if (args.flags != 0)
-		return -EINVAL;
+		io_uring_cmd_get_async_data(cmd)->op_data = data;
 
-	ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
-			   &iov, &iter);
-	if (ret < 0)
-		goto out_acct;
+		if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+			struct btrfs_ioctl_encoded_io_args_32 args32;
 
-	if (iov_iter_count(&iter) == 0) {
-		ret = 0;
-		goto out_free;
+			if (copy_from_user(&args32, sqe_addr, copy_end)) {
+				ret = -EFAULT;
+				goto out_acct;
+			}
+
+			data->args.iov = compat_ptr(args32.iov);
+			data->args.iovcnt = args32.iovcnt;
+			data->args.offset = args32.offset;
+			data->args.flags = args32.flags;
+#endif
+		} else {
+			if (copy_from_user(&data->args, sqe_addr, copy_end)) {
+				ret = -EFAULT;
+				goto out_acct;
+			}
+		}
+
+		if (data->args.flags != 0) {
+			ret = -EINVAL;
+			goto out_acct;
+		}
+
+		data->iov = data->iovstack;
+		ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt,
+				   ARRAY_SIZE(data->iovstack), &data->iov,
+				   &data->iter);
+		if (ret < 0)
+			goto out_acct;
+
+		if (iov_iter_count(&data->iter) == 0) {
+			ret = 0;
+			goto out_free;
+		}
 	}
 
-	pos = args.offset;
-	ret = rw_verify_area(READ, file, &pos, args.len);
+	pos = data->args.offset;
+	ret = rw_verify_area(READ, file, &pos, data->args.len);
 	if (ret < 0)
 		goto out_free;
 
@@ -4960,15 +4985,16 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
 	start = ALIGN_DOWN(pos, fs_info->sectorsize);
 	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
 
-	ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+	ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
 				 &disk_bytenr, &disk_io_size);
 	if (ret < 0 && ret != -EIOCBQUEUED)
 		goto out_free;
 
 	file_accessed(file);
 
-	if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
-			 sizeof(args) - copy_end_kernel)) {
+	if (copy_to_user(sqe_addr + copy_end,
+			 (const char *)&data->args + copy_end_kernel,
+			 sizeof(data->args) - copy_end_kernel)) {
 		if (ret == -EIOCBQUEUED) {
 			unlock_extent(io_tree, start, lockend, &cached_state);
 			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -4978,40 +5004,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
 	}
 
 	if (ret == -EIOCBQUEUED) {
-		u64 count;
-
-		/*
-		 * If we've optimized things by storing the iovecs on the stack,
-		 * undo this.
-		 */
-		if (!iov) {
-			iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
-			if (!iov) {
-				unlock_extent(io_tree, start, lockend, &cached_state);
-				btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
-				ret = -ENOMEM;
-				goto out_acct;
-			}
-
-			memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
-		}
-
-		count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+		u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size);
 
 		/* Match ioctl by not returning past EOF if uncompressed. */
-		if (!args.compression)
-			count = min_t(u64, count, args.len);
+		if (!data->args.compression)
+			count = min_t(u64, count, data->args.len);
 
-		ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
-					      cached_state, disk_bytenr,
-					      disk_io_size, count,
-					      args.compression, iov, cmd);
+		ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend,
+					      cached_state, disk_bytenr, disk_io_size,
+					      count, data->args.compression,
+					      data->iov, cmd);
 
 		goto out_acct;
 	}
 
 out_free:
-	kfree(iov);
+	kfree(data->iov);
 
 out_acct:
 	if (ret > 0)
-- 
cgit v1.2.3


From 19d97ac5aabec2e253fd40d110ecbe326040d917 Mon Sep 17 00:00:00 2001
From: Chen Hanxiao <chenhx.fnst@fujitsu.com>
Date: Fri, 22 Nov 2024 10:18:25 +0800
Subject: nfsd: trace: remove redundant stateid even deleg_recall

Since commit e56dc9e2949e ("nfsd: remove fault injection code") remove
all nfsd_recall_delegations codes,
we don't need trace_nfsd_deleg_recall any more.

Signed-off-by: Chen Hanxiao <chenhx.fnst@fujitsu.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/trace.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 696c89f68a9e..ad2c0c432d08 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -626,7 +626,6 @@ DEFINE_STATEID_EVENT(open);
 DEFINE_STATEID_EVENT(deleg_read);
 DEFINE_STATEID_EVENT(deleg_write);
 DEFINE_STATEID_EVENT(deleg_return);
-DEFINE_STATEID_EVENT(deleg_recall);
 
 DECLARE_EVENT_CLASS(nfsd_stateseqid_class,
 	TP_PROTO(u32 seqid, const stateid_t *stp),
-- 
cgit v1.2.3


From 935fee5d5ba8073b223a9cc9906a62950f0e13ed Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Dec 2024 13:55:52 +1100
Subject: nfsd: use new wake_up_var interfaces.

The wake_up_var interface is fragile as barriers are sometimes needed.
There are now new interfaces so that most wake-ups can use an interface
that is guaranteed to have all barriers needed.

This patch changes the wake up on cl_cb_inflight to use
atomic_dec_and_wake_up().

It also changes the wake up on rp_locked to use store_release_wake_up().
This involves changing rp_locked from atomic_t to int.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c |  3 +--
 fs/nfsd/nfs4state.c    | 16 ++++++----------
 fs/nfsd/state.h        |  2 +-
 3 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c083e539e898..94479483c3d6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1036,8 +1036,7 @@ static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
 static void nfsd41_cb_inflight_end(struct nfs4_client *clp)
 {
 
-	if (atomic_dec_and_test(&clp->cl_cb_inflight))
-		wake_up_var(&clp->cl_cb_inflight);
+	atomic_dec_and_wake_up(&clp->cl_cb_inflight);
 }
 
 static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 741b9449f727..9fbf7c8f0a3e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4739,7 +4739,7 @@ static void init_nfs4_replay(struct nfs4_replay *rp)
 	rp->rp_status = nfserr_serverfault;
 	rp->rp_buflen = 0;
 	rp->rp_buf = rp->rp_ibuf;
-	atomic_set(&rp->rp_locked, RP_UNLOCKED);
+	rp->rp_locked = RP_UNLOCKED;
 }
 
 static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
@@ -4747,9 +4747,9 @@ static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
 {
 	if (!nfsd4_has_session(cstate)) {
 		wait_var_event(&so->so_replay.rp_locked,
-			       atomic_cmpxchg(&so->so_replay.rp_locked,
-					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
-		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
+			       cmpxchg(&so->so_replay.rp_locked,
+				       RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
+		if (so->so_replay.rp_locked == RP_UNHASHED)
 			return -EAGAIN;
 		cstate->replay_owner = nfs4_get_stateowner(so);
 	}
@@ -4762,9 +4762,7 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
 
 	if (so != NULL) {
 		cstate->replay_owner = NULL;
-		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
-		smp_mb__after_atomic();
-		wake_up_var(&so->so_replay.rp_locked);
+		store_release_wake_up(&so->so_replay.rp_locked, RP_UNLOCKED);
 		nfs4_put_stateowner(so);
 	}
 }
@@ -5069,9 +5067,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 	 * Some threads with a reference might be waiting for rp_locked,
 	 * so tell them to stop waiting.
 	 */
-	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
-	smp_mb__after_atomic();
-	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
+	store_release_wake_up(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
 	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
 
 	release_all_access(s);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e16bb3717fb9..ba30b2335b66 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -505,7 +505,7 @@ struct nfs4_replay {
 	unsigned int		rp_buflen;
 	char			*rp_buf;
 	struct knfsd_fh		rp_openfh;
-	atomic_t		rp_locked;
+	int			rp_locked;
 	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
 };
 
-- 
cgit v1.2.3


From 6f035c99acd6ef6b875ac4ac28e6117e60db8f89 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 6 Dec 2024 16:36:33 -0500
Subject: NFSD: Clean up unused variable

@sb should have been removed by commit 7e64c5bc497c ("NLM/NFSD: Fix
lock notifications for async-capable filesystems").

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9fbf7c8f0a3e..b9d0a300d376 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7962,7 +7962,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfsd4_blocked_lock *nbl = NULL;
 	struct file_lock *file_lock = NULL;
 	struct file_lock *conflock = NULL;
-	struct super_block *sb;
 	__be32 status = 0;
 	int lkflg;
 	int err;
@@ -7982,7 +7981,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0);
 	if (status != nfs_ok)
 		return status;
-	sb = cstate->current_fh.fh_dentry->d_sb;
 
 	if (lock->lk_is_new) {
 		if (nfsd4_has_session(cstate))
-- 
cgit v1.2.3


From de71d4e211eddb670b285a0ea477a299601ce1ca Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Tue, 10 Dec 2024 07:25:54 -0500
Subject: nfsd: fix legacy client tracking initialization

Get rid of the nfsd4_legacy_tracking_ops->init() call in
check_for_legacy_methods().  That will be handled in the caller
(nfsd4_client_tracking_init()).  Otherwise, we'll wind up calling
nfsd4_legacy_tracking_ops->init() twice, and the second time we'll
trigger the BUG_ON() in nfsd4_init_recdir().

Fixes: 74fd48739d04 ("nfsd: new Kconfig option for legacy client tracking")
Reported-by: Jur van der Burg <jur@avtware.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219580
Signed-off-by: Scott Mayhew <smayhew@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Salvatore Bonaccorso <carnil@debian.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4recover.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4a765555bf84..1c8fcb04b3cd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -2052,7 +2052,6 @@ static inline int check_for_legacy_methods(int status, struct net *net)
 		path_put(&path);
 		if (status)
 			return -ENOTDIR;
-		status = nn->client_tracking_ops->init(net);
 	}
 	return status;
 }
-- 
cgit v1.2.3


From eccbbc7c00a5aae5e704d4002adfaf4c3fa4b30d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 9 Dec 2024 11:41:26 +1100
Subject: nfsd: don't use sv_nrthreads in connection limiting calculations.

The heuristic for limiting the number of incoming connections to nfsd
currently uses sv_nrthreads - allowing more connections if more threads
were configured.

A future patch will allow number of threads to grow dynamically so that
there will be no need to configure sv_nrthreads.  So we need a different
solution for limiting connections.

It isn't clear what problem is solved by limiting connections (as
mentioned in a code comment) but the most likely problem is a connection
storm - many connections that are not doing productive work.  These will
be closed after about 6 minutes already but it might help to slow down a
storm.

This patch adds a per-connection flag XPT_PEER_VALID which indicates
that the peer has presented a filehandle for which it has some sort of
access.  i.e the peer is known to be trusted in some way.  We now only
count connections which have NOT been determined to be valid.  There
should be relative few of these at any given time.

If the number of non-validated peer exceed a limit - currently 64 - we
close the oldest non-validated peer to avoid having too many of these
useless connections.

Note that this patch significantly changes the meaning of the various
configuration parameters for "max connections".  The next patch will
remove all of these.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/callback.c               |  4 ----
 fs/nfs/callback_xdr.c           |  1 +
 fs/nfsd/netns.h                 |  4 ++--
 fs/nfsd/nfsfh.c                 |  2 ++
 include/linux/sunrpc/svc.h      |  2 +-
 include/linux/sunrpc/svc_xprt.h | 16 ++++++++++++++++
 net/sunrpc/svc_xprt.c           | 32 ++++++++++++++++----------------
 7 files changed, 38 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 6cf92498a5ac..86bdc7d23fb9 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -211,10 +211,6 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 		return ERR_PTR(-ENOMEM);
 	}
 	cb_info->serv = serv;
-	/* As there is only one thread we need to over-ride the
-	 * default maximum of 80 connections
-	 */
-	serv->sv_maxconn = 1024;
 	dprintk("nfs_callback_create_svc: service created\n");
 	return serv;
 }
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index fdeb0b34a3d3..4254ba3ee7c5 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -984,6 +984,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
 			nfs_put_client(cps.clp);
 			goto out_invalidcred;
 		}
+		svc_xprt_set_valid(rqstp->rq_xprt);
 	}
 
 	cps.minorversion = hdr_arg.minorversion;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 26f7b34d1a03..a05a45bb1978 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -129,8 +129,8 @@ struct nfsd_net {
 	unsigned char writeverf[8];
 
 	/*
-	 * Max number of connections this nfsd container will allow. Defaults
-	 * to '0' which is means that it bases this on the number of threads.
+	 * Max number of non-validated connections this nfsd container
+	 * will allow.  Defaults to '0' gets mapped to 64.
 	 */
 	unsigned int max_connections;
 
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 6a831cb242df..bf59f83c6224 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -382,6 +382,8 @@ __fh_verify(struct svc_rqst *rqstp,
 	if (error)
 		goto out;
 
+	svc_xprt_set_valid(rqstp->rq_xprt);
+
 	/* Finally, check access permissions. */
 	error = nfsd_permission(cred, exp, dentry, access);
 out:
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e4f09f58d58c..4f9418cbf8c9 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -81,7 +81,7 @@ struct svc_serv {
 	unsigned int		sv_xdrsize;	/* XDR buffer size */
 	struct list_head	sv_permsocks;	/* all permanent sockets */
 	struct list_head	sv_tempsocks;	/* all temporary sockets */
-	int			sv_tmpcnt;	/* count of temporary sockets */
+	int			sv_tmpcnt;	/* count of temporary "valid" sockets */
 	struct timer_list	sv_temptimer;	/* timer for aging temporary sockets */
 
 	char *			sv_name;	/* service name */
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0981e35a9fed..7064ebbd550b 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -99,8 +99,24 @@ enum {
 	XPT_HANDSHAKE,		/* xprt requests a handshake */
 	XPT_TLS_SESSION,	/* transport-layer security established */
 	XPT_PEER_AUTH,		/* peer has been authenticated */
+	XPT_PEER_VALID,		/* peer has presented a filehandle that
+				 * it has access to.  It is NOT counted
+				 * in ->sv_tmpcnt.
+				 */
 };
 
+static inline void svc_xprt_set_valid(struct svc_xprt *xpt)
+{
+	if (test_bit(XPT_TEMP, &xpt->xpt_flags) &&
+	    !test_and_set_bit(XPT_PEER_VALID, &xpt->xpt_flags)) {
+		struct svc_serv *serv = xpt->xpt_server;
+
+		spin_lock(&serv->sv_lock);
+		serv->sv_tmpcnt -= 1;
+		spin_unlock(&serv->sv_lock);
+	}
+}
+
 static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
 {
 	spin_lock(&xpt->xpt_lock);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 43c57124de52..dbd96b295dfa 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -606,7 +606,8 @@ int svc_port_is_privileged(struct sockaddr *sin)
 }
 
 /*
- * Make sure that we don't have too many active connections. If we have,
+ * Make sure that we don't have too many connections that have not yet
+ * demonstrated that they have access to the NFS server. If we have,
  * something must be dropped. It's not clear what will happen if we allow
  * "too many" connections, but when dealing with network-facing software,
  * we have to code defensively. Here we do that by imposing hard limits.
@@ -625,27 +626,25 @@ int svc_port_is_privileged(struct sockaddr *sin)
  */
 static void svc_check_conn_limits(struct svc_serv *serv)
 {
-	unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn :
-				(serv->sv_nrthreads+3) * 20;
+	unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn : 64;
 
 	if (serv->sv_tmpcnt > limit) {
-		struct svc_xprt *xprt = NULL;
+		struct svc_xprt *xprt = NULL, *xprti;
 		spin_lock_bh(&serv->sv_lock);
 		if (!list_empty(&serv->sv_tempsocks)) {
-			/* Try to help the admin */
-			net_notice_ratelimited("%s: too many open connections, consider increasing the %s\n",
-					       serv->sv_name, serv->sv_maxconn ?
-					       "max number of connections" :
-					       "number of threads");
 			/*
 			 * Always select the oldest connection. It's not fair,
-			 * but so is life
+			 * but nor is life.
 			 */
-			xprt = list_entry(serv->sv_tempsocks.prev,
-					  struct svc_xprt,
-					  xpt_list);
-			set_bit(XPT_CLOSE, &xprt->xpt_flags);
-			svc_xprt_get(xprt);
+			list_for_each_entry_reverse(xprti, &serv->sv_tempsocks,
+						    xpt_list) {
+				if (!test_bit(XPT_PEER_VALID, &xprti->xpt_flags)) {
+					xprt = xprti;
+					set_bit(XPT_CLOSE, &xprt->xpt_flags);
+					svc_xprt_get(xprt);
+					break;
+				}
+			}
 		}
 		spin_unlock_bh(&serv->sv_lock);
 
@@ -1039,7 +1038,8 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
 
 	spin_lock_bh(&serv->sv_lock);
 	list_del_init(&xprt->xpt_list);
-	if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+	if (test_bit(XPT_TEMP, &xprt->xpt_flags) &&
+	    !test_bit(XPT_PEER_VALID, &xprt->xpt_flags))
 		serv->sv_tmpcnt--;
 	spin_unlock_bh(&serv->sv_lock);
 
-- 
cgit v1.2.3


From a4b853f183a19a88ad635f9ae8ba97e7cb377a23 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 9 Dec 2024 11:41:27 +1100
Subject: sunrpc: remove all connection limit configuration

Now that the connection limit only apply to unconfirmed connections,
there is no need to configure it.  So remove all the configuration and
fix the number of unconfirmed connections as always 64 - which is
now given a name: XPT_MAX_TMP_CONN

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc.c                  |  8 --------
 fs/nfsd/netns.h                 |  6 ------
 fs/nfsd/nfsctl.c                | 42 -----------------------------------------
 fs/nfsd/nfssvc.c                |  5 -----
 include/linux/sunrpc/svc.h      |  4 ----
 include/linux/sunrpc/svc_xprt.h |  6 ++++++
 net/sunrpc/svc_xprt.c           |  8 +-------
 7 files changed, 7 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 4ec22c2f2ea3..7ded57ec3a60 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -70,9 +70,6 @@ static unsigned long		nlm_grace_period;
 unsigned long			nlm_timeout = LOCKD_DFLT_TIMEO;
 static int			nlm_udpport, nlm_tcpport;
 
-/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
-static unsigned int		nlm_max_connections = 1024;
-
 /*
  * Constants needed for the sysctl interface.
  */
@@ -136,9 +133,6 @@ lockd(void *vrqstp)
 	 * NFS mount or NFS daemon has gone away.
 	 */
 	while (!svc_thread_should_stop(rqstp)) {
-		/* update sv_maxconn if it has changed */
-		rqstp->rq_server->sv_maxconn = nlm_max_connections;
-
 		nlmsvc_retry_blocked(rqstp);
 		svc_recv(rqstp);
 	}
@@ -340,7 +334,6 @@ static int lockd_get(void)
 		return -ENOMEM;
 	}
 
-	serv->sv_maxconn = nlm_max_connections;
 	error = svc_set_num_threads(serv, NULL, 1);
 	if (error < 0) {
 		svc_destroy(&serv);
@@ -542,7 +535,6 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
 module_param_call(nlm_tcpport, param_set_port, param_get_int,
 		  &nlm_tcpport, 0644);
 module_param(nsm_use_hostnames, bool, 0644);
-module_param(nlm_max_connections, uint, 0644);
 
 static int lockd_init_net(struct net *net)
 {
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index a05a45bb1978..4a07b8d0837b 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -128,12 +128,6 @@ struct nfsd_net {
 	seqlock_t writeverf_lock;
 	unsigned char writeverf[8];
 
-	/*
-	 * Max number of non-validated connections this nfsd container
-	 * will allow.  Defaults to '0' gets mapped to 64.
-	 */
-	unsigned int max_connections;
-
 	u32 clientid_base;
 	u32 clientid_counter;
 	u32 clverifier_counter;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3adbc05ebaac..95ea4393305b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -48,7 +48,6 @@ enum {
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
-	NFSD_MaxConnections,
 	NFSD_Filecache,
 	NFSD_Leasetime,
 	NFSD_Gracetime,
@@ -68,7 +67,6 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
 static ssize_t write_ports(struct file *file, char *buf, size_t size);
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
-static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
@@ -87,7 +85,6 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Versions] = write_versions,
 	[NFSD_Ports] = write_ports,
 	[NFSD_MaxBlkSize] = write_maxblksize,
-	[NFSD_MaxConnections] = write_maxconn,
 #ifdef CONFIG_NFSD_V4
 	[NFSD_Leasetime] = write_leasetime,
 	[NFSD_Gracetime] = write_gracetime,
@@ -902,44 +899,6 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 							nfsd_max_blksize);
 }
 
-/*
- * write_maxconn - Set or report the current max number of connections
- *
- * Input:
- *			buf:		ignored
- *			size:		zero
- * OR
- *
- * Input:
- *			buf:		C string containing an unsigned
- *					integer value representing the new
- *					number of max connections
- *			size:		non-zero length of C string in @buf
- * Output:
- *	On success:	passed-in buffer filled with '\n'-terminated C string
- *			containing numeric value of max_connections setting
- *			for this net namespace;
- *			return code is the size in bytes of the string
- *	On error:	return code is zero or a negative errno value
- */
-static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
-{
-	char *mesg = buf;
-	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
-	unsigned int maxconn = nn->max_connections;
-
-	if (size > 0) {
-		int rv = get_uint(&mesg, &maxconn);
-
-		if (rv)
-			return rv;
-		trace_nfsd_ctl_maxconn(netns(file), maxconn);
-		nn->max_connections = maxconn;
-	}
-
-	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
-}
-
 #ifdef CONFIG_NFSD_V4
 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
 				  time64_t *time, struct nfsd_net *nn)
@@ -1372,7 +1331,6 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
-		[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 49e2f32102ab..b77097de5936 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -668,7 +668,6 @@ int nfsd_create_serv(struct net *net)
 	if (serv == NULL)
 		return -ENOMEM;
 
-	serv->sv_maxconn = nn->max_connections;
 	error = svc_bind(serv, net);
 	if (error < 0) {
 		svc_destroy(&serv);
@@ -954,11 +953,7 @@ nfsd(void *vrqstp)
 	 * The main request loop
 	 */
 	while (!svc_thread_should_stop(rqstp)) {
-		/* Update sv_maxconn if it has changed */
-		rqstp->rq_server->sv_maxconn = nn->max_connections;
-
 		svc_recv(rqstp);
-
 		nfsd_file_net_dispose(nn);
 	}
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 4f9418cbf8c9..74658cca0f38 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -72,10 +72,6 @@ struct svc_serv {
 	spinlock_t		sv_lock;
 	unsigned int		sv_nprogs;	/* Number of sv_programs */
 	unsigned int		sv_nrthreads;	/* # of server threads */
-	unsigned int		sv_maxconn;	/* max connections allowed or
-						 * '0' causing max to be based
-						 * on number of threads. */
-
 	unsigned int		sv_max_payload;	/* datagram payload size */
 	unsigned int		sv_max_mesg;	/* max_payload + 1 page for overheads */
 	unsigned int		sv_xdrsize;	/* XDR buffer size */
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 7064ebbd550b..72be60952579 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -105,6 +105,12 @@ enum {
 				 */
 };
 
+/*
+ * Maximum number of "tmp" connections - those without XPT_PEER_VALID -
+ * permitted on any service.
+ */
+#define XPT_MAX_TMP_CONN	64
+
 static inline void svc_xprt_set_valid(struct svc_xprt *xpt)
 {
 	if (test_bit(XPT_TEMP, &xpt->xpt_flags) &&
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index dbd96b295dfa..06779b4cdd0a 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -619,16 +619,10 @@ int svc_port_is_privileged(struct sockaddr *sin)
  * The only somewhat efficient mechanism would be if drop old
  * connections from the same IP first. But right now we don't even
  * record the client IP in svc_sock.
- *
- * single-threaded services that expect a lot of clients will probably
- * need to set sv_maxconn to override the default value which is based
- * on the number of threads
  */
 static void svc_check_conn_limits(struct svc_serv *serv)
 {
-	unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn : 64;
-
-	if (serv->sv_tmpcnt > limit) {
+	if (serv->sv_tmpcnt > XPT_MAX_TMP_CONN) {
 		struct svc_xprt *xprt = NULL, *xprti;
 		spin_lock_bh(&serv->sv_lock);
 		if (!list_empty(&serv->sv_tempsocks)) {
-- 
cgit v1.2.3


From 0b6e14242630ad5f65839b23400bd67c5166e2b4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Dec 2024 08:47:04 +1100
Subject: nfsd: use an xarray to store v4.1 session slots

Using an xarray to store session slots will make it easier to change the
number of active slots based on demand, and removes an unnecessary
limit.

To achieve good throughput with a high-latency server it can be helpful
to have hundreds of concurrent writes, which means hundreds of slots.
So increase the limit to 2048 (twice what the Linux client will
currently use).  This limit is only a sanity check, not a hard limit.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 28 ++++++++++++++++++----------
 fs/nfsd/state.h     |  9 ++++++---
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b9d0a300d376..aabe5223f95e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1915,8 +1915,11 @@ free_session_slots(struct nfsd4_session *ses)
 	int i;
 
 	for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
-		free_svc_cred(&ses->se_slots[i]->sl_cred);
-		kfree(ses->se_slots[i]);
+		struct nfsd4_slot *slot = xa_load(&ses->se_slots, i);
+
+		xa_erase(&ses->se_slots, i);
+		free_svc_cred(&slot->sl_cred);
+		kfree(slot);
 	}
 }
 
@@ -1996,17 +1999,20 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 	struct nfsd4_session *new;
 	int i;
 
-	BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION)
-		     > PAGE_SIZE);
-
-	new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL);
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		return NULL;
+	xa_init(&new->se_slots);
 	/* allocate each struct nfsd4_slot and data cache in one piece */
 	for (i = 0; i < numslots; i++) {
-		new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL);
-		if (!new->se_slots[i])
+		struct nfsd4_slot *slot;
+		slot = kzalloc(slotsize, GFP_KERNEL);
+		if (!slot)
 			goto out_free;
+		if (xa_is_err(xa_store(&new->se_slots, i, slot, GFP_KERNEL))) {
+			kfree(slot);
+			goto out_free;
+		}
 	}
 
 	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
@@ -2017,7 +2023,8 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 	return new;
 out_free:
 	while (i--)
-		kfree(new->se_slots[i]);
+		kfree(xa_load(&new->se_slots, i));
+	xa_destroy(&new->se_slots);
 	kfree(new);
 	return NULL;
 }
@@ -2124,6 +2131,7 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
 static void __free_session(struct nfsd4_session *ses)
 {
 	free_session_slots(ses);
+	xa_destroy(&ses->se_slots);
 	kfree(ses);
 }
 
@@ -4278,7 +4286,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (seq->slotid >= session->se_fchannel.maxreqs)
 		goto out_put_session;
 
-	slot = session->se_slots[seq->slotid];
+	slot = xa_load(&session->se_slots, seq->slotid);
 	dprintk("%s: slotid %d\n", __func__, seq->slotid);
 
 	/* We do not negotiate the number of slots yet, so set the
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ba30b2335b66..e2ddb25b8dfc 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -227,8 +227,11 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
 	return container_of(s, struct nfs4_delegation, dl_stid);
 }
 
-/* Maximum number of slots per session. 160 is useful for long haul TCP */
-#define NFSD_MAX_SLOTS_PER_SESSION     160
+/* Maximum number of slots per session.  This is for sanity-check only.
+ * It could be increased if we had a mechanism to shutdown misbehaving clients.
+ * A large number can be needed to get good throughput on high-latency servers.
+ */
+#define NFSD_MAX_SLOTS_PER_SESSION	2048
 /* Maximum  session per slot cache size */
 #define NFSD_SLOT_CACHE_SIZE		2048
 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
@@ -327,7 +330,7 @@ struct nfsd4_session {
 	struct nfsd4_cb_sec	se_cb_sec;
 	struct list_head	se_conns;
 	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_SIZE];
-	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
+	struct xarray		se_slots;	/* forward channel slots */
 };
 
 /* formatted contents of nfs4_sessionid */
-- 
cgit v1.2.3


From b5fba969a2e445e2f36b699582d8410478a99374 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Dec 2024 08:47:05 +1100
Subject: nfsd: remove artificial limits on the session-based DRC

Rather than guessing how much space it might be safe to use for the DRC,
simply try allocating slots and be prepared to accept failure.

The first slot for each session is allocated with GFP_KERNEL which is
unlikely to fail.  Subsequent slots are allocated with the addition of
__GFP_NORETRY which is expected to fail if there isn't much free memory.

This is probably too aggressive but clears the way for adding a
shrinker interface to free extra slots when memory is tight.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 94 +++++++++--------------------------------------------
 fs/nfsd/nfsd.h      |  3 --
 fs/nfsd/nfssvc.c    | 32 ------------------
 3 files changed, 16 insertions(+), 113 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aabe5223f95e..84d364975ae1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1938,65 +1938,13 @@ static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
 	return size + sizeof(struct nfsd4_slot);
 }
 
-/*
- * XXX: If we run out of reserved DRC memory we could (up to a point)
- * re-negotiate active sessions and reduce their slot usage to make
- * room for new connections. For now we just fail the create session.
- */
-static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
-{
-	u32 slotsize = slot_bytes(ca);
-	u32 num = ca->maxreqs;
-	unsigned long avail, total_avail;
-	unsigned int scale_factor;
-
-	spin_lock(&nfsd_drc_lock);
-	if (nfsd_drc_max_mem > nfsd_drc_mem_used)
-		total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
-	else
-		/* We have handed out more space than we chose in
-		 * set_max_drc() to allow.  That isn't really a
-		 * problem as long as that doesn't make us think we
-		 * have lots more due to integer overflow.
-		 */
-		total_avail = 0;
-	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
-	/*
-	 * Never use more than a fraction of the remaining memory,
-	 * unless it's the only way to give this client a slot.
-	 * The chosen fraction is either 1/8 or 1/number of threads,
-	 * whichever is smaller.  This ensures there are adequate
-	 * slots to support multiple clients per thread.
-	 * Give the client one slot even if that would require
-	 * over-allocation--it is better than failure.
-	 */
-	scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads);
-
-	avail = clamp_t(unsigned long, avail, slotsize,
-			total_avail/scale_factor);
-	num = min_t(int, num, avail / slotsize);
-	num = max_t(int, num, 1);
-	nfsd_drc_mem_used += num * slotsize;
-	spin_unlock(&nfsd_drc_lock);
-
-	return num;
-}
-
-static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
-{
-	int slotsize = slot_bytes(ca);
-
-	spin_lock(&nfsd_drc_lock);
-	nfsd_drc_mem_used -= slotsize * ca->maxreqs;
-	spin_unlock(&nfsd_drc_lock);
-}
-
 static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 					   struct nfsd4_channel_attrs *battrs)
 {
 	int numslots = fattrs->maxreqs;
 	int slotsize = slot_bytes(fattrs);
 	struct nfsd4_session *new;
+	struct nfsd4_slot *slot;
 	int i;
 
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
@@ -2004,17 +1952,21 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 		return NULL;
 	xa_init(&new->se_slots);
 	/* allocate each struct nfsd4_slot and data cache in one piece */
-	for (i = 0; i < numslots; i++) {
-		struct nfsd4_slot *slot;
-		slot = kzalloc(slotsize, GFP_KERNEL);
+	slot = kzalloc(slotsize, GFP_KERNEL);
+	if (!slot || xa_is_err(xa_store(&new->se_slots, 0, slot, GFP_KERNEL)))
+		goto out_free;
+
+	for (i = 1; i < numslots; i++) {
+		const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+		slot = kzalloc(slotsize, gfp);
 		if (!slot)
-			goto out_free;
-		if (xa_is_err(xa_store(&new->se_slots, i, slot, GFP_KERNEL))) {
+			break;
+		if (xa_is_err(xa_store(&new->se_slots, i, slot, gfp))) {
 			kfree(slot);
-			goto out_free;
+			break;
 		}
 	}
-
+	fattrs->maxreqs = i;
 	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
 	new->se_cb_slot_avail = ~0U;
 	new->se_cb_highest_slot = min(battrs->maxreqs - 1,
@@ -2022,8 +1974,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 	spin_lock_init(&new->se_lock);
 	return new;
 out_free:
-	while (i--)
-		kfree(xa_load(&new->se_slots, i));
+	kfree(slot);
 	xa_destroy(&new->se_slots);
 	kfree(new);
 	return NULL;
@@ -2138,7 +2089,6 @@ static void __free_session(struct nfsd4_session *ses)
 static void free_session(struct nfsd4_session *ses)
 {
 	nfsd4_del_conns(ses);
-	nfsd4_put_drc_mem(&ses->se_fchannel);
 	__free_session(ses);
 }
 
@@ -3786,17 +3736,6 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
 	ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
 			NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
 	ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
-	/*
-	 * Note decreasing slot size below client's request may make it
-	 * difficult for client to function correctly, whereas
-	 * decreasing the number of slots will (just?) affect
-	 * performance.  When short on memory we therefore prefer to
-	 * decrease number of slots instead of their size.  Clients that
-	 * request larger slots than they need will get poor results:
-	 * Note that we always allow at least one slot, because our
-	 * accounting is soft and provides no guarantees either way.
-	 */
-	ca->maxreqs = nfsd4_get_drc_mem(ca, nn);
 
 	return nfs_ok;
 }
@@ -3874,11 +3813,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		return status;
 	status = check_backchannel_attrs(&cr_ses->back_channel);
 	if (status)
-		goto out_release_drc_mem;
+		goto out_err;
 	status = nfserr_jukebox;
 	new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
 	if (!new)
-		goto out_release_drc_mem;
+		goto out_err;
 	conn = alloc_conn_from_crses(rqstp, cr_ses);
 	if (!conn)
 		goto out_free_session;
@@ -3987,8 +3926,7 @@ out_free_conn:
 	free_conn(conn);
 out_free_session:
 	__free_session(new);
-out_release_drc_mem:
-	nfsd4_put_drc_mem(&cr_ses->fore_channel);
+out_err:
 	return status;
 }
 
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 4b56ba1e8e48..3eb21e63b921 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -88,9 +88,6 @@ struct nfsd_genl_rqstp {
 extern struct svc_program	nfsd_programs[];
 extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
 extern struct mutex		nfsd_mutex;
-extern spinlock_t		nfsd_drc_lock;
-extern unsigned long		nfsd_drc_max_mem;
-extern unsigned long		nfsd_drc_mem_used;
 extern atomic_t			nfsd_th_cnt;		/* number of available threads */
 
 extern const struct seq_operations nfs_exports_op;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b77097de5936..3f5104ed70bf 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -70,16 +70,6 @@ static __be32			nfsd_init_request(struct svc_rqst *,
  */
 DEFINE_MUTEX(nfsd_mutex);
 
-/*
- * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
- * nfsd_drc_max_pages limits the total amount of memory available for
- * version 4.1 DRC caches.
- * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
- */
-DEFINE_SPINLOCK(nfsd_drc_lock);
-unsigned long	nfsd_drc_max_mem;
-unsigned long	nfsd_drc_mem_used;
-
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 static const struct svc_version *localio_versions[] = {
 	[1] = &localio_version1,
@@ -575,27 +565,6 @@ void nfsd_reset_versions(struct nfsd_net *nn)
 		}
 }
 
-/*
- * Each session guarantees a negotiated per slot memory cache for replies
- * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
- * NFSv4.1 server might want to use more memory for a DRC than a machine
- * with mutiple services.
- *
- * Impose a hard limit on the number of pages for the DRC which varies
- * according to the machines free pages. This is of course only a default.
- *
- * For now this is a #defined shift which could be under admin control
- * in the future.
- */
-static void set_max_drc(void)
-{
-	#define NFSD_DRC_SIZE_SHIFT	7
-	nfsd_drc_max_mem = (nr_free_buffer_pages()
-					>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
-	nfsd_drc_mem_used = 0;
-	dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
-}
-
 static int nfsd_get_default_max_blksize(void)
 {
 	struct sysinfo i;
@@ -677,7 +646,6 @@ int nfsd_create_serv(struct net *net)
 	nn->nfsd_serv = serv;
 	spin_unlock(&nfsd_notifier_lock);
 
-	set_max_drc();
 	/* check if the notifier is already set */
 	if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
 		register_inetaddr_notifier(&nfsd_inetaddr_notifier);
-- 
cgit v1.2.3


From 601c8cb349c2d9a3a6cea6f53e0bf838e2e60893 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Dec 2024 08:47:06 +1100
Subject: nfsd: add session slot count to /proc/fs/nfsd/clients/*/info

Each client now reports the number of slots allocated in each session.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 84d364975ae1..bdfe791b02f1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2643,6 +2643,7 @@ static const char *cb_state2str(int state)
 static int client_info_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = file_inode(m->file);
+	struct nfsd4_session *ses;
 	struct nfs4_client *clp;
 	u64 clid;
 
@@ -2679,6 +2680,13 @@ static int client_info_show(struct seq_file *m, void *v)
 	seq_printf(m, "callback address: \"%pISpc\"\n", &clp->cl_cb_conn.cb_addr);
 	seq_printf(m, "admin-revoked states: %d\n",
 		   atomic_read(&clp->cl_admin_revoked));
+	spin_lock(&clp->cl_lock);
+	seq_printf(m, "session slots:");
+	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
+		seq_printf(m, " %u", ses->se_fchannel.maxreqs);
+	spin_unlock(&clp->cl_lock);
+	seq_puts(m, "\n");
+
 	drop_client(clp);
 
 	return 0;
-- 
cgit v1.2.3


From 60aa6564317db29ea72b6db821b5bbb45f1e879d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Dec 2024 08:47:07 +1100
Subject: nfsd: allocate new session-based DRC slots on demand.

If a client ever uses the highest available slot for a given session,
attempt to allocate more slots so there is room for the client to use
them if wanted.  GFP_NOWAIT is used so if there is not plenty of
free memory, failure is expected - which is what we want.  It also
allows the allocation while holding a spinlock.

Each time we increase the number of slots by 20% (rounded up).  This
allows fairly quick growth while avoiding excessive over-shoot.

We would expect to stablise with around 10% more slots available than
the client actually uses.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bdfe791b02f1..fcc0153b6b90 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4235,11 +4235,6 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	slot = xa_load(&session->se_slots, seq->slotid);
 	dprintk("%s: slotid %d\n", __func__, seq->slotid);
 
-	/* We do not negotiate the number of slots yet, so set the
-	 * maxslots to the session maxreqs which is used to encode
-	 * sr_highest_slotid and the sr_target_slot id to maxslots */
-	seq->maxslots = session->se_fchannel.maxreqs;
-
 	trace_nfsd_slot_seqid_sequence(clp, seq, slot);
 	status = check_slot_seqid(seq->seqid, slot->sl_seqid,
 					slot->sl_flags & NFSD4_SLOT_INUSE);
@@ -4289,6 +4284,38 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	cstate->session = session;
 	cstate->clp = clp;
 
+	/*
+	 * If the client ever uses the highest available slot,
+	 * gently try to allocate another 20%.  This allows
+	 * fairly quick growth without grossly over-shooting what
+	 * the client might use.
+	 */
+	if (seq->slotid == session->se_fchannel.maxreqs - 1 &&
+	    session->se_fchannel.maxreqs < NFSD_MAX_SLOTS_PER_SESSION) {
+		int s = session->se_fchannel.maxreqs;
+		int cnt = DIV_ROUND_UP(s, 5);
+
+		do {
+			/*
+			 * GFP_NOWAIT both allows allocation under a
+			 * spinlock, and only succeeds if there is
+			 * plenty of memory.
+			 */
+			slot = kzalloc(slot_bytes(&session->se_fchannel),
+				       GFP_NOWAIT);
+			if (slot &&
+			    !xa_is_err(xa_store(&session->se_slots, s, slot,
+						GFP_NOWAIT))) {
+				s += 1;
+				session->se_fchannel.maxreqs = s;
+			} else {
+				kfree(slot);
+				slot = NULL;
+			}
+		} while (slot && --cnt > 0);
+	}
+	seq->maxslots = session->se_fchannel.maxreqs;
+
 out:
 	switch (clp->cl_cb_state) {
 	case NFSD4_CB_DOWN:
-- 
cgit v1.2.3


From fc8738c68d0bbf5033dd98b4f63d277ecbd49fd7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Dec 2024 08:47:08 +1100
Subject: nfsd: add support for freeing unused session-DRC slots

Reducing the number of slots in the session slot table requires
confirmation from the client.  This patch adds reduce_session_slots()
which starts the process of getting confirmation, but never calls it.
That will come in a later patch.

Before we can free a slot we need to confirm that the client won't try
to use it again.  This involves returning a lower cr_maxrequests in a
SEQUENCE reply and then seeing a ca_maxrequests on the same slot which
is not larger than we limit we are trying to impose.  So for each slot
we need to remember that we have sent a reduced cr_maxrequests.

To achieve this we introduce a concept of request "generations".  Each
time we decide to reduce cr_maxrequests we increment the generation
number, and record this when we return the lower cr_maxrequests to the
client.  When a slot with the current generation reports a low
ca_maxrequests, we commit to that level and free extra slots.

We use an 16 bit generation number (64 seems wasteful) and if it cycles
we iterate all slots and reset the generation number to avoid false matches.

When we free a slot we store the seqid in the slot pointer so that it can
be restored when we reactivate the slot.  The RFC can be read as
suggesting that the slot number could restart from one after a slot is
retired and reactivated, but also suggests that retiring slots is not
required.  So when we reactive a slot we accept with the next seqid in
sequence, or 1.

When decoding sa_highest_slotid into maxslots we need to add 1 - this
matches how it is encoded for the reply.

se_dead is moved in struct nfsd4_session to remove a hole.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++------
 fs/nfsd/nfs4xdr.c   |  5 +--
 fs/nfsd/state.h     |  6 +++-
 fs/nfsd/xdr4.h      |  2 --
 4 files changed, 92 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fcc0153b6b90..d29737018f04 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1910,17 +1910,69 @@ gen_sessionid(struct nfsd4_session *ses)
 #define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
 
 static void
-free_session_slots(struct nfsd4_session *ses)
+free_session_slots(struct nfsd4_session *ses, int from)
 {
 	int i;
 
-	for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
+	if (from >= ses->se_fchannel.maxreqs)
+		return;
+
+	for (i = from; i < ses->se_fchannel.maxreqs; i++) {
 		struct nfsd4_slot *slot = xa_load(&ses->se_slots, i);
 
-		xa_erase(&ses->se_slots, i);
+		/*
+		 * Save the seqid in case we reactivate this slot.
+		 * This will never require a memory allocation so GFP
+		 * flag is irrelevant
+		 */
+		xa_store(&ses->se_slots, i, xa_mk_value(slot->sl_seqid), 0);
 		free_svc_cred(&slot->sl_cred);
 		kfree(slot);
 	}
+	ses->se_fchannel.maxreqs = from;
+	if (ses->se_target_maxslots > from)
+		ses->se_target_maxslots = from;
+}
+
+/**
+ * reduce_session_slots - reduce the target max-slots of a session if possible
+ * @ses:  The session to affect
+ * @dec:  how much to decrease the target by
+ *
+ * This interface can be used by a shrinker to reduce the target max-slots
+ * for a session so that some slots can eventually be freed.
+ * It uses spin_trylock() as it may be called in a context where another
+ * spinlock is held that has a dependency on client_lock.  As shrinkers are
+ * best-effort, skiping a session is client_lock is already held has no
+ * great coast
+ *
+ * Return value:
+ *   The number of slots that the target was reduced by.
+ */
+static int __maybe_unused
+reduce_session_slots(struct nfsd4_session *ses, int dec)
+{
+	struct nfsd_net *nn = net_generic(ses->se_client->net,
+					  nfsd_net_id);
+	int ret = 0;
+
+	if (ses->se_target_maxslots <= 1)
+		return ret;
+	if (!spin_trylock(&nn->client_lock))
+		return ret;
+	ret = min(dec, ses->se_target_maxslots-1);
+	ses->se_target_maxslots -= ret;
+	ses->se_slot_gen += 1;
+	if (ses->se_slot_gen == 0) {
+		int i;
+		ses->se_slot_gen = 1;
+		for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
+			struct nfsd4_slot *slot = xa_load(&ses->se_slots, i);
+			slot->sl_generation = 0;
+		}
+	}
+	spin_unlock(&nn->client_lock);
+	return ret;
 }
 
 /*
@@ -1968,6 +2020,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 	}
 	fattrs->maxreqs = i;
 	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
+	new->se_target_maxslots = i;
 	new->se_cb_slot_avail = ~0U;
 	new->se_cb_highest_slot = min(battrs->maxreqs - 1,
 				      NFSD_BC_SLOT_TABLE_SIZE - 1);
@@ -2081,7 +2134,7 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
 
 static void __free_session(struct nfsd4_session *ses)
 {
-	free_session_slots(ses);
+	free_session_slots(ses, 0);
 	xa_destroy(&ses->se_slots);
 	kfree(ses);
 }
@@ -2684,6 +2737,9 @@ static int client_info_show(struct seq_file *m, void *v)
 	seq_printf(m, "session slots:");
 	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
 		seq_printf(m, " %u", ses->se_fchannel.maxreqs);
+	seq_printf(m, "\nsession target slots:");
+	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
+		seq_printf(m, " %u", ses->se_target_maxslots);
 	spin_unlock(&clp->cl_lock);
 	seq_puts(m, "\n");
 
@@ -3674,10 +3730,10 @@ nfsd4_exchange_id_release(union nfsd4_op_u *u)
 	kfree(exid->server_impl_name);
 }
 
-static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, bool slot_inuse)
+static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, u8 flags)
 {
 	/* The slot is in use, and no response has been sent. */
-	if (slot_inuse) {
+	if (flags & NFSD4_SLOT_INUSE) {
 		if (seqid == slot_seqid)
 			return nfserr_jukebox;
 		else
@@ -3686,6 +3742,8 @@ static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, bool slot_inuse)
 	/* Note unsigned 32-bit arithmetic handles wraparound: */
 	if (likely(seqid == slot_seqid + 1))
 		return nfs_ok;
+	if ((flags & NFSD4_SLOT_REUSED) && seqid == 1)
+		return nfs_ok;
 	if (seqid == slot_seqid)
 		return nfserr_replay_cache;
 	return nfserr_seq_misordered;
@@ -4236,8 +4294,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	dprintk("%s: slotid %d\n", __func__, seq->slotid);
 
 	trace_nfsd_slot_seqid_sequence(clp, seq, slot);
-	status = check_slot_seqid(seq->seqid, slot->sl_seqid,
-					slot->sl_flags & NFSD4_SLOT_INUSE);
+	status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags);
 	if (status == nfserr_replay_cache) {
 		status = nfserr_seq_misordered;
 		if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
@@ -4262,6 +4319,12 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto out_put_session;
 
+	if (session->se_target_maxslots < session->se_fchannel.maxreqs &&
+	    slot->sl_generation == session->se_slot_gen &&
+	    seq->maxslots <= session->se_target_maxslots)
+		/* Client acknowledged our reduce maxreqs */
+		free_session_slots(session, session->se_target_maxslots);
+
 	buflen = (seq->cachethis) ?
 			session->se_fchannel.maxresp_cached :
 			session->se_fchannel.maxresp_sz;
@@ -4272,9 +4335,11 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	svc_reserve(rqstp, buflen);
 
 	status = nfs_ok;
-	/* Success! bump slot seqid */
+	/* Success! accept new slot seqid */
 	slot->sl_seqid = seq->seqid;
+	slot->sl_flags &= ~NFSD4_SLOT_REUSED;
 	slot->sl_flags |= NFSD4_SLOT_INUSE;
+	slot->sl_generation = session->se_slot_gen;
 	if (seq->cachethis)
 		slot->sl_flags |= NFSD4_SLOT_CACHETHIS;
 	else
@@ -4291,9 +4356,11 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * the client might use.
 	 */
 	if (seq->slotid == session->se_fchannel.maxreqs - 1 &&
+	    session->se_target_maxslots >= session->se_fchannel.maxreqs &&
 	    session->se_fchannel.maxreqs < NFSD_MAX_SLOTS_PER_SESSION) {
 		int s = session->se_fchannel.maxreqs;
 		int cnt = DIV_ROUND_UP(s, 5);
+		void *prev_slot;
 
 		do {
 			/*
@@ -4303,18 +4370,25 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			 */
 			slot = kzalloc(slot_bytes(&session->se_fchannel),
 				       GFP_NOWAIT);
+			prev_slot = xa_load(&session->se_slots, s);
+			if (xa_is_value(prev_slot) && slot) {
+				slot->sl_seqid = xa_to_value(prev_slot);
+				slot->sl_flags |= NFSD4_SLOT_REUSED;
+			}
 			if (slot &&
 			    !xa_is_err(xa_store(&session->se_slots, s, slot,
 						GFP_NOWAIT))) {
 				s += 1;
 				session->se_fchannel.maxreqs = s;
+				session->se_target_maxslots = s;
 			} else {
 				kfree(slot);
 				slot = NULL;
 			}
 		} while (slot && --cnt > 0);
 	}
-	seq->maxslots = session->se_fchannel.maxreqs;
+	seq->maxslots = max(session->se_target_maxslots, seq->maxslots);
+	seq->target_maxslots = session->se_target_maxslots;
 
 out:
 	switch (clp->cl_cb_state) {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 53fac037611c..4dcb03cd9292 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1884,7 +1884,8 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
 		return nfserr_bad_xdr;
 	seq->seqid = be32_to_cpup(p++);
 	seq->slotid = be32_to_cpup(p++);
-	seq->maxslots = be32_to_cpup(p++);
+	/* sa_highest_slotid counts from 0 but maxslots  counts from 1 ... */
+	seq->maxslots = be32_to_cpup(p++) + 1;
 	seq->cachethis = be32_to_cpup(p);
 
 	seq->status_flags = 0;
@@ -4968,7 +4969,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
 	if (nfserr != nfs_ok)
 		return nfserr;
 	/* sr_target_highest_slotid */
-	nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+	nfserr = nfsd4_encode_slotid4(xdr, seq->target_maxslots - 1);
 	if (nfserr != nfs_ok)
 		return nfserr;
 	/* sr_status_flags */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e2ddb25b8dfc..2b0e6148a87a 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -245,10 +245,12 @@ struct nfsd4_slot {
 	struct svc_cred sl_cred;
 	u32	sl_datalen;
 	u16	sl_opcnt;
+	u16	sl_generation;
 #define NFSD4_SLOT_INUSE	(1 << 0)
 #define NFSD4_SLOT_CACHETHIS	(1 << 1)
 #define NFSD4_SLOT_INITIALIZED	(1 << 2)
 #define NFSD4_SLOT_CACHED	(1 << 3)
+#define NFSD4_SLOT_REUSED	(1 << 4)
 	u8	sl_flags;
 	char	sl_data[];
 };
@@ -321,7 +323,6 @@ struct nfsd4_session {
 	u32			se_cb_slot_avail; /* bitmap of available slots */
 	u32			se_cb_highest_slot;	/* highest slot client wants */
 	u32			se_cb_prog;
-	bool			se_dead;
 	struct list_head	se_hash;	/* hash by sessionid */
 	struct list_head	se_perclnt;
 	struct nfs4_client	*se_client;
@@ -331,6 +332,9 @@ struct nfsd4_session {
 	struct list_head	se_conns;
 	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_SIZE];
 	struct xarray		se_slots;	/* forward channel slots */
+	u16			se_slot_gen;
+	bool			se_dead;
+	u32			se_target_maxslots;
 };
 
 /* formatted contents of nfs4_sessionid */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 382cc1389396..c26ba86dbdfd 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -576,9 +576,7 @@ struct nfsd4_sequence {
 	u32			slotid;			/* request/response */
 	u32			maxslots;		/* request/response */
 	u32			cachethis;		/* request */
-#if 0
 	u32			target_maxslots;	/* response */
-#endif /* not yet */
 	u32			status_flags;		/* response */
 };
 
-- 
cgit v1.2.3


From 35e34642b5996df91e2879d59f726df6072c82f9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Dec 2024 08:47:09 +1100
Subject: nfsd: add shrinker to reduce number of slots allocated per session

Add a shrinker which frees unused slots and may ask the clients to use
fewer slots on each session.

We keep a global count of the number of freeable slots, which is the sum
of one less than the current "target" slots in all sessions in all
clients in all net-namespaces. This number is reported by the shrinker.

When the shrinker is asked to free some, we call xxx on each session in
a round-robin asking each to reduce the slot count by 1.  This will
reduce the "target" so the number reported by the shrinker will reduce
immediately.  The memory will only be freed later when the client
confirmed that it is no longer needed.

We use a global list of sessions and move the "head" to after the last
session that we asked to reduce, so the next callback from the shrinker
will move on to the next session.  This pressure should be applied
"evenly" across all sessions over time.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/nfsd/state.h     |  1 +
 2 files changed, 73 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d29737018f04..b38b3a1c0307 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1909,6 +1909,16 @@ gen_sessionid(struct nfsd4_session *ses)
  */
 #define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
 
+static struct shrinker *nfsd_slot_shrinker;
+static DEFINE_SPINLOCK(nfsd_session_list_lock);
+static LIST_HEAD(nfsd_session_list);
+/* The sum of "target_slots-1" on every session.  The shrinker can push this
+ * down, though it can take a little while for the memory to actually
+ * be freed.  The "-1" is because we can never free slot 0 while the
+ * session is active.
+ */
+static atomic_t nfsd_total_target_slots = ATOMIC_INIT(0);
+
 static void
 free_session_slots(struct nfsd4_session *ses, int from)
 {
@@ -1930,8 +1940,11 @@ free_session_slots(struct nfsd4_session *ses, int from)
 		kfree(slot);
 	}
 	ses->se_fchannel.maxreqs = from;
-	if (ses->se_target_maxslots > from)
-		ses->se_target_maxslots = from;
+	if (ses->se_target_maxslots > from) {
+		int new_target = from ?: 1;
+		atomic_sub(ses->se_target_maxslots - new_target, &nfsd_total_target_slots);
+		ses->se_target_maxslots = new_target;
+	}
 }
 
 /**
@@ -1949,7 +1962,7 @@ free_session_slots(struct nfsd4_session *ses, int from)
  * Return value:
  *   The number of slots that the target was reduced by.
  */
-static int __maybe_unused
+static int
 reduce_session_slots(struct nfsd4_session *ses, int dec)
 {
 	struct nfsd_net *nn = net_generic(ses->se_client->net,
@@ -1962,6 +1975,7 @@ reduce_session_slots(struct nfsd4_session *ses, int dec)
 		return ret;
 	ret = min(dec, ses->se_target_maxslots-1);
 	ses->se_target_maxslots -= ret;
+	atomic_sub(ret, &nfsd_total_target_slots);
 	ses->se_slot_gen += 1;
 	if (ses->se_slot_gen == 0) {
 		int i;
@@ -2021,6 +2035,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 	fattrs->maxreqs = i;
 	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
 	new->se_target_maxslots = i;
+	atomic_add(i - 1, &nfsd_total_target_slots);
 	new->se_cb_slot_avail = ~0U;
 	new->se_cb_highest_slot = min(battrs->maxreqs - 1,
 				      NFSD_BC_SLOT_TABLE_SIZE - 1);
@@ -2145,6 +2160,36 @@ static void free_session(struct nfsd4_session *ses)
 	__free_session(ses);
 }
 
+static unsigned long
+nfsd_slot_count(struct shrinker *s, struct shrink_control *sc)
+{
+	unsigned long cnt = atomic_read(&nfsd_total_target_slots);
+
+	return cnt ? cnt : SHRINK_EMPTY;
+}
+
+static unsigned long
+nfsd_slot_scan(struct shrinker *s, struct shrink_control *sc)
+{
+	struct nfsd4_session *ses;
+	unsigned long scanned = 0;
+	unsigned long freed = 0;
+
+	spin_lock(&nfsd_session_list_lock);
+	list_for_each_entry(ses, &nfsd_session_list, se_all_sessions) {
+		freed += reduce_session_slots(ses, 1);
+		scanned += 1;
+		if (scanned >= sc->nr_to_scan) {
+			/* Move starting point for next scan */
+			list_move(&nfsd_session_list, &ses->se_all_sessions);
+			break;
+		}
+	}
+	spin_unlock(&nfsd_session_list_lock);
+	sc->nr_scanned = scanned;
+	return freed;
+}
+
 static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
 {
 	int idx;
@@ -2169,6 +2214,10 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 	list_add(&new->se_perclnt, &clp->cl_sessions);
 	spin_unlock(&clp->cl_lock);
 
+	spin_lock(&nfsd_session_list_lock);
+	list_add_tail(&new->se_all_sessions, &nfsd_session_list);
+	spin_unlock(&nfsd_session_list_lock);
+
 	{
 		struct sockaddr *sa = svc_addr(rqstp);
 		/*
@@ -2238,6 +2287,9 @@ unhash_session(struct nfsd4_session *ses)
 	spin_lock(&ses->se_client->cl_lock);
 	list_del(&ses->se_perclnt);
 	spin_unlock(&ses->se_client->cl_lock);
+	spin_lock(&nfsd_session_list_lock);
+	list_del(&ses->se_all_sessions);
+	spin_unlock(&nfsd_session_list_lock);
 }
 
 /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
@@ -2373,8 +2425,12 @@ unhash_client_locked(struct nfs4_client *clp)
 	}
 	list_del_init(&clp->cl_lru);
 	spin_lock(&clp->cl_lock);
-	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
+	spin_lock(&nfsd_session_list_lock);
+	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) {
 		list_del_init(&ses->se_hash);
+		list_del_init(&ses->se_all_sessions);
+	}
+	spin_unlock(&nfsd_session_list_lock);
 	spin_unlock(&clp->cl_lock);
 }
 
@@ -4380,6 +4436,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 						GFP_NOWAIT))) {
 				s += 1;
 				session->se_fchannel.maxreqs = s;
+				atomic_add(s - session->se_target_maxslots,
+					   &nfsd_total_target_slots);
 				session->se_target_maxslots = s;
 			} else {
 				kfree(slot);
@@ -8770,7 +8828,6 @@ skip_grace:
 }
 
 /* initialization to perform when the nfsd service is started: */
-
 int
 nfs4_state_start(void)
 {
@@ -8780,6 +8837,15 @@ nfs4_state_start(void)
 	if (ret)
 		return ret;
 
+	nfsd_slot_shrinker = shrinker_alloc(0, "nfsd-DRC-slot");
+	if (!nfsd_slot_shrinker) {
+		rhltable_destroy(&nfs4_file_rhltable);
+		return -ENOMEM;
+	}
+	nfsd_slot_shrinker->count_objects = nfsd_slot_count;
+	nfsd_slot_shrinker->scan_objects = nfsd_slot_scan;
+	shrinker_register(nfsd_slot_shrinker);
+
 	set_max_delegations();
 	return 0;
 }
@@ -8821,6 +8887,7 @@ void
 nfs4_state_shutdown(void)
 {
 	rhltable_destroy(&nfs4_file_rhltable);
+	shrinker_free(nfsd_slot_shrinker);
 }
 
 static void
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 2b0e6148a87a..b31a8523c8e5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -325,6 +325,7 @@ struct nfsd4_session {
 	u32			se_cb_prog;
 	struct list_head	se_hash;	/* hash by sessionid */
 	struct list_head	se_perclnt;
+	struct list_head	se_all_sessions;/* global list of sessions */
 	struct nfs4_client	*se_client;
 	struct nfs4_sessionid	se_sessionid;
 	struct nfsd4_channel_attrs se_fchannel;
-- 
cgit v1.2.3


From 1b3e26a5ccbfc2f85bda1930cc278e313165e353 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Thu, 19 Dec 2024 15:12:04 -0500
Subject: NFSD: fix decoding in nfs4_xdr_dec_cb_getattr

If a client were to send an error to a CB_GETATTR call, the code
erronously continues to try decode past the error code. It ends
up returning BAD_XDR error to the rpc layer and then in turn
trigger a WARN_ONCE in nfsd4_cb_done() function.

Fixes: 6487a13b5c6b ("NFSD: add support for CB_GETATTR callback")
Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 94479483c3d6..151de0285d22 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -647,7 +647,7 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
 		return status;
 
 	status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status);
-	if (status)
+	if (unlikely(status || cb->cb_seq_status))
 		return status;
 	if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0)
 		return -NFSERR_BAD_XDR;
-- 
cgit v1.2.3


From cb80ecf75ac38df30cea1163563391ef7e76a24e Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Thu, 19 Dec 2024 16:57:48 -0500
Subject: NFSD: add cb opcode to WARN_ONCE on failed callback

It helps to know what kind of callback happened that triggered the
WARN_ONCE in nfsd4_cb_done() function in diagnosing what can set
an uncommon state where both cb_status and tk_status are set at
the same time.

Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 151de0285d22..a8e33f561bb3 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1396,8 +1396,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 		return;
 
 	if (cb->cb_status) {
-		WARN_ONCE(task->tk_status, "cb_status=%d tk_status=%d",
-			  cb->cb_status, task->tk_status);
+		WARN_ONCE(task->tk_status,
+			  "cb_status=%d tk_status=%d cb_opcode=%d",
+			  cb->cb_status, task->tk_status, cb->cb_ops->opcode);
 		task->tk_status = cb->cb_status;
 	}
 
-- 
cgit v1.2.3


From c224edca7af028828e2ad866b61d731b5e72b46d Mon Sep 17 00:00:00 2001
From: Yang Erkun <yangerkun@huawei.com>
Date: Wed, 25 Dec 2024 14:59:06 +0800
Subject: nfsd: no need get cache ref when protected by rcu

rcu_read_lock/rcu_read_unlock has already provide protection for the
pointer we will reference when we call e_show. Therefore, there is no
need to obtain a cache reference to help protect cache_head.
Additionally, the .put such as expkey_put/svc_export_put will invoke
dput, which can sleep and break rcu. Stop get cache reference to fix
them all.

Fixes: ae74136b4bb6 ("SUNRPC: Allow cache lookups to use RCU protection rather than the r/w spinlock")
Suggested-by: NeilBrown <neilb@suse.de>
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/export.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index aa4712362b3b..c6168bccfb6c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1425,13 +1425,9 @@ static int e_show(struct seq_file *m, void *p)
 		return 0;
 	}
 
-	if (!cache_get_rcu(&exp->h))
+	if (cache_check_rcu(cd, &exp->h, NULL))
 		return 0;
 
-	if (cache_check(cd, &exp->h, NULL))
-		return 0;
-
-	exp_put(exp);
 	return svc_export_show(m, cd, cp);
 }
 
-- 
cgit v1.2.3


From 2530766492ec7726582bcde44575ec3ff7487cd2 Mon Sep 17 00:00:00 2001
From: Yang Erkun <yangerkun@huawei.com>
Date: Wed, 25 Dec 2024 14:59:08 +0800
Subject: nfsd: fix UAF when access ex_uuid or ex_stats

We can access exp->ex_stats or exp->ex_uuid in rcu context(c_show and
e_show). All these resources should be released using kfree_rcu. Fix this
by using call_rcu, clean them all after a rcu grace period.

==================================================================
BUG: KASAN: slab-use-after-free in svc_export_show+0x362/0x430 [nfsd]
Read of size 1 at addr ff11000010fdc120 by task cat/870

CPU: 1 UID: 0 PID: 870 Comm: cat Not tainted 6.12.0-rc3+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.16.1-2.fc37 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x53/0x70
 print_address_description.constprop.0+0x2c/0x3a0
 print_report+0xb9/0x280
 kasan_report+0xae/0xe0
 svc_export_show+0x362/0x430 [nfsd]
 c_show+0x161/0x390 [sunrpc]
 seq_read_iter+0x589/0x770
 seq_read+0x1e5/0x270
 proc_reg_read+0xe1/0x140
 vfs_read+0x125/0x530
 ksys_read+0xc1/0x160
 do_syscall_64+0x5f/0x170
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

Allocated by task 830:
 kasan_save_stack+0x20/0x40
 kasan_save_track+0x14/0x30
 __kasan_kmalloc+0x8f/0xa0
 __kmalloc_node_track_caller_noprof+0x1bc/0x400
 kmemdup_noprof+0x22/0x50
 svc_export_parse+0x8a9/0xb80 [nfsd]
 cache_do_downcall+0x71/0xa0 [sunrpc]
 cache_write_procfs+0x8e/0xd0 [sunrpc]
 proc_reg_write+0xe1/0x140
 vfs_write+0x1a5/0x6d0
 ksys_write+0xc1/0x160
 do_syscall_64+0x5f/0x170
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

Freed by task 868:
 kasan_save_stack+0x20/0x40
 kasan_save_track+0x14/0x30
 kasan_save_free_info+0x3b/0x60
 __kasan_slab_free+0x37/0x50
 kfree+0xf3/0x3e0
 svc_export_put+0x87/0xb0 [nfsd]
 cache_purge+0x17f/0x1f0 [sunrpc]
 nfsd_destroy_serv+0x226/0x2d0 [nfsd]
 nfsd_svc+0x125/0x1e0 [nfsd]
 write_threads+0x16a/0x2a0 [nfsd]
 nfsctl_transaction_write+0x74/0xa0 [nfsd]
 vfs_write+0x1a5/0x6d0
 ksys_write+0xc1/0x160
 do_syscall_64+0x5f/0x170
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

Fixes: ae74136b4bb6 ("SUNRPC: Allow cache lookups to use RCU protection rather than the r/w spinlock")
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/export.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c6168bccfb6c..0363720280d4 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -355,16 +355,25 @@ static void export_stats_destroy(struct export_stats *stats)
 					    EXP_STATS_COUNTERS_NUM);
 }
 
-static void svc_export_put(struct kref *ref)
+static void svc_export_release(struct rcu_head *rcu_head)
 {
-	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
-	path_put(&exp->ex_path);
-	auth_domain_put(exp->ex_client);
+	struct svc_export *exp = container_of(rcu_head, struct svc_export,
+			ex_rcu);
+
 	nfsd4_fslocs_free(&exp->ex_fslocs);
 	export_stats_destroy(exp->ex_stats);
 	kfree(exp->ex_stats);
 	kfree(exp->ex_uuid);
-	kfree_rcu(exp, ex_rcu);
+	kfree(exp);
+}
+
+static void svc_export_put(struct kref *ref)
+{
+	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
+
+	path_put(&exp->ex_path);
+	auth_domain_put(exp->ex_client);
+	call_rcu(&exp->ex_rcu, svc_export_release);
 }
 
 static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
-- 
cgit v1.2.3


From 07aeefae7ff44d80524375253980b1bdee2396b0 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 5 Jan 2025 17:24:03 +0100
Subject: ovl: pass realinode to ovl_encode_real_fh() instead of realdentry

We want to be able to encode an fid from an inode with no alias.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20250105162404.357058-2-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/overlayfs/copy_up.c   | 11 ++++++-----
 fs/overlayfs/export.c    |  5 +++--
 fs/overlayfs/namei.c     |  4 ++--
 fs/overlayfs/overlayfs.h |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 56eee9f23ea9..0c28e5fa3407 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -415,13 +415,13 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
 	return err;
 }
 
-struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode,
 				  bool is_upper)
 {
 	struct ovl_fh *fh;
 	int fh_type, dwords;
 	int buflen = MAX_HANDLE_SZ;
-	uuid_t *uuid = &real->d_sb->s_uuid;
+	uuid_t *uuid = &realinode->i_sb->s_uuid;
 	int err;
 
 	/* Make sure the real fid stays 32bit aligned */
@@ -438,7 +438,8 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
 	 * the price or reconnecting the dentry.
 	 */
 	dwords = buflen >> 2;
-	fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0);
+	fh_type = exportfs_encode_inode_fh(realinode, (void *)fh->fb.fid,
+					   &dwords, NULL, 0);
 	buflen = (dwords << 2);
 
 	err = -EIO;
@@ -479,7 +480,7 @@ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin)
 	if (!ovl_can_decode_fh(origin->d_sb))
 		return NULL;
 
-	return ovl_encode_real_fh(ofs, origin, false);
+	return ovl_encode_real_fh(ofs, d_inode(origin), false);
 }
 
 int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
@@ -504,7 +505,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
 	const struct ovl_fh *fh;
 	int err;
 
-	fh = ovl_encode_real_fh(ofs, upper, true);
+	fh = ovl_encode_real_fh(ofs, d_inode(upper), true);
 	if (IS_ERR(fh))
 		return PTR_ERR(fh);
 
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 5868cb222955..036c9f39a14d 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -223,6 +223,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
 static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
 			     u32 *fid, int buflen)
 {
+	struct inode *inode = d_inode(dentry);
 	struct ovl_fh *fh = NULL;
 	int err, enc_lower;
 	int len;
@@ -236,8 +237,8 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
 		goto fail;
 
 	/* Encode an upper or lower file handle */
-	fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) :
-				ovl_dentry_upper(dentry), !enc_lower);
+	fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) :
+				ovl_inode_upper(inode), !enc_lower);
 	if (IS_ERR(fh))
 		return PTR_ERR(fh);
 
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 7e27b7d4adee..cea820cb3b55 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -542,7 +542,7 @@ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
 	struct ovl_fh *fh;
 	int err;
 
-	fh = ovl_encode_real_fh(ofs, real, is_upper);
+	fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper);
 	err = PTR_ERR(fh);
 	if (IS_ERR(fh)) {
 		fh = NULL;
@@ -738,7 +738,7 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
 	struct ovl_fh *fh;
 	int err;
 
-	fh = ovl_encode_real_fh(ofs, origin, false);
+	fh = ovl_encode_real_fh(ofs, d_inode(origin), false);
 	if (IS_ERR(fh))
 		return PTR_ERR(fh);
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index b361f35762be..0021e2025020 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -865,7 +865,7 @@ int ovl_copy_up_with_data(struct dentry *dentry);
 int ovl_maybe_copy_up(struct dentry *dentry, int flags);
 int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
 int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
-struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode,
 				  bool is_upper);
 struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin);
 int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
-- 
cgit v1.2.3


From c45beebfde34aa71afbc48b2c54cdda623515037 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 5 Jan 2025 17:24:04 +0100
Subject: ovl: support encoding fid from inode with no alias

Dmitry Safonov reported that a WARN_ON() assertion can be trigered by
userspace when calling inotify_show_fdinfo() for an overlayfs watched
inode, whose dentry aliases were discarded with drop_caches.

The WARN_ON() assertion in inotify_show_fdinfo() was removed, because
it is possible for encoding file handle to fail for other reason, but
the impact of failing to encode an overlayfs file handle goes beyond
this assertion.

As shown in the LTP test case mentioned in the link below, failure to
encode an overlayfs file handle from a non-aliased inode also leads to
failure to report an fid with FAN_DELETE_SELF fanotify events.

As Dmitry notes in his analyzis of the problem, ovl_encode_fh() fails
if it cannot find an alias for the inode, but this failure can be fixed.
ovl_encode_fh() seldom uses the alias and in the case of non-decodable
file handles, as is often the case with fanotify fid info,
ovl_encode_fh() never needs to use the alias to encode a file handle.

Defer finding an alias until it is actually needed so ovl_encode_fh()
will not fail in the common case of FAN_DELETE_SELF fanotify events.

Fixes: 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles")
Reported-by: Dmitry Safonov <dima@arista.com>
Closes: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiie81voLZZi2zXS1BziXZCM24nXqPAxbu8kxXCUWdwOg@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20250105162404.357058-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/overlayfs/export.c | 46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 036c9f39a14d..444aeeccb6da 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -176,35 +176,37 @@ static int ovl_connect_layer(struct dentry *dentry)
  *
  * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
  */
-static int ovl_check_encode_origin(struct dentry *dentry)
+static int ovl_check_encode_origin(struct inode *inode)
 {
-	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	struct ovl_fs *ofs = OVL_FS(inode->i_sb);
 	bool decodable = ofs->config.nfs_export;
+	struct dentry *dentry;
+	int err;
 
 	/* No upper layer? */
 	if (!ovl_upper_mnt(ofs))
 		return 1;
 
 	/* Lower file handle for non-upper non-decodable */
-	if (!ovl_dentry_upper(dentry) && !decodable)
+	if (!ovl_inode_upper(inode) && !decodable)
 		return 1;
 
 	/* Upper file handle for pure upper */
-	if (!ovl_dentry_lower(dentry))
+	if (!ovl_inode_lower(inode))
 		return 0;
 
 	/*
 	 * Root is never indexed, so if there's an upper layer, encode upper for
 	 * root.
 	 */
-	if (dentry == dentry->d_sb->s_root)
+	if (inode == d_inode(inode->i_sb->s_root))
 		return 0;
 
 	/*
 	 * Upper decodable file handle for non-indexed upper.
 	 */
-	if (ovl_dentry_upper(dentry) && decodable &&
-	    !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+	if (ovl_inode_upper(inode) && decodable &&
+	    !ovl_test_flag(OVL_INDEX, inode))
 		return 0;
 
 	/*
@@ -213,17 +215,25 @@ static int ovl_check_encode_origin(struct dentry *dentry)
 	 * ovl_connect_layer() will try to make origin's layer "connected" by
 	 * copying up a "connectable" ancestor.
 	 */
-	if (d_is_dir(dentry) && decodable)
-		return ovl_connect_layer(dentry);
+	if (!decodable || !S_ISDIR(inode->i_mode))
+		return 1;
+
+	dentry = d_find_any_alias(inode);
+	if (!dentry)
+		return -ENOENT;
+
+	err = ovl_connect_layer(dentry);
+	dput(dentry);
+	if (err < 0)
+		return err;
 
 	/* Lower file handle for indexed and non-upper dir/non-dir */
 	return 1;
 }
 
-static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode,
 			     u32 *fid, int buflen)
 {
-	struct inode *inode = d_inode(dentry);
 	struct ovl_fh *fh = NULL;
 	int err, enc_lower;
 	int len;
@@ -232,7 +242,7 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
 	 * Check if we should encode a lower or upper file handle and maybe
 	 * copy up an ancestor to make lower file handle connectable.
 	 */
-	err = enc_lower = ovl_check_encode_origin(dentry);
+	err = enc_lower = ovl_check_encode_origin(inode);
 	if (enc_lower < 0)
 		goto fail;
 
@@ -252,8 +262,8 @@ out:
 	return err;
 
 fail:
-	pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n",
-			    dentry, err);
+	pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n",
+			    inode->i_ino, err);
 	goto out;
 }
 
@@ -261,19 +271,13 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
 			 struct inode *parent)
 {
 	struct ovl_fs *ofs = OVL_FS(inode->i_sb);
-	struct dentry *dentry;
 	int bytes, buflen = *max_len << 2;
 
 	/* TODO: encode connectable file handles */
 	if (parent)
 		return FILEID_INVALID;
 
-	dentry = d_find_any_alias(inode);
-	if (!dentry)
-		return FILEID_INVALID;
-
-	bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen);
-	dput(dentry);
+	bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen);
 	if (bytes <= 0)
 		return FILEID_INVALID;
 
-- 
cgit v1.2.3


From 6aecd91a5c5b68939cf4169e32bc49f3cd2dd329 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 2 Jan 2025 14:44:16 +1030
Subject: btrfs: avoid NULL pointer dereference if no valid extent tree

[BUG]
Syzbot reported a crash with the following call trace:

  BTRFS info (device loop0): scrub: started on devid 1
  BUG: kernel NULL pointer dereference, address: 0000000000000208
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 106e70067 P4D 106e70067 PUD 107143067 PMD 0
  Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
  CPU: 1 UID: 0 PID: 689 Comm: repro Kdump: loaded Tainted: G           O       6.13.0-rc4-custom+ #206
  Tainted: [O]=OOT_MODULE
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022
  RIP: 0010:find_first_extent_item+0x26/0x1f0 [btrfs]
  Call Trace:
   <TASK>
   scrub_find_fill_first_stripe+0x13d/0x3b0 [btrfs]
   scrub_simple_mirror+0x175/0x260 [btrfs]
   scrub_stripe+0x5d4/0x6c0 [btrfs]
   scrub_chunk+0xbb/0x170 [btrfs]
   scrub_enumerate_chunks+0x2f4/0x5f0 [btrfs]
   btrfs_scrub_dev+0x240/0x600 [btrfs]
   btrfs_ioctl+0x1dc8/0x2fa0 [btrfs]
   ? do_sys_openat2+0xa5/0xf0
   __x64_sys_ioctl+0x97/0xc0
   do_syscall_64+0x4f/0x120
   entry_SYSCALL_64_after_hwframe+0x76/0x7e
   </TASK>

[CAUSE]
The reproducer is using a corrupted image where extent tree root is
corrupted, thus forcing to use "rescue=all,ro" mount option to mount the
image.

Then it triggered a scrub, but since scrub relies on extent tree to find
where the data/metadata extents are, scrub_find_fill_first_stripe()
relies on an non-empty extent root.

But unfortunately scrub_find_fill_first_stripe() doesn't really expect
an NULL pointer for extent root, it use extent_root to grab fs_info and
triggered a NULL pointer dereference.

[FIX]
Add an extra check for a valid extent root at the beginning of
scrub_find_fill_first_stripe().

The new error path is introduced by 42437a6386ff ("btrfs: introduce
mount option rescue=ignorebadroots"), but that's pretty old, and later
commit b979547513ff ("btrfs: scrub: introduce helper to find and fill
sector info for a scrub_stripe") changed how we do scrub.

So for kernels older than 6.6, the fix will need manual backport.

Reported-by: syzbot+339e9dbe3a2ca419b85d@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/67756935.050a0220.25abdd.0a12.GAE@google.com/
Fixes: 42437a6386ff ("btrfs: introduce mount option rescue=ignorebadroots")
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 204c928beaf9..531312efee8d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
 	u64 extent_gen;
 	int ret;
 
+	if (unlikely(!extent_root)) {
+		btrfs_err(fs_info, "no valid extent root for scrub");
+		return -EUCLEAN;
+	}
 	memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
 				   stripe->nr_sectors);
 	scrub_stripe_reset_bitmaps(stripe);
-- 
cgit v1.2.3


From 7467bc5959bf02ef5210ea7e7948e548565c799c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 13 Dec 2024 07:43:43 +0100
Subject: btrfs: zoned: calculate max_extent_size properly on non-zoned setup

Since commit 559218d43ec9 ("block: pre-calculate max_zone_append_sectors"),
queue_limits's max_zone_append_sectors is default to be 0 and it is only
updated when there is a zoned device. So, we have
lim->max_zone_append_sectors = 0 when there is no zoned device in the
filesystem.

That leads to fs_info->max_zone_append_size and thus
fs_info->max_extent_size to be 0, which is wrong and can for example
lead to a divide by zero in count_max_extents().

Fix this by only capping fs_info->max_extent_size to
fs_info->max_zone_append_size when it is non-zero.

Based on a patch from Naohiro Aota <naohiro.aota@wdc.com>, from which
much of this commit message is stolen as well.

Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fixes: 559218d43ec9 ("block: pre-calculate max_zone_append_sectors")
Tested-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index cb32966380f5..5f9d3be1234a 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -745,8 +745,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 		     (u64)lim->max_segments << PAGE_SHIFT),
 		fs_info->sectorsize);
 	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
-	if (fs_info->max_zone_append_size < fs_info->max_extent_size)
-		fs_info->max_extent_size = fs_info->max_zone_append_size;
+
+	fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size,
+						fs_info->max_zone_append_size);
 
 	/*
 	 * Check mount options here, because we might change fs_info->zoned
-- 
cgit v1.2.3


From 0ee4736c003daded513de0ff112d4a1e9c85bbab Mon Sep 17 00:00:00 2001
From: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Date: Wed, 18 Dec 2024 11:32:51 +0100
Subject: btrfs: zlib: fix avail_in bytes for s390 zlib HW compression path

Since the input data length passed to zlib_compress_folios() can be
arbitrary, always setting strm.avail_in to a multiple of PAGE_SIZE may
cause read-in bytes to exceed the input range. Currently this triggers
an assert in btrfs_compress_folios() on the debug kernel (see below).
Fix strm.avail_in calculation for S390 hardware acceleration path.

  assertion failed: *total_in <= orig_len, in fs/btrfs/compression.c:1041
  ------------[ cut here ]------------
  kernel BUG at fs/btrfs/compression.c:1041!
  monitor event: 0040 ilc:2 [#1] PREEMPT SMP
  CPU: 16 UID: 0 PID: 325 Comm: kworker/u273:3 Not tainted 6.13.0-20241204.rc1.git6.fae3b21430ca.300.fc41.s390x+debug #1
  Hardware name: IBM 3931 A01 703 (z/VM 7.4.0)
  Workqueue: btrfs-delalloc btrfs_work_helper
  Krnl PSW : 0704d00180000000 0000021761df6538 (btrfs_compress_folios+0x198/0x1a0)
             R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
  Krnl GPRS: 0000000080000000 0000000000000001 0000000000000047 0000000000000000
             0000000000000006 ffffff01757bb000 000001976232fcc0 000000000000130c
             000001976232fcd0 000001976232fcc8 00000118ff4a0e30 0000000000000001
             00000111821ab400 0000011100000000 0000021761df6534 000001976232fb58
  Krnl Code: 0000021761df6528: c020006f5ef4        larl    %r2,0000021762be2310
             0000021761df652e: c0e5ffbd09d5        brasl   %r14,00000217615978d8
            #0000021761df6534: af000000            mc      0,0
            >0000021761df6538: 0707                bcr     0,%r7
             0000021761df653a: 0707                bcr     0,%r7
             0000021761df653c: 0707                bcr     0,%r7
             0000021761df653e: 0707                bcr     0,%r7
             0000021761df6540: c004004bb7ec        brcl    0,000002176276d518
  Call Trace:
   [<0000021761df6538>] btrfs_compress_folios+0x198/0x1a0
  ([<0000021761df6534>] btrfs_compress_folios+0x194/0x1a0)
   [<0000021761d97788>] compress_file_range+0x3b8/0x6d0
   [<0000021761dcee7c>] btrfs_work_helper+0x10c/0x160
   [<0000021761645760>] process_one_work+0x2b0/0x5d0
   [<000002176164637e>] worker_thread+0x20e/0x3e0
   [<000002176165221a>] kthread+0x15a/0x170
   [<00000217615b859c>] __ret_from_fork+0x3c/0x60
   [<00000217626e72d2>] ret_from_fork+0xa/0x38
  INFO: lockdep is turned off.
  Last Breaking-Event-Address:
   [<0000021761597924>] _printk+0x4c/0x58
  Kernel panic - not syncing: Fatal exception: panic_on_oops

Fixes: fd1e75d0105d ("btrfs: make compression path to be subpage compatible")
CC: stable@vger.kernel.org # 6.12+
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zlib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ddf0d5a448a7..c9e92c6941ec 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -174,10 +174,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 					copy_page(workspace->buf + i * PAGE_SIZE,
 						  data_in);
 					start += PAGE_SIZE;
-					workspace->strm.avail_in =
-						(in_buf_folios << PAGE_SHIFT);
 				}
 				workspace->strm.next_in = workspace->buf;
+				workspace->strm.avail_in = min(bytes_left,
+							       in_buf_folios << PAGE_SHIFT);
 			} else {
 				unsigned int pg_off;
 				unsigned int cur_len;
-- 
cgit v1.2.3


From 8fd56ad6e7c90ac2bddb0741c6b248c8c5d56ac8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 6 Jan 2025 16:21:00 +0000
Subject: afs: Fix the maximum cell name length

The kafs filesystem limits the maximum length of a cell to 256 bytes, but a
problem occurs if someone actually does that: kafs tries to create a
directory under /proc/net/afs/ with the name of the cell, but that fails
with a warning:

        WARNING: CPU: 0 PID: 9 at fs/proc/generic.c:405

because procfs limits the maximum filename length to 255.

However, the DNS limits the maximum lookup length and, by extension, the
maximum cell name, to 255 less two (length count and trailing NUL).

Fix this by limiting the maximum acceptable cellname length to 253.  This
also allows us to be sure we can create the "/afs/.<cell>/" mountpoint too.

Further, split the YFS VL record cell name maximum to be the 256 allowed by
the protocol and ignore the record retrieved by YFSVL.GetCellName if it
exceeds 253.

Fixes: c3e9f888263b ("afs: Implement client support for the YFSVL.GetCellName RPC op")
Reported-by: syzbot+7848fee1f1e5c53f912b@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/6776d25d.050a0220.3a8527.0048.GAE@google.com/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/376236.1736180460@warthog.procyon.org.uk
Tested-by: syzbot+7848fee1f1e5c53f912b@syzkaller.appspotmail.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/afs.h      | 2 +-
 fs/afs/afs_vl.h   | 1 +
 fs/afs/vl_alias.c | 8 ++++++--
 fs/afs/vlclient.c | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index b488072aee87..ec3db00bd081 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -10,7 +10,7 @@
 
 #include <linux/in.h>
 
-#define AFS_MAXCELLNAME		256  	/* Maximum length of a cell name */
+#define AFS_MAXCELLNAME		253  	/* Maximum length of a cell name (DNS limited) */
 #define AFS_MAXVOLNAME		64  	/* Maximum length of a volume name */
 #define AFS_MAXNSERVERS		8   	/* Maximum servers in a basic volume record */
 #define AFS_NMAXNSERVERS	13  	/* Maximum servers in a N/U-class volume record */
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
index a06296c8827d..b835e25a2c02 100644
--- a/fs/afs/afs_vl.h
+++ b/fs/afs/afs_vl.h
@@ -13,6 +13,7 @@
 #define AFS_VL_PORT		7003	/* volume location service port */
 #define VL_SERVICE		52	/* RxRPC service ID for the Volume Location service */
 #define YFS_VL_SERVICE		2503	/* Service ID for AuriStor upgraded VL service */
+#define YFS_VL_MAXCELLNAME	256  	/* Maximum length of a cell name in YFS protocol */
 
 enum AFSVL_Operations {
 	VLGETENTRYBYID		= 503,	/* AFS Get VLDB entry by ID */
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index 9f36e14f1c2d..f9e76b604f31 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -253,6 +253,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key)
 static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
 {
 	struct afs_cell *master;
+	size_t name_len;
 	char *cell_name;
 
 	cell_name = afs_vl_get_cell_name(cell, key);
@@ -264,8 +265,11 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
 		return 0;
 	}
 
-	master = afs_lookup_cell(cell->net, cell_name, strlen(cell_name),
-				 NULL, false);
+	name_len = strlen(cell_name);
+	if (!name_len || name_len > AFS_MAXCELLNAME)
+		master = ERR_PTR(-EOPNOTSUPP);
+	else
+		master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false);
 	kfree(cell_name);
 	if (IS_ERR(master))
 		return PTR_ERR(master);
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index cac75f89b64a..55dd0fc5aad7 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -697,7 +697,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
 			return ret;
 
 		namesz = ntohl(call->tmp);
-		if (namesz > AFS_MAXCELLNAME)
+		if (namesz > YFS_VL_MAXCELLNAME)
 			return afs_protocol_error(call, afs_eproto_cellname_len);
 		paddedsz = (namesz + 3) & ~3;
 		call->count = namesz;
-- 
cgit v1.2.3


From 5cf8f938bf5ca441a02a3bbf6ef772963aa387b3 Mon Sep 17 00:00:00 2001
From: Christian Kujau <lists@nerdbynature.de>
Date: Mon, 6 Jan 2025 16:32:05 +0100
Subject: vbox: Enable VBOXGUEST and VBOXSF_FS on ARM64

Now that VirtualBox is able to run as a host on arm64 (e.g. the Apple M3
processors) we can enable VBOXSF_FS (and in turn VBOXGUEST) for this
architecture. Tested with various runs of bonnie++ and dbench on an Apple
MacBook Pro with the latest Virtualbox 7.1.4 r165100 installed.

Signed-off-by: Christian Kujau <lists@nerdbynature.de>
Link: https://lore.kernel.org/r/7384d96c-2a77-39b0-2306-90129bae9342@nerdbynature.de
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/virt/vboxguest/Kconfig | 2 +-
 fs/vboxsf/Kconfig              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/drivers/virt/vboxguest/Kconfig b/drivers/virt/vboxguest/Kconfig
index cc329887bfae..11b153e7454e 100644
--- a/drivers/virt/vboxguest/Kconfig
+++ b/drivers/virt/vboxguest/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VBOXGUEST
 	tristate "Virtual Box Guest integration support"
-	depends on X86 && PCI && INPUT
+	depends on (ARM64 || X86) && PCI && INPUT
 	help
 	  This is a driver for the Virtual Box Guest PCI device used in
 	  Virtual Box virtual machines. Enabling this driver will add
diff --git a/fs/vboxsf/Kconfig b/fs/vboxsf/Kconfig
index b84586ae08b3..d4694026db8b 100644
--- a/fs/vboxsf/Kconfig
+++ b/fs/vboxsf/Kconfig
@@ -1,6 +1,6 @@
 config VBOXSF_FS
 	tristate "VirtualBox guest shared folder (vboxsf) support"
-	depends on X86 && VBOXGUEST
+	depends on (ARM64 || X86) && VBOXGUEST
 	select NLS
 	help
 	  VirtualBox hosts can share folders with guests, this driver
-- 
cgit v1.2.3


From f8f25893a477a4da4414c3e40ddd51d77fac9cfc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 29 Nov 2024 13:15:37 +0100
Subject: fs: debugfs: differentiate short fops with proxy ops

Geert reported that my previous short fops debugfs changes
broke m68k, because it only has mandatory alignement of two,
so we can't stash the "is it short" information into the
pointer (as we already did with the "is it real" bit.)

Instead, exploit the fact that debugfs_file_get() called on
an already open file will already find that the fsdata is
no longer the real fops but rather the allocated data that
already distinguishes full/short ops, so only open() needs
to be able to distinguish. We can achieve that by using two
different open functions.

Unfortunately this requires another set of full file ops,
increasing the size by 536 bytes (x86-64), but that's still
a reasonable trade-off given that only converting some of
the wireless stack gained over 28k. This brings the total
cost of this to around 1k, for wins of 28k (all x86-64).

Reported-and-tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/CAMuHMdWu_9-L2Te101w8hU7H_2yobJFPXSwwUmGHSJfaPWDKiQ@mail.gmail.com
Fixes: 8dc6d81c6b2a ("debugfs: add small file operations for most files")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20241129121536.30989-2-johannes@sipsolutions.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c     | 72 +++++++++++++++++++++++++++++++++++----------------
 fs/debugfs/inode.c    | 11 +++-----
 fs/debugfs/internal.h |  6 +----
 3 files changed, 55 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 47dc96dfe386..bdb4f2ca0506 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -64,22 +64,13 @@ const struct file_operations *debugfs_real_fops(const struct file *filp)
 }
 EXPORT_SYMBOL_GPL(debugfs_real_fops);
 
-/**
- * debugfs_file_get - mark the beginning of file data access
- * @dentry: the dentry object whose data is being accessed.
- *
- * Up to a matching call to debugfs_file_put(), any successive call
- * into the file removing functions debugfs_remove() and
- * debugfs_remove_recursive() will block. Since associated private
- * file data may only get freed after a successful return of any of
- * the removal functions, you may safely access it after a successful
- * call to debugfs_file_get() without worrying about lifetime issues.
- *
- * If -%EIO is returned, the file has already been removed and thus,
- * it is not safe to access any of its data. If, on the other hand,
- * it is allowed to access the file data, zero is returned.
- */
-int debugfs_file_get(struct dentry *dentry)
+enum dbgfs_get_mode {
+	DBGFS_GET_ALREADY,
+	DBGFS_GET_REGULAR,
+	DBGFS_GET_SHORT,
+};
+
+static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
 {
 	struct debugfs_fsdata *fsd;
 	void *d_fsd;
@@ -96,15 +87,17 @@ int debugfs_file_get(struct dentry *dentry)
 	if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) {
 		fsd = d_fsd;
 	} else {
+		if (WARN_ON(mode == DBGFS_GET_ALREADY))
+			return -EINVAL;
+
 		fsd = kmalloc(sizeof(*fsd), GFP_KERNEL);
 		if (!fsd)
 			return -ENOMEM;
 
-		if ((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT) {
+		if (mode == DBGFS_GET_SHORT) {
 			fsd->real_fops = NULL;
 			fsd->short_fops = (void *)((unsigned long)d_fsd &
-						~(DEBUGFS_FSDATA_IS_REAL_FOPS_BIT |
-						  DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT));
+						~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
 		} else {
 			fsd->real_fops = (void *)((unsigned long)d_fsd &
 						~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
@@ -138,6 +131,26 @@ int debugfs_file_get(struct dentry *dentry)
 
 	return 0;
 }
+
+/**
+ * debugfs_file_get - mark the beginning of file data access
+ * @dentry: the dentry object whose data is being accessed.
+ *
+ * Up to a matching call to debugfs_file_put(), any successive call
+ * into the file removing functions debugfs_remove() and
+ * debugfs_remove_recursive() will block. Since associated private
+ * file data may only get freed after a successful return of any of
+ * the removal functions, you may safely access it after a successful
+ * call to debugfs_file_get() without worrying about lifetime issues.
+ *
+ * If -%EIO is returned, the file has already been removed and thus,
+ * it is not safe to access any of its data. If, on the other hand,
+ * it is allowed to access the file data, zero is returned.
+ */
+int debugfs_file_get(struct dentry *dentry)
+{
+	return __debugfs_file_get(dentry, DBGFS_GET_ALREADY);
+}
 EXPORT_SYMBOL_GPL(debugfs_file_get);
 
 /**
@@ -424,7 +437,8 @@ static void __full_proxy_fops_init(struct file_operations *proxy_fops,
 		proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
 }
 
-static int full_proxy_open(struct inode *inode, struct file *filp)
+static int full_proxy_open(struct inode *inode, struct file *filp,
+			   enum dbgfs_get_mode mode)
 {
 	struct dentry *dentry = F_DENTRY(filp);
 	const struct file_operations *real_fops;
@@ -432,7 +446,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
 	struct debugfs_fsdata *fsd;
 	int r;
 
-	r = debugfs_file_get(dentry);
+	r = __debugfs_file_get(dentry, mode);
 	if (r)
 		return r == -EIO ? -ENOENT : r;
 
@@ -491,8 +505,22 @@ out:
 	return r;
 }
 
+static int full_proxy_open_regular(struct inode *inode, struct file *filp)
+{
+	return full_proxy_open(inode, filp, DBGFS_GET_REGULAR);
+}
+
 const struct file_operations debugfs_full_proxy_file_operations = {
-	.open = full_proxy_open,
+	.open = full_proxy_open_regular,
+};
+
+static int full_proxy_open_short(struct inode *inode, struct file *filp)
+{
+	return full_proxy_open(inode, filp, DBGFS_GET_SHORT);
+}
+
+const struct file_operations debugfs_full_short_proxy_file_operations = {
+	.open = full_proxy_open_short,
 };
 
 ssize_t debugfs_attr_read(struct file *file, char __user *buf,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 38a9c7eb97e6..65e46c7b6bf1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -455,8 +455,7 @@ struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
 					const struct file_operations *fops)
 {
 	if (WARN_ON((unsigned long)fops &
-		    (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT |
-		     DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+		    DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
 		return ERR_PTR(-EINVAL);
 
 	return __debugfs_create_file(name, mode, parent, data,
@@ -471,15 +470,13 @@ struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
 					 const struct debugfs_short_fops *fops)
 {
 	if (WARN_ON((unsigned long)fops &
-		    (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT |
-		     DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+		    DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
 		return ERR_PTR(-EINVAL);
 
 	return __debugfs_create_file(name, mode, parent, data,
-				fops ? &debugfs_full_proxy_file_operations :
+				fops ? &debugfs_full_short_proxy_file_operations :
 					&debugfs_noop_file_operations,
-				(const void *)((unsigned long)fops |
-					       DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT));
+				fops);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file_short);
 
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index a3edfa4f0d8e..bbae4a228ef4 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -15,6 +15,7 @@ struct file_operations;
 extern const struct file_operations debugfs_noop_file_operations;
 extern const struct file_operations debugfs_open_proxy_file_operations;
 extern const struct file_operations debugfs_full_proxy_file_operations;
+extern const struct file_operations debugfs_full_short_proxy_file_operations;
 
 struct debugfs_fsdata {
 	const struct file_operations *real_fops;
@@ -40,11 +41,6 @@ struct debugfs_fsdata {
  * pointer gets its lowest bit set.
  */
 #define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0)
-/*
- * A dentry's ->d_fsdata, when pointing to real fops, is with
- * short fops instead of full fops.
- */
-#define DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT BIT(1)
 
 /* Access BITS */
 #define DEBUGFS_ALLOW_API	BIT(0)
-- 
cgit v1.2.3


From 24edfbdedf19998366205130cfc93158a475497e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Dec 2024 08:12:04 +0000
Subject: debugfs: fix missing mutex_destroy() in short_fops case

we need that in ->real_fops == NULL, ->short_fops != NULL case

Fixes: 8dc6d81c6b2a "debugfs: add small file operations for most files"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20241229081223.3193228-1-viro@zeniv.linux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 65e46c7b6bf1..e752009de929 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -229,7 +229,7 @@ static void debugfs_release_dentry(struct dentry *dentry)
 		return;
 
 	/* check it wasn't a dir (no fsdata) or automount (no real_fops) */
-	if (fsd && fsd->real_fops) {
+	if (fsd && (fsd->real_fops || fsd->short_fops)) {
 		WARN_ON(!list_empty(&fsd->cancellations));
 		mutex_destroy(&fsd->cancellations_mtx);
 	}
-- 
cgit v1.2.3


From 2ac538e40278a2c0c051cca81bcaafc547d61372 Mon Sep 17 00:00:00 2001
From: He Wang <xw897002528@gmail.com>
Date: Mon, 6 Jan 2025 03:39:54 +0000
Subject: ksmbd: fix unexpectedly changed path in ksmbd_vfs_kern_path_locked

When `ksmbd_vfs_kern_path_locked` met an error and it is not the last
entry, it will exit without restoring changed path buffer. But later this
buffer may be used as the filename for creation.

Fixes: c5a709f08d40 ("ksmbd: handle caseless file creation")
Signed-off-by: He Wang <xw897002528@gmail.com>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/vfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 88d167a5f8b7..40f08eac519c 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1264,6 +1264,8 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
 					      filepath,
 					      flags,
 					      path);
+			if (!is_last)
+				next[0] = '/';
 			if (err)
 				goto out2;
 			else if (is_last)
@@ -1271,7 +1273,6 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
 			path_put(parent_path);
 			*parent_path = *path;
 
-			next[0] = '/';
 			remain_len -= filename_len + 1;
 		}
 
-- 
cgit v1.2.3


From e8580b4c600e085b3c8e6404392de2f822d4c132 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Tue, 7 Jan 2025 17:41:21 +0900
Subject: ksmbd: Implement new SMB3 POSIX type

As SMB3 posix extension specification, Give posix file type to posix
mode.

https://www.samba.org/~slow/SMB3_POSIX/fscc_posix_extensions.html#posix-file-type-definition

Cc: stable@vger.kernel.org
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/smb/server/smb2pdu.h | 10 ++++++++++
 2 files changed, 50 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 433e33c04039..772deec5b90f 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -3994,6 +3994,26 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
 		posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev);
 		posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink);
 		posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777);
+		switch (ksmbd_kstat->kstat->mode & S_IFMT) {
+		case S_IFDIR:
+			posix_info->Mode |= cpu_to_le32(POSIX_TYPE_DIR << POSIX_FILETYPE_SHIFT);
+			break;
+		case S_IFLNK:
+			posix_info->Mode |= cpu_to_le32(POSIX_TYPE_SYMLINK << POSIX_FILETYPE_SHIFT);
+			break;
+		case S_IFCHR:
+			posix_info->Mode |= cpu_to_le32(POSIX_TYPE_CHARDEV << POSIX_FILETYPE_SHIFT);
+			break;
+		case S_IFBLK:
+			posix_info->Mode |= cpu_to_le32(POSIX_TYPE_BLKDEV << POSIX_FILETYPE_SHIFT);
+			break;
+		case S_IFIFO:
+			posix_info->Mode |= cpu_to_le32(POSIX_TYPE_FIFO << POSIX_FILETYPE_SHIFT);
+			break;
+		case S_IFSOCK:
+			posix_info->Mode |= cpu_to_le32(POSIX_TYPE_SOCKET << POSIX_FILETYPE_SHIFT);
+		}
+
 		posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino);
 		posix_info->DosAttributes =
 			S_ISDIR(ksmbd_kstat->kstat->mode) ?
@@ -5184,6 +5204,26 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
 	file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
 	file_info->HardLinks = cpu_to_le32(stat.nlink);
 	file_info->Mode = cpu_to_le32(stat.mode & 0777);
+	switch (stat.mode & S_IFMT) {
+	case S_IFDIR:
+		file_info->Mode |= cpu_to_le32(POSIX_TYPE_DIR << POSIX_FILETYPE_SHIFT);
+		break;
+	case S_IFLNK:
+		file_info->Mode |= cpu_to_le32(POSIX_TYPE_SYMLINK << POSIX_FILETYPE_SHIFT);
+		break;
+	case S_IFCHR:
+		file_info->Mode |= cpu_to_le32(POSIX_TYPE_CHARDEV << POSIX_FILETYPE_SHIFT);
+		break;
+	case S_IFBLK:
+		file_info->Mode |= cpu_to_le32(POSIX_TYPE_BLKDEV << POSIX_FILETYPE_SHIFT);
+		break;
+	case S_IFIFO:
+		file_info->Mode |= cpu_to_le32(POSIX_TYPE_FIFO << POSIX_FILETYPE_SHIFT);
+		break;
+	case S_IFSOCK:
+		file_info->Mode |= cpu_to_le32(POSIX_TYPE_SOCKET << POSIX_FILETYPE_SHIFT);
+	}
+
 	file_info->DeviceId = cpu_to_le32(stat.rdev);
 
 	/*
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index 649dacf7e8c4..17a0b18a8406 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -502,4 +502,14 @@ static inline void *smb2_get_msg(void *buf)
 	return buf + 4;
 }
 
+#define POSIX_TYPE_FILE		0
+#define POSIX_TYPE_DIR		1
+#define POSIX_TYPE_SYMLINK	2
+#define POSIX_TYPE_CHARDEV	3
+#define POSIX_TYPE_BLKDEV	4
+#define POSIX_TYPE_FIFO		5
+#define POSIX_TYPE_SOCKET	6
+
+#define POSIX_FILETYPE_SHIFT	12
+
 #endif	/* _SMB2PDU_H */
-- 
cgit v1.2.3


From 7ee7c9b39ed36caf983706f5b893cc5c37a79071 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 17 Dec 2024 05:27:35 +0100
Subject: xfs: don't return an error from xfs_update_last_rtgroup_size for
 !XFS_RT

Non-rtg file systems have a fake RT group even if they do not have a RT
device, and thus an rgcount of 1.  Ensure xfs_update_last_rtgroup_size
doesn't fail when called for !XFS_RT to handle this case.

Fixes: 87fe4c34a383 ("xfs: create incore realtime group structures")
Reported-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/libxfs/xfs_rtgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 7e7e491ff06f..2d7822644eff 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -272,7 +272,7 @@ static inline int xfs_initialize_rtgroups(struct xfs_mount *mp,
 }
 
 # define xfs_rtgroup_extents(mp, rgno)		(0)
-# define xfs_update_last_rtgroup_size(mp, rgno)	(-EOPNOTSUPP)
+# define xfs_update_last_rtgroup_size(mp, rgno)	(0)
 # define xfs_rtgroup_lock(rtg, gf)		((void)0)
 # define xfs_rtgroup_unlock(rtg, gf)		((void)0)
 # define xfs_rtgroup_trans_join(tp, rtg, gf)	((void)0)
-- 
cgit v1.2.3


From 41f70d8e16349c65abdc0dd88a7d0ab94e5ce639 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Sep 2024 00:49:06 +0200
Subject: kthread: Unify kthread_create_on_cpu() and
 kthread_create_worker_on_cpu() automatic format

kthread_create_on_cpu() uses the CPU argument as an implicit and unique
printf argument to add to the format whereas
kthread_create_worker_on_cpu() still relies on explicitly passing the
printf arguments. This difference in behaviour is error prone and
doesn't help standardizing per-CPU kthread names.

Unify the behaviours and convert kthread_create_worker_on_cpu() to
use the printf behaviour of kthread_create_on_cpu().

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 fs/erofs/zdata.c        |  2 +-
 include/linux/kthread.h | 21 +++++++++++++-----
 kernel/kthread.c        | 59 +++++++++++++++++++++++++++++--------------------
 3 files changed, 52 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 01f147505487..a23392327ce2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -320,7 +320,7 @@ static void erofs_destroy_percpu_workers(void)
 static struct kthread_worker *erofs_init_percpu_worker(int cpu)
 {
 	struct kthread_worker *worker =
-		kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
+		kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u");
 
 	if (IS_ERR(worker))
 		return worker;
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 30209bdf83a2..0c66e7c1092a 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -187,13 +187,24 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 
 int kthread_worker_fn(void *worker_ptr);
 
-__printf(2, 3)
-struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...);
+__printf(3, 4)
+struct kthread_worker *kthread_create_worker_on_node(unsigned int flags,
+						     int node,
+						     const char namefmt[], ...);
 
-__printf(3, 4) struct kthread_worker *
+#define kthread_create_worker(flags, namefmt, ...) \
+({									   \
+	struct kthread_worker *__kw					   \
+		= kthread_create_worker_on_node(flags, NUMA_NO_NODE,	   \
+						namefmt, ## __VA_ARGS__);  \
+	if (!IS_ERR(__kw))						   \
+		wake_up_process(__kw->task);				   \
+	__kw;								   \
+})
+
+struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
-			     const char namefmt[], ...);
+			     const char namefmt[]);
 
 bool kthread_queue_work(struct kthread_worker *worker,
 			struct kthread_work *work);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 32d620a99f11..2d52126d5da0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1030,12 +1030,11 @@ repeat:
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
 static __printf(3, 0) struct kthread_worker *
-__kthread_create_worker(int cpu, unsigned int flags,
-			const char namefmt[], va_list args)
+__kthread_create_worker_on_node(unsigned int flags, int node,
+				const char namefmt[], va_list args)
 {
 	struct kthread_worker *worker;
 	struct task_struct *task;
-	int node = NUMA_NO_NODE;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
 	if (!worker)
@@ -1043,20 +1042,14 @@ __kthread_create_worker(int cpu, unsigned int flags,
 
 	kthread_init_worker(worker);
 
-	if (cpu >= 0)
-		node = cpu_to_node(cpu);
-
 	task = __kthread_create_on_node(kthread_worker_fn, worker,
-						node, namefmt, args);
+					node, namefmt, args);
 	if (IS_ERR(task))
 		goto fail_task;
 
-	if (cpu >= 0)
-		kthread_bind(task, cpu);
-
 	worker->flags = flags;
 	worker->task = task;
-	wake_up_process(task);
+
 	return worker;
 
 fail_task:
@@ -1067,6 +1060,7 @@ fail_task:
 /**
  * kthread_create_worker - create a kthread worker
  * @flags: flags modifying the default behavior of the worker
+ * @node: task structure for the thread is allocated on this node
  * @namefmt: printf-style name for the kthread worker (task).
  *
  * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
@@ -1074,25 +1068,49 @@ fail_task:
  * when the caller was killed by a fatal signal.
  */
 struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...)
+kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...)
+{
+	struct kthread_worker *worker;
+	va_list args;
+
+	va_start(args, namefmt);
+	worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
+	va_end(args);
+
+	if (worker)
+		wake_up_process(worker->task);
+
+	return worker;
+}
+EXPORT_SYMBOL(kthread_create_worker_on_node);
+
+static __printf(3, 4) struct kthread_worker *
+__kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+			       const char namefmt[], ...)
 {
 	struct kthread_worker *worker;
 	va_list args;
 
 	va_start(args, namefmt);
-	worker = __kthread_create_worker(-1, flags, namefmt, args);
+	worker = __kthread_create_worker_on_node(flags, cpu_to_node(cpu),
+						 namefmt, args);
 	va_end(args);
 
+	if (worker) {
+		kthread_bind(worker->task, cpu);
+		wake_up_process(worker->task);
+	}
+
 	return worker;
 }
-EXPORT_SYMBOL(kthread_create_worker);
 
 /**
  * kthread_create_worker_on_cpu - create a kthread worker and bind it
  *	to a given CPU and the associated NUMA node.
  * @cpu: CPU number
  * @flags: flags modifying the default behavior of the worker
- * @namefmt: printf-style name for the kthread worker (task).
+ * @namefmt: printf-style name for the thread. Format is restricted
+ *	     to "name.*%u". Code fills in cpu number.
  *
  * Use a valid CPU number if you want to bind the kthread worker
  * to the given CPU and the associated NUMA node.
@@ -1124,16 +1142,9 @@ EXPORT_SYMBOL(kthread_create_worker);
  */
 struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
-			     const char namefmt[], ...)
+			     const char namefmt[])
 {
-	struct kthread_worker *worker;
-	va_list args;
-
-	va_start(args, namefmt);
-	worker = __kthread_create_worker(cpu, flags, namefmt, args);
-	va_end(args);
-
-	return worker;
+	return __kthread_create_worker_on_cpu(cpu, flags, namefmt, cpu);
 }
 EXPORT_SYMBOL(kthread_create_worker_on_cpu);
 
-- 
cgit v1.2.3


From b04e317b522630b46f78ee62ecbdc5734e8d43de Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Sep 2024 00:49:07 +0200
Subject: treewide: Introduce kthread_run_worker[_on_cpu]()

kthread_create() creates a kthread without running it yet. kthread_run()
creates a kthread and runs it.

On the other hand, kthread_create_worker() creates a kthread worker and
runs it.

This difference in behaviours is confusing. Also there is no way to
create a kthread worker and affine it using kthread_bind_mask() or
kthread_affine_preferred() before starting it.

Consolidate the behaviours and introduce kthread_run_worker[_on_cpu]()
that behaves just like kthread_run(). kthread_create_worker[_on_cpu]()
will now only create a kthread worker without starting it.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
---
 arch/x86/kvm/i8254.c                               |  2 +-
 crypto/crypto_engine.c                             |  2 +-
 drivers/cpufreq/cppc_cpufreq.c                     |  2 +-
 drivers/gpu/drm/drm_vblank_work.c                  |  2 +-
 .../gpu/drm/i915/gem/selftests/i915_gem_context.c  |  2 +-
 drivers/gpu/drm/i915/gt/selftest_execlists.c       |  2 +-
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c       |  2 +-
 drivers/gpu/drm/i915/gt/selftest_slpc.c            |  2 +-
 drivers/gpu/drm/i915/selftests/i915_request.c      |  8 ++--
 drivers/gpu/drm/msm/disp/msm_disp_snapshot.c       |  2 +-
 drivers/gpu/drm/msm/msm_atomic.c                   |  2 +-
 drivers/gpu/drm/msm/msm_gpu.c                      |  2 +-
 drivers/gpu/drm/msm/msm_kms.c                      |  2 +-
 .../media/platform/chips-media/wave5/wave5-vpu.c   |  2 +-
 drivers/net/dsa/mv88e6xxx/chip.c                   |  2 +-
 drivers/net/ethernet/intel/ice/ice_dpll.c          |  2 +-
 drivers/net/ethernet/intel/ice/ice_gnss.c          |  2 +-
 drivers/net/ethernet/intel/ice/ice_ptp.c           |  2 +-
 drivers/platform/chrome/cros_ec_spi.c              |  2 +-
 drivers/ptp/ptp_clock.c                            |  2 +-
 drivers/spi/spi.c                                  |  2 +-
 drivers/usb/typec/tcpm/tcpm.c                      |  2 +-
 drivers/vdpa/vdpa_sim/vdpa_sim.c                   |  2 +-
 drivers/watchdog/watchdog_dev.c                    |  2 +-
 fs/erofs/zdata.c                                   |  2 +-
 include/linux/kthread.h                            | 48 ++++++++++++++++++----
 kernel/kthread.c                                   | 31 ++++----------
 kernel/rcu/tree.c                                  |  4 +-
 kernel/sched/ext.c                                 |  2 +-
 kernel/workqueue.c                                 |  2 +-
 net/dsa/tag_ksz.c                                  |  2 +-
 net/dsa/tag_ocelot_8021q.c                         |  2 +-
 net/dsa/tag_sja1105.c                              |  2 +-
 33 files changed, 83 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index cd57a517d04a..d7ab8780ab9e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -681,7 +681,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pid_nr = pid_vnr(pid);
 	put_pid(pid);
 
-	pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr);
+	pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr);
 	if (IS_ERR(pit->worker))
 		goto fail_kthread;
 
diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index e60a0eb628e8..c7c16da5e649 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -517,7 +517,7 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
 	crypto_init_queue(&engine->queue, qlen);
 	spin_lock_init(&engine->queue_lock);
 
-	engine->kworker = kthread_create_worker(0, "%s", engine->name);
+	engine->kworker = kthread_run_worker(0, "%s", engine->name);
 	if (IS_ERR(engine->kworker)) {
 		dev_err(dev, "failed to create crypto request pump task\n");
 		return NULL;
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index bd8f75accfa0..2486a6c5256a 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -225,7 +225,7 @@ static void __init cppc_freq_invariance_init(void)
 	if (fie_disabled)
 		return;
 
-	kworker_fie = kthread_create_worker(0, "cppc_fie");
+	kworker_fie = kthread_run_worker(0, "cppc_fie");
 	if (IS_ERR(kworker_fie)) {
 		pr_warn("%s: failed to create kworker_fie: %ld\n", __func__,
 			PTR_ERR(kworker_fie));
diff --git a/drivers/gpu/drm/drm_vblank_work.c b/drivers/gpu/drm/drm_vblank_work.c
index 1752ffb44e1d..9cc71120246f 100644
--- a/drivers/gpu/drm/drm_vblank_work.c
+++ b/drivers/gpu/drm/drm_vblank_work.c
@@ -277,7 +277,7 @@ int drm_vblank_worker_init(struct drm_vblank_crtc *vblank)
 
 	INIT_LIST_HEAD(&vblank->pending_work);
 	init_waitqueue_head(&vblank->work_wait_queue);
-	worker = kthread_create_worker(0, "card%d-crtc%d",
+	worker = kthread_run_worker(0, "card%d-crtc%d",
 				       vblank->dev->primary->index,
 				       vblank->pipe);
 	if (IS_ERR(worker))
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 89d4dc8b60c6..eb0158e43417 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -369,7 +369,7 @@ static int live_parallel_switch(void *arg)
 		if (!data[n].ce[0])
 			continue;
 
-		worker = kthread_create_worker(0, "igt/parallel:%s",
+		worker = kthread_run_worker(0, "igt/parallel:%s",
 					       data[n].ce[0]->engine->name);
 		if (IS_ERR(worker)) {
 			err = PTR_ERR(worker);
diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c
index 222ca7c44951..81c31396eceb 100644
--- a/drivers/gpu/drm/i915/gt/selftest_execlists.c
+++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c
@@ -3574,7 +3574,7 @@ static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags)
 			arg[id].batch = NULL;
 		arg[id].count = 0;
 
-		worker[id] = kthread_create_worker(0, "igt/smoke:%d", id);
+		worker[id] = kthread_run_worker(0, "igt/smoke:%d", id);
 		if (IS_ERR(worker[id])) {
 			err = PTR_ERR(worker[id]);
 			break;
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 9ce8ff1c04fe..9d3aeb237295 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -1025,7 +1025,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
 			threads[tmp].engine = other;
 			threads[tmp].flags = flags;
 
-			worker = kthread_create_worker(0, "igt/%s",
+			worker = kthread_run_worker(0, "igt/%s",
 						       other->name);
 			if (IS_ERR(worker)) {
 				err = PTR_ERR(worker);
diff --git a/drivers/gpu/drm/i915/gt/selftest_slpc.c b/drivers/gpu/drm/i915/gt/selftest_slpc.c
index 4ecc4ae74a54..e218b229681f 100644
--- a/drivers/gpu/drm/i915/gt/selftest_slpc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_slpc.c
@@ -489,7 +489,7 @@ static int live_slpc_tile_interaction(void *arg)
 		return -ENOMEM;
 
 	for_each_gt(gt, i915, i) {
-		threads[i].worker = kthread_create_worker(0, "igt/slpc_parallel:%d", gt->info.id);
+		threads[i].worker = kthread_run_worker(0, "igt/slpc_parallel:%d", gt->info.id);
 
 		if (IS_ERR(threads[i].worker)) {
 			ret = PTR_ERR(threads[i].worker);
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index acae30a04a94..88870844b5bd 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -492,7 +492,7 @@ static int mock_breadcrumbs_smoketest(void *arg)
 	for (n = 0; n < ncpus; n++) {
 		struct kthread_worker *worker;
 
-		worker = kthread_create_worker(0, "igt/%d", n);
+		worker = kthread_run_worker(0, "igt/%d", n);
 		if (IS_ERR(worker)) {
 			ret = PTR_ERR(worker);
 			ncpus = n;
@@ -1645,7 +1645,7 @@ static int live_parallel_engines(void *arg)
 		for_each_uabi_engine(engine, i915) {
 			struct kthread_worker *worker;
 
-			worker = kthread_create_worker(0, "igt/parallel:%s",
+			worker = kthread_run_worker(0, "igt/parallel:%s",
 						       engine->name);
 			if (IS_ERR(worker)) {
 				err = PTR_ERR(worker);
@@ -1806,7 +1806,7 @@ static int live_breadcrumbs_smoketest(void *arg)
 			unsigned int i = idx * ncpus + n;
 			struct kthread_worker *worker;
 
-			worker = kthread_create_worker(0, "igt/%d.%d", idx, n);
+			worker = kthread_run_worker(0, "igt/%d.%d", idx, n);
 			if (IS_ERR(worker)) {
 				ret = PTR_ERR(worker);
 				goto out_flush;
@@ -3219,7 +3219,7 @@ static int perf_parallel_engines(void *arg)
 
 			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
 
-			worker = kthread_create_worker(0, "igt:%s",
+			worker = kthread_run_worker(0, "igt:%s",
 						       engine->name);
 			if (IS_ERR(worker)) {
 				err = PTR_ERR(worker);
diff --git a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
index e75b97127c0d..2be00b11e557 100644
--- a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
+++ b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
@@ -109,7 +109,7 @@ int msm_disp_snapshot_init(struct drm_device *drm_dev)
 
 	mutex_init(&kms->dump_mutex);
 
-	kms->dump_worker = kthread_create_worker(0, "%s", "disp_snapshot");
+	kms->dump_worker = kthread_run_worker(0, "%s", "disp_snapshot");
 	if (IS_ERR(kms->dump_worker))
 		DRM_ERROR("failed to create disp state task\n");
 
diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c
index 9c45d641b521..a7a2384044ff 100644
--- a/drivers/gpu/drm/msm/msm_atomic.c
+++ b/drivers/gpu/drm/msm/msm_atomic.c
@@ -115,7 +115,7 @@ int msm_atomic_init_pending_timer(struct msm_pending_timer *timer,
 	timer->kms = kms;
 	timer->crtc_idx = crtc_idx;
 
-	timer->worker = kthread_create_worker(0, "atomic-worker-%d", crtc_idx);
+	timer->worker = kthread_run_worker(0, "atomic-worker-%d", crtc_idx);
 	if (IS_ERR(timer->worker)) {
 		int ret = PTR_ERR(timer->worker);
 		timer->worker = NULL;
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 0d4a3744cfcb..8557998e0c92 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -859,7 +859,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	gpu->funcs = funcs;
 	gpu->name = name;
 
-	gpu->worker = kthread_create_worker(0, "gpu-worker");
+	gpu->worker = kthread_run_worker(0, "gpu-worker");
 	if (IS_ERR(gpu->worker)) {
 		ret = PTR_ERR(gpu->worker);
 		gpu->worker = NULL;
diff --git a/drivers/gpu/drm/msm/msm_kms.c b/drivers/gpu/drm/msm/msm_kms.c
index f3326d09bdbc..dac831ba6219 100644
--- a/drivers/gpu/drm/msm/msm_kms.c
+++ b/drivers/gpu/drm/msm/msm_kms.c
@@ -269,7 +269,7 @@ int msm_drm_kms_init(struct device *dev, const struct drm_driver *drv)
 		/* initialize event thread */
 		ev_thread = &priv->event_thread[drm_crtc_index(crtc)];
 		ev_thread->dev = ddev;
-		ev_thread->worker = kthread_create_worker(0, "crtc_event:%d", crtc->base.id);
+		ev_thread->worker = kthread_run_worker(0, "crtc_event:%d", crtc->base.id);
 		if (IS_ERR(ev_thread->worker)) {
 			ret = PTR_ERR(ev_thread->worker);
 			DRM_DEV_ERROR(dev, "failed to create crtc_event kthread\n");
diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu.c b/drivers/media/platform/chips-media/wave5/wave5-vpu.c
index 6b294a2d6717..d1320298a0f7 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-vpu.c
+++ b/drivers/media/platform/chips-media/wave5/wave5-vpu.c
@@ -271,7 +271,7 @@ static int wave5_vpu_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "failed to get irq resource, falling back to polling\n");
 		hrtimer_init(&dev->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 		dev->hrtimer.function = &wave5_vpu_timer_callback;
-		dev->worker = kthread_create_worker(0, "vpu_irq_thread");
+		dev->worker = kthread_run_worker(0, "vpu_irq_thread");
 		if (IS_ERR(dev->worker)) {
 			dev_err(&pdev->dev, "failed to create vpu irq worker\n");
 			ret = PTR_ERR(dev->worker);
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 3a792f79270d..377e66cf7a48 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -394,7 +394,7 @@ static int mv88e6xxx_irq_poll_setup(struct mv88e6xxx_chip *chip)
 	kthread_init_delayed_work(&chip->irq_poll_work,
 				  mv88e6xxx_irq_poll);
 
-	chip->kworker = kthread_create_worker(0, "%s", dev_name(chip->dev));
+	chip->kworker = kthread_run_worker(0, "%s", dev_name(chip->dev));
 	if (IS_ERR(chip->kworker))
 		return PTR_ERR(chip->kworker);
 
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index d5ad6d84007c..75570be61fef 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -2053,7 +2053,7 @@ static int ice_dpll_init_worker(struct ice_pf *pf)
 	struct kthread_worker *kworker;
 
 	kthread_init_delayed_work(&d->work, ice_dpll_periodic_work);
-	kworker = kthread_create_worker(0, "ice-dplls-%s",
+	kworker = kthread_run_worker(0, "ice-dplls-%s",
 					dev_name(ice_pf_to_dev(pf)));
 	if (IS_ERR(kworker))
 		return PTR_ERR(kworker);
diff --git a/drivers/net/ethernet/intel/ice/ice_gnss.c b/drivers/net/ethernet/intel/ice/ice_gnss.c
index f02e8ca55375..b2148dbe49b2 100644
--- a/drivers/net/ethernet/intel/ice/ice_gnss.c
+++ b/drivers/net/ethernet/intel/ice/ice_gnss.c
@@ -182,7 +182,7 @@ static struct gnss_serial *ice_gnss_struct_init(struct ice_pf *pf)
 	pf->gnss_serial = gnss;
 
 	kthread_init_delayed_work(&gnss->read_work, ice_gnss_read);
-	kworker = kthread_create_worker(0, "ice-gnss-%s", dev_name(dev));
+	kworker = kthread_run_worker(0, "ice-gnss-%s", dev_name(dev));
 	if (IS_ERR(kworker)) {
 		kfree(gnss);
 		return NULL;
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
index a999fface272..3154bb674dd3 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -3080,7 +3080,7 @@ static int ice_ptp_init_work(struct ice_pf *pf, struct ice_ptp *ptp)
 	/* Allocate a kworker for handling work required for the ports
 	 * connected to the PTP hardware clock.
 	 */
-	kworker = kthread_create_worker(0, "ice-ptp-%s",
+	kworker = kthread_run_worker(0, "ice-ptp-%s",
 					dev_name(ice_pf_to_dev(pf)));
 	if (IS_ERR(kworker))
 		return PTR_ERR(kworker);
diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c
index 86a3d32a7763..08f566cc1480 100644
--- a/drivers/platform/chrome/cros_ec_spi.c
+++ b/drivers/platform/chrome/cros_ec_spi.c
@@ -715,7 +715,7 @@ static int cros_ec_spi_devm_high_pri_alloc(struct device *dev,
 	int err;
 
 	ec_spi->high_pri_worker =
-		kthread_create_worker(0, "cros_ec_spi_high_pri");
+		kthread_run_worker(0, "cros_ec_spi_high_pri");
 
 	if (IS_ERR(ec_spi->high_pri_worker)) {
 		err = PTR_ERR(ec_spi->high_pri_worker);
diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 77a36e7bddd5..b932425ddc6a 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -296,7 +296,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 
 	if (ptp->info->do_aux_work) {
 		kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker);
-		ptp->kworker = kthread_create_worker(0, "ptp%d", ptp->index);
+		ptp->kworker = kthread_run_worker(0, "ptp%d", ptp->index);
 		if (IS_ERR(ptp->kworker)) {
 			err = PTR_ERR(ptp->kworker);
 			pr_err("failed to create ptp aux_worker %d\n", err);
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index ff1add2ecb91..e4aa8f838934 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2060,7 +2060,7 @@ static int spi_init_queue(struct spi_controller *ctlr)
 	ctlr->busy = false;
 	ctlr->queue_empty = true;
 
-	ctlr->kworker = kthread_create_worker(0, dev_name(&ctlr->dev));
+	ctlr->kworker = kthread_run_worker(0, dev_name(&ctlr->dev));
 	if (IS_ERR(ctlr->kworker)) {
 		dev_err(&ctlr->dev, "failed to create message pump kworker\n");
 		return PTR_ERR(ctlr->kworker);
diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 6021eeb903fe..95c0c63119ac 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -7635,7 +7635,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc)
 	mutex_init(&port->lock);
 	mutex_init(&port->swap_lock);
 
-	port->wq = kthread_create_worker(0, dev_name(dev));
+	port->wq = kthread_run_worker(0, dev_name(dev));
 	if (IS_ERR(port->wq))
 		return ERR_CAST(port->wq);
 	sched_set_fifo(port->wq->task);
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 8ffea8430f95..c204fc8e471a 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -229,7 +229,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr,
 	dev = &vdpasim->vdpa.dev;
 
 	kthread_init_work(&vdpasim->work, vdpasim_work_fn);
-	vdpasim->worker = kthread_create_worker(0, "vDPA sim worker: %s",
+	vdpasim->worker = kthread_run_worker(0, "vDPA sim worker: %s",
 						dev_attr->name);
 	if (IS_ERR(vdpasim->worker))
 		goto err_iommu;
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 4190cb800cc4..19698d87dc57 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -1229,7 +1229,7 @@ int __init watchdog_dev_init(void)
 {
 	int err;
 
-	watchdog_kworker = kthread_create_worker(0, "watchdogd");
+	watchdog_kworker = kthread_run_worker(0, "watchdogd");
 	if (IS_ERR(watchdog_kworker)) {
 		pr_err("Failed to create watchdog kworker\n");
 		return PTR_ERR(watchdog_kworker);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index a23392327ce2..35381c00ee09 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -320,7 +320,7 @@ static void erofs_destroy_percpu_workers(void)
 static struct kthread_worker *erofs_init_percpu_worker(int cpu)
 {
 	struct kthread_worker *worker =
-		kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u");
+		kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u");
 
 	if (IS_ERR(worker))
 		return worker;
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0c66e7c1092a..8d27403888ce 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -193,19 +193,53 @@ struct kthread_worker *kthread_create_worker_on_node(unsigned int flags,
 						     const char namefmt[], ...);
 
 #define kthread_create_worker(flags, namefmt, ...) \
-({									   \
-	struct kthread_worker *__kw					   \
-		= kthread_create_worker_on_node(flags, NUMA_NO_NODE,	   \
-						namefmt, ## __VA_ARGS__);  \
-	if (!IS_ERR(__kw))						   \
-		wake_up_process(__kw->task);				   \
-	__kw;								   \
+	kthread_create_worker_on_node(flags, NUMA_NO_NODE, namefmt, ## __VA_ARGS__);
+
+/**
+ * kthread_run_worker - create and wake a kthread worker.
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: Convenient wrapper for kthread_create_worker() followed by
+ * wake_up_process().  Returns the kthread_worker or ERR_PTR(-ENOMEM).
+ */
+#define kthread_run_worker(flags, namefmt, ...)					\
+({										\
+	struct kthread_worker *__kw						\
+		= kthread_create_worker(flags, namefmt, ## __VA_ARGS__);	\
+	if (!IS_ERR(__kw))							\
+		wake_up_process(__kw->task);					\
+	__kw;									\
 })
 
 struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
 			     const char namefmt[]);
 
+/**
+ * kthread_run_worker_on_cpu - create and wake a cpu bound kthread worker.
+ * @cpu: CPU number
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the thread. Format is restricted
+ *	     to "name.*%u". Code fills in cpu number.
+ *
+ * Description: Convenient wrapper for kthread_create_worker_on_cpu()
+ * followed by wake_up_process().  Returns the kthread_worker or
+ * ERR_PTR(-ENOMEM).
+ */
+static inline struct kthread_worker *
+kthread_run_worker_on_cpu(int cpu, unsigned int flags,
+			  const char namefmt[])
+{
+	struct kthread_worker *kw;
+
+	kw = kthread_create_worker_on_cpu(cpu, flags, namefmt);
+	if (!IS_ERR(kw))
+		wake_up_process(kw->task);
+
+	return kw;
+}
+
 bool kthread_queue_work(struct kthread_worker *worker,
 			struct kthread_work *work);
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2d52126d5da0..922f15762ec3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1077,33 +1077,10 @@ kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[]
 	worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
 	va_end(args);
 
-	if (worker)
-		wake_up_process(worker->task);
-
 	return worker;
 }
 EXPORT_SYMBOL(kthread_create_worker_on_node);
 
-static __printf(3, 4) struct kthread_worker *
-__kthread_create_worker_on_cpu(int cpu, unsigned int flags,
-			       const char namefmt[], ...)
-{
-	struct kthread_worker *worker;
-	va_list args;
-
-	va_start(args, namefmt);
-	worker = __kthread_create_worker_on_node(flags, cpu_to_node(cpu),
-						 namefmt, args);
-	va_end(args);
-
-	if (worker) {
-		kthread_bind(worker->task, cpu);
-		wake_up_process(worker->task);
-	}
-
-	return worker;
-}
-
 /**
  * kthread_create_worker_on_cpu - create a kthread worker and bind it
  *	to a given CPU and the associated NUMA node.
@@ -1144,7 +1121,13 @@ struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
 			     const char namefmt[])
 {
-	return __kthread_create_worker_on_cpu(cpu, flags, namefmt, cpu);
+	struct kthread_worker *worker;
+
+	worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu);
+	if (!IS_ERR(worker))
+		kthread_bind(worker->task, cpu);
+
+	return worker;
 }
 EXPORT_SYMBOL(kthread_create_worker_on_cpu);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4a4c49821058..d4b8e87a473b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4906,7 +4906,7 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
 	if (rnp->exp_kworker)
 		return;
 
-	kworker = kthread_create_worker(0, name, rnp_index);
+	kworker = kthread_run_worker(0, name, rnp_index);
 	if (IS_ERR_OR_NULL(kworker)) {
 		pr_err("Failed to create par gp kworker on %d/%d\n",
 		       rnp->grplo, rnp->grphi);
@@ -4933,7 +4933,7 @@ static void __init rcu_start_exp_gp_kworker(void)
 	const char *name = "rcu_exp_gp_kthread_worker";
 	struct sched_param param = { .sched_priority = kthread_prio };
 
-	rcu_exp_gp_kworker = kthread_create_worker(0, name);
+	rcu_exp_gp_kworker = kthread_run_worker(0, name);
 	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
 		pr_err("Failed to create %s!\n", name);
 		rcu_exp_gp_kworker = NULL;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7fff1d045477..ab8962d2e9d3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5352,7 +5352,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
 {
 	struct kthread_worker *helper;
 
-	helper = kthread_create_worker(0, name);
+	helper = kthread_run_worker(0, name);
 	if (helper)
 		sched_set_fifo(helper->task);
 	return helper;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8b07576814a5..fe01c1f8095c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7828,7 +7828,7 @@ static void __init wq_cpu_intensive_thresh_init(void)
 	unsigned long thresh;
 	unsigned long bogo;
 
-	pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+	pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release");
 	BUG_ON(IS_ERR(pwq_release_worker));
 
 	/* if the user set it to a specific value, keep it */
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 281bbac5539d..c33d4bf17929 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -66,7 +66,7 @@ static int ksz_connect(struct dsa_switch *ds)
 	if (!priv)
 		return -ENOMEM;
 
-	xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit",
+	xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
 					    ds->dst->index, ds->index);
 	if (IS_ERR(xmit_worker)) {
 		ret = PTR_ERR(xmit_worker);
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 8e8b1bef6af6..6ce0bc166792 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -110,7 +110,7 @@ static int ocelot_connect(struct dsa_switch *ds)
 	if (!priv)
 		return -ENOMEM;
 
-	priv->xmit_worker = kthread_create_worker(0, "felix_xmit");
+	priv->xmit_worker = kthread_run_worker(0, "felix_xmit");
 	if (IS_ERR(priv->xmit_worker)) {
 		err = PTR_ERR(priv->xmit_worker);
 		kfree(priv);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 3e902af7eea6..02adec693811 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -707,7 +707,7 @@ static int sja1105_connect(struct dsa_switch *ds)
 
 	spin_lock_init(&priv->meta_lock);
 
-	xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit",
+	xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
 					    ds->dst->index, ds->index);
 	if (IS_ERR(xmit_worker)) {
 		err = PTR_ERR(xmit_worker);
-- 
cgit v1.2.3


From 3d56fbb1f03f2abf8c806aee14df2f462350dfba Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.sun@unisoc.com>
Date: Mon, 4 Nov 2024 11:45:41 +0800
Subject: f2fs: expand f2fs_invalidate_compress_page() to
 f2fs_invalidate_compress_pages_range()

New function f2fs_invalidate_compress_pages_range() adds the @len
parameter. So it can process some consecutive blocks at a time.

Signed-off-by: Yi Sun <yi.sun@unisoc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 5 +++--
 fs/f2fs/f2fs.h     | 9 +++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 0b55b2695c9b..be3e6f4d33a2 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1909,11 +1909,12 @@ struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi)
 	return sbi->compress_inode->i_mapping;
 }
 
-void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr)
+void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
+				block_t blkaddr, unsigned int len)
 {
 	if (!sbi->compress_inode)
 		return;
-	invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr);
+	invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr + len - 1);
 }
 
 void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f523dd302bf6..b88e3d1fea99 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4391,7 +4391,8 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi);
 int __init f2fs_init_compress_cache(void);
 void f2fs_destroy_compress_cache(void);
 struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi);
-void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
+					block_t blkaddr, unsigned int len);
 void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
 						nid_t ino, block_t blkaddr);
 bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -4446,8 +4447,8 @@ static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return
 static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
 static inline int __init f2fs_init_compress_cache(void) { return 0; }
 static inline void f2fs_destroy_compress_cache(void) { }
-static inline void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi,
-				block_t blkaddr) { }
+static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
+				block_t blkaddr, unsigned int len) { }
 static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
 				struct page *page, nid_t ino, block_t blkaddr) { }
 static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
@@ -4766,7 +4767,7 @@ static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
 								block_t blkaddr)
 {
 	f2fs_truncate_meta_inode_pages(sbi, blkaddr, 1);
-	f2fs_invalidate_compress_page(sbi, blkaddr);
+	f2fs_invalidate_compress_pages_range(sbi, blkaddr, 1);
 }
 
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
-- 
cgit v1.2.3


From d217b5cea488c0f644c189f91b636aeefa12deb9 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.sun@unisoc.com>
Date: Mon, 4 Nov 2024 11:45:42 +0800
Subject: f2fs: add parameter @len to f2fs_invalidate_internal_cache()

New function can process some consecutive blocks at a time.

Signed-off-by: Yi Sun <yi.sun@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    | 2 +-
 fs/f2fs/f2fs.h    | 6 +++---
 fs/f2fs/gc.c      | 2 +-
 fs/f2fs/segment.c | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2ec0cfb41260..35b9455fb899 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1414,7 +1414,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
 		return err;
 
 	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
-		f2fs_invalidate_internal_cache(sbi, old_blkaddr);
+		f2fs_invalidate_internal_cache(sbi, old_blkaddr, 1);
 
 	f2fs_update_data_blkaddr(dn, dn->data_blkaddr);
 	return 0;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b88e3d1fea99..c537f6b5a413 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4764,10 +4764,10 @@ static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi,
 }
 
 static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
-								block_t blkaddr)
+						block_t blkaddr, unsigned int len)
 {
-	f2fs_truncate_meta_inode_pages(sbi, blkaddr, 1);
-	f2fs_invalidate_compress_pages_range(sbi, blkaddr, 1);
+	f2fs_truncate_meta_inode_pages(sbi, blkaddr, len);
+	f2fs_invalidate_compress_pages_range(sbi, blkaddr, len);
 }
 
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 8029369bb71d..faf9fa1c804d 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1415,7 +1415,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
 				page_address(mpage), PAGE_SIZE);
 	f2fs_put_page(mpage, 1);
 
-	f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr);
+	f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr, 1);
 
 	set_page_dirty(fio.encrypted_page);
 	if (clear_page_dirty_for_io(fio.encrypted_page))
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index eade36c5ef13..86e547f008f9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2537,7 +2537,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 	if (addr == NEW_ADDR || addr == COMPRESS_ADDR)
 		return;
 
-	f2fs_invalidate_internal_cache(sbi, addr);
+	f2fs_invalidate_internal_cache(sbi, addr, 1);
 
 	/* add it into sit main buffer */
 	down_write(&sit_i->sentry_lock);
@@ -3857,7 +3857,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 		goto out;
 	}
 	if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
-		f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr);
+		f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1);
 
 	/* writeout dirty page into bdev */
 	f2fs_submit_page_write(fio);
@@ -4049,7 +4049,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		update_sit_entry(sbi, new_blkaddr, 1);
 	}
 	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
-		f2fs_invalidate_internal_cache(sbi, old_blkaddr);
+		f2fs_invalidate_internal_cache(sbi, old_blkaddr, 1);
 		if (!from_gc)
 			update_segment_mtime(sbi, old_blkaddr, 0);
 		update_sit_entry(sbi, old_blkaddr, -1);
-- 
cgit v1.2.3


From 91b587ba79e1b68bb718d12b0758dbcdab4e9cb7 Mon Sep 17 00:00:00 2001
From: Daniel Lee <chullee@google.com>
Date: Fri, 20 Dec 2024 15:41:31 -0800
Subject: f2fs: Introduce linear search for dentries

This patch addresses an issue where some files in case-insensitive
directories become inaccessible due to changes in how the kernel function,
utf8_casefold(), generates case-folded strings from the commit 5c26d2f1d3f5
("unicode: Don't special case ignorable code points").

F2FS uses these case-folded names to calculate hash values for locating
dentries and stores them on disk. Since utf8_casefold() can produce
different output across kernel versions, stored hash values and newly
calculated hash values may differ. This results in affected files no
longer being found via the hash-based lookup.

To resolve this, the patch introduces a linear search fallback.
If the initial hash-based search fails, F2FS will sequentially scan the
directory entries.

Fixes: 5c26d2f1d3f5 ("unicode: Don't special case ignorable code points")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219586
Signed-off-by: Daniel Lee <chullee@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c    | 53 ++++++++++++++++++++++++++++++++++++++---------------
 fs/f2fs/f2fs.h   |  6 ++++--
 fs/f2fs/inline.c |  5 +++--
 3 files changed, 45 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 47a5c806cf16..54dd52de7269 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -175,7 +175,8 @@ static unsigned long dir_block_index(unsigned int level,
 static struct f2fs_dir_entry *find_in_block(struct inode *dir,
 				struct page *dentry_page,
 				const struct f2fs_filename *fname,
-				int *max_slots)
+				int *max_slots,
+				bool use_hash)
 {
 	struct f2fs_dentry_block *dentry_blk;
 	struct f2fs_dentry_ptr d;
@@ -183,7 +184,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
 	dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
 
 	make_dentry_ptr_block(dir, &d, dentry_blk);
-	return f2fs_find_target_dentry(&d, fname, max_slots);
+	return f2fs_find_target_dentry(&d, fname, max_slots, use_hash);
 }
 
 static inline int f2fs_match_name(const struct inode *dir,
@@ -208,7 +209,8 @@ static inline int f2fs_match_name(const struct inode *dir,
 }
 
 struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
-			const struct f2fs_filename *fname, int *max_slots)
+			const struct f2fs_filename *fname, int *max_slots,
+			bool use_hash)
 {
 	struct f2fs_dir_entry *de;
 	unsigned long bit_pos = 0;
@@ -231,7 +233,7 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
 			continue;
 		}
 
-		if (de->hash_code == fname->hash) {
+		if (!use_hash || de->hash_code == fname->hash) {
 			res = f2fs_match_name(d->inode, fname,
 					      d->filename[bit_pos],
 					      le16_to_cpu(de->name_len));
@@ -258,11 +260,12 @@ found:
 static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 					unsigned int level,
 					const struct f2fs_filename *fname,
-					struct page **res_page)
+					struct page **res_page,
+					bool use_hash)
 {
 	int s = GET_DENTRY_SLOTS(fname->disk_name.len);
 	unsigned int nbucket, nblock;
-	unsigned int bidx, end_block;
+	unsigned int bidx, end_block, bucket_no;
 	struct page *dentry_page;
 	struct f2fs_dir_entry *de = NULL;
 	pgoff_t next_pgofs;
@@ -272,8 +275,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 	nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
 	nblock = bucket_blocks(level);
 
+	bucket_no = use_hash ? le32_to_cpu(fname->hash) % nbucket : 0;
+
+start_find_bucket:
 	bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
-			       le32_to_cpu(fname->hash) % nbucket);
+			       bucket_no);
 	end_block = bidx + nblock;
 
 	while (bidx < end_block) {
@@ -290,7 +296,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 			}
 		}
 
-		de = find_in_block(dir, dentry_page, fname, &max_slots);
+		de = find_in_block(dir, dentry_page, fname, &max_slots, use_hash);
 		if (IS_ERR(de)) {
 			*res_page = ERR_CAST(de);
 			de = NULL;
@@ -307,12 +313,18 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 		bidx++;
 	}
 
-	if (!de && room && F2FS_I(dir)->chash != fname->hash) {
-		F2FS_I(dir)->chash = fname->hash;
-		F2FS_I(dir)->clevel = level;
-	}
+	if (de)
+		return de;
 
-	return de;
+	if (likely(use_hash)) {
+		if (room && F2FS_I(dir)->chash != fname->hash) {
+			F2FS_I(dir)->chash = fname->hash;
+			F2FS_I(dir)->clevel = level;
+		}
+	} else if (++bucket_no < nbucket) {
+		goto start_find_bucket;
+	}
+	return NULL;
 }
 
 struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
@@ -323,11 +335,15 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
 	struct f2fs_dir_entry *de = NULL;
 	unsigned int max_depth;
 	unsigned int level;
+	bool use_hash = true;
 
 	*res_page = NULL;
 
+#if IS_ENABLED(CONFIG_UNICODE)
+start_find_entry:
+#endif
 	if (f2fs_has_inline_dentry(dir)) {
-		de = f2fs_find_in_inline_dir(dir, fname, res_page);
+		de = f2fs_find_in_inline_dir(dir, fname, res_page, use_hash);
 		goto out;
 	}
 
@@ -343,11 +359,18 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
 	}
 
 	for (level = 0; level < max_depth; level++) {
-		de = find_in_level(dir, level, fname, res_page);
+		de = find_in_level(dir, level, fname, res_page, use_hash);
 		if (de || IS_ERR(*res_page))
 			break;
 	}
+
 out:
+#if IS_ENABLED(CONFIG_UNICODE)
+	if (IS_CASEFOLDED(dir) && !de && use_hash) {
+		use_hash = false;
+		goto start_find_entry;
+	}
+#endif
 	/* This is to increase the speed of f2fs_create */
 	if (!de)
 		F2FS_I(dir)->task = current;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c537f6b5a413..f265c26d0038 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3588,7 +3588,8 @@ int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry,
 			struct f2fs_filename *fname);
 void f2fs_free_filename(struct f2fs_filename *fname);
 struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
-			const struct f2fs_filename *fname, int *max_slots);
+			const struct f2fs_filename *fname, int *max_slots,
+			bool use_hash);
 int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			unsigned int start_pos, struct fscrypt_str *fstr);
 void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent,
@@ -4224,7 +4225,8 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio);
 int f2fs_recover_inline_data(struct inode *inode, struct page *npage);
 struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
 					const struct f2fs_filename *fname,
-					struct page **res_page);
+					struct page **res_page,
+					bool use_hash);
 int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent,
 			struct page *ipage);
 int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index cbd2a0d34804..3e3c35d4c98b 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -352,7 +352,8 @@ process_inline:
 
 struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
 					const struct f2fs_filename *fname,
-					struct page **res_page)
+					struct page **res_page,
+					bool use_hash)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
 	struct f2fs_dir_entry *de;
@@ -369,7 +370,7 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
 	inline_dentry = inline_data_addr(dir, ipage);
 
 	make_dentry_ptr_inline(dir, &d, inline_dentry);
-	de = f2fs_find_target_dentry(&d, fname, NULL);
+	de = f2fs_find_target_dentry(&d, fname, NULL, use_hash);
 	unlock_page(ipage);
 	if (IS_ERR(de)) {
 		*res_page = ERR_CAST(de);
-- 
cgit v1.2.3


From cf5817ce66f6c75452027af243689da780dbddb4 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 7 Jan 2025 02:26:34 +0000
Subject: f2fs: don't call block truncation for aliased file

This patch should avoid the below warning which does not corrupt the metadata
tho.

[   51.508120][  T253] F2FS-fs (dm-59): access invalid blkaddr:36
[   51.508156][  T253]  __f2fs_is_valid_blkaddr+0x330/0x384
[   51.508162][  T253]  f2fs_is_valid_blkaddr_raw+0x10/0x24
[   51.508163][  T253]  f2fs_truncate_data_blocks_range+0x1ec/0x438
[   51.508177][  T253]  f2fs_remove_inode_page+0x8c/0x148
[   51.508194][  T253]  f2fs_evict_inode+0x230/0x76c

Fixes: 128d333f0dff ("f2fs: introduce device aliasing file")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index c04ee1a7ce57..00f340c91fcb 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1275,8 +1275,9 @@ int f2fs_remove_inode_page(struct inode *inode)
 	}
 
 	/* remove potential inline_data blocks */
-	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-				S_ISLNK(inode->i_mode))
+	if (!IS_DEVICE_ALIASING(inode) &&
+	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	     S_ISLNK(inode->i_mode)))
 		f2fs_truncate_data_blocks_range(&dn, 1);
 
 	/* 0 is possible, after f2fs_new_inode() has failed */
-- 
cgit v1.2.3


From 66baee2b886d72ab6be11a08d4c7897f9612e25b Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.sun@unisoc.com>
Date: Mon, 23 Dec 2024 16:10:41 +0800
Subject: f2fs: introduce update_sit_entry_for_release/alloc()

No logical changes, just for cleanliness.

Signed-off-by: Yi Sun <yi.sun@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 162 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 93 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 86e547f008f9..bc761d9b889a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2426,16 +2426,103 @@ static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr,
 		SIT_I(sbi)->max_mtime = ctime;
 }
 
-static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
+static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_entry *se,
+				block_t blkaddr, unsigned int offset, int del)
+{
+	bool exist;
+#ifdef CONFIG_F2FS_CHECK_FS
+	bool mir_exist;
+#endif
+
+	exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map);
+#ifdef CONFIG_F2FS_CHECK_FS
+	mir_exist = f2fs_test_and_clear_bit(offset,
+					se->cur_valid_map_mir);
+	if (unlikely(exist != mir_exist)) {
+		f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d",
+			blkaddr, exist);
+		f2fs_bug_on(sbi, 1);
+	}
+#endif
+	if (unlikely(!exist)) {
+		f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", blkaddr);
+		f2fs_bug_on(sbi, 1);
+		se->valid_blocks++;
+		del = 0;
+	} else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+		/*
+		 * If checkpoints are off, we must not reuse data that
+		 * was used in the previous checkpoint. If it was used
+		 * before, we must track that to know how much space we
+		 * really have.
+		 */
+		if (f2fs_test_bit(offset, se->ckpt_valid_map)) {
+			spin_lock(&sbi->stat_lock);
+			sbi->unusable_block_count++;
+			spin_unlock(&sbi->stat_lock);
+		}
+	}
+
+	if (f2fs_block_unit_discard(sbi) &&
+			f2fs_test_and_clear_bit(offset, se->discard_map))
+		sbi->discard_blks++;
+
+	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
+		se->ckpt_valid_blocks += del;
+
+	return del;
+}
+
+static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry *se,
+				block_t blkaddr, unsigned int offset, int del)
 {
-	struct seg_entry *se;
-	unsigned int segno, offset;
-	long int new_vblocks;
 	bool exist;
 #ifdef CONFIG_F2FS_CHECK_FS
 	bool mir_exist;
 #endif
 
+	exist = f2fs_test_and_set_bit(offset, se->cur_valid_map);
+#ifdef CONFIG_F2FS_CHECK_FS
+	mir_exist = f2fs_test_and_set_bit(offset,
+					se->cur_valid_map_mir);
+	if (unlikely(exist != mir_exist)) {
+		f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d",
+			blkaddr, exist);
+		f2fs_bug_on(sbi, 1);
+	}
+#endif
+	if (unlikely(exist)) {
+		f2fs_err(sbi, "Bitmap was wrongly set, blk:%u", blkaddr);
+		f2fs_bug_on(sbi, 1);
+		se->valid_blocks--;
+		del = 0;
+	}
+
+	if (f2fs_block_unit_discard(sbi) &&
+			!f2fs_test_and_set_bit(offset, se->discard_map))
+		sbi->discard_blks--;
+
+	/*
+	 * SSR should never reuse block which is checkpointed
+	 * or newly invalidated.
+	 */
+	if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
+		if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
+			se->ckpt_valid_blocks++;
+	}
+
+	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
+		se->ckpt_valid_blocks += del;
+
+	return del;
+}
+
+static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
+{
+	struct seg_entry *se;
+	unsigned int segno, offset;
+	long int new_vblocks;
+
 	segno = GET_SEGNO(sbi, blkaddr);
 	if (segno == NULL_SEGNO)
 		return;
@@ -2451,73 +2538,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 
 	/* Update valid block bitmap */
 	if (del > 0) {
-		exist = f2fs_test_and_set_bit(offset, se->cur_valid_map);
-#ifdef CONFIG_F2FS_CHECK_FS
-		mir_exist = f2fs_test_and_set_bit(offset,
-						se->cur_valid_map_mir);
-		if (unlikely(exist != mir_exist)) {
-			f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d",
-				 blkaddr, exist);
-			f2fs_bug_on(sbi, 1);
-		}
-#endif
-		if (unlikely(exist)) {
-			f2fs_err(sbi, "Bitmap was wrongly set, blk:%u",
-				 blkaddr);
-			f2fs_bug_on(sbi, 1);
-			se->valid_blocks--;
-			del = 0;
-		}
-
-		if (f2fs_block_unit_discard(sbi) &&
-				!f2fs_test_and_set_bit(offset, se->discard_map))
-			sbi->discard_blks--;
-
-		/*
-		 * SSR should never reuse block which is checkpointed
-		 * or newly invalidated.
-		 */
-		if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
-			if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
-				se->ckpt_valid_blocks++;
-		}
+		del = update_sit_entry_for_alloc(sbi, se, blkaddr, offset, del);
 	} else {
-		exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map);
-#ifdef CONFIG_F2FS_CHECK_FS
-		mir_exist = f2fs_test_and_clear_bit(offset,
-						se->cur_valid_map_mir);
-		if (unlikely(exist != mir_exist)) {
-			f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d",
-				 blkaddr, exist);
-			f2fs_bug_on(sbi, 1);
-		}
-#endif
-		if (unlikely(!exist)) {
-			f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u",
-				 blkaddr);
-			f2fs_bug_on(sbi, 1);
-			se->valid_blocks++;
-			del = 0;
-		} else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
-			/*
-			 * If checkpoints are off, we must not reuse data that
-			 * was used in the previous checkpoint. If it was used
-			 * before, we must track that to know how much space we
-			 * really have.
-			 */
-			if (f2fs_test_bit(offset, se->ckpt_valid_map)) {
-				spin_lock(&sbi->stat_lock);
-				sbi->unusable_block_count++;
-				spin_unlock(&sbi->stat_lock);
-			}
-		}
-
-		if (f2fs_block_unit_discard(sbi) &&
-			f2fs_test_and_clear_bit(offset, se->discard_map))
-			sbi->discard_blks++;
+		del = update_sit_entry_for_release(sbi, se, blkaddr, offset, del);
 	}
-	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
-		se->ckpt_valid_blocks += del;
 
 	__mark_sit_entry_dirty(sbi, segno);
 
-- 
cgit v1.2.3


From 81ffbd224e5f926bf8df01d6107db9c8779f7d57 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.sun@unisoc.com>
Date: Mon, 23 Dec 2024 16:10:42 +0800
Subject: f2fs: update_sit_entry_for_release() supports consecutive blocks.

This function can process some consecutive blocks at a time.

When using update_sit_entry() to release consecutive blocks,
ensure that the consecutive blocks belong to the same segment.
Because after update_sit_entry_for_realese(), @segno is still
in use in update_sit_entry().

Signed-off-by: Yi Sun <yi.sun@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 75 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 45 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index bc761d9b889a..67d2859831e4 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2426,6 +2426,10 @@ static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr,
 		SIT_I(sbi)->max_mtime = ctime;
 }
 
+/*
+ * NOTE: when updating multiple blocks at the same time, please ensure
+ * that the consecutive input blocks belong to the same segment.
+ */
 static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_entry *se,
 				block_t blkaddr, unsigned int offset, int del)
 {
@@ -2433,42 +2437,48 @@ static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_ent
 #ifdef CONFIG_F2FS_CHECK_FS
 	bool mir_exist;
 #endif
+	int i;
+	int del_count = -del;
+
+	f2fs_bug_on(sbi, GET_SEGNO(sbi, blkaddr) != GET_SEGNO(sbi, blkaddr + del_count - 1));
 
-	exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map);
+	for (i = 0; i < del_count; i++) {
+		exist = f2fs_test_and_clear_bit(offset + i, se->cur_valid_map);
 #ifdef CONFIG_F2FS_CHECK_FS
-	mir_exist = f2fs_test_and_clear_bit(offset,
-					se->cur_valid_map_mir);
-	if (unlikely(exist != mir_exist)) {
-		f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d",
-			blkaddr, exist);
-		f2fs_bug_on(sbi, 1);
-	}
+		mir_exist = f2fs_test_and_clear_bit(offset + i,
+						se->cur_valid_map_mir);
+		if (unlikely(exist != mir_exist)) {
+			f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d",
+				blkaddr + i, exist);
+			f2fs_bug_on(sbi, 1);
+		}
 #endif
-	if (unlikely(!exist)) {
-		f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", blkaddr);
-		f2fs_bug_on(sbi, 1);
-		se->valid_blocks++;
-		del = 0;
-	} else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
-		/*
-		 * If checkpoints are off, we must not reuse data that
-		 * was used in the previous checkpoint. If it was used
-		 * before, we must track that to know how much space we
-		 * really have.
-		 */
-		if (f2fs_test_bit(offset, se->ckpt_valid_map)) {
-			spin_lock(&sbi->stat_lock);
-			sbi->unusable_block_count++;
-			spin_unlock(&sbi->stat_lock);
+		if (unlikely(!exist)) {
+			f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", blkaddr + i);
+			f2fs_bug_on(sbi, 1);
+			se->valid_blocks++;
+			del += 1;
+		} else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+			/*
+			 * If checkpoints are off, we must not reuse data that
+			 * was used in the previous checkpoint. If it was used
+			 * before, we must track that to know how much space we
+			 * really have.
+			 */
+			if (f2fs_test_bit(offset + i, se->ckpt_valid_map)) {
+				spin_lock(&sbi->stat_lock);
+				sbi->unusable_block_count++;
+				spin_unlock(&sbi->stat_lock);
+			}
 		}
-	}
 
-	if (f2fs_block_unit_discard(sbi) &&
-			f2fs_test_and_clear_bit(offset, se->discard_map))
-		sbi->discard_blks++;
+		if (f2fs_block_unit_discard(sbi) &&
+				f2fs_test_and_clear_bit(offset + i, se->discard_map))
+			sbi->discard_blks++;
 
-	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
-		se->ckpt_valid_blocks += del;
+		if (!f2fs_test_bit(offset + i, se->ckpt_valid_map))
+			se->ckpt_valid_blocks -= 1;
+	}
 
 	return del;
 }
@@ -2517,6 +2527,11 @@ static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry
 	return del;
 }
 
+/*
+ * If releasing blocks, this function supports updating multiple consecutive blocks
+ * at one time, but please note that these consecutive blocks need to belong to the
+ * same segment.
+ */
 static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 {
 	struct seg_entry *se;
-- 
cgit v1.2.3


From f7c848431632598ff9bce57a659db6af60d75b39 Mon Sep 17 00:00:00 2001
From: Mike Marshall <hubcap@omnibond.com>
Date: Wed, 8 Jan 2025 14:21:08 -0500
Subject: orangefs: fix a oob in orangefs_debug_write

I got a syzbot report: slab-out-of-bounds Read in
orangefs_debug_write... several people suggested fixes,
I tested Al Viro's suggestion and made this patch.

Signed-off-by: Mike Marshall <hubcap@omnibond.com>
Reported-by: syzbot+fc519d7875f2d9186c1f@syzkaller.appspotmail.com
---
 fs/orangefs/orangefs-debugfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 1b508f543384..fa41db088488 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -393,9 +393,9 @@ static ssize_t orangefs_debug_write(struct file *file,
 	 * Thwart users who try to jamb a ridiculous number
 	 * of bytes into the debug file...
 	 */
-	if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) {
+	if (count > ORANGEFS_MAX_DEBUG_STRING_LEN) {
 		silly = count;
-		count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1;
+		count = ORANGEFS_MAX_DEBUG_STRING_LEN;
 	}
 
 	buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
-- 
cgit v1.2.3


From 542243af7182efaeaf6d0f4643f7de437541a9af Mon Sep 17 00:00:00 2001
From: Eugen Hristev <eugen.hristev@linaro.org>
Date: Wed, 1 Jan 2025 13:19:21 +0200
Subject: pstore/blk: trivial typo fixes

Fix trivial typos in comments.

Fixes: 2a03ddbde1e1 ("pstore/blk: Move verify_size() macro out of function")
Fixes: 17639f67c1d6 ("pstore/blk: Introduce backend for block devices")
Signed-off-by: Eugen Hristev <eugen.hristev@linaro.org>
Link: https://lore.kernel.org/r/20250101111921.850406-1-eugen.hristev@linaro.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/pstore/blk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index 65b2473e22ff..fa6b8cb788a1 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -89,7 +89,7 @@ static struct pstore_device_info *pstore_device_info;
 		_##name_ = check_size(name, alignsize);		\
 	else							\
 		_##name_ = 0;					\
-	/* Synchronize module parameters with resuls. */	\
+	/* Synchronize module parameters with results. */	\
 	name = _##name_ / 1024;					\
 	dev->zone.name = _##name_;				\
 }
@@ -121,7 +121,7 @@ static int __register_pstore_device(struct pstore_device_info *dev)
 	if (pstore_device_info)
 		return -EBUSY;
 
-	/* zero means not limit on which backends to attempt to store. */
+	/* zero means no limit on which backends attempt to store. */
 	if (!dev->flags)
 		dev->flags = UINT_MAX;
 
-- 
cgit v1.2.3


From 3675a926feefdf3afabea12f806f31ea582065e5 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 28 Dec 2024 09:43:41 +0100
Subject: sysfs: constify bin_attribute argument of
 sysfs_bin_attr_simple_read()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most users use this function through the BIN_ATTR_SIMPLE* macros,
they can handle the switch transparently.
Also adapt the two non-macro users in the same change.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Tested-by: Aditya Gupta <adityag@linux.ibm.com>
Link: https://lore.kernel.org/r/20241228-sysfs-const-bin_attr-simple-v2-1-7c6f3f1767a3@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/powernv/opal.c | 2 +-
 fs/sysfs/file.c                       | 2 +-
 include/linux/sysfs.h                 | 4 ++--
 kernel/module/sysfs.c                 | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 5d0f35bb917e..013637e2b2a8 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -818,7 +818,7 @@ static int opal_add_one_export(struct kobject *parent, const char *export_name,
 	sysfs_bin_attr_init(attr);
 	attr->attr.name = name;
 	attr->attr.mode = 0400;
-	attr->read = sysfs_bin_attr_simple_read;
+	attr->read_new = sysfs_bin_attr_simple_read;
 	attr->private = __va(vals[0]);
 	attr->size = vals[1];
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 785408861c01..6931308876c4 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -817,7 +817,7 @@ EXPORT_SYMBOL_GPL(sysfs_emit_at);
  * Returns number of bytes written to @buf.
  */
 ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
-				   struct bin_attribute *attr, char *buf,
+				   const struct bin_attribute *attr, char *buf,
 				   loff_t off, size_t count)
 {
 	memcpy(buf, attr->private + off, count);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index b4368377fac9..18f7e1fd093c 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -511,7 +511,7 @@ __printf(3, 4)
 int sysfs_emit_at(char *buf, int at, const char *fmt, ...);
 
 ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
-				   struct bin_attribute *attr, char *buf,
+				   const struct bin_attribute *attr, char *buf,
 				   loff_t off, size_t count);
 
 #else /* CONFIG_SYSFS */
@@ -774,7 +774,7 @@ static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
 
 static inline ssize_t sysfs_bin_attr_simple_read(struct file *file,
 						 struct kobject *kobj,
-						 struct bin_attribute *attr,
+						 const struct bin_attribute *attr,
 						 char *buf, loff_t off,
 						 size_t count)
 {
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index 456358e1fdc4..254017b58b64 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -196,7 +196,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info)
 			nattr->attr.mode = 0444;
 			nattr->size = info->sechdrs[i].sh_size;
 			nattr->private = (void *)info->sechdrs[i].sh_addr;
-			nattr->read = sysfs_bin_attr_simple_read;
+			nattr->read_new = sysfs_bin_attr_simple_read;
 			++nattr;
 		}
 		++loaded;
-- 
cgit v1.2.3


From c57b6e1d8a5c133dd5f6293de262701a55d11335 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 6 Jan 2025 18:35:20 -0800
Subject: efivarfs: remove unused efi_variable.Attributes and efivar_entry.kobj

These fields look to be remnants of older code: Attributes was likely
meant to stash the variable attributes, but doesn't because we always
read them from the variable store and kobj was likely left over from
an older iteration of code where we manually created the objects
instead of using a filesystem.

[ ardb: these fields were used by the sysfs based 'efivars' precursor to
	efivarfs, which was removed in commit 0f5b2c69a4cb ("efi: vars:
	Remove deprecated 'efivars' sysfs interface") ]

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/internal.h | 2 --
 fs/efivarfs/super.c    | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 74f0602a9e01..64d15d1bb6bf 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -24,13 +24,11 @@ struct efivarfs_fs_info {
 struct efi_variable {
 	efi_char16_t  VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
 	efi_guid_t    VendorGuid;
-	__u32         Attributes;
 };
 
 struct efivar_entry {
 	struct efi_variable var;
 	struct list_head list;
-	struct kobject kobj;
 };
 
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index beba15673be8..d3c8528274aa 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -245,7 +245,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 
 	inode_lock(inode);
 	inode->i_private = entry;
-	i_size_write(inode, size + sizeof(entry->var.Attributes));
+	i_size_write(inode, size + sizeof(__u32)); /* attributes + data */
 	inode_unlock(inode);
 	d_add(dentry, inode);
 
-- 
cgit v1.2.3


From 1aba87f92d471222a89a5e7c27497489d37c67e1 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 6 Jan 2025 18:35:21 -0800
Subject: efivarfs: add helper to convert from UC16 name and GUID to utf8 name

These will be used by a later patch to check for uniqueness on initial
EFI variable iteration.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/internal.h |  1 +
 fs/efivarfs/super.c    | 17 +++--------------
 fs/efivarfs/vars.c     | 25 +++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 64d15d1bb6bf..c10efc1ad0a7 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -55,6 +55,7 @@ bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 		     unsigned long data_size);
 bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
 				  size_t len);
+char *efivar_get_utf8name(const efi_char16_t *name16, efi_guid_t *vendor);
 
 extern const struct file_operations efivarfs_file_operations;
 extern const struct inode_operations efivarfs_dir_inode_operations;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index d3c8528274aa..b22441f7f7c6 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -205,27 +205,16 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 	memcpy(entry->var.VariableName, name16, name_size);
 	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
 
-	len = ucs2_utf8size(entry->var.VariableName);
-
-	/* name, plus '-', plus GUID, plus NUL*/
-	name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
+	name = efivar_get_utf8name(name16, &vendor);
 	if (!name)
 		goto fail;
 
-	ucs2_as_utf8(name, entry->var.VariableName, len);
+	/* length of the variable name itself: remove GUID and separator */
+	len = strlen(name) - EFI_VARIABLE_GUID_LEN - 1;
 
 	if (efivar_variable_is_removable(entry->var.VendorGuid, name, len))
 		is_removable = true;
 
-	name[len] = '-';
-
-	efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
-
-	name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
-
-	/* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */
-	strreplace(name, '/', '!');
-
 	inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
 				   is_removable);
 	if (!inode)
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index f7d43c847ee9..13594087d9ee 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -225,6 +225,31 @@ variable_matches(const char *var_name, size_t len, const char *match_name,
 	}
 }
 
+char *
+efivar_get_utf8name(const efi_char16_t *name16, efi_guid_t *vendor)
+{
+	int len = ucs2_utf8size(name16);
+	char *name;
+
+	/* name, plus '-', plus GUID, plus NUL*/
+	name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
+	if (!name)
+		return NULL;
+
+	ucs2_as_utf8(name, name16, len);
+
+	name[len] = '-';
+
+	efi_guid_to_str(vendor, name + len + 1);
+
+	name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+
+	/* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */
+	strreplace(name, '/', '!');
+
+	return name;
+}
+
 bool
 efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 		unsigned long data_size)
-- 
cgit v1.2.3


From 7e365c7e2cc587ac90c346a52156a6b08845d909 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 6 Jan 2025 18:35:22 -0800
Subject: efivarfs: make variable_is_present use dcache lookup

Instead of searching the variable entry list for a variable, use the
dcache lookup functions to find it instead.  Also add an efivarfs_
prefix to the function now it is no longer static.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/internal.h |  2 ++
 fs/efivarfs/super.c    | 29 +++++++++++++++++++++++++++++
 fs/efivarfs/vars.c     | 26 ++------------------------
 3 files changed, 33 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index c10efc1ad0a7..597ccaa60d37 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -56,6 +56,8 @@ bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
 				  size_t len);
 char *efivar_get_utf8name(const efi_char16_t *name16, efi_guid_t *vendor);
+bool efivarfs_variable_is_present(efi_char16_t *variable_name,
+				  efi_guid_t *vendor, void *data);
 
 extern const struct file_operations efivarfs_file_operations;
 extern const struct inode_operations efivarfs_dir_inode_operations;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index b22441f7f7c6..9e90823f8009 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -181,6 +181,35 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
 	return ERR_PTR(-ENOMEM);
 }
 
+bool efivarfs_variable_is_present(efi_char16_t *variable_name,
+				  efi_guid_t *vendor, void *data)
+{
+	char *name = efivar_get_utf8name(variable_name, vendor);
+	struct super_block *sb = data;
+	struct dentry *dentry;
+	struct qstr qstr;
+
+	if (!name)
+		/*
+		 * If the allocation failed there'll already be an
+		 * error in the log (and likely a huge and growing
+		 * number of them since they system will be under
+		 * extreme memory pressure), so simply assume
+		 * collision for safety but don't add to the log
+		 * flood.
+		 */
+		return true;
+
+	qstr.name = name;
+	qstr.len = strlen(name);
+	dentry = d_hash_and_lookup(sb->s_root, &qstr);
+	kfree(name);
+	if (!IS_ERR_OR_NULL(dentry))
+		dput(dentry);
+
+	return dentry != NULL;
+}
+
 static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 			     unsigned long name_size, void *data,
 			     struct list_head *list)
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index 13594087d9ee..b2fc5bdc759a 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -313,28 +313,6 @@ efivar_variable_is_removable(efi_guid_t vendor, const char *var_name,
 	return found;
 }
 
-static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
-				struct list_head *head)
-{
-	struct efivar_entry *entry, *n;
-	unsigned long strsize1, strsize2;
-	bool found = false;
-
-	strsize1 = ucs2_strsize(variable_name, EFI_VAR_NAME_LEN);
-	list_for_each_entry_safe(entry, n, head, list) {
-		strsize2 = ucs2_strsize(entry->var.VariableName, EFI_VAR_NAME_LEN);
-		if (strsize1 == strsize2 &&
-			!memcmp(variable_name, &(entry->var.VariableName),
-				strsize2) &&
-			!efi_guidcmp(entry->var.VendorGuid,
-				*vendor)) {
-			found = true;
-			break;
-		}
-	}
-	return found;
-}
-
 /*
  * Returns the size of variable_name, in bytes, including the
  * terminating NULL character, or variable_name_size if no NULL
@@ -439,8 +417,8 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 			 * we'll ever see a different variable name,
 			 * and may end up looping here forever.
 			 */
-			if (variable_is_present(variable_name, &vendor_guid,
-						head)) {
+			if (efivarfs_variable_is_present(variable_name,
+							 &vendor_guid, data)) {
 				dup_variable_bug(variable_name, &vendor_guid,
 						 variable_name_size);
 				status = EFI_NOT_FOUND;
-- 
cgit v1.2.3


From c13094b894de289514d84b8db56d1f2931a0bade Mon Sep 17 00:00:00 2001
From: Marco Nelissen <marco.nelissen@gmail.com>
Date: Wed, 8 Jan 2025 20:11:50 -0800
Subject: iomap: avoid avoid truncating 64-bit offset to 32 bits

on 32-bit kernels, iomap_write_delalloc_scan() was inadvertently using a
32-bit position due to folio_next_index() returning an unsigned long.
This could lead to an infinite loop when writing to an xfs filesystem.

Signed-off-by: Marco Nelissen <marco.nelissen@gmail.com>
Link: https://lore.kernel.org/r/20250109041253.2494374-1-marco.nelissen@gmail.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 54dc27d92781..d303e6c8900c 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1138,7 +1138,7 @@ static void iomap_write_delalloc_scan(struct inode *inode,
 				start_byte, end_byte, iomap, punch);
 
 		/* move offset to start of next folio in range */
-		start_byte = folio_next_index(folio) << PAGE_SHIFT;
+		start_byte = folio_pos(folio) + folio_size(folio);
 		folio_unlock(folio);
 		folio_put(folio);
 	}
-- 
cgit v1.2.3


From 7ed6cbe0f8caa6ee38a2dc8f1b925acb904cc01f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Jan 2025 09:31:02 +0100
Subject: fs: add STATX_DIO_READ_ALIGN

Add a separate dio read align field, as many out of place write
file systems can easily do reads aligned to the device sector size,
but require bigger alignment for writes.

This is usually papered over by falling back to buffered I/O for smaller
writes and doing read-modify-write cycles, but performance for this
sucks, so applications benefit from knowing the actual write alignment.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250109083109.1441561-3-hch@lst.de
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/stat.c                 | 1 +
 include/linux/stat.h      | 1 +
 include/uapi/linux/stat.h | 4 +++-
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/stat.c b/fs/stat.c
index 0870e969a8a0..2c0e111a098a 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -725,6 +725,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	tmp.stx_mnt_id = stat->mnt_id;
 	tmp.stx_dio_mem_align = stat->dio_mem_align;
 	tmp.stx_dio_offset_align = stat->dio_offset_align;
+	tmp.stx_dio_read_offset_align = stat->dio_read_offset_align;
 	tmp.stx_subvol = stat->subvol;
 	tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
 	tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 3d900c86981c..9d8382e23a9c 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -52,6 +52,7 @@ struct kstat {
 	u64		mnt_id;
 	u32		dio_mem_align;
 	u32		dio_offset_align;
+	u32		dio_read_offset_align;
 	u64		change_cookie;
 	u64		subvol;
 	u32		atomic_write_unit_min;
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 8b35d7d511a2..f78ee3670dd5 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -179,7 +179,8 @@ struct statx {
 	/* Max atomic write segment count */
 	__u32   stx_atomic_write_segments_max;
 
-	__u32   __spare1[1];
+	/* File offset alignment for direct I/O reads */
+	__u32	stx_dio_read_offset_align;
 
 	/* 0xb8 */
 	__u64	__spare3[9];	/* Spare space for future expansion */
@@ -213,6 +214,7 @@ struct statx {
 #define STATX_MNT_ID_UNIQUE	0x00004000U	/* Want/got extended stx_mount_id */
 #define STATX_SUBVOL		0x00008000U	/* Want/got stx_subvol */
 #define STATX_WRITE_ATOMIC	0x00010000U	/* Want/got atomic_write_* fields */
+#define STATX_DIO_READ_ALIGN	0x00020000U	/* Want/got dio read alignment info */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
-- 
cgit v1.2.3


From 7e17483c7b151ed64f8959278def2fea164526f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Jan 2025 09:31:03 +0100
Subject: xfs: cleanup xfs_vn_getattr

Split the two bits of optional statx reporting into their own helpers
so that they are self-contained instead of deeply indented in the main
getattr handler.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250109083109.1441561-4-hch@lst.de
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/xfs/xfs_iops.c | 47 +++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 207e0dadffc3..6b0228a21617 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -573,17 +573,28 @@ xfs_stat_blksize(
 }
 
 static void
-xfs_get_atomic_write_attr(
+xfs_report_dioalign(
 	struct xfs_inode	*ip,
-	unsigned int		*unit_min,
-	unsigned int		*unit_max)
+	struct kstat		*stat)
 {
-	if (!xfs_inode_can_atomicwrite(ip)) {
-		*unit_min = *unit_max = 0;
-		return;
-	}
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	struct block_device	*bdev = target->bt_bdev;
 
-	*unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+	stat->result_mask |= STATX_DIOALIGN;
+	stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+	stat->dio_offset_align = bdev_logical_block_size(bdev);
+}
+
+static void
+xfs_report_atomic_write(
+	struct xfs_inode	*ip,
+	struct kstat		*stat)
+{
+	unsigned int		unit_min = 0, unit_max = 0;
+
+	if (xfs_inode_can_atomicwrite(ip))
+		unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
+	generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
 }
 
 STATIC int
@@ -647,22 +658,10 @@ xfs_vn_getattr(
 		stat->rdev = inode->i_rdev;
 		break;
 	case S_IFREG:
-		if (request_mask & STATX_DIOALIGN) {
-			struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
-			struct block_device	*bdev = target->bt_bdev;
-
-			stat->result_mask |= STATX_DIOALIGN;
-			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
-			stat->dio_offset_align = bdev_logical_block_size(bdev);
-		}
-		if (request_mask & STATX_WRITE_ATOMIC) {
-			unsigned int	unit_min, unit_max;
-
-			xfs_get_atomic_write_attr(ip, &unit_min,
-					&unit_max);
-			generic_fill_statx_atomic_writes(stat,
-					unit_min, unit_max);
-		}
+		if (request_mask & STATX_DIOALIGN)
+			xfs_report_dioalign(ip, stat);
+		if (request_mask & STATX_WRITE_ATOMIC)
+			xfs_report_atomic_write(ip, stat);
 		fallthrough;
 	default:
 		stat->blksize = xfs_stat_blksize(ip);
-- 
cgit v1.2.3


From 7422bbd030210fbdc011acfd6f0f15b966fd2b46 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Jan 2025 09:31:04 +0100
Subject: xfs: report the correct read/write dio alignment for reflinked inodes

For I/O to reflinked blocks we always need to write an entire new
file system block, and the code enforces the file system block alignment
for the entire file if it has any reflinked blocks.

Use the new STATX_DIO_READ_ALIGN flag to report the asymmetric read
vs write alignments for reflinked files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250109083109.1441561-5-hch@lst.de
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/xfs/xfs_iops.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 6b0228a21617..40289fe6f5b2 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -580,9 +580,24 @@ xfs_report_dioalign(
 	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
 	struct block_device	*bdev = target->bt_bdev;
 
-	stat->result_mask |= STATX_DIOALIGN;
+	stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
 	stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
-	stat->dio_offset_align = bdev_logical_block_size(bdev);
+
+	/*
+	 * For COW inodes, we can only perform out of place writes of entire
+	 * allocation units (blocks or RT extents).
+	 * For writes smaller than the allocation unit, we must fall back to
+	 * buffered I/O to perform read-modify-write cycles.  At best this is
+	 * highly inefficient; at worst it leads to page cache invalidation
+	 * races.  Tell applications to avoid this by reporting the larger write
+	 * alignment in dio_offset_align, and the smaller read alignment in
+	 * dio_read_offset_align.
+	 */
+	stat->dio_read_offset_align = bdev_logical_block_size(bdev);
+	if (xfs_is_cow_inode(ip))
+		stat->dio_offset_align = xfs_inode_alloc_unitsize(ip);
+	else
+		stat->dio_offset_align = stat->dio_read_offset_align;
 }
 
 static void
@@ -658,7 +673,7 @@ xfs_vn_getattr(
 		stat->rdev = inode->i_rdev;
 		break;
 	case S_IFREG:
-		if (request_mask & STATX_DIOALIGN)
+		if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN))
 			xfs_report_dioalign(ip, stat);
 		if (request_mask & STATX_WRITE_ATOMIC)
 			xfs_report_atomic_write(ip, stat);
-- 
cgit v1.2.3


From 468210ec76e155bbc53f8fc41b2bd5e26a2f6d20 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Jan 2025 09:31:05 +0100
Subject: xfs: report larger dio alignment for COW inodes

For I/O to reflinked blocks we always need to write an entire new file
system block, and the code enforces the file system block alignment for
the entire file if it has any reflinked blocks.  Mirror the larger
value reported in the statx in the dio_offset_align in the xfs-specific
XFS_IOC_DIOINFO ioctl for the same reason.

Don't bother adding a new field for the read alignment to this legacy
ioctl as all new users should use statx instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250109083109.1441561-6-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/xfs/xfs_ioctl.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0789c18aaa18..f95103325318 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1204,7 +1204,16 @@ xfs_file_ioctl(
 		struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
 		struct dioattr		da;
 
-		da.d_mem =  da.d_miniosz = target->bt_logical_sectorsize;
+		da.d_mem = target->bt_logical_sectorsize;
+
+		/*
+		 * See xfs_report_dioalign() for an explanation about why this
+		 * reports a value larger than the sector size for COW inodes.
+		 */
+		if (xfs_is_cow_inode(ip))
+			da.d_miniosz = xfs_inode_alloc_unitsize(ip);
+		else
+			da.d_miniosz = target->bt_logical_sectorsize;
 		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
 
 		if (copy_to_user(arg, &da, sizeof(da)))
-- 
cgit v1.2.3


From 344bac8f0d73fe970cd9f5b2f132906317d29e8b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 15 Dec 2024 21:17:05 +0100
Subject: fs: kill MNT_ONRB

Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist
instead of keeping it with mnt->mnt_list. This allows us to use
RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as
list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB.

This also fixes the bug reported in [1] where seemingly MNT_ONRB wasn't
set in @mnt->mnt_flags even though the mount was present in the mount
rbtree of the mount namespace.

The root cause is the following race. When a btrfs subvolume is mounted
a temporary mount is created:

btrfs_get_tree_subvol()
{
        mnt = fc_mount()
        // Register the newly allocated mount with sb->mounts:
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
}

and registered on sb->s_mounts. Later it is added to an anonymous mount
namespace via mount_subvol():

-> mount_subvol()
   -> mount_subtree()
      -> alloc_mnt_ns()
         mnt_add_to_ns()
         vfs_path_lookup()
         put_mnt_ns()

The mnt_add_to_ns() call raises MNT_ONRB in @mnt->mnt_flags. If someone
concurrently does a ro remount:

reconfigure_super()
-> sb_prepare_remount_readonly()
   {
           list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
   }

all mounts registered in sb->s_mounts are visited and first
MNT_WRITE_HOLD is raised, then MNT_READONLY is raised, and finally
MNT_WRITE_HOLD is removed again.

The flag modification for MNT_WRITE_HOLD/MNT_READONLY and MNT_ONRB race
so MNT_ONRB might be lost.

Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree")
Cc: <stable@kernel.org> # v6.8+
Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org
Link: https://lore.kernel.org/r/ec6784ed-8722-4695-980a-4400d4e7bd1a@gmx.com [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h            | 15 +++++++++------
 fs/namespace.c        | 14 ++++++--------
 include/linux/mount.h |  3 +--
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index 185fc56afc13..179f690a0c72 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -38,6 +38,7 @@ struct mount {
 	struct dentry *mnt_mountpoint;
 	struct vfsmount mnt;
 	union {
+		struct rb_node mnt_node; /* node in the ns->mounts rbtree */
 		struct rcu_head mnt_rcu;
 		struct llist_node mnt_llist;
 	};
@@ -51,10 +52,7 @@ struct mount {
 	struct list_head mnt_child;	/* and going through their mnt_child */
 	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
 	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
-	union {
-		struct rb_node mnt_node;	/* Under ns->mounts */
-		struct list_head mnt_list;
-	};
+	struct list_head mnt_list;
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct list_head mnt_share;	/* circular list of shared mounts */
 	struct list_head mnt_slave_list;/* list of slave mounts */
@@ -145,11 +143,16 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
 	return ns->seq == 0;
 }
 
+static inline bool mnt_ns_attached(const struct mount *mnt)
+{
+	return !RB_EMPTY_NODE(&mnt->mnt_node);
+}
+
 static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
 {
-	WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
-	mnt->mnt.mnt_flags &= ~MNT_ONRB;
+	WARN_ON(!mnt_ns_attached(mnt));
 	rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+	RB_CLEAR_NODE(&mnt->mnt_node);
 	list_add_tail(&mnt->mnt_list, dt_list);
 }
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 23e81c2a1e3f..847fa8443e8a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -344,6 +344,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
+		RB_CLEAR_NODE(&mnt->mnt_node);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
 	}
 	return mnt;
@@ -1124,7 +1125,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 	struct rb_node **link = &ns->mounts.rb_node;
 	struct rb_node *parent = NULL;
 
-	WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+	WARN_ON(mnt_ns_attached(mnt));
 	mnt->mnt_ns = ns;
 	while (*link) {
 		parent = *link;
@@ -1135,7 +1136,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 	}
 	rb_link_node(&mnt->mnt_node, parent, link);
 	rb_insert_color(&mnt->mnt_node, &ns->mounts);
-	mnt->mnt.mnt_flags |= MNT_ONRB;
 }
 
 /*
@@ -1305,7 +1305,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
+	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
 
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1763,7 +1763,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 	/* Gather the mounts to umount */
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		p->mnt.mnt_flags |= MNT_UMOUNT;
-		if (p->mnt.mnt_flags & MNT_ONRB)
+		if (mnt_ns_attached(p))
 			move_from_ns(p, &tmp_list);
 		else
 			list_move(&p->mnt_list, &tmp_list);
@@ -1912,16 +1912,14 @@ static int do_umount(struct mount *mnt, int flags)
 
 	event++;
 	if (flags & MNT_DETACH) {
-		if (mnt->mnt.mnt_flags & MNT_ONRB ||
-		    !list_empty(&mnt->mnt_list))
+		if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
 			umount_tree(mnt, UMOUNT_PROPAGATE);
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
-			if (mnt->mnt.mnt_flags & MNT_ONRB ||
-			    !list_empty(&mnt->mnt_list))
+			if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
 				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 			retval = 0;
 		}
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c34c18b4e8f3..04213d8ef837 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -50,7 +50,7 @@ struct path;
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
-			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)
+			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
 
 #define MNT_INTERNAL	0x4000
 
@@ -64,7 +64,6 @@ struct path;
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
 #define MNT_UMOUNT		0x8000000
-#define MNT_ONRB		0x10000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
-- 
cgit v1.2.3


From 056d33137bf9364456ee70aa265ccbb948daee49 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 15 Nov 2024 10:35:53 -0500
Subject: fs: prepend statmount.mnt_opts string with security_sb_mnt_opts()

Currently these mount options aren't accessible via statmount().

The read handler for /proc/#/mountinfo calls security_sb_show_options()
to emit the security options after emitting superblock flag options, but
before calling sb->s_op->show_options.

Have statmount_mnt_opts() call security_sb_show_options() before
calling ->show_options.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20241115-statmount-v2-2-cd29aeff9cbb@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 847fa8443e8a..1af8da8e1e97 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5036,6 +5036,10 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
 	if (sb->s_op->show_options) {
 		size_t start = seq->count;
 
+		err = security_sb_show_options(seq, sb);
+		if (err)
+			return err;
+
 		err = sb->s_op->show_options(seq, mnt->mnt_root);
 		if (err)
 			return err;
-- 
cgit v1.2.3


From 62b8dee925023ed2a2b417dea657e3e3e57c4117 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Dec 2024 00:03:40 +0100
Subject: mount: remove inlude/nospec.h include

It's not needed, so remove it.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-1-6e3cdaf9b280@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1af8da8e1e97..a1832e9a4340 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,7 +32,6 @@
 #include <linux/fs_context.h>
 #include <linux/shmem_fs.h>
 #include <linux/mnt_idmapping.h>
-#include <linux/nospec.h>
 
 #include "pnode.h"
 #include "internal.h"
-- 
cgit v1.2.3


From 144acef3334eb664fae4a2e1e35fdd693fc07d4e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Dec 2024 00:03:41 +0100
Subject: fs: add mount namespace to rbtree late

There's no point doing that under the namespace semaphore it just gives
the false impression that it protects the mount namespace rbtree and it
simply doesn't.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-2-6e3cdaf9b280@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index a1832e9a4340..0ca2cda6bc16 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3981,7 +3981,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		while (p->mnt.mnt_root != q->mnt.mnt_root)
 			p = next_mnt(skip_mnt_tree(p), old);
 	}
-	mnt_ns_tree_add(new_ns);
 	namespace_unlock();
 
 	if (rootmnt)
@@ -3989,6 +3988,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	if (pwdmnt)
 		mntput(pwdmnt);
 
+	mnt_ns_tree_add(new_ns);
 	return new_ns;
 }
 
-- 
cgit v1.2.3


From 5dcbd85d35515532c93b337a3f8be54937aeea8a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Dec 2024 00:03:42 +0100
Subject: fs: lockless mntns rbtree lookup

Currently we use a read-write lock but for the simple search case we can
make this lockless. Creating a new mount namespace is a rather rare
event compared with querying mounts in a foreign mount namespace. Once
this is picked up by e.g., systemd to list mounts in another mount in
it's isolated services or in containers this will be used a lot so this
seems worthwhile doing.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-3-6e3cdaf9b280@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h     |   5 ++-
 fs/namespace.c | 116 ++++++++++++++++++++++++++++++++++-----------------------
 2 files changed, 74 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index 179f690a0c72..bd8d6c36b421 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -12,7 +12,10 @@ struct mnt_namespace {
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
 	u64			seq;	/* Sequence number to prevent loops */
-	wait_queue_head_t poll;
+	union {
+		wait_queue_head_t	poll;
+		struct rcu_head		mnt_ns_rcu;
+	};
 	u64 event;
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
diff --git a/fs/namespace.c b/fs/namespace.c
index 0ca2cda6bc16..d2dbbbc91cc0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
+
 static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
 
 struct mount_kattr {
@@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 
-static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
-{
-	u64 seq_b = ns->seq;
-
-	if (seq < seq_b)
-		return -1;
-	if (seq > seq_b)
-		return 1;
-	return 0;
-}
-
 static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
 {
 	if (!node)
@@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
 	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
 }
 
-static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
 {
 	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
 	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
 	u64 seq_a = ns_a->seq;
+	u64 seq_b = ns_b->seq;
 
-	return mnt_ns_cmp(seq_a, ns_b) < 0;
+	if (seq_a < seq_b)
+		return -1;
+	if (seq_a > seq_b)
+		return 1;
+	return 0;
+}
+
+static inline void mnt_ns_tree_write_lock(void)
+{
+	write_lock(&mnt_ns_tree_lock);
+	write_seqcount_begin(&mnt_ns_tree_seqcount);
+}
+
+static inline void mnt_ns_tree_write_unlock(void)
+{
+	write_seqcount_end(&mnt_ns_tree_seqcount);
+	write_unlock(&mnt_ns_tree_lock);
 }
 
 static void mnt_ns_tree_add(struct mnt_namespace *ns)
 {
-	guard(write_lock)(&mnt_ns_tree_lock);
-	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+	struct rb_node *node;
+
+	mnt_ns_tree_write_lock();
+	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
+	mnt_ns_tree_write_unlock();
+
+	WARN_ON_ONCE(node);
 }
 
 static void mnt_ns_release(struct mnt_namespace *ns)
@@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns)
 }
 DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
 
+static void mnt_ns_release_rcu(struct rcu_head *rcu)
+{
+	mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+}
+
 static void mnt_ns_tree_remove(struct mnt_namespace *ns)
 {
 	/* remove from global mount namespace list */
 	if (!is_anon_ns(ns)) {
-		guard(write_lock)(&mnt_ns_tree_lock);
+		mnt_ns_tree_write_lock();
 		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+		mnt_ns_tree_write_unlock();
 	}
 
-	mnt_ns_release(ns);
+	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
 }
 
-/*
- * Returns the mount namespace which either has the specified id, or has the
- * next smallest id afer the specified one.
- */
-static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+static int mnt_ns_find(const void *key, const struct rb_node *node)
 {
-	struct rb_node *node = mnt_ns_tree.rb_node;
-	struct mnt_namespace *ret = NULL;
+	const u64 mnt_ns_id = *(u64 *)key;
+	const struct mnt_namespace *ns = node_to_mnt_ns(node);
 
-	lockdep_assert_held(&mnt_ns_tree_lock);
-
-	while (node) {
-		struct mnt_namespace *n = node_to_mnt_ns(node);
-
-		if (mnt_ns_id <= n->seq) {
-			ret = node_to_mnt_ns(node);
-			if (mnt_ns_id == n->seq)
-				break;
-			node = node->rb_left;
-		} else {
-			node = node->rb_right;
-		}
-	}
-	return ret;
+	if (mnt_ns_id < ns->seq)
+		return -1;
+	if (mnt_ns_id > ns->seq)
+		return 1;
+	return 0;
 }
 
 /*
@@ -194,18 +199,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
  * namespace the @namespace_sem must first be acquired. If the namespace has
  * already shut down before acquiring @namespace_sem, {list,stat}mount() will
  * see that the mount rbtree of the namespace is empty.
+ *
+ * Note the lookup is lockless protected by a sequence counter. We only
+ * need to guard against false negatives as false positives aren't
+ * possible. So if we didn't find a mount namespace and the sequence
+ * counter has changed we need to retry. If the sequence counter is
+ * still the same we know the search actually failed.
  */
 static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
 {
-       struct mnt_namespace *ns;
+	struct mnt_namespace *ns;
+	struct rb_node *node;
+	unsigned int seq;
+
+	guard(rcu)();
+	do {
+		seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
+		node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
+		if (node)
+			break;
+	} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
 
-       guard(read_lock)(&mnt_ns_tree_lock);
-       ns = mnt_ns_find_id_at(mnt_ns_id);
-       if (!ns || ns->seq != mnt_ns_id)
-               return NULL;
+	if (!node)
+		return NULL;
 
-       refcount_inc(&ns->passive);
-       return ns;
+	/*
+	 * The last reference count is put with RCU delay so we can
+	 * unconditonally acquire a reference here.
+	 */
+	ns = node_to_mnt_ns(node);
+	refcount_inc(&ns->passive);
+	return ns;
 }
 
 static inline void lock_mount_hash(void)
-- 
cgit v1.2.3


From 4368898b271ad4ea3b6ae64f5cd16486a6d65430 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Dec 2024 00:03:44 +0100
Subject: fs: lockless mntns lookup for nsfs

We already made the rbtree lookup lockless for the simple lookup case.
However, walking the list of mount namespaces via nsfs still happens
with taking the read lock blocking concurrent additions of new mount
namespaces pointlessly. Plus, such additions are rare anyway so allow
lockless lookup of the previous and next mount namespace by keeping a
separate list. This also allows to make some things simpler in the code.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-5-6e3cdaf9b280@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h     | 13 ++++---------
 fs/namespace.c | 42 +++++++++++++++++++++++++++++-------------
 fs/nsfs.c      |  5 +----
 3 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index bd8d6c36b421..e9f48e563c0f 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -20,6 +20,7 @@ struct mnt_namespace {
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
 	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
+	struct list_head	mnt_ns_list; /* entry in the sequential list of mounts namespace */
 	refcount_t		passive; /* number references not pinning @mounts */
 } __randomize_layout;
 
@@ -160,15 +161,9 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
 }
 
 bool has_locked_children(struct mount *mnt, struct dentry *dentry);
-struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
-static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
-{
-	return __lookup_next_mnt_ns(mntns, false);
-}
-static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
-{
-	return __lookup_next_mnt_ns(mntns, true);
-}
+struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
+					    bool previous);
+
 static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
 {
 	return container_of(ns, struct mnt_namespace, ns);
diff --git a/fs/namespace.c b/fs/namespace.c
index d2dbbbc91cc0..8b50b26024fa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -82,6 +82,7 @@ static DEFINE_RWLOCK(mnt_ns_tree_lock);
 static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
 
 static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
+static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
 
 struct mount_kattr {
 	unsigned int attr_set;
@@ -142,10 +143,19 @@ static inline void mnt_ns_tree_write_unlock(void)
 
 static void mnt_ns_tree_add(struct mnt_namespace *ns)
 {
-	struct rb_node *node;
+	struct rb_node *node, *prev;
 
 	mnt_ns_tree_write_lock();
 	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
+	/*
+	 * If there's no previous entry simply add it after the
+	 * head and if there is add it after the previous entry.
+	 */
+	prev = rb_prev(&ns->mnt_ns_tree_node);
+	if (!prev)
+		list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
+	else
+		list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
 	mnt_ns_tree_write_unlock();
 
 	WARN_ON_ONCE(node);
@@ -174,6 +184,7 @@ static void mnt_ns_tree_remove(struct mnt_namespace *ns)
 	if (!is_anon_ns(ns)) {
 		mnt_ns_tree_write_lock();
 		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+		list_bidir_del_rcu(&ns->mnt_ns_list);
 		mnt_ns_tree_write_unlock();
 	}
 
@@ -2086,30 +2097,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
 	return &mnt->ns;
 }
 
-struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
+struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
 {
-	guard(read_lock)(&mnt_ns_tree_lock);
+	guard(rcu)();
+
 	for (;;) {
-		struct rb_node *node;
+		struct list_head *list;
 
 		if (previous)
-			node = rb_prev(&mntns->mnt_ns_tree_node);
+			list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
 		else
-			node = rb_next(&mntns->mnt_ns_tree_node);
-		if (!node)
+			list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
+		if (list_is_head(list, &mnt_ns_list))
 			return ERR_PTR(-ENOENT);
 
-		mntns = node_to_mnt_ns(node);
-		node = &mntns->mnt_ns_tree_node;
+		mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
 
+		/*
+		 * The last passive reference count is put with RCU
+		 * delay so accessing the mount namespace is not just
+		 * safe but all relevant members are still valid.
+		 */
 		if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
 			continue;
 
 		/*
-		 * Holding mnt_ns_tree_lock prevents the mount namespace from
-		 * being freed but it may well be on it's deathbed. We want an
-		 * active reference, not just a passive one here as we're
-		 * persisting the mount namespace.
+		 * We need an active reference count as we're persisting
+		 * the mount namespace and it might already be on its
+		 * deathbed.
 		 */
 		if (!refcount_inc_not_zero(&mntns->ns.count))
 			continue;
@@ -3926,6 +3941,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 	refcount_set(&new_ns->ns.count, 1);
 	refcount_set(&new_ns->passive, 1);
 	new_ns->mounts = RB_ROOT;
+	INIT_LIST_HEAD(&new_ns->mnt_ns_list);
 	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->user_ns = get_user_ns(user_ns);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index c675fc40ce2d..663f8656158d 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		if (usize < MNT_NS_INFO_SIZE_VER0)
 			return -EINVAL;
 
-		if (previous)
-			mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
-		else
-			mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
+		mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
 		if (IS_ERR(mnt_ns))
 			return PTR_ERR(mnt_ns);
 
-- 
cgit v1.2.3


From e7c8dde36818ffa101045cfc887c49cd631048ee Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Dec 2024 00:03:45 +0100
Subject: fs: simplify rwlock to spinlock

We're not taking the read_lock() anymore now that all lookup is
lockless. Just use a simple spinlock.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-6-6e3cdaf9b280@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 8b50b26024fa..a382be402f62 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -78,8 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
-static DEFINE_RWLOCK(mnt_ns_tree_lock);
-static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
+static DEFINE_SEQLOCK(mnt_ns_tree_lock);
 
 static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
 static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
@@ -131,14 +130,12 @@ static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
 
 static inline void mnt_ns_tree_write_lock(void)
 {
-	write_lock(&mnt_ns_tree_lock);
-	write_seqcount_begin(&mnt_ns_tree_seqcount);
+	write_seqlock(&mnt_ns_tree_lock);
 }
 
 static inline void mnt_ns_tree_write_unlock(void)
 {
-	write_seqcount_end(&mnt_ns_tree_seqcount);
-	write_unlock(&mnt_ns_tree_lock);
+	write_sequnlock(&mnt_ns_tree_lock);
 }
 
 static void mnt_ns_tree_add(struct mnt_namespace *ns)
@@ -163,7 +160,7 @@ static void mnt_ns_tree_add(struct mnt_namespace *ns)
 
 static void mnt_ns_release(struct mnt_namespace *ns)
 {
-	lockdep_assert_not_held(&mnt_ns_tree_lock);
+	lockdep_assert_not_held(&mnt_ns_tree_lock.lock);
 
 	/* keep alive for {list,stat}mount() */
 	if (refcount_dec_and_test(&ns->passive)) {
@@ -225,11 +222,11 @@ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
 
 	guard(rcu)();
 	do {
-		seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
+		seq = read_seqbegin(&mnt_ns_tree_lock);
 		node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
 		if (node)
 			break;
-	} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
+	} while (read_seqretry(&mnt_ns_tree_lock, seq));
 
 	if (!node)
 		return NULL;
-- 
cgit v1.2.3


From 2ce23285d704f6a8d90207ae8f115f5db93d3541 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 15 Dec 2024 21:17:06 +0100
Subject: fs: cache first and last mount

Speed up listmount() by caching the first and last node making retrieval
of the first and last mount of each mount namespace O(1).

Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-2-fd55922c4af8@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h     | 13 +++++++++++--
 fs/namespace.c | 17 +++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index e9f48e563c0f..ffb613cdfeee 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,7 +8,11 @@
 struct mnt_namespace {
 	struct ns_common	ns;
 	struct mount *	root;
-	struct rb_root		mounts; /* Protected by namespace_sem */
+	struct {
+		struct rb_root	mounts;		 /* Protected by namespace_sem */
+		struct rb_node	*mnt_last_node;	 /* last (rightmost) mount in the rbtree */
+		struct rb_node	*mnt_first_node; /* first (leftmost) mount in the rbtree */
+	};
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
 	u64			seq;	/* Sequence number to prevent loops */
@@ -154,8 +158,13 @@ static inline bool mnt_ns_attached(const struct mount *mnt)
 
 static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
 {
+	struct mnt_namespace *ns = mnt->mnt_ns;
 	WARN_ON(!mnt_ns_attached(mnt));
-	rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+	if (ns->mnt_last_node == &mnt->mnt_node)
+		ns->mnt_last_node = rb_prev(&mnt->mnt_node);
+	if (ns->mnt_first_node == &mnt->mnt_node)
+		ns->mnt_first_node = rb_next(&mnt->mnt_node);
+	rb_erase(&mnt->mnt_node, &ns->mounts);
 	RB_CLEAR_NODE(&mnt->mnt_node);
 	list_add_tail(&mnt->mnt_list, dt_list);
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index a382be402f62..5336e5f09996 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1155,16 +1155,25 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 {
 	struct rb_node **link = &ns->mounts.rb_node;
 	struct rb_node *parent = NULL;
+	bool mnt_first_node = true, mnt_last_node = true;
 
 	WARN_ON(mnt_ns_attached(mnt));
 	mnt->mnt_ns = ns;
 	while (*link) {
 		parent = *link;
-		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
 			link = &parent->rb_left;
-		else
+			mnt_last_node = false;
+		} else {
 			link = &parent->rb_right;
+			mnt_first_node = false;
+		}
 	}
+
+	if (mnt_last_node)
+		ns->mnt_last_node = &mnt->mnt_node;
+	if (mnt_first_node)
+		ns->mnt_first_node = &mnt->mnt_node;
 	rb_link_node(&mnt->mnt_node, parent, link);
 	rb_insert_color(&mnt->mnt_node, &ns->mounts);
 }
@@ -5563,9 +5572,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
 
 	if (!last_mnt_id) {
 		if (reverse)
-			first = node_to_mount(rb_last(&ns->mounts));
+			first = node_to_mount(ns->mnt_last_node);
 		else
-			first = node_to_mount(rb_first(&ns->mounts));
+			first = node_to_mount(ns->mnt_first_node);
 	} else {
 		if (reverse)
 			first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
-- 
cgit v1.2.3


From 7f9bfafc5f496cf2222659890477d0408455369e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 17 Dec 2024 13:21:55 +0100
Subject: fs: use xarray for old mount id

While the ida does use the xarray internally we can use it explicitly
which allows us to increment the unique mount id under the xa lock.
This allows us to remove the atomic as we're now allocating both ids in
one go.

Link: https://lore.kernel.org/r/20241217-erhielten-regung-44bb1604ca8f@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 5336e5f09996..e607762fd9c7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -65,12 +65,12 @@ static int __init set_mphash_entries(char *str)
 __setup("mphash_entries=", set_mphash_entries);
 
 static u64 event;
-static DEFINE_IDA(mnt_id_ida);
+static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
 static DEFINE_IDA(mnt_group_ida);
 
 /* Don't allow confusion with old 32bit mount ID */
 #define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
+static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
 
 static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
@@ -267,18 +267,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)
 
 static int mnt_alloc_id(struct mount *mnt)
 {
-	int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
+	int res;
 
-	if (res < 0)
-		return res;
-	mnt->mnt_id = res;
-	mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
-	return 0;
+	xa_lock(&mnt_id_xa);
+	res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
+	if (!res)
+		mnt->mnt_id_unique = ++mnt_id_ctr;
+	xa_unlock(&mnt_id_xa);
+	return res;
 }
 
 static void mnt_free_id(struct mount *mnt)
 {
-	ida_free(&mnt_id_ida, mnt->mnt_id);
+	xa_erase(&mnt_id_xa, mnt->mnt_id);
 }
 
 /*
-- 
cgit v1.2.3


From 22eb23b8a7b2536a475ac87244ee4ab50764eccd Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 19 Dec 2024 14:59:14 +0100
Subject: fs: remove useless lockdep assertion

mnt_ns_release() can run asynchronously via call_rcu() so hitting that
lockdep assertion means someone else already grabbed the
mnt_ns_tree_lock and causes a false positive. That assertion has likely
always been wrong. call_rcu() just makes it more likely to hit.

Link: https://lore.kernel.org/r/Z2PlT5rcRTIhCpft@ly-workstation
Link: https://lore.kernel.org/r/20241219-darben-quietschen-b6e1d80327bb@brauner
Reported-by: Lai, Yi <yi1.lai@linux.intel.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index e607762fd9c7..371c860f49de 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -160,8 +160,6 @@ static void mnt_ns_tree_add(struct mnt_namespace *ns)
 
 static void mnt_ns_release(struct mnt_namespace *ns)
 {
-	lockdep_assert_not_held(&mnt_ns_tree_lock.lock);
-
 	/* keep alive for {list,stat}mount() */
 	if (refcount_dec_and_test(&ns->passive)) {
 		put_user_ns(ns->user_ns);
-- 
cgit v1.2.3


From 3f6bc9e3ab9b127171d39f9ac6eca1abb693b731 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 7 Jan 2025 18:39:27 +0000
Subject: netfs: Fix kernel async DIO

Netfslib needs to be able to handle kernel-initiated asynchronous DIO that
is supplied with a bio_vec[] array.  Currently, because of the async flag,
this gets passed to netfs_extract_user_iter() which throws a warning and
fails because it only handles IOVEC and UBUF iterators.  This can be
triggered through a combination of cifs and a loopback blockdev with
something like:

        mount //my/cifs/share /foo
        dd if=/dev/zero of=/foo/m0 bs=4K count=1K
        losetup --sector-size 4096 --direct-io=on /dev/loop2046 /foo/m0
        echo hello >/dev/loop2046

This causes the following to appear in syslog:

        WARNING: CPU: 2 PID: 109 at fs/netfs/iterator.c:50 netfs_extract_user_iter+0x170/0x250 [netfs]

and the write to fail.

Fix this by removing the check in netfs_unbuffered_write_iter_locked() that
causes async kernel DIO writes to be handled as userspace writes.  Note
that this change relies on the kernel caller maintaining the existence of
the bio_vec array (or kvec[] or folio_queue) until the op is complete.

Fixes: 153a9961b551 ("netfs: Implement unbuffered/DIO write support")
Reported-by: Nicolas Baranger <nicolas.baranger@3xo.fr>
Closes: https://lore.kernel.org/r/fedd8a40d54b2969097ffa4507979858@3xo.fr/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/608725.1736275167@warthog.procyon.org.uk
Tested-by: Nicolas Baranger <nicolas.baranger@3xo.fr>
Acked-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
cc: Steve French <smfrench@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-cifs@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/direct_write.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 173e8b5e6a93..f9421f3e6d37 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -67,7 +67,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 		 * allocate a sufficiently large bvec array and may shorten the
 		 * request.
 		 */
-		if (async || user_backed_iter(iter)) {
+		if (user_backed_iter(iter)) {
 			n = netfs_extract_user_iter(iter, len, &wreq->iter, 0);
 			if (n < 0) {
 				ret = n;
@@ -77,6 +77,11 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 			wreq->direct_bv_count = n;
 			wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
 		} else {
+			/* If this is a kernel-generated async DIO request,
+			 * assume that any resources the iterator points to
+			 * (eg. a bio_vec array) will persist till the end of
+			 * the op.
+			 */
 			wreq->iter = *iter;
 		}
 
-- 
cgit v1.2.3


From 904abff4b1b94184aaa0e9f5fce7821f7b5b81a3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 7 Jan 2025 14:43:30 +0000
Subject: netfs: Fix read-retry for fs with no ->prepare_read()

Fix netfslib's read-retry to only call ->prepare_read() in the backing
filesystem such a function is provided.  We can get to this point if a
there's an active cache as failed reads from the cache need negotiating
with the server instead.

Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/529329.1736261010@warthog.procyon.org.uk
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/read_retry.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 21b4a54e545e..16b676c68dcd 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -152,7 +152,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 			BUG_ON(!len);
 
 			/* Renegotiate max_len (rsize) */
-			if (rreq->netfs_ops->prepare_read(subreq) < 0) {
+			if (rreq->netfs_ops->prepare_read &&
+			    rreq->netfs_ops->prepare_read(subreq) < 0) {
 				trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
 				__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
 			}
-- 
cgit v1.2.3


From 17a4fde81d3a7478d97d15304a6d61094a10c2e3 Mon Sep 17 00:00:00 2001
From: Lizhi Xu <lizhi.xu@windriver.com>
Date: Tue, 7 Jan 2025 14:52:32 +0000
Subject: afs: Fix merge preference rule failure condition

syzbot reported a lock held when returning to userspace[1].  This is
because if argc is less than 0 and the function returns directly, the held
inode lock is not released.

Fix this by store the error in ret and jump to done to clean up instead of
returning directly.

[dh: Modified Lizhi Xu's original patch to make it honour the error code
from afs_split_string()]

[1]
WARNING: lock held when returning to user space!
6.13.0-rc3-syzkaller-00209-g499551201b5f #0 Not tainted
------------------------------------------------
syz-executor133/5823 is leaving the kernel with locks still held!
1 lock held by syz-executor133/5823:
 #0: ffff888071cffc00 (&sb->s_type->i_mutex_key#9){++++}-{4:4}, at: inode_lock include/linux/fs.h:818 [inline]
 #0: ffff888071cffc00 (&sb->s_type->i_mutex_key#9){++++}-{4:4}, at: afs_proc_addr_prefs_write+0x2bb/0x14e0 fs/afs/addr_prefs.c:388

Reported-by: syzbot+76f33569875eb708e575@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=76f33569875eb708e575
Signed-off-by: Lizhi Xu <lizhi.xu@windriver.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241226012616.2348907-1-lizhi.xu@windriver.com/
Link: https://lore.kernel.org/r/529850.1736261552@warthog.procyon.org.uk
Tested-by: syzbot+76f33569875eb708e575@syzkaller.appspotmail.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/addr_prefs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c
index a189ff8a5034..c0384201b8fe 100644
--- a/fs/afs/addr_prefs.c
+++ b/fs/afs/addr_prefs.c
@@ -413,8 +413,10 @@ int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size)
 
 	do {
 		argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv));
-		if (argc < 0)
-			return argc;
+		if (argc < 0) {
+			ret = argc;
+			goto done;
+		}
 		if (argc < 2)
 			goto inval;
 
-- 
cgit v1.2.3


From df448ca355ce84d62993166294f75517e4128598 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 1 Dec 2024 21:44:38 -0500
Subject: bcachefs: bcachefs_metadata_version_persistent_inode_cursors

Persistent cursors for inode allocation.

A free inodes btree would add substantial overhead to inode allocation
and freeing - a "next num to allocate" cursor is always going to be
faster.

We just need it to be persistent, to avoid scanning the inodes btree
from the start on startup.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h          |   3 -
 fs/bcachefs/bcachefs_format.h   |  10 +++-
 fs/bcachefs/btree_update.c      |   2 +-
 fs/bcachefs/inode.c             | 120 ++++++++++++++++++++++++++++------------
 fs/bcachefs/inode.h             |  10 ++++
 fs/bcachefs/inode_format.h      |  14 ++++-
 fs/bcachefs/logged_ops.c        |   5 +-
 fs/bcachefs/logged_ops_format.h |   5 +-
 fs/bcachefs/opts.h              |  10 ++--
 fs/bcachefs/sb-errors_format.h  |   3 +-
 fs/bcachefs/super-io.c          |   5 ++
 fs/bcachefs/super.c             |   7 +--
 12 files changed, 135 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b749c4ecad1b..161cf2f05d2a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1063,9 +1063,6 @@ struct bch_fs {
 	struct btree_node	*verify_ondisk;
 	struct mutex		verify_lock;
 
-	u64			*unused_inode_hints;
-	unsigned		inode_shard_bits;
-
 	/*
 	 * A btree node on disk could have too many bsets for an iterator to fit
 	 * on the stack - have to dynamically allocate them
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f140c3366e65..09e53bef1a30 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k)
 	x(snapshot_tree,	31)			\
 	x(logged_op_truncate,	32)			\
 	x(logged_op_finsert,	33)			\
-	x(accounting,		34)
+	x(accounting,		34)			\
+	x(inode_alloc_cursor,	35)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -682,7 +683,8 @@ struct bch_sb_field_ext {
 	x(backpointer_bucket_gen,	BCH_VERSION(1, 14))		\
 	x(disk_accounting_big_endian,	BCH_VERSION(1, 15))		\
 	x(reflink_p_may_update_opts,	BCH_VERSION(1, 16))		\
-	x(inode_depth,			BCH_VERSION(1, 17))
+	x(inode_depth,			BCH_VERSION(1, 17))		\
+	x(persistent_inode_cursors,	BCH_VERSION(1, 18))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -850,6 +852,7 @@ LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
 LE64_BITMASK(BCH_SB_VERSION_INCOMPAT,	struct bch_sb, flags[5], 32, 48);
 LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
 					struct bch_sb, flags[5], 48, 64);
+LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS,	struct bch_sb, flags[6],  0,  4);
 
 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
@@ -1347,7 +1350,8 @@ enum btree_id_flags {
 	  BIT_ULL(KEY_TYPE_set))						\
 	x(logged_ops,		17,	0,					\
 	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
-	  BIT_ULL(KEY_TYPE_logged_op_finsert))					\
+	  BIT_ULL(KEY_TYPE_logged_op_finsert)|					\
+	  BIT_ULL(KEY_TYPE_inode_alloc_cursor))					\
 	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
 	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
 	x(subvolume_children,	19,	0,					\
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index a4b70e3fe4c3..13d794f201a5 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -588,7 +588,7 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi
 int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
 			     enum btree_id btree, struct bpos end)
 {
-	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
+	bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
 	struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
 	int ret = bkey_err(k);
 	if (ret)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f6245b78eb78..04ec05206f8c 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -799,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
 	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
 }
 
+int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
+				   struct bkey_validate_context from)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
+			 c, inode_alloc_cursor_inode_bad,
+			 "k.p.inode bad");
+fsck_err:
+	return ret;
+}
+
+void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
+				     struct bkey_s_c k)
+{
+	struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
+
+	prt_printf(out, "idx %llu generation %llu",
+		   le64_to_cpu(i.v->idx),
+		   le64_to_cpu(i.v->gen));
+}
+
 void bch2_inode_init_early(struct bch_fs *c,
 			   struct bch_inode_unpacked *inode_u)
 {
@@ -859,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 	}
 }
 
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
-		      struct btree_iter *iter,
-		      struct bch_inode_unpacked *inode_u,
-		      u32 snapshot, u64 cpu)
+static struct bkey_i_inode_alloc_cursor *
+bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-	u64 min, max, start, pos, *hint;
-	int ret = 0;
-	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
 
-	if (c->opts.shard_inode_numbers) {
-		bits -= c->inode_shard_bits;
+	u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
+
+	cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
 
-		min = (cpu << bits);
-		max = (cpu << bits) | ~(ULLONG_MAX << bits);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+					BTREE_ID_logged_ops,
+					POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
+					BTREE_ITER_cached);
+	int ret = bkey_err(k);
+	if (ret)
+		return ERR_PTR(ret);
+
+	struct bkey_i_inode_alloc_cursor *cursor =
+		k.k->type == KEY_TYPE_inode_alloc_cursor
+		? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
+		: bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
+	ret = PTR_ERR_OR_ZERO(cursor);
+	if (ret)
+		goto err;
 
-		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
-		hint = c->unused_inode_hints + cpu;
+	if (c->opts.inodes_32bit) {
+		*min = BLOCKDEV_INODE_MAX;
+		*max = INT_MAX;
 	} else {
-		min = BLOCKDEV_INODE_MAX;
-		max = ~(ULLONG_MAX << bits);
-		hint = c->unused_inode_hints;
+		cursor->v.bits = c->opts.shard_inode_numbers_bits;
+
+		unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
+
+		*min = max(cpu << bits, (u64) INT_MAX + 1);
+		*max = (cpu << bits) | ~(ULLONG_MAX << bits);
 	}
 
-	start = READ_ONCE(*hint);
+	if (le64_to_cpu(cursor->v.idx)  < *min)
+		cursor->v.idx = cpu_to_le64(*min);
+
+	if (le64_to_cpu(cursor->v.idx) >= *max) {
+		cursor->v.idx = cpu_to_le64(*min);
+		le32_add_cpu(&cursor->v.gen, 1);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret ? ERR_PTR(ret) : cursor;
+}
+
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+		      struct btree_iter *iter,
+		      struct bch_inode_unpacked *inode_u,
+		      u32 snapshot, u64 cpu)
+{
+	u64 min, max;
+	struct bkey_i_inode_alloc_cursor *cursor =
+		bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
+	int ret = PTR_ERR_OR_ZERO(cursor);
+	if (ret)
+		return ret;
 
-	if (start >= max || start < min)
-		start = min;
+	u64 start = le64_to_cpu(cursor->v.idx);
+	u64 pos = start;
 
-	pos = start;
 	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
 			     BTREE_ITER_all_snapshots|
 			     BTREE_ITER_intent);
+	struct bkey_s_c k;
 again:
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k)) &&
@@ -925,6 +982,7 @@ again:
 	/* Retry from start */
 	pos = start = min;
 	bch2_btree_iter_set_pos(iter, POS(0, pos));
+	le32_add_cpu(&cursor->v.gen, 1);
 	goto again;
 found_slot:
 	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
@@ -935,9 +993,9 @@ found_slot:
 		return ret;
 	}
 
-	*hint			= k.k->p.offset;
 	inode_u->bi_inum	= k.k->p.offset;
-	inode_u->bi_generation	= bkey_generation(k);
+	inode_u->bi_generation	= le64_to_cpu(cursor->v.gen);
+	cursor->v.idx		= cpu_to_le64(k.k->p.offset + 1);
 	return 0;
 }
 
@@ -999,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
-	struct bkey_i_inode_generation delete;
-	struct bch_inode_unpacked inode_u;
 	struct bkey_s_c k;
 	u32 snapshot;
 	int ret;
@@ -1040,13 +1096,7 @@ retry:
 		goto err;
 	}
 
-	bch2_inode_unpack(k, &inode_u);
-
-	bkey_inode_generation_init(&delete.k_i);
-	delete.k.p = iter.pos;
-	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
-	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+	ret   = bch2_btree_delete_at(trans, &iter, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc);
 err:
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 5bca6950f20e..d2e134528f0e 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -68,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk
 	.min_val_size	= 8,					\
 })
 
+int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
+				     struct bkey_validate_context);
+void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) {	\
+	.key_validate	= bch2_inode_alloc_cursor_validate,	\
+	.val_to_text	= bch2_inode_alloc_cursor_to_text,	\
+	.min_val_size	= 16,					\
+})
+
 #if 0
 typedef struct {
 	u64			lo;
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index be1e747629d2..b99a5bf1a75e 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -102,7 +102,8 @@ struct bch_inode_generation {
 	x(bi_subvol,			32)	\
 	x(bi_parent_subvol,		32)	\
 	x(bi_nocow,			8)	\
-	x(bi_depth,			32)
+	x(bi_depth,			32)	\
+	x(bi_inodes_32bit,		8)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()			\
@@ -115,7 +116,8 @@ struct bch_inode_generation {
 	x(foreground_target,		16)	\
 	x(background_target,		16)	\
 	x(erasure_code,			16)	\
-	x(nocow,			8)
+	x(nocow,			8)	\
+	x(inodes_32bit,			8)
 
 enum inode_opt_id {
 #define x(name, ...)				\
@@ -165,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START,
 				struct bch_inode_v3, bi_flags, 31, 36);
 LE64_BITMASK(INODEv3_MODE,	struct bch_inode_v3, bi_flags, 36, 52);
 
+struct bch_inode_alloc_cursor {
+	struct bch_val		v;
+	__u8			bits;
+	__u8			pad;
+	__le32			gen;
+	__le64			idx;
+};
+
 #endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 1ac51af16299..75f27ec26f85 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -65,7 +65,8 @@ int bch2_resume_logged_ops(struct bch_fs *c)
 	int ret = bch2_trans_run(c,
 		for_each_btree_key_max(trans, iter,
 				   BTREE_ID_logged_ops,
-				   POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX),
+				   POS(LOGGED_OPS_INUM_logged_ops, 0),
+				   POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
 				   BTREE_ITER_prefetch, k,
 			resume_logged_op(trans, &iter, k)));
 	bch_err_fn(c, ret);
@@ -76,7 +77,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
 {
 	struct btree_iter iter;
 	int ret = bch2_bkey_get_empty_slot(trans, &iter,
-				 BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX));
+				 BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
index 0b370a963ac6..cfb67c95d4c8 100644
--- a/fs/bcachefs/logged_ops_format.h
+++ b/fs/bcachefs/logged_ops_format.h
@@ -2,7 +2,10 @@
 #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
 #define _BCACHEFS_LOGGED_OPS_FORMAT_H
 
-#define LOGGED_OPS_INUM		0
+enum logged_ops_inums {
+	LOGGED_OPS_INUM_logged_ops,
+	LOGGED_OPS_INUM_inode_cursors,
+};
 
 struct bch_logged_op_truncate {
 	struct bch_val		v;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index ea69099e681d..e763d52e0f38 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -222,14 +222,14 @@ enum fsck_err_opts {
 	  BCH_SB_ERASURE_CODE,		false,				\
 	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
 	x(inodes_32bit,			u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_BOOL(),							\
 	  BCH_SB_INODE_32BIT,		true,				\
 	  NULL,		"Constrain inode numbers to 32 bits")		\
-	x(shard_inode_numbers,		u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_BOOL(),							\
-	  BCH_SB_SHARD_INUMS,		true,				\
+	x(shard_inode_numbers_bits,	u8,				\
+	  OPT_FS|OPT_FORMAT,						\
+	  OPT_UINT(0, 8),						\
+	  BCH_SB_SHARD_INUMS_NBITS,	0,				\
 	  NULL,		"Shard new inode numbers by CPU id")		\
 	x(inodes_use_key_cache,	u8,					\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 806486635075..e26317c367f7 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -211,6 +211,7 @@ enum bch_fsck_flags {
 	x(bkey_in_missing_snapshot,				190,	0)		\
 	x(inode_pos_inode_nonzero,				191,	0)		\
 	x(inode_pos_blockdev_range,				192,	0)		\
+	x(inode_alloc_cursor_inode_bad,				301,	0)		\
 	x(inode_unpack_error,					193,	0)		\
 	x(inode_str_hash_invalid,				194,	0)		\
 	x(inode_v3_fields_start_bad,				195,	0)		\
@@ -311,7 +312,7 @@ enum bch_fsck_flags {
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
 	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
 	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
-	x(MAX,							301,	0)
+	x(MAX,							302,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b0d52b6ccad4..dbc09e305c27 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -460,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 			SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
 	}
 
+#ifdef __KERNEL__
+	if (!BCH_SB_SHARD_INUMS_NBITS(sb))
+		SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
+#endif
+
 	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
 		const struct bch_option *opt = bch2_opt_table + opt_id;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2b2e0835c8fe..7e97c198efe2 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -586,7 +586,6 @@ static void __bch2_fs_free(struct bch_fs *c)
 #endif
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
 	kfree(c->journal_seq_blacklist_table);
-	kfree(c->unused_inode_hints);
 
 	if (c->write_ref_wq)
 		destroy_workqueue(c->write_ref_wq);
@@ -872,8 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		(btree_blocks(c) + 1) * 2 *
 		sizeof(struct sort_iter_set);
 
-	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
-
 	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
 				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
 	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
@@ -900,9 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    !(c->online_reserved = alloc_percpu(u64)) ||
 	    mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
 				       c->opts.btree_node_size) ||
-	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
-	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
-					      sizeof(u64), GFP_KERNEL))) {
+	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
 		ret = -BCH_ERR_ENOMEM_fs_other_alloc;
 		goto err;
 	}
-- 
cgit v1.2.3


From 3db3084a86c3690e4b50b8579b0b1bbc29897375 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 10 Dec 2024 14:19:30 -0500
Subject: bcachefs: bcachefs_metadata_version_autofix_errors

It's time to make self healing the default: change the error action for
old filesystems to fix_safe, matching the default for current
filesystems.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 3 ++-
 fs/bcachefs/recovery.c        | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 09e53bef1a30..b0fac8b7915b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -684,7 +684,8 @@ struct bch_sb_field_ext {
 	x(disk_accounting_big_endian,	BCH_VERSION(1, 15))		\
 	x(reflink_p_may_update_opts,	BCH_VERSION(1, 16))		\
 	x(inode_depth,			BCH_VERSION(1, 17))		\
-	x(persistent_inode_cursors,	BCH_VERSION(1, 18))
+	x(persistent_inode_cursors,	BCH_VERSION(1, 18))		\
+	x(autofix_errors,		BCH_VERSION(1, 19))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 383e03606d6e..98825437381c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -781,6 +781,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
 
+	if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
+		SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
+		write_sb = true;
+	}
+
 	if (write_sb)
 		bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
-- 
cgit v1.2.3


From d01ea14da718f7c30520c8498a7ba952b099ad83 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Tue, 12 Nov 2024 16:15:47 +0800
Subject: bcachefs: add counter_flags for counters

In bcachefs, io_read and io_write counter record the amount
of data which has been read and written. They increase in
unit of sector, so to display correctly, they need to be
shifted to the left by the size of a sector. Other counters
like io_move, move_extent_{read, write, finish} also have
this problem.

In order to support different unit, we add extra column to
mark the counter type by using TYPE_COUNTER and TYPE_SECTORS
in BCH_PERSISTENT_COUNTERS().

Fixes: 1c6fdbd8f246 ("bcachefs: Initial commit")
Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-counters_format.h | 165 ++++++++++++++++++++-------------------
 fs/bcachefs/sysfs.c              |   9 ++-
 2 files changed, 93 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 62ea478215d0..fdcf598f08b1 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -2,86 +2,91 @@
 #ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
 #define _BCACHEFS_SB_COUNTERS_FORMAT_H
 
-#define BCH_PERSISTENT_COUNTERS()				\
-	x(io_read,					0)	\
-	x(io_write,					1)	\
-	x(io_move,					2)	\
-	x(bucket_invalidate,				3)	\
-	x(bucket_discard,				4)	\
-	x(bucket_alloc,					5)	\
-	x(bucket_alloc_fail,				6)	\
-	x(btree_cache_scan,				7)	\
-	x(btree_cache_reap,				8)	\
-	x(btree_cache_cannibalize,			9)	\
-	x(btree_cache_cannibalize_lock,			10)	\
-	x(btree_cache_cannibalize_lock_fail,		11)	\
-	x(btree_cache_cannibalize_unlock,		12)	\
-	x(btree_node_write,				13)	\
-	x(btree_node_read,				14)	\
-	x(btree_node_compact,				15)	\
-	x(btree_node_merge,				16)	\
-	x(btree_node_split,				17)	\
-	x(btree_node_rewrite,				18)	\
-	x(btree_node_alloc,				19)	\
-	x(btree_node_free,				20)	\
-	x(btree_node_set_root,				21)	\
-	x(btree_path_relock_fail,			22)	\
-	x(btree_path_upgrade_fail,			23)	\
-	x(btree_reserve_get_fail,			24)	\
-	x(journal_entry_full,				25)	\
-	x(journal_full,					26)	\
-	x(journal_reclaim_finish,			27)	\
-	x(journal_reclaim_start,			28)	\
-	x(journal_write,				29)	\
-	x(read_promote,					30)	\
-	x(read_bounce,					31)	\
-	x(read_split,					33)	\
-	x(read_retry,					32)	\
-	x(read_reuse_race,				34)	\
-	x(move_extent_read,				35)	\
-	x(move_extent_write,				36)	\
-	x(move_extent_finish,				37)	\
-	x(move_extent_fail,				38)	\
-	x(move_extent_start_fail,			39)	\
-	x(copygc,					40)	\
-	x(copygc_wait,					41)	\
-	x(gc_gens_end,					42)	\
-	x(gc_gens_start,				43)	\
-	x(trans_blocked_journal_reclaim,		44)	\
-	x(trans_restart_btree_node_reused,		45)	\
-	x(trans_restart_btree_node_split,		46)	\
-	x(trans_restart_fault_inject,			47)	\
-	x(trans_restart_iter_upgrade,			48)	\
-	x(trans_restart_journal_preres_get,		49)	\
-	x(trans_restart_journal_reclaim,		50)	\
-	x(trans_restart_journal_res_get,		51)	\
-	x(trans_restart_key_cache_key_realloced,	52)	\
-	x(trans_restart_key_cache_raced,		53)	\
-	x(trans_restart_mark_replicas,			54)	\
-	x(trans_restart_mem_realloced,			55)	\
-	x(trans_restart_memory_allocation_failure,	56)	\
-	x(trans_restart_relock,				57)	\
-	x(trans_restart_relock_after_fill,		58)	\
-	x(trans_restart_relock_key_cache_fill,		59)	\
-	x(trans_restart_relock_next_node,		60)	\
-	x(trans_restart_relock_parent_for_fill,		61)	\
-	x(trans_restart_relock_path,			62)	\
-	x(trans_restart_relock_path_intent,		63)	\
-	x(trans_restart_too_many_iters,			64)	\
-	x(trans_restart_traverse,			65)	\
-	x(trans_restart_upgrade,			66)	\
-	x(trans_restart_would_deadlock,			67)	\
-	x(trans_restart_would_deadlock_write,		68)	\
-	x(trans_restart_injected,			69)	\
-	x(trans_restart_key_cache_upgrade,		70)	\
-	x(trans_traverse_all,				71)	\
-	x(transaction_commit,				72)	\
-	x(write_super,					73)	\
-	x(trans_restart_would_deadlock_recursion_limit,	74)	\
-	x(trans_restart_write_buffer_flush,		75)	\
-	x(trans_restart_split_race,			76)	\
-	x(write_buffer_flush_slowpath,			77)	\
-	x(write_buffer_flush_sync,			78)
+enum counters_flags {
+	TYPE_COUNTER	= BIT(0),	/* event counters */
+	TYPE_SECTORS	= BIT(1),	/* amount counters, the unit is sectors */
+};
+
+#define BCH_PERSISTENT_COUNTERS()					\
+	x(io_read,					0,	TYPE_SECTORS)	\
+	x(io_write,					1,	TYPE_SECTORS)	\
+	x(io_move,					2,	TYPE_SECTORS)	\
+	x(bucket_invalidate,				3,	TYPE_COUNTER)	\
+	x(bucket_discard,				4,	TYPE_COUNTER)	\
+	x(bucket_alloc,					5,	TYPE_COUNTER)	\
+	x(bucket_alloc_fail,				6,	TYPE_COUNTER)	\
+	x(btree_cache_scan,				7,	TYPE_COUNTER)	\
+	x(btree_cache_reap,				8,	TYPE_COUNTER)	\
+	x(btree_cache_cannibalize,			9,	TYPE_COUNTER)	\
+	x(btree_cache_cannibalize_lock,			10,	TYPE_COUNTER)	\
+	x(btree_cache_cannibalize_lock_fail,		11,	TYPE_COUNTER)	\
+	x(btree_cache_cannibalize_unlock,		12,	TYPE_COUNTER)	\
+	x(btree_node_write,				13,	TYPE_COUNTER)	\
+	x(btree_node_read,				14,	TYPE_COUNTER)	\
+	x(btree_node_compact,				15,	TYPE_COUNTER)	\
+	x(btree_node_merge,				16,	TYPE_COUNTER)	\
+	x(btree_node_split,				17,	TYPE_COUNTER)	\
+	x(btree_node_rewrite,				18,	TYPE_COUNTER)	\
+	x(btree_node_alloc,				19,	TYPE_COUNTER)	\
+	x(btree_node_free,				20,	TYPE_COUNTER)	\
+	x(btree_node_set_root,				21,	TYPE_COUNTER)	\
+	x(btree_path_relock_fail,			22,	TYPE_COUNTER)	\
+	x(btree_path_upgrade_fail,			23,	TYPE_COUNTER)	\
+	x(btree_reserve_get_fail,			24,	TYPE_COUNTER)	\
+	x(journal_entry_full,				25,	TYPE_COUNTER)	\
+	x(journal_full,					26,	TYPE_COUNTER)	\
+	x(journal_reclaim_finish,			27,	TYPE_COUNTER)	\
+	x(journal_reclaim_start,			28,	TYPE_COUNTER)	\
+	x(journal_write,				29,	TYPE_COUNTER)	\
+	x(read_promote,					30,	TYPE_COUNTER)	\
+	x(read_bounce,					31,	TYPE_COUNTER)	\
+	x(read_split,					33,	TYPE_COUNTER)	\
+	x(read_retry,					32,	TYPE_COUNTER)	\
+	x(read_reuse_race,				34,	TYPE_COUNTER)	\
+	x(move_extent_read,				35,	TYPE_SECTORS)	\
+	x(move_extent_write,				36,	TYPE_SECTORS)	\
+	x(move_extent_finish,				37,	TYPE_SECTORS)	\
+	x(move_extent_fail,				38,	TYPE_COUNTER)	\
+	x(move_extent_start_fail,			39,	TYPE_COUNTER)	\
+	x(copygc,					40,	TYPE_COUNTER)	\
+	x(copygc_wait,					41,	TYPE_COUNTER)	\
+	x(gc_gens_end,					42,	TYPE_COUNTER)	\
+	x(gc_gens_start,				43,	TYPE_COUNTER)	\
+	x(trans_blocked_journal_reclaim,		44,	TYPE_COUNTER)	\
+	x(trans_restart_btree_node_reused,		45,	TYPE_COUNTER)	\
+	x(trans_restart_btree_node_split,		46,	TYPE_COUNTER)	\
+	x(trans_restart_fault_inject,			47,	TYPE_COUNTER)	\
+	x(trans_restart_iter_upgrade,			48,	TYPE_COUNTER)	\
+	x(trans_restart_journal_preres_get,		49,	TYPE_COUNTER)	\
+	x(trans_restart_journal_reclaim,		50,	TYPE_COUNTER)	\
+	x(trans_restart_journal_res_get,		51,	TYPE_COUNTER)	\
+	x(trans_restart_key_cache_key_realloced,	52,	TYPE_COUNTER)	\
+	x(trans_restart_key_cache_raced,		53,	TYPE_COUNTER)	\
+	x(trans_restart_mark_replicas,			54,	TYPE_COUNTER)	\
+	x(trans_restart_mem_realloced,			55,	TYPE_COUNTER)	\
+	x(trans_restart_memory_allocation_failure,	56,	TYPE_COUNTER)	\
+	x(trans_restart_relock,				57,	TYPE_COUNTER)	\
+	x(trans_restart_relock_after_fill,		58,	TYPE_COUNTER)	\
+	x(trans_restart_relock_key_cache_fill,		59,	TYPE_COUNTER)	\
+	x(trans_restart_relock_next_node,		60,	TYPE_COUNTER)	\
+	x(trans_restart_relock_parent_for_fill,		61,	TYPE_COUNTER)	\
+	x(trans_restart_relock_path,			62,	TYPE_COUNTER)	\
+	x(trans_restart_relock_path_intent,		63,	TYPE_COUNTER)	\
+	x(trans_restart_too_many_iters,			64,	TYPE_COUNTER)	\
+	x(trans_restart_traverse,			65,	TYPE_COUNTER)	\
+	x(trans_restart_upgrade,			66,	TYPE_COUNTER)	\
+	x(trans_restart_would_deadlock,			67,	TYPE_COUNTER)	\
+	x(trans_restart_would_deadlock_write,		68,	TYPE_COUNTER)	\
+	x(trans_restart_injected,			69,	TYPE_COUNTER)	\
+	x(trans_restart_key_cache_upgrade,		70,	TYPE_COUNTER)	\
+	x(trans_traverse_all,				71,	TYPE_COUNTER)	\
+	x(transaction_commit,				72,	TYPE_COUNTER)	\
+	x(write_super,					73,	TYPE_COUNTER)	\
+	x(trans_restart_would_deadlock_recursion_limit,	74,	TYPE_COUNTER)	\
+	x(trans_restart_write_buffer_flush,		75,	TYPE_COUNTER)	\
+	x(trans_restart_split_race,			76,	TYPE_COUNTER)	\
+	x(write_buffer_flush_slowpath,			77,	TYPE_COUNTER)	\
+	x(write_buffer_flush_sync,			78,	TYPE_COUNTER)
 
 enum bch_persistent_counters {
 #define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 48bc6ad03f09..a7eb1f511484 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -505,15 +505,22 @@ SHOW(bch2_fs_counters)
 
 	printbuf_tabstop_push(out, 32);
 
-	#define x(t, ...) \
+	#define x(t, n, f, ...) \
 		if (attr == &sysfs_##t) {					\
 			counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
 			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+			if (f & TYPE_SECTORS) {					\
+				counter <<= 9;					\
+				counter_since_mount <<= 9;			\
+			}							\
+										\
 			prt_printf(out, "since mount:\t");			\
+			(f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\
 			prt_human_readable_u64(out, counter_since_mount);	\
 			prt_newline(out);					\
 										\
 			prt_printf(out, "since filesystem creation:\t");	\
+			(f & TYPE_COUNTER) ? prt_u64(out, counter) :		\
 			prt_human_readable_u64(out, counter);			\
 			prt_newline(out);					\
 		}
-- 
cgit v1.2.3


From 0475c7639edf3282783b04913889d2a6369d541e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 12 Dec 2024 00:55:48 -0500
Subject: bcachefs: better check_bp_exists() error message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index b93ddfa00fdd..ebeb6a5ff9d2 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -566,11 +566,11 @@ check_existing_bp:
 	goto err;
 missing:
 	printbuf_reset(&buf);
-	prt_str(&buf, "missing backpointer ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
-	prt_newline(&buf);
+	prt_str(&buf, "missing backpointer\n  for:  ");
 	bch2_bkey_val_to_text(&buf, c, orig_k);
-	prt_printf(&buf, "\n  got:   ");
+	prt_printf(&buf, "\n  want: ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
+	prt_printf(&buf, "\n  got:  ");
 	bch2_bkey_val_to_text(&buf, c, bp_k);
 
 	if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
-- 
cgit v1.2.3


From 6542afe299eaa1be530caa56c455492f470896cb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 15 Dec 2024 01:52:54 -0500
Subject: bcachefs: Drop racy warning

Checking for writing past i_size after unlocking the folio and clearing
the dirty bit is racy, and we already check it at the start.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index ff8b8df50bf3..ab1d5db2fa56 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -625,15 +625,6 @@ do_io:
 		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
 				     sectors << 9, offset << 9));
 
-		/* Check for writing past i_size: */
-		WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-			  round_up(i_size, block_bytes(c)) &&
-			  !test_bit(BCH_FS_emergency_ro, &c->flags),
-			  "writing past i_size: %llu > %llu (unrounded %llu)\n",
-			  bio_end_sector(&w->io->op.wbio.bio) << 9,
-			  round_up(i_size, block_bytes(c)),
-			  i_size);
-
 		w->io->op.res.sectors += reserved_sectors;
 		w->io->op.i_sectors_delta -= dirty_sectors;
 		w->io->op.new_i_size = i_size;
-- 
cgit v1.2.3


From 30e32692d6b898338db26f6caf6fef295ce7ec68 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 15 Dec 2024 02:03:11 -0500
Subject: bcachefs: Drop redundant "read error" call from btree_gc

The btree node read error path already calls topology error, so this is
entirely redundant, and we're not specific enough about our error codes
- this was triggering for bucket_ref_update() errors.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 5aa11ca08c94..721dca551720 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -733,16 +733,8 @@ static int bch2_gc_btrees(struct bch_fs *c)
 			continue;
 
 		ret = bch2_gc_btree(trans, btree, true);
-
-		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
-					trans, btree_node_read_error,
-			       "btree node read error for %s",
-			       (printbuf_reset(&buf),
-				bch2_btree_id_to_text(&buf, btree),
-				buf.buf)))
-			ret = bch2_btree_lost_data(c, btree);
 	}
-fsck_err:
+
 	printbuf_exit(&buf);
 	bch2_trans_put(trans);
 	bch_err_fn(c, ret);
-- 
cgit v1.2.3


From baf13d83441b8c25fcb09097ec629a8388d1c505 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 15 Dec 2024 02:24:30 -0500
Subject: bcachefs: kill __bch2_btree_iter_flags()

bch2_btree_iter_flags() now takes a level parameter; this fixes a bug
where using a node iterator on a leaf wouldn't set
BTREE_ITER_with_key_cache, leading to fun cache coherency bugs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c |  7 +++++--
 fs/bcachefs/btree_iter.h | 28 +++++++++++-----------------
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a1c5fcced24e..291eb5eb0203 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3032,7 +3032,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans,
 			  unsigned flags)
 {
 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-			       bch2_btree_iter_flags(trans, btree_id, flags),
+			       bch2_btree_iter_flags(trans, btree_id, 0, flags),
 			       _RET_IP_);
 }
 
@@ -3048,8 +3048,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
 	flags |= BTREE_ITER_snapshot_field;
 	flags |= BTREE_ITER_all_snapshots;
 
+	if (!depth && btree_id_cached(trans->c, btree_id))
+		flags |= BTREE_ITER_with_key_cache;
+
 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
-			       __bch2_btree_iter_flags(trans, btree_id, flags),
+			       bch2_btree_iter_flags(trans, btree_id, depth, flags),
 			       _RET_IP_);
 
 	iter->min_depth	= depth;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 3477fc8c0396..e23608d2a26d 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -446,10 +446,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna
 
 void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
 
-static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
-					       unsigned btree_id,
-					       unsigned flags)
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+					     unsigned btree_id,
+					     unsigned level,
+					     unsigned flags)
 {
+	if (level || !btree_id_cached(trans->c, btree_id)) {
+		flags &= ~BTREE_ITER_cached;
+		flags &= ~BTREE_ITER_with_key_cache;
+	} else if (!(flags & BTREE_ITER_cached))
+		flags |= BTREE_ITER_with_key_cache;
+
 	if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
 	    btree_id_is_extents(btree_id))
 		flags |= BTREE_ITER_is_extents;
@@ -468,19 +475,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
 	return flags;
 }
 
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
-					     unsigned btree_id,
-					     unsigned flags)
-{
-	if (!btree_id_cached(trans->c, btree_id)) {
-		flags &= ~BTREE_ITER_cached;
-		flags &= ~BTREE_ITER_with_key_cache;
-	} else if (!(flags & BTREE_ITER_cached))
-		flags |= BTREE_ITER_with_key_cache;
-
-	return __bch2_btree_iter_flags(trans, btree_id, flags);
-}
-
 static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
 					  struct btree_iter *iter,
 					  unsigned btree_id, struct bpos pos,
@@ -517,7 +511,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
 	if (__builtin_constant_p(btree_id) &&
 	    __builtin_constant_p(flags))
 		bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-				bch2_btree_iter_flags(trans, btree_id, flags),
+				bch2_btree_iter_flags(trans, btree_id, 0, flags),
 				_THIS_IP_);
 	else
 		bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
-- 
cgit v1.2.3


From 861cd0f60624440b51e3ff14b04055b59c13d1e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jun 2024 17:01:31 -0400
Subject: bcachefs: Write lock btree node in key cache fills

this addresses a key cache coherency bug

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 4eba2871f289..382f99b774b8 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -197,7 +197,9 @@ out:
 	return ck;
 }
 
-static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path,
+static int btree_key_cache_create(struct btree_trans *trans,
+				  struct btree_path *path,
+				  struct btree_path *ck_path,
 				  struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
@@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
 	key_u64s = min(256U, (key_u64s * 3) / 2);
 	key_u64s = roundup_pow_of_two(key_u64s);
 
-	struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s);
+	struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
 	int ret = PTR_ERR_OR_ZERO(ck);
 	if (ret)
 		return ret;
@@ -226,19 +228,19 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
 		ck = bkey_cached_reuse(bc);
 		if (unlikely(!ck)) {
 			bch_err(c, "error allocating memory for key cache item, btree %s",
-				bch2_btree_id_str(path->btree_id));
+				bch2_btree_id_str(ck_path->btree_id));
 			return -BCH_ERR_ENOMEM_btree_key_cache_create;
 		}
 	}
 
 	ck->c.level		= 0;
-	ck->c.btree_id		= path->btree_id;
-	ck->key.btree_id	= path->btree_id;
-	ck->key.pos		= path->pos;
+	ck->c.btree_id		= ck_path->btree_id;
+	ck->key.btree_id	= ck_path->btree_id;
+	ck->key.pos		= ck_path->pos;
 	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 
 	if (unlikely(key_u64s > ck->u64s)) {
-		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+		mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
 
 		struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
 				kmalloc(key_u64s * sizeof(u64), _gfp));
@@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
 
 	bkey_reassemble(ck->k, k);
 
+	ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
+	if (unlikely(ret))
+		goto err;
+
 	ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
+
+	bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
+
 	if (unlikely(ret)) /* raced with another fill? */
 		goto err;
 
 	atomic_long_inc(&bc->nr_keys);
 	six_unlock_write(&ck->c.lock);
 
-	enum six_lock_type lock_want = __btree_lock_want(path, 0);
+	enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
 	if (lock_want == SIX_LOCK_read)
 		six_lock_downgrade(&ck->c.lock);
-	btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
-	path->uptodate = BTREE_ITER_UPTODATE;
+	btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
+	ck_path->uptodate = BTREE_ITER_UPTODATE;
 	return 0;
 err:
 	bkey_cached_free(bc, ck);
-	mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+	mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
 
 	return ret;
 }
@@ -293,6 +302,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
+			     BTREE_ITER_intent|
 			     BTREE_ITER_key_cache_fill|
 			     BTREE_ITER_cached_nofill);
 	iter.flags &= ~BTREE_ITER_with_journal;
@@ -306,7 +316,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
 	if (unlikely(ret))
 		goto out;
 
-	ret = btree_key_cache_create(trans, ck_path, k);
+	ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
 	if (ret)
 		goto err;
 
-- 
cgit v1.2.3


From cf3da2d627037a0848e25dbaeaedfadfe534c7c4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 16 Dec 2024 13:58:02 -0500
Subject: bcachefs: Handle -BCH_ERR_need_mark_replicas in gc

Locking considerations (possibly no longer relevant?) mean that when an
accounting update needs a new superblock replicas entry to be created,
it's deferred to the transaction commit error path.

But accounting updates for gc/fcsk aren't done from the transaction
commit path - so we need to handle
-BCH_ERR_btree_insert_need_mark_replicas locally.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 72c8dcb9226f..b32e91ba8be8 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_
 	memcpy_u64s_small(acc->v.d, d, nr);
 }
 
+static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
+
 int bch2_disk_accounting_mod(struct btree_trans *trans,
 			     struct disk_accounting_pos *k,
 			     s64 *d, unsigned nr, bool gc)
@@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans,
 
 	accounting_key_init(&k_i.k, k, d, nr);
 
-	return likely(!gc)
-		? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k)
-		: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+	if (unlikely(gc)) {
+		int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+		if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
+			ret = drop_locks_do(trans,
+				bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
+				bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+		return ret;
+	} else {
+		return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
+	}
 }
 
 int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
-- 
cgit v1.2.3


From fa3e5135e4e7bfb38598a58caf592c940eff4b03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 16 Dec 2024 16:41:25 -0500
Subject: bcachefs: Fix assert for online fsck

We can't check if we're racing with fsck ending until mark_lock is held.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index fc1b673689c8..5360cbb3ec29 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -138,7 +138,8 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
 	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
 	bool gc = mode == BCH_ACCOUNTING_gc;
 
-	EBUG_ON(gc && !acc->gc_running);
+	if (gc && !acc->gc_running)
+		return 0;
 
 	if (!bch2_accounting_is_mem(acc_k))
 		return 0;
-- 
cgit v1.2.3


From 45414083913f68cb7586924767a3fb2cc7710352 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 20 Dec 2024 05:20:01 -0500
Subject: bcachefs: bch2_kvmalloc()

Add a version of kvmalloc() that doesn't have the INT_MAX limit; large
filesystems do hit this.

We'll want to get rid of the in-memory bucket gens array, but we're not
there quite yet.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c |  6 +++---
 fs/bcachefs/util.h    | 10 ++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 56d3e3800a89..345b117a4a4a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1258,7 +1258,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c)
 	for_each_member_device(c, ca) {
 		BUG_ON(ca->buckets_nouse);
 
-		ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
+		ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO);
 		if (!ca->buckets_nouse) {
@@ -1290,8 +1290,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	if (resize && ca->buckets_nouse)
 		return -BCH_ERR_no_resize_with_buckets_nouse;
 
-	bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets),
-			       GFP_KERNEL|__GFP_ZERO);
+	bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
+				    GFP_KERNEL|__GFP_ZERO);
 	if (!bucket_gens) {
 		ret = -BCH_ERR_ENOMEM_bucket_gens;
 		goto err;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c292b9ce8240..1a1720116071 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -55,6 +55,16 @@ static inline size_t buf_pages(void *p, size_t len)
 			    PAGE_SIZE);
 }
 
+static inline void *bch2_kvmalloc(size_t n, gfp_t flags)
+{
+	void *p = unlikely(n >= INT_MAX)
+		? vmalloc(n)
+		: kvmalloc(n, flags & ~__GFP_ZERO);
+	if (p && (flags & __GFP_ZERO))
+		memset(p, 0, n);
+	return p;
+}
+
 #define init_heap(heap, _size, gfp)					\
 ({									\
 	(heap)->nr = 0;						\
-- 
cgit v1.2.3


From b5e4cd0871db885de05d531e8e72fa6059d81bd4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 20 Dec 2024 23:56:42 -0500
Subject: bcachefs: Don't rely on snapshot_tree.master_subvol for reattaching

Previously, fsck used the snapshot tree's master subvol for finding the
root inode number - but the master subvol might have been deleting, and
setting a new one should be a user operation; meaning we can't rely on
it existing.

Fortunately, for finding the root inode number in a tree of snapshots,
finding any associated subvolume works.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h |  1 +
 fs/bcachefs/fsck.c    | 52 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index c0df2587a580..4590cd0c7c90 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -118,6 +118,7 @@
 	x(ENOENT,			ENOENT_dev_not_found)			\
 	x(ENOENT,			ENOENT_dev_idx_not_found)		\
 	x(ENOENT,			ENOENT_inode_no_backpointer)		\
+	x(ENOENT,			ENOENT_no_snapshot_tree_subvol)		\
 	x(ENOTEMPTY,			ENOTEMPTY_dir_not_empty)		\
 	x(ENOTEMPTY,			ENOTEMPTY_subvol_not_empty)		\
 	x(EEXIST,			EEXIST_str_hash_set)			\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ea8c8ed06940..206fc046610a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -205,6 +205,36 @@ err:
 	return ret;
 }
 
+/*
+ * Find any subvolume associated with a tree of snapshots
+ * We can't rely on master_subvol - it might have been deleted.
+ */
+static int find_snapshot_tree_subvol(struct btree_trans *trans,
+				     u32 tree_id, u32 *subvol)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+		if (le32_to_cpu(s.v->tree) != tree_id)
+			continue;
+
+		if (s.v->subvol) {
+			*subvol = le32_to_cpu(s.v->subvol);
+			goto found;
+		}
+	}
+	ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
+found:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 /* Get lost+found, create if it doesn't exist: */
 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 			    struct bch_inode_unpacked *lostfound,
@@ -223,19 +253,24 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 	if (ret)
 		return ret;
 
-	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
+	u32 subvolid;
+	ret = find_snapshot_tree_subvol(trans,
+				bch2_snapshot_tree(c, snapshot), &subvolid);
+	bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
+		    bch2_snapshot_tree(c, snapshot));
+	if (ret)
+		return ret;
 
 	struct bch_subvolume subvol;
-	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), false, &subvol);
-	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
-		    le32_to_cpu(st.master_subvol), snapshot);
+	ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
+	bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
 	if (ret)
 		return ret;
 
 	if (!subvol.inode) {
 		struct btree_iter iter;
 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
-				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+				BTREE_ID_subvolumes, POS(0, subvolid),
 				0, subvolume);
 		ret = PTR_ERR_OR_ZERO(subvol);
 		if (ret)
@@ -245,13 +280,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	root_inum.inum = le64_to_cpu(subvol.inode);
+	subvol_inum root_inum = {
+		.subvol = subvolid,
+		.inum = le64_to_cpu(subvol.inode)
+	};
 
 	struct bch_inode_unpacked root_inode;
 	struct bch_hash_info root_hash_info;
 	ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
-		    root_inum.inum, le32_to_cpu(st.master_subvol));
+		    root_inum.inum, subvolid);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 4bd06f07bcb5c472b7d7a90e6ee890bd0900b3e1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 20 Dec 2024 04:46:00 -0500
Subject: bcachefs: Fixes for snapshot_tree.master_subvol

Ensure that snapshot_tree.master_subvol is cleared when we delete the
master subvolume in a tree of snapshots, and allow for snapshot trees
that don't have a master subvolume in fsck.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c  | 12 +++++++-----
 fs/bcachefs/subvolume.c | 50 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 47 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 1506445eaaf4..cf6b3256d188 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -495,6 +495,9 @@ static int check_snapshot_tree(struct btree_trans *trans,
 		goto err;
 	}
 
+	if (!st.v->master_subvol)
+		goto out;
+
 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		goto err;
@@ -538,6 +541,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
 		u->v.master_subvol = cpu_to_le32(subvol_id);
 		st = snapshot_tree_i_to_s_c(u);
 	}
+out:
 err:
 fsck_err:
 	bch2_trans_iter_exit(trans, &snapshot_iter);
@@ -1037,13 +1041,11 @@ fsck_err:
 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
 {
 	struct btree_iter iter;
-	struct bkey_i_snapshot *s;
-	int ret = 0;
-
-	s = bch2_bkey_get_mut_typed(trans, &iter,
+	struct bkey_i_snapshot *s =
+		bch2_bkey_get_mut_typed(trans, &iter,
 				    BTREE_ID_snapshots, POS(0, id),
 				    0, snapshot);
-	ret = PTR_ERR_OR_ZERO(s);
+	int ret = PTR_ERR_OR_ZERO(s);
 	if (unlikely(ret)) {
 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
 					trans->c, "missing snapshot %u", id);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 0e756e35c3d9..e3d0475232e5 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -409,26 +409,56 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
  */
 static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
-	struct btree_iter iter;
-	struct bkey_s_c_subvolume subvol;
-	u32 snapid;
-	int ret = 0;
+	struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {};
 
-	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+	struct bkey_s_c_subvolume subvol =
+		bch2_bkey_get_iter_typed(trans, &subvol_iter,
 				BTREE_ID_subvolumes, POS(0, subvolid),
 				BTREE_ITER_cached|BTREE_ITER_intent,
 				subvolume);
-	ret = bkey_err(subvol);
+	int ret = bkey_err(subvol);
 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
 				"missing subvolume %u", subvolid);
 	if (ret)
-		return ret;
+		goto err;
 
-	snapid = le32_to_cpu(subvol.v->snapshot);
+	u32 snapid = le32_to_cpu(subvol.v->snapshot);
+
+	struct bkey_s_c_snapshot snapshot =
+		bch2_bkey_get_iter_typed(trans, &snapshot_iter,
+				BTREE_ID_snapshots, POS(0, snapid),
+				0, snapshot);
+	ret = bkey_err(subvol);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+				"missing snapshot %u", snapid);
+	if (ret)
+		goto err;
+
+	u32 treeid = le32_to_cpu(snapshot.v->tree);
 
-	ret =   bch2_btree_delete_at(trans, &iter, 0) ?:
+	struct bkey_s_c_snapshot_tree snapshot_tree =
+		bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
+				BTREE_ID_snapshot_trees, POS(0, treeid),
+				0, snapshot_tree);
+
+	if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
+		struct bkey_i_snapshot_tree *snapshot_tree_mut =
+			bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter,
+						 &snapshot_tree.s_c,
+						 0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(snapshot_tree_mut);
+		if (ret)
+			goto err;
+
+		snapshot_tree_mut->v.master_subvol = 0;
+	}
+
+	ret =   bch2_btree_delete_at(trans, &subvol_iter, 0) ?:
 		bch2_snapshot_node_set_deleted(trans, snapid);
-	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch2_trans_iter_exit(trans, &snapshot_tree_iter);
+	bch2_trans_iter_exit(trans, &snapshot_iter);
+	bch2_trans_iter_exit(trans, &subvol_iter);
 	return ret;
 }
 
-- 
cgit v1.2.3


From bc6fce7870ceda943fdc8451d42407dacb1519e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 21 Dec 2024 03:31:00 -0500
Subject: bcachefs: bch2_btree_node_write_trans()

Avoiding screwing up path->lock_seq.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c              | 32 ++++++++++++++++++++++++++++++--
 fs/bcachefs/btree_io.h              |  6 ++++--
 fs/bcachefs/btree_locking.h         | 19 ++++++++++++-------
 fs/bcachefs/btree_trans_commit.c    |  2 +-
 fs/bcachefs/btree_update_interior.c | 16 ++++++++--------
 5 files changed, 55 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d99f8a78d286..e371e60e3133 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -489,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 	if (b->nsets == MAX_BSETS &&
 	    !btree_node_write_in_flight(b) &&
 	    should_compact_all(c, b)) {
-		bch2_btree_node_write(c, b, SIX_LOCK_write,
-				      BTREE_WRITE_init_next_bset);
+		bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
+					    BTREE_WRITE_init_next_bset);
 		reinit_iter = true;
 	}
 
@@ -2345,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	}
 }
 
+void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
+				 enum six_lock_type lock_type_held,
+				 unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+
+	if (lock_type_held == SIX_LOCK_intent ||
+	    (lock_type_held == SIX_LOCK_read &&
+	     six_lock_tryupgrade(&b->c.lock))) {
+		__bch2_btree_node_write(c, b, flags);
+
+		/* don't cycle lock unnecessarily: */
+		if (btree_node_just_written(b) &&
+		    six_trylock_write(&b->c.lock)) {
+			bch2_btree_post_write_cleanup(c, b);
+			__bch2_btree_node_unlock_write(trans, b);
+		}
+
+		if (lock_type_held == SIX_LOCK_read)
+			six_lock_downgrade(&b->c.lock);
+	} else {
+		__bch2_btree_node_write(c, b, flags);
+		if (lock_type_held == SIX_LOCK_write &&
+		    btree_node_just_written(b))
+			bch2_btree_post_write_cleanup(c, b);
+	}
+}
+
 static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
 {
 	struct bucket_table *tbl;
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 9b01ca3de907..6f9e4a6dacf7 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -144,11 +144,13 @@ enum btree_write_flags {
 void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
 			   enum six_lock_type, unsigned);
+void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
+				 enum six_lock_type, unsigned);
 
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
 					    enum six_lock_type lock_held)
 {
-	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
+	bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
 bool bch2_btree_flush_all_reads(struct bch_fs *);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7474ab6ce019..fb3d04ddcb40 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -163,22 +163,27 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
  * succeed:
  */
 static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
-				     struct btree *b)
+__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
 {
 	struct btree_path *linked;
 	unsigned i;
 
+	trans_for_each_path_with_node(trans, b, linked, i)
+		linked->l[b->c.level].lock_seq++;
+
+	six_unlock_write(&b->c.lock);
+}
+
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+				     struct btree *b)
+{
 	EBUG_ON(path->l[b->c.level].b != b);
 	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
 	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
 
 	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
-	trans_for_each_path_with_node(trans, b, linked, i)
-		linked->l[b->c.level].lock_seq++;
-
-	six_unlock_write(&b->c.lock);
+	__bch2_btree_node_unlock_write(trans, b);
 }
 
 void bch2_btree_node_unlock_write(struct btree_trans *,
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index c3a3bfd11e8c..2f1dd516318e 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -249,7 +249,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 		new |= 1 << BTREE_NODE_need_write;
 	} while (!try_cmpxchg(&b->flags, &old, new));
 
-	btree_node_write_if_need(c, b, SIX_LOCK_read);
+	btree_node_write_if_need(trans, b, SIX_LOCK_read);
 	six_unlock_read(&b->c.lock);
 
 	bch2_trans_put(trans);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 03a6eba7403d..76c8602601dd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -803,7 +803,7 @@ err:
 		mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 		six_unlock_write(&b->c.lock);
 
-		btree_node_write_if_need(c, b, SIX_LOCK_intent);
+		btree_node_write_if_need(trans, b, SIX_LOCK_intent);
 		btree_node_unlock(trans, path, b->c.level);
 		bch2_path_put(trans, path_idx, true);
 	}
@@ -824,7 +824,7 @@ err:
 		b = as->new_nodes[i];
 
 		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		btree_node_write_if_need(c, b, SIX_LOCK_read);
+		btree_node_write_if_need(trans, b, SIX_LOCK_read);
 		six_unlock_read(&b->c.lock);
 	}
 
@@ -1709,14 +1709,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 	if (n3) {
 		bch2_btree_update_get_open_buckets(as, n3);
-		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+		bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
 	}
 	if (n2) {
 		bch2_btree_update_get_open_buckets(as, n2);
-		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+		bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
 	}
 	bch2_btree_update_get_open_buckets(as, n1);
-	bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+	bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
 
 	/*
 	 * The old node must be freed (in memory) _before_ unlocking the new
@@ -1911,7 +1911,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
 	BUG_ON(ret);
 
 	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+	bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
 	bch2_trans_node_add(trans, path, n);
 	six_unlock_intent(&n->c.lock);
 
@@ -2104,7 +2104,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	bch2_trans_verify_paths(trans);
 
 	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+	bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
 
 	bch2_btree_node_free_inmem(trans, trans->paths + path, b);
 	bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
@@ -2181,7 +2181,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 	bch2_btree_interior_update_will_free_node(as, b);
 
 	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+	bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
 
 	bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
 
-- 
cgit v1.2.3


From f908eacc34c7e8092bd47b7fcf5741943d3b734c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 21 Dec 2024 04:14:28 -0500
Subject: bcachefs: fix bch2_btree_key_cache_drop()

When evicting, we shouldn't leave a pointer to the key cache entry lying
around - that screws up btree path asserts we're adding.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 382f99b774b8..7636a5e97409 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -613,8 +613,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
 	bkey_cached_free(bc, ck);
 
 	mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
-	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-	path->should_be_locked = false;
+
+	struct btree_path *path2;
+	unsigned i;
+	trans_for_each_path(trans, path2, i)
+		if (path2->l[0].b == (void *) ck) {
+			__bch2_btree_path_unlock(trans, path2);
+			path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
+			path2->should_be_locked = false;
+			btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE);
+		}
+
+	bch2_trans_verify_locks(trans);
 }
 
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
-- 
cgit v1.2.3


From 6adc5af50a5401a19c1f23b3ed2ebfd3a0d8cdc6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 21 Dec 2024 02:55:03 -0500
Subject: bcachefs: btree_path_very_locks(): verify lock seq

If the btree_path's lock seq is wrong, the next bch2_trans_relock()
operation is guaranteed to fail and we take an unnecessary transaction
restart.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index d343df9f0ad2..b339c209345a 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -856,6 +856,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
 		       (want == BTREE_NODE_UNLOCKED ||
 			have != BTREE_NODE_WRITE_LOCKED) &&
 		       want != have);
+
+		BUG_ON(btree_node_locked(path, l) &&
+		       path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
 	}
 }
 
-- 
cgit v1.2.3


From ce9a21713bfd9744ae4cbfef1b4d8bc895ab7283 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Dec 2024 05:11:46 -0500
Subject: bcachefs: bch2_inum_path() no longer returns an error for
 disconnected inums

bch2_inum_path() should work even if the filesystem is corrupted - we
don't want it to cause fsck to fail.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-common.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 7d279f211312..63619318c64a 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -574,6 +574,11 @@ static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsig
 	printbuf_nul_terminate(out);
 }
 
+static inline void prt_str_reversed(struct printbuf *out, const char *s)
+{
+	prt_bytes_reversed(out, s, strlen(s));
+}
+
 static inline void reverse_bytes(void *b, size_t n)
 {
 	char *e = b + n, *s = b;
@@ -596,17 +601,17 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
 		struct bch_inode_unpacked inode;
 		ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
 		if (ret)
-			goto err;
+			goto disconnected;
 
 		if (!inode.bi_dir && !inode.bi_dir_offset) {
 			ret = -BCH_ERR_ENOENT_inode_no_backpointer;
-			goto err;
+			goto disconnected;
 		}
 
 		u32 snapshot;
 		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 		if (ret)
-			goto err;
+			goto disconnected;
 
 		struct btree_iter d_iter;
 		struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
@@ -614,7 +619,7 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
 				0, dirent);
 		ret = bkey_err(d.s_c);
 		if (ret)
-			goto err;
+			goto disconnected;
 
 		struct qstr dirent_name = bch2_dirent_get_name(d);
 		prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
@@ -630,7 +635,7 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
 
 	if (orig_pos == path->pos)
 		prt_char(path, '/');
-
+out:
 	ret = path->allocation_failure ? -ENOMEM : 0;
 	if (ret)
 		goto err;
@@ -639,4 +644,10 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
 	return 0;
 err:
 	return ret;
+disconnected:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto err;
+
+	prt_str_reversed(path, "(disconnected)");
+	goto out;
 }
-- 
cgit v1.2.3


From bdedae70f51f570381095a59740d6a9df1262668 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Dec 2024 05:16:56 -0500
Subject: bcachefs: bch2_inum_path() now crosses subvolumes correctly

The dirent that points to a subvolume root is in the parent subvolume.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-common.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 63619318c64a..2c3d46ac70c6 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -608,6 +608,9 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
 			goto disconnected;
 		}
 
+		inum.subvol	= inode.bi_parent_subvol ?: inum.subvol;
+		inum.inum	= inode.bi_dir;
+
 		u32 snapshot;
 		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 		if (ret)
@@ -626,10 +629,6 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
 
 		prt_char(path, '/');
 
-		if (d.v->d_type == DT_SUBVOL)
-			inum.subvol = le32_to_cpu(d.v->d_parent_subvol);
-		inum.inum = d.k->p.inode;
-
 		bch2_trans_iter_exit(trans, &d_iter);
 	}
 
-- 
cgit v1.2.3


From cb3f34982c9dfccf1aadde4e69043c7da3107c92 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Dec 2024 16:57:24 -0500
Subject: bcachefs: Assert that btree write buffer only touches the right
 btrees

More asserts, more better.

Also, clean up the per-btree flags a bit.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h    | 48 +++++++++++++++++++++++++++-------------
 fs/bcachefs/btree_types.h        | 39 ++++++++++++++++++++------------
 fs/bcachefs/btree_write_buffer.c |  2 ++
 3 files changed, 60 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b0fac8b7915b..0680930508a3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1288,14 +1288,18 @@ LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 /* Btree: */
 
 enum btree_id_flags {
-	BTREE_ID_EXTENTS	= BIT(0),
-	BTREE_ID_SNAPSHOTS	= BIT(1),
-	BTREE_ID_SNAPSHOT_FIELD	= BIT(2),
-	BTREE_ID_DATA		= BIT(3),
+	BTREE_IS_extents	= BIT(0),
+	BTREE_IS_snapshots	= BIT(1),
+	BTREE_IS_snapshot_field	= BIT(2),
+	BTREE_IS_data		= BIT(3),
+	BTREE_IS_write_buffer	= BIT(4),
 };
 
 #define BCH_BTREE_IDS()								\
-	x(extents,		0,	BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+	x(extents,		0,						\
+	  BTREE_IS_extents|							\
+	  BTREE_IS_snapshots|							\
+	  BTREE_IS_data,							\
 	  BIT_ULL(KEY_TYPE_whiteout)|						\
 	  BIT_ULL(KEY_TYPE_error)|						\
 	  BIT_ULL(KEY_TYPE_cookie)|						\
@@ -1303,17 +1307,20 @@ enum btree_id_flags {
 	  BIT_ULL(KEY_TYPE_reservation)|					\
 	  BIT_ULL(KEY_TYPE_reflink_p)|						\
 	  BIT_ULL(KEY_TYPE_inline_data))					\
-	x(inodes,		1,	BTREE_ID_SNAPSHOTS,			\
+	x(inodes,		1,						\
+	  BTREE_IS_snapshots,							\
 	  BIT_ULL(KEY_TYPE_whiteout)|						\
 	  BIT_ULL(KEY_TYPE_inode)|						\
 	  BIT_ULL(KEY_TYPE_inode_v2)|						\
 	  BIT_ULL(KEY_TYPE_inode_v3)|						\
 	  BIT_ULL(KEY_TYPE_inode_generation))					\
-	x(dirents,		2,	BTREE_ID_SNAPSHOTS,			\
+	x(dirents,		2,						\
+	  BTREE_IS_snapshots,							\
 	  BIT_ULL(KEY_TYPE_whiteout)|						\
 	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
 	  BIT_ULL(KEY_TYPE_dirent))						\
-	x(xattrs,		3,	BTREE_ID_SNAPSHOTS,			\
+	x(xattrs,		3,						\
+	  BTREE_IS_snapshots,							\
 	  BIT_ULL(KEY_TYPE_whiteout)|						\
 	  BIT_ULL(KEY_TYPE_cookie)|						\
 	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
@@ -1327,7 +1334,9 @@ enum btree_id_flags {
 	  BIT_ULL(KEY_TYPE_quota))						\
 	x(stripes,		6,	0,					\
 	  BIT_ULL(KEY_TYPE_stripe))						\
-	x(reflink,		7,	BTREE_ID_EXTENTS|BTREE_ID_DATA,		\
+	x(reflink,		7,						\
+	  BTREE_IS_extents|							\
+	  BTREE_IS_data,							\
 	  BIT_ULL(KEY_TYPE_reflink_v)|						\
 	  BIT_ULL(KEY_TYPE_indirect_inline_data)|				\
 	  BIT_ULL(KEY_TYPE_error))						\
@@ -1335,29 +1344,38 @@ enum btree_id_flags {
 	  BIT_ULL(KEY_TYPE_subvolume))						\
 	x(snapshots,		9,	0,					\
 	  BIT_ULL(KEY_TYPE_snapshot))						\
-	x(lru,			10,	0,					\
+	x(lru,			10,						\
+	  BTREE_IS_write_buffer,						\
 	  BIT_ULL(KEY_TYPE_set))						\
-	x(freespace,		11,	BTREE_ID_EXTENTS,			\
+	x(freespace,		11,						\
+	  BTREE_IS_extents,							\
 	  BIT_ULL(KEY_TYPE_set))						\
 	x(need_discard,		12,	0,					\
 	  BIT_ULL(KEY_TYPE_set))						\
-	x(backpointers,		13,	0,					\
+	x(backpointers,		13,						\
+	  BTREE_IS_write_buffer,						\
 	  BIT_ULL(KEY_TYPE_backpointer))					\
 	x(bucket_gens,		14,	0,					\
 	  BIT_ULL(KEY_TYPE_bucket_gens))					\
 	x(snapshot_trees,	15,	0,					\
 	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
-	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOT_FIELD,		\
+	x(deleted_inodes,	16,						\
+	  BTREE_IS_snapshot_field|						\
+	  BTREE_IS_write_buffer,						\
 	  BIT_ULL(KEY_TYPE_set))						\
 	x(logged_ops,		17,	0,					\
 	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
 	  BIT_ULL(KEY_TYPE_logged_op_finsert)|					\
 	  BIT_ULL(KEY_TYPE_inode_alloc_cursor))					\
-	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
+	x(rebalance_work,	18,						\
+	  BTREE_IS_snapshot_field|						\
+	  BTREE_IS_write_buffer,						\
 	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
 	x(subvolume_children,	19,	0,					\
 	  BIT_ULL(KEY_TYPE_set))						\
-	x(accounting,		20,	BTREE_ID_SNAPSHOT_FIELD,		\
+	x(accounting,		20,						\
+	  BTREE_IS_snapshot_field|						\
+	  BTREE_IS_write_buffer,						\
 	  BIT_ULL(KEY_TYPE_accounting))						\
 
 enum btree_id {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index baab5288ecc9..a6f251eb4164 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -790,53 +790,64 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type)
 	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
 }
 
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
+static inline bool btree_id_is_extents(enum btree_id btree)
 {
 	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_IS_extents)) << nr)
 	BCH_BTREE_IDS()
 #undef x
 	;
 
-	return BIT_ULL(type) & mask;
+	return BIT_ULL(btree) & mask;
 }
 
-static inline bool btree_id_is_extents(enum btree_id btree)
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+	return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
+}
+
+static inline bool btree_type_has_snapshots(enum btree_id btree)
 {
-	return btree_node_type_is_extents(__btree_node_type(0, btree));
+	const u64 mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_IS_snapshots)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return BIT_ULL(btree) & mask;
 }
 
-static inline bool btree_type_has_snapshots(enum btree_id id)
+static inline bool btree_type_has_snapshot_field(enum btree_id btree)
 {
 	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+#define x(name, nr, flags, ...)	|((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
 	BCH_BTREE_IDS()
 #undef x
 	;
 
-	return BIT_ULL(id) & mask;
+	return BIT_ULL(btree) & mask;
 }
 
-static inline bool btree_type_has_snapshot_field(enum btree_id id)
+static inline bool btree_type_has_ptrs(enum btree_id btree)
 {
 	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_IS_data)) << nr)
 	BCH_BTREE_IDS()
 #undef x
 	;
 
-	return BIT_ULL(id) & mask;
+	return BIT_ULL(btree) & mask;
 }
 
-static inline bool btree_type_has_ptrs(enum btree_id id)
+static inline bool btree_type_uses_write_buffer(enum btree_id btree)
 {
 	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_DATA)) << nr)
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_IS_write_buffer)) << nr)
 	BCH_BTREE_IDS()
 #undef x
 	;
 
-	return BIT_ULL(id) & mask;
+	return BIT_ULL(btree) & mask;
 }
 
 struct btree_root {
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 746db6d5a0fb..b56c4987b8c9 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -312,6 +312,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 	darray_for_each(wb->sorted, i) {
 		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
 
+		BUG_ON(!btree_type_uses_write_buffer(k->btree));
+
 		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
 			prefetch(&wb->flushing.keys.data[n->idx]);
 
-- 
cgit v1.2.3


From 8f3aaa5d5d4e915c2486eafca84840d04b517b72 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 25 Dec 2024 06:32:41 -0500
Subject: bcachefs: bch2_fs_btree_gc_init()

Now returns errors, prep work for check_allocations_done_lock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 11 +++++++++--
 fs/bcachefs/btree_gc.h |  4 +++-
 fs/bcachefs/super.c    |  6 ++----
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 721dca551720..dd1d9b74076e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1238,9 +1238,16 @@ void bch2_gc_gens_async(struct bch_fs *c)
 		bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
 }
 
-void bch2_fs_gc_init(struct bch_fs *c)
+void bch2_fs_btree_gc_exit(struct bch_fs *c)
 {
-	seqcount_init(&c->gc_pos_lock);
+}
 
+int bch2_fs_btree_gc_init(struct bch_fs *c)
+{
+	seqcount_init(&c->gc_pos_lock);
 	INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
+
+	init_rwsem(&c->gc_lock);
+	mutex_init(&c->gc_gens_lock);
+	return 0;
 }
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 8a47e8bd0791..9693a90a48a2 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -82,6 +82,8 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
 
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_gens_async(struct bch_fs *);
-void bch2_fs_gc_init(struct bch_fs *);
+
+void bch2_fs_btree_gc_exit(struct bch_fs *);
+int bch2_fs_btree_gc_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7e97c198efe2..d97ea7bd1171 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -563,6 +563,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
+	bch2_fs_btree_gc_exit(c);
 	bch2_journal_keys_put_initial(c);
 	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
 	BUG_ON(atomic_read(&c->journal_keys.ref));
@@ -770,13 +771,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	spin_lock_init(&c->recovery_pass_lock);
 	sema_init(&c->online_fsck_mutex, 1);
 
-	init_rwsem(&c->gc_lock);
-	mutex_init(&c->gc_gens_lock);
-
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_init(&c->times[i]);
 
-	bch2_fs_gc_init(c);
 	bch2_fs_copygc_init(c);
 	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
 	bch2_fs_btree_iter_init_early(c);
@@ -911,6 +908,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_cache_init(c) ?:
 	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
 	    bch2_fs_btree_interior_update_init(c) ?:
+	    bch2_fs_btree_gc_init(c) ?:
 	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
 	    bch2_fs_btree_write_buffer_init(c) ?:
 	    bch2_fs_subvolumes_init(c) ?:
-- 
cgit v1.2.3


From 9a5232ef0a9f9dcc8d1645b361296772b03a3525 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 21 Dec 2024 02:33:53 -0500
Subject: bcachefs: six locks: write locks can now be held recursively

This is needed for the interior update locking rework, where we'll be
holding node write locks for the duration of the update - which is
needed for synchronizing with online check_allocations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 17 ++++++++++++-----
 fs/bcachefs/six.h |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 617d07e53b20..537bf049618f 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -616,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
 
 	if (type != SIX_LOCK_write)
 		six_release(&lock->dep_map, ip);
-	else
-		lock->seq++;
 
 	if (type == SIX_LOCK_intent &&
 	    lock->intent_lock_recurse) {
@@ -625,6 +623,15 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
 		return;
 	}
 
+	if (type == SIX_LOCK_write &&
+	    lock->write_lock_recurse) {
+		--lock->write_lock_recurse;
+		return;
+	}
+
+	if (type == SIX_LOCK_write)
+		lock->seq++;
+
 	do_six_unlock_type(lock, type);
 }
 EXPORT_SYMBOL_GPL(six_unlock_ip);
@@ -735,13 +742,13 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 			atomic_add(l[type].lock_val, &lock->state);
 		}
 		break;
+	case SIX_LOCK_write:
+		lock->write_lock_recurse++;
+		fallthrough;
 	case SIX_LOCK_intent:
 		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
 		lock->intent_lock_recurse++;
 		break;
-	case SIX_LOCK_write:
-		BUG();
-		break;
 	}
 }
 EXPORT_SYMBOL_GPL(six_lock_increment);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 68d46fd7f391..c142e06b7a3a 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -137,6 +137,7 @@ struct six_lock {
 	atomic_t		state;
 	u32			seq;
 	unsigned		intent_lock_recurse;
+	unsigned		write_lock_recurse;
 	struct task_struct	*owner;
 	unsigned __percpu	*readers;
 	raw_spinlock_t		wait_lock;
-- 
cgit v1.2.3


From e1911d7a69b88f02f0f1542a25f714574153f196 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Dec 2024 05:57:30 -0500
Subject: bcachefs: btree_node_unlock() can now drop write locks

Prep work for reworking btree node locking during interior btree
updates.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.h | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index fb3d04ddcb40..80f177078101 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -75,13 +75,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path,
 	path->nodes_locked |= (type + 1) << (level << 1);
 }
 
-static inline void mark_btree_node_unlocked(struct btree_path *path,
-					    unsigned level)
-{
-	EBUG_ON(btree_node_write_locked(path, level));
-	mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
-}
-
 static inline void mark_btree_node_locked(struct btree_trans *trans,
 					  struct btree_path *path,
 					  unsigned level,
@@ -124,19 +117,25 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
 
 /* unlock: */
 
+void bch2_btree_node_unlock_write(struct btree_trans *,
+			struct btree_path *, struct btree *);
+
 static inline void btree_node_unlock(struct btree_trans *trans,
 				     struct btree_path *path, unsigned level)
 {
 	int lock_type = btree_node_locked_type(path, level);
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
-	EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED);
 
 	if (lock_type != BTREE_NODE_UNLOCKED) {
+		if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
+			bch2_btree_node_unlock_write(trans, path, path->l[level].b);
+			lock_type = BTREE_NODE_INTENT_LOCKED;
+		}
 		six_unlock_type(&path->l[level].b->c.lock, lock_type);
 		btree_trans_lock_hold_time_update(trans, path, level);
+		mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
 	}
-	mark_btree_node_unlocked(path, level);
 }
 
 static inline int btree_path_lowest_level_locked(struct btree_path *path)
@@ -165,11 +164,13 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
 static inline void
 __bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_path *linked;
-	unsigned i;
+	if (!b->c.lock.write_lock_recurse) {
+		struct btree_path *linked;
+		unsigned i;
 
-	trans_for_each_path_with_node(trans, b, linked, i)
-		linked->l[b->c.level].lock_seq++;
+		trans_for_each_path_with_node(trans, b, linked, i)
+			linked->l[b->c.level].lock_seq++;
+	}
 
 	six_unlock_write(&b->c.lock);
 }
@@ -186,9 +187,6 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
 	__bch2_btree_node_unlock_write(trans, b);
 }
 
-void bch2_btree_node_unlock_write(struct btree_trans *,
-			struct btree_path *, struct btree *);
-
 int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
 
 /* lock: */
-- 
cgit v1.2.3


From 0971a72c3d1a8be44f54e72096c7d1fc5fbe512e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Dec 2024 05:40:17 -0500
Subject: bcachefs: bch2_trans_unlock_write()

New helper for dropping all write locks; which is distinct from the
helper the transaction commit path uses, which is faster and only
touches updates.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c      | 11 +++++++++++
 fs/bcachefs/btree_locking.h      |  1 +
 fs/bcachefs/btree_trans_commit.c |  6 +++---
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index b339c209345a..8503931463d1 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -818,6 +818,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
 	bch2_trans_srcu_unlock(trans);
 }
 
+void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	unsigned i;
+
+	trans_for_each_path(trans, path, i)
+		for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
+			if (btree_node_write_locked(path, l))
+				bch2_btree_node_unlock_write(trans, path, path->l[l].b);
+}
+
 int __bch2_trans_mutex_lock(struct btree_trans *trans,
 			    struct mutex *lock)
 {
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 80f177078101..b54ef48eb8cc 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -16,6 +16,7 @@
 void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
 
 void bch2_trans_unlock_noassert(struct btree_trans *);
+void bch2_trans_unlock_write(struct btree_trans *);
 
 static inline bool is_btree_node(struct btree_path *path, unsigned l)
 {
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 2f1dd516318e..6b79b672e0b1 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -133,7 +133,7 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans)
 	return 0;
 }
 
-static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
 {
 	if (likely(trans->write_locked)) {
 		trans_for_each_update(trans, i)
@@ -384,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
 	struct bkey_i *new_k;
 	int ret;
 
-	bch2_trans_unlock_write(trans);
+	bch2_trans_unlock_updates_write(trans);
 	bch2_trans_unlock(trans);
 
 	new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
@@ -868,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 	if (!ret && unlikely(trans->journal_replay_not_finished))
 		bch2_drop_overwrites_from_journal(trans);
 
-	bch2_trans_unlock_write(trans);
+	bch2_trans_unlock_updates_write(trans);
 
 	if (!ret && trans->journal_pin)
 		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-- 
cgit v1.2.3


From 8cfdc6ce1f785278ae2a1cf58e62d125986797fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 25 Dec 2024 12:19:08 -0500
Subject: bcachefs: bch2_trans_node_drop()

Factor out a small common helper.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c            | 13 +++++++++++++
 fs/bcachefs/btree_iter.h            |  1 +
 fs/bcachefs/btree_update_interior.c | 17 +++--------------
 3 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 291eb5eb0203..c291d495b85b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -699,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans,
 	bch2_trans_revalidate_updates_in_node(trans, b);
 }
 
+void bch2_trans_node_drop(struct btree_trans *trans,
+			  struct btree *b)
+{
+	struct btree_path *path;
+	unsigned i, level = b->c.level;
+
+	trans_for_each_path(trans, path, i)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
+}
+
 /*
  * A btree node has been modified in such a way as to invalidate iterators - fix
  * them:
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index e23608d2a26d..b9538e6e6d65 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -372,6 +372,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
 void bch2_trans_downgrade(struct btree_trans *);
 
 void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
+void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
 void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
 
 int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 76c8602601dd..f4aeadbe53c1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -238,7 +238,6 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 				       struct btree *b)
 {
 	struct bch_fs *c = trans->c;
-	unsigned i, level = b->c.level;
 
 	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
 
@@ -249,13 +248,9 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 	mutex_unlock(&c->btree_cache.lock);
 
 	six_unlock_write(&b->c.lock);
-	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 
-	trans_for_each_path(trans, path, i)
-		if (path->l[level].b == b) {
-			btree_node_unlock(trans, path, level);
-			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
-		}
+	bch2_trans_node_drop(trans, b);
 }
 
 static void bch2_btree_node_free_never_used(struct btree_update *as,
@@ -264,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
 {
 	struct bch_fs *c = as->c;
 	struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
-	struct btree_path *path;
-	unsigned i, level = b->c.level;
 
 	BUG_ON(!list_empty(&b->write_blocked));
 	BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
@@ -287,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
 
 	six_unlock_intent(&b->c.lock);
 
-	trans_for_each_path(trans, path, i)
-		if (path->l[level].b == b) {
-			btree_node_unlock(trans, path, level);
-			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
-		}
+	bch2_trans_node_drop(trans, b);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
-- 
cgit v1.2.3


From 8b1f46bff38f075b8d4071e7ac1edecb441fd53c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 28 Dec 2024 19:57:04 -0500
Subject: bcachefs: Dropped superblock write is no longer a fatal error

Just emit a warning if errors=continue or fix_safe.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super-io.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index dbc09e305c27..8037ccbacf6a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1084,9 +1084,16 @@ int bch2_write_super(struct bch_fs *c)
 				": Superblock write was silently dropped! (seq %llu expected %llu)",
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
-			bch2_fs_fatal_error(c, "%s", buf.buf);
+
+			if (c->opts.errors != BCH_ON_ERROR_continue &&
+			    c->opts.errors != BCH_ON_ERROR_fix_safe) {
+				ret = -BCH_ERR_erofs_sb_err;
+				bch2_fs_fatal_error(c, "%s", buf.buf);
+			} else {
+				bch_err(c, "%s", buf.buf);
+			}
+
 			printbuf_exit(&buf);
-			ret = -BCH_ERR_erofs_sb_err;
 		}
 
 		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
-- 
cgit v1.2.3


From 5906dcb9937d2664db39598b65729a252eaf4aaf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 31 Dec 2024 09:55:09 -0500
Subject: bcachefs: Silence read-only errors when deleting snapshots

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index cf6b3256d188..c54091a28909 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1563,7 +1563,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	 */
 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
 		check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
-	bch_err_msg(c, ret, "walking snapshots");
+	if (!bch2_err_matches(ret, EROFS))
+		bch_err_msg(c, ret, "walking snapshots");
 	if (ret)
 		goto err;
 
@@ -1602,7 +1603,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 
 		bch2_disk_reservation_put(c, &res);
 
-		bch_err_msg(c, ret, "deleting keys from dying snapshots");
+		if (!bch2_err_matches(ret, EROFS))
+			bch_err_msg(c, ret, "deleting keys from dying snapshots");
 		if (ret)
 			goto err;
 	}
@@ -1610,7 +1612,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	darray_for_each(delete_leaves, i) {
 		ret = commit_do(trans, NULL, NULL, 0,
 			bch2_snapshot_node_delete(trans, *i));
-		bch_err_msg(c, ret, "deleting snapshot %u", *i);
+		if (!bch2_err_matches(ret, EROFS))
+			bch_err_msg(c, ret, "deleting snapshot %u", *i);
 		if (ret)
 			goto err;
 	}
@@ -1630,7 +1633,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	darray_for_each(delete_interior, i) {
 		ret = commit_do(trans, NULL, NULL, 0,
 			bch2_snapshot_node_delete(trans, i->id));
-		bch_err_msg(c, ret, "deleting snapshot %u", i->id);
+		if (!bch2_err_matches(ret, EROFS))
+			bch_err_msg(c, ret, "deleting snapshot %u", i->id);
 		if (ret)
 			goto err;
 	}
@@ -1638,7 +1642,8 @@ err:
 	darray_exit(&delete_interior);
 	darray_exit(&delete_leaves);
 	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
+	if (!bch2_err_matches(ret, EROFS))
+		bch_err_fn(c, ret);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0a46ea9d4693d960fda990acf7b0230cb417306e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 30 Dec 2024 15:31:14 -0500
Subject: bcachefs: printbuf_reset() handles tabstops

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/printbuf.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 1d570387b77f..d0dd398baa2b 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -251,16 +251,23 @@ static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
 	printbuf_nul_terminate_reserved(out);
 }
 
+static inline void printbuf_reset_keep_tabstops(struct printbuf *buf)
+{
+	buf->pos		= 0;
+	buf->allocation_failure	= 0;
+	buf->last_newline	= 0;
+	buf->last_field		= 0;
+	buf->indent		= 0;
+	buf->cur_tabstop	= 0;
+}
+
 /**
  * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
  */
 static inline void printbuf_reset(struct printbuf *buf)
 {
-	buf->pos		= 0;
-	buf->allocation_failure	= 0;
-	buf->indent		= 0;
+	printbuf_reset_keep_tabstops(buf);
 	buf->nr_tabstops	= 0;
-	buf->cur_tabstop	= 0;
 }
 
 /**
-- 
cgit v1.2.3


From cf67f46641abd71ec9c1e4b55481928c50975804 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 30 Dec 2024 16:22:59 -0500
Subject: bcachefs: __bch2_btree_pos_to_text()

Factor out a version of bch2_btree_pos_to_text() that doesn't take a
pointer to a in-memory btree node, to be used for btree node scrub.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 16 +++++++++++-----
 fs/bcachefs/btree_cache.h |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index b00c6a20be27..672ca2c1d37d 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1402,18 +1402,24 @@ void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsi
 	prt_printf(out, " level=%u", level);
 }
 
-void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+			      enum btree_id btree, unsigned level, struct bkey_s_c k)
 {
-	bch2_btree_id_to_text(out, b->c.btree_id);
-	prt_printf(out, " level %u/", b->c.level);
-	struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
+	bch2_btree_id_to_text(out, btree);
+	prt_printf(out, " level %u/", level);
+	struct btree_root *r = bch2_btree_id_root(c, btree);
 	if (r)
 		prt_printf(out, "%u", r->level);
 	else
 		prt_printf(out, "(unknown)");
 	prt_printf(out, "\n  ");
 
-	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	bch2_bkey_val_to_text(out, c, k);
+}
+
+void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+	__bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key));
 }
 
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index dcc34fe4996d..ca3c1b145330 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -147,6 +147,8 @@ const char *bch2_btree_id_str(enum btree_id);	/* avoid */
 void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
 void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
 
+void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *,
+			      enum btree_id, unsigned, struct bkey_s_c);
 void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
 void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
-- 
cgit v1.2.3


From bd5b09727f3d13bb3c782b5f05041c69784e7fe9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 31 Dec 2024 12:58:23 -0500
Subject: bcachefs: Don't set btree_path to updtodate if we don't fill

This fixes various locking asserts, and a null ptr deref in
bch2_btree_iter_peek_path().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 7636a5e97409..3b62296c3100 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -291,10 +291,8 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
 					 struct btree_path *ck_path,
 					 unsigned flags)
 {
-	if (flags & BTREE_ITER_cached_nofill) {
-		ck_path->uptodate = BTREE_ITER_UPTODATE;
+	if (flags & BTREE_ITER_cached_nofill)
 		return 0;
-	}
 
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-- 
cgit v1.2.3


From d3d0fac57d4679024199706a2aa6d2a4a477bfa3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 29 Dec 2024 09:37:15 -0500
Subject: bcachefs: bch2_btree_iter_peek_slot() handles navigating to
 nonexistent depth

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c291d495b85b..367231ab1980 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2770,6 +2770,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		goto out_no_locked;
 	}
 
+	struct btree_path *path = btree_iter_path(trans, iter);
+	if (unlikely(!btree_path_node(path, path->level)))
+		return bkey_s_c_null;
+
 	if ((iter->flags & BTREE_ITER_cached) ||
 	    !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
 		k = bkey_s_c_null;
-- 
cgit v1.2.3


From 15734b5e6f8dfd44df08ecc1d846540476b7f7de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 31 Dec 2024 15:59:02 -0500
Subject: bcachefs: Check for dirents to overwritten inodes

This fixes various "dirent to missing inode" errors.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c             | 38 ++++++++++++++++++++++++++++++++++----
 fs/bcachefs/sb-errors_format.h |  3 ++-
 2 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 206fc046610a..3917d75f3c98 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -832,11 +832,13 @@ struct inode_walker {
 	struct bpos			last_pos;
 
 	DARRAY(struct inode_walker_entry) inodes;
+	snapshot_id_list		deletes;
 };
 
 static void inode_walker_exit(struct inode_walker *w)
 {
 	darray_exit(&w->inodes);
+	darray_exit(&w->deletes);
 }
 
 static struct inode_walker inode_walker_init(void)
@@ -960,8 +962,9 @@ static int get_visible_inodes(struct btree_trans *trans,
 	int ret;
 
 	w->inodes.nr = 0;
+	w->deletes.nr = 0;
 
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
 			   BTREE_ITER_all_snapshots, k, ret) {
 		if (k.k->p.offset != inum)
 			break;
@@ -969,10 +972,13 @@ static int get_visible_inodes(struct btree_trans *trans,
 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
 			continue;
 
-		if (bkey_is_inode(k.k))
-			add_inode(c, w, k);
+		if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
+			continue;
 
-		if (k.k->p.snapshot >= s->pos.snapshot)
+		ret = bkey_is_inode(k.k)
+			? add_inode(c, w, k)
+			: snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
+		if (ret)
 			break;
 	}
 	bch2_trans_iter_exit(trans, &iter);
@@ -2380,6 +2386,30 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			if (ret)
 				goto err;
 		}
+
+		darray_for_each(target->deletes, i)
+			if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
+					trans, dirent_to_overwritten_inode,
+					"dirent points to inode overwritten in snapshot %u:\n%s",
+					*i,
+					(printbuf_reset(&buf),
+					 bch2_bkey_val_to_text(&buf, c, k),
+					 buf.buf))) {
+				struct btree_iter delete_iter;
+				bch2_trans_iter_init(trans, &delete_iter,
+						     BTREE_ID_dirents,
+						     SPOS(k.k->p.inode, k.k->p.offset, *i),
+						     BTREE_ITER_intent);
+				ret =   bch2_btree_iter_traverse(&delete_iter) ?:
+					bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+							  hash_info,
+							  &delete_iter,
+							  BTREE_UPDATE_internal_snapshot_node);
+				bch2_trans_iter_exit(trans, &delete_iter);
+				if (ret)
+					goto err;
+
+			}
 	}
 
 	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index e26317c367f7..80b6d589808b 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -256,6 +256,7 @@ enum bch_fsck_flags {
 	x(dirent_in_missing_dir_inode,				227,	0)		\
 	x(dirent_in_non_dir_inode,				228,	0)		\
 	x(dirent_to_missing_inode,				229,	0)		\
+	x(dirent_to_overwritten_inode,				302,	0)		\
 	x(dirent_to_missing_subvol,				230,	0)		\
 	x(dirent_to_itself,					231,	0)		\
 	x(quota_type_invalid,					232,	0)		\
@@ -312,7 +313,7 @@ enum bch_fsck_flags {
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
 	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
 	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
-	x(MAX,							302,	0)
+	x(MAX,							303,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From ae153f2e11eea89bc1203c33f304939190b88614 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Jan 2025 12:09:52 -0500
Subject: bcachefs: Don't use BTREE_ITER_cached when walking alloc btree during
 fsck

No need to pull the whole alloc btree into the btree key cache.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 94e7bc889cb1..fc2ef33b67b3 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1402,7 +1402,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
 
 	struct btree_iter alloc_iter;
 	struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
-						     BTREE_ID_alloc, bucket, BTREE_ITER_cached);
+						     BTREE_ID_alloc, bucket,
+						     async_repair ? BTREE_ITER_cached : 0);
 	int ret = bkey_err(alloc_k);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 4204e3bf63c4b68968fd557f05fed32c99e2b18a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Jan 2025 12:10:25 -0500
Subject: bcachefs: check_unreachable_inodes is not actually PASS_ONLINE yet

check_unreachable_inodes does work in online mode, with the one caveat
that it assumes check_dirents has also run - and check_dirents is not
PASS_ONLINE yet.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery_passes_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 71baad41d8c5..418557960ed6 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -53,7 +53,7 @@
 	x(check_dirents,			27, PASS_FSCK)				\
 	x(check_xattrs,				28, PASS_FSCK)				\
 	x(check_root,				29, PASS_ONLINE|PASS_FSCK)		\
-	x(check_unreachable_inodes,		40, PASS_ONLINE|PASS_FSCK)		\
+	x(check_unreachable_inodes,		40, PASS_FSCK)				\
 	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)		\
 	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)		\
 	x(check_nlinks,				31, PASS_FSCK)				\
-- 
cgit v1.2.3


From 67510d7e2e5f5bdc020bf9d759aa575cce48c8e1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 10 Jan 2025 08:58:14 +0100
Subject: fs: debugfs: fix open proxy for unsafe files

In the previous commit referenced below, I had to split
the short fops handling into different proxy fops. This
necessitated knowing out-of-band whether or not the ops
are short or full, when attempting to convert from fops
to allocated fsdata.

Unfortunately, I only converted full_proxy_open() which
is used for the new full_proxy_open_regular() and
full_proxy_open_short(), but forgot about the call in
open_proxy_open(), used for debugfs_create_file_unsafe().

Fix that, it never has short fops.

Fixes: f8f25893a477 ("fs: debugfs: differentiate short fops with proxy ops")
Reported-by: Suresh Kumar Kurmi <suresh.kumar.kurmi@intel.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202501101055.bb8bf3e7-lkp@intel.com
Reported-by: Venkat Rao Bagalkote <venkat88@linux.vnet.ibm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20250110085826.cd74f3b7a36b.I430c79c82ec3f954c2ff9665753bf6ac9e63eef8@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index bdb4f2ca0506..16e198a26339 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -280,7 +280,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
 	const struct file_operations *real_fops = NULL;
 	int r;
 
-	r = debugfs_file_get(dentry);
+	r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR);
 	if (r)
 		return r == -EIO ? -ENOENT : r;
 
-- 
cgit v1.2.3


From 111d36d6278756128b7d7fab787fdcbf8221cd98 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 8 Jan 2025 16:54:02 -0800
Subject: xfs: lock dquot buffer before detaching dquot from b_li_list

We have to lock the buffer before we can delete the dquot log item from
the buffer's log item list.

Cc: stable@vger.kernel.org # v6.13-rc3
Fixes: acc8f8628c3737 ("xfs: attach dquot buffer to dquot log item buffer")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_dquot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index f11d475898f2..201c26322ede 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -87,8 +87,9 @@ xfs_dquot_detach_buf(
 	}
 	spin_unlock(&qlip->qli_lock);
 	if (bp) {
+		xfs_buf_lock(bp);
 		list_del_init(&qlip->qli_item.li_bio_list);
-		xfs_buf_rele(bp);
+		xfs_buf_relse(bp);
 	}
 }
 
-- 
cgit v1.2.3


From af10dd16655339a48178221fc4b73fbf171dd739 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Dec 2024 05:14:58 +0000
Subject: hostfs: Convert to writepages

If we add a migrate_folio operation, we can convert the writepage
operation to writepages.  The large folio support here is illusory;
we would need to kmap each page in turn for proper support.  But we do
remove a few hidden calls to compound_head().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20241220051500.1919389-1-willy@infradead.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 fs/hostfs/hostfs_kern.c | 54 +++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7e51d2cec64b..844c45253452 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -410,38 +410,33 @@ static const struct file_operations hostfs_dir_fops = {
 	.fsync		= hostfs_fsync,
 };
 
-static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+static int hostfs_writepages(struct address_space *mapping,
+		struct writeback_control *wbc)
 {
-	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
-	char *buffer;
-	loff_t base = page_offset(page);
-	int count = PAGE_SIZE;
-	int end_index = inode->i_size >> PAGE_SHIFT;
-	int err;
-
-	if (page->index >= end_index)
-		count = inode->i_size & (PAGE_SIZE-1);
-
-	buffer = kmap_local_page(page);
-
-	err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
-	if (err != count) {
-		if (err >= 0)
-			err = -EIO;
-		mapping_set_error(mapping, err);
-		goto out;
+	struct folio *folio = NULL;
+	loff_t i_size = i_size_read(inode);
+	int err = 0;
+
+	while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
+		loff_t pos = folio_pos(folio);
+		size_t count = folio_size(folio);
+		char *buffer;
+		int ret;
+
+		if (count > i_size - pos)
+			count = i_size - pos;
+
+		buffer = kmap_local_folio(folio, 0);
+		ret = write_file(HOSTFS_I(inode)->fd, &pos, buffer, count);
+		kunmap_local(buffer);
+		folio_unlock(folio);
+		if (ret != count) {
+			err = ret < 0 ? ret : -EIO;
+			mapping_set_error(mapping, err);
+		}
 	}
 
-	if (base > inode->i_size)
-		inode->i_size = base;
-
-	err = 0;
-
- out:
-	kunmap_local(buffer);
-	unlock_page(page);
-
 	return err;
 }
 
@@ -506,11 +501,12 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
 }
 
 static const struct address_space_operations hostfs_aops = {
-	.writepage 	= hostfs_writepage,
+	.writepages 	= hostfs_writepages,
 	.read_folio	= hostfs_read_folio,
 	.dirty_folio	= filemap_dirty_folio,
 	.write_begin	= hostfs_write_begin,
 	.write_end	= hostfs_write_end,
+	.migrate_folio	= filemap_migrate_folio,
 };
 
 static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
-- 
cgit v1.2.3


From 92f08e9d3cf0f8005ac6fcb931e3c388efc3ac49 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 7 Jan 2025 18:34:49 +0000
Subject: afs: Make /afs/.<cell> as well as /afs/<cell> mountpoints

When a cell is instantiated, automatically create an /afs/.<cell>
mountpoint to match the /afs/<cell> mountpoint to match other AFS clients.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20250107183454.608451-2-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/cell.c    | 13 ++++++++-----
 fs/afs/dynroot.c | 52 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 41 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index caa09875f520..1aba6d4d03a9 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -146,18 +146,20 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	cell->name = kmalloc(namelen + 1, GFP_KERNEL);
+	cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL);
 	if (!cell->name) {
 		kfree(cell);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	cell->net = net;
+	cell->name[0] = '.';
+	cell->name++;
 	cell->name_len = namelen;
 	for (i = 0; i < namelen; i++)
 		cell->name[i] = tolower(name[i]);
 	cell->name[i] = 0;
 
+	cell->net = net;
 	refcount_set(&cell->ref, 1);
 	atomic_set(&cell->active, 0);
 	INIT_WORK(&cell->manager, afs_manage_cell_work);
@@ -211,7 +213,7 @@ parse_failed:
 	if (ret == -EINVAL)
 		printk(KERN_ERR "kAFS: bad VL server IP address\n");
 error:
-	kfree(cell->name);
+	kfree(cell->name - 1);
 	kfree(cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
@@ -502,7 +504,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
 	afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
 	afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
 	key_put(cell->anonymous_key);
-	kfree(cell->name);
+	kfree(cell->name - 1);
 	kfree(cell);
 
 	afs_dec_cells_outstanding(net);
@@ -710,7 +712,8 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
 	afs_proc_cell_remove(cell);
 
 	mutex_lock(&net->proc_cells_lock);
-	hlist_del_rcu(&cell->proc_link);
+	if (!hlist_unhashed(&cell->proc_link))
+		hlist_del_rcu(&cell->proc_link);
 	afs_dynroot_rmdir(net, cell);
 	mutex_unlock(&net->proc_cells_lock);
 
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index c4d2711e20ad..f80a4244b9d2 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -271,7 +271,8 @@ const struct dentry_operations afs_dynroot_dentry_operations = {
 int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
 {
 	struct super_block *sb = net->dynroot_sb;
-	struct dentry *root, *subdir;
+	struct dentry *root, *subdir, *dsubdir;
+	char *dotname = cell->name - 1;
 	int ret;
 
 	if (!sb || atomic_read(&sb->s_active) == 0)
@@ -286,34 +287,31 @@ int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
 		goto unlock;
 	}
 
-	/* Note that we're retaining an extra ref on the dentry */
+	dsubdir = lookup_one_len(dotname, root, cell->name_len + 1);
+	if (IS_ERR(dsubdir)) {
+		ret = PTR_ERR(dsubdir);
+		dput(subdir);
+		goto unlock;
+	}
+
+	/* Note that we're retaining extra refs on the dentries. */
 	subdir->d_fsdata = (void *)1UL;
+	dsubdir->d_fsdata = (void *)1UL;
 	ret = 0;
 unlock:
 	inode_unlock(root->d_inode);
 	return ret;
 }
 
-/*
- * Remove a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
- */
-void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
+static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len)
 {
-	struct super_block *sb = net->dynroot_sb;
-	struct dentry *root, *subdir;
-
-	if (!sb || atomic_read(&sb->s_active) == 0)
-		return;
-
-	root = sb->s_root;
-	inode_lock(root->d_inode);
+	struct dentry *subdir;
 
 	/* Don't want to trigger a lookup call, which will re-add the cell */
-	subdir = try_lookup_one_len(cell->name, root, cell->name_len);
+	subdir = try_lookup_one_len(name, root, name_len);
 	if (IS_ERR_OR_NULL(subdir)) {
 		_debug("lookup %ld", PTR_ERR(subdir));
-		goto no_dentry;
+		return;
 	}
 
 	_debug("rmdir %pd %u", subdir, d_count(subdir));
@@ -324,8 +322,24 @@ void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
 		dput(subdir);
 	}
 	dput(subdir);
-no_dentry:
-	inode_unlock(root->d_inode);
+}
+
+/*
+ * Remove a manually added cell mount directory.
+ * - The caller must hold net->proc_cells_lock
+ */
+void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
+{
+	struct super_block *sb = net->dynroot_sb;
+	char *dotname = cell->name - 1;
+
+	if (!sb || atomic_read(&sb->s_active) == 0)
+		return;
+
+	inode_lock(sb->s_root->d_inode);
+	afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len);
+	afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1);
+	inode_unlock(sb->s_root->d_inode);
 	_leave("");
 }
 
-- 
cgit v1.2.3


From 3e914febd79a8d1a78ee6e67ff3fa4214d6d1d57 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 7 Jan 2025 18:34:50 +0000
Subject: afs: Add rootcell checks

Add some checks for the validity of the cell name.  It's may get put into a
symlink, so preclude it containing any slashes or "..".  Also disallow
starting/ending with a dot.  This makes /afs/@cell/ as a symlink less of a
security risk.

Also disallow multiple setting of /proc/net/afs/rootcell for any given
network namespace.  Once set, the value may not be changed.  This makes it
easier to only create /afs/@cell and /afs/.@cell if there's a rootcell.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20250107183454.608451-3-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/cell.c | 8 ++++++++
 fs/afs/proc.c | 8 +++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 1aba6d4d03a9..cee42646736c 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -367,6 +367,14 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
 		len = cp - rootcell;
 	}
 
+	if (len == 0 || !rootcell[0] || rootcell[0] == '.' || rootcell[len - 1] == '.')
+		return -EINVAL;
+	if (memchr(rootcell, '/', len))
+		return -EINVAL;
+	cp = strstr(rootcell, "..");
+	if (cp && cp < rootcell + len)
+		return -EINVAL;
+
 	/* allocate a cell record for the root cell */
 	new_root = afs_lookup_cell(net, rootcell, len, vllist, false);
 	if (IS_ERR(new_root)) {
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 15eab053af6d..e7614f4f30c2 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -240,7 +240,13 @@ static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size)
 	/* determine command to perform */
 	_debug("rootcell=%s", buf);
 
-	ret = afs_cell_init(net, buf);
+	ret = -EEXIST;
+	inode_lock(file_inode(file));
+	if (!net->ws_cell)
+		ret = afs_cell_init(net, buf);
+	else
+		printk("busy\n");
+	inode_unlock(file_inode(file));
 
 out:
 	_leave(" = %d", ret);
-- 
cgit v1.2.3


From 30bca65bbbae13f32ee4f2897c55a496ea8132cf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 7 Jan 2025 18:34:51 +0000
Subject: afs: Make /afs/@cell and /afs/.@cell symlinks

Make /afs/@cell a symlink in the /afs dynamic root to match what other AFS
clients do rather than doing a substitution in the dentry name.  This has
the bonus of being tab-expandable also.

Further, provide a /afs/.@cell symlink to point to the dotted cell share.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20250107183454.608451-4-dhowells@redhat.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/dynroot.c           | 177 +++++++++++++++++++++++++++++++++------------
 include/trace/events/afs.h |   2 +
 2 files changed, 131 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index f80a4244b9d2..d8bf52f77d93 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -185,50 +185,6 @@ out:
 	return ret == -ENOENT ? NULL : ERR_PTR(ret);
 }
 
-/*
- * Look up @cell in a dynroot directory.  This is a substitution for the
- * local cell name for the net namespace.
- */
-static struct dentry *afs_lookup_atcell(struct dentry *dentry)
-{
-	struct afs_cell *cell;
-	struct afs_net *net = afs_d2net(dentry);
-	struct dentry *ret;
-	char *name;
-	int len;
-
-	if (!net->ws_cell)
-		return ERR_PTR(-ENOENT);
-
-	ret = ERR_PTR(-ENOMEM);
-	name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL);
-	if (!name)
-		goto out_p;
-
-	down_read(&net->cells_lock);
-	cell = net->ws_cell;
-	if (cell) {
-		len = cell->name_len;
-		memcpy(name, cell->name, len + 1);
-	}
-	up_read(&net->cells_lock);
-
-	ret = ERR_PTR(-ENOENT);
-	if (!cell)
-		goto out_n;
-
-	ret = lookup_one_len(name, dentry->d_parent, len);
-
-	/* We don't want to d_add() the @cell dentry here as we don't want to
-	 * the cached dentry to hide changes to the local cell name.
-	 */
-
-out_n:
-	kfree(name);
-out_p:
-	return ret;
-}
-
 /*
  * Look up an entry in a dynroot directory.
  */
@@ -247,10 +203,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
 		return ERR_PTR(-ENAMETOOLONG);
 	}
 
-	if (dentry->d_name.len == 5 &&
-	    memcmp(dentry->d_name.name, "@cell", 5) == 0)
-		return afs_lookup_atcell(dentry);
-
 	return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry);
 }
 
@@ -343,6 +295,131 @@ void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
 	_leave("");
 }
 
+static void afs_atcell_delayed_put_cell(void *arg)
+{
+	struct afs_cell *cell = arg;
+
+	afs_put_cell(cell, afs_cell_trace_put_atcell);
+}
+
+/*
+ * Read @cell or .@cell symlinks.
+ */
+static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inode,
+				       struct delayed_call *done)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_cell *cell;
+	struct afs_net *net = afs_i2net(inode);
+	const char *name;
+	bool dotted = vnode->fid.vnode == 3;
+
+	if (!net->ws_cell)
+		return ERR_PTR(-ENOENT);
+
+	down_read(&net->cells_lock);
+
+	cell = net->ws_cell;
+	if (dotted)
+		name = cell->name - 1;
+	else
+		name = cell->name;
+	afs_get_cell(cell, afs_cell_trace_get_atcell);
+	set_delayed_call(done, afs_atcell_delayed_put_cell, cell);
+
+	up_read(&net->cells_lock);
+	return name;
+}
+
+static const struct inode_operations afs_atcell_inode_operations = {
+	.get_link	= afs_atcell_get_link,
+};
+
+/*
+ * Look up @cell or .@cell in a dynroot directory.  This is a substitution for
+ * the local cell name for the net namespace.
+ */
+static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name)
+{
+	struct afs_vnode *vnode;
+	struct afs_fid fid = { .vnode = 2, .unique = 1, };
+	struct dentry *dentry;
+	struct inode *inode;
+
+	if (name[0] == '.')
+		fid.vnode = 3;
+
+	dentry = d_alloc_name(root, name);
+	if (!dentry)
+		return ERR_PTR(-ENOMEM);
+
+	inode = iget5_locked(dentry->d_sb, fid.vnode,
+			     afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
+	if (!inode) {
+		dput(dentry);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	vnode = AFS_FS_I(inode);
+
+	/* there shouldn't be an existing inode */
+	if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) {
+		iput(inode);
+		dput(dentry);
+		return ERR_PTR(-EIO);
+	}
+
+	netfs_inode_init(&vnode->netfs, NULL, false);
+	simple_inode_init_ts(inode);
+	set_nlink(inode, 1);
+	inode->i_size		= 0;
+	inode->i_mode		= S_IFLNK | 0555;
+	inode->i_op		= &afs_atcell_inode_operations;
+	inode->i_uid		= GLOBAL_ROOT_UID;
+	inode->i_gid		= GLOBAL_ROOT_GID;
+	inode->i_blocks		= 0;
+	inode->i_generation	= 0;
+	inode->i_flags		|= S_NOATIME;
+
+	unlock_new_inode(inode);
+	d_splice_alias(inode, dentry);
+	return dentry;
+}
+
+/*
+ * Create @cell and .@cell symlinks.
+ */
+static int afs_dynroot_symlink(struct afs_net *net)
+{
+	struct super_block *sb = net->dynroot_sb;
+	struct dentry *root, *symlink, *dsymlink;
+	int ret;
+
+	/* Let the ->lookup op do the creation */
+	root = sb->s_root;
+	inode_lock(root->d_inode);
+	symlink = afs_dynroot_create_symlink(root, "@cell");
+	if (IS_ERR(symlink)) {
+		ret = PTR_ERR(symlink);
+		goto unlock;
+	}
+
+	dsymlink = afs_dynroot_create_symlink(root, ".@cell");
+	if (IS_ERR(dsymlink)) {
+		ret = PTR_ERR(dsymlink);
+		dput(symlink);
+		goto unlock;
+	}
+
+	/* Note that we're retaining extra refs on the dentries. */
+	symlink->d_fsdata = (void *)1UL;
+	dsymlink->d_fsdata = (void *)1UL;
+	ret = 0;
+unlock:
+	inode_unlock(root->d_inode);
+	return ret;
+}
+
 /*
  * Populate a newly created dynamic root with cell names.
  */
@@ -355,6 +432,10 @@ int afs_dynroot_populate(struct super_block *sb)
 	mutex_lock(&net->proc_cells_lock);
 
 	net->dynroot_sb = sb;
+	ret = afs_dynroot_symlink(net);
+	if (ret < 0)
+		goto error;
+
 	hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
 		ret = afs_dynroot_mkdir(net, cell);
 		if (ret < 0)
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index a0aed1a428a1..de0e2301a037 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -168,12 +168,14 @@ enum yfs_cm_operation {
 #define afs_cell_traces \
 	EM(afs_cell_trace_alloc,		"ALLOC     ") \
 	EM(afs_cell_trace_free,			"FREE      ") \
+	EM(afs_cell_trace_get_atcell,		"GET atcell") \
 	EM(afs_cell_trace_get_queue_dns,	"GET q-dns ") \
 	EM(afs_cell_trace_get_queue_manage,	"GET q-mng ") \
 	EM(afs_cell_trace_get_queue_new,	"GET q-new ") \
 	EM(afs_cell_trace_get_vol,		"GET vol   ") \
 	EM(afs_cell_trace_insert,		"INSERT    ") \
 	EM(afs_cell_trace_manage,		"MANAGE    ") \
+	EM(afs_cell_trace_put_atcell,		"PUT atcell") \
 	EM(afs_cell_trace_put_candidate,	"PUT candid") \
 	EM(afs_cell_trace_put_destroy,		"PUT destry") \
 	EM(afs_cell_trace_put_queue_work,	"PUT q-work") \
-- 
cgit v1.2.3


From 55cf2f4b945f6a6416cc2524ba740b83cc9af25a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 4 Dec 2024 15:07:15 +0300
Subject: binfmt_flat: Fix integer overflow bug on 32 bit systems

Most of these sizes and counts are capped at 256MB so the math doesn't
result in an integer overflow.  The "relocs" count needs to be checked
as well.  Otherwise on 32bit systems the calculation of "full_data"
could be wrong.

	full_data = data_len + relocs * sizeof(unsigned long);

Fixes: c995ee28d29d ("binfmt_flat: prevent kernel dammage from corrupted executable headers")
Cc: stable@vger.kernel.org
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Acked-by: Nicolas Pitre <npitre@baylibre.com>
Link: https://lore.kernel.org/r/5be17f6c-5338-43be-91ef-650153b975cb@stanley.mountain
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/binfmt_flat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 390808ce935d..b5b5ca1a44f7 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -478,7 +478,7 @@ static int load_flat_file(struct linux_binprm *bprm,
 	 * 28 bits (256 MB) is way more than reasonable in this case.
 	 * If some top bits are set we have probable binary corruption.
 	*/
-	if ((text_len | data_len | bss_len | stack_len | full_data) >> 28) {
+	if ((text_len | data_len | bss_len | stack_len | relocs | full_data) >> 28) {
 		pr_err("bad header\n");
 		ret = -ENOEXEC;
 		goto err;
-- 
cgit v1.2.3


From 067cdf020329a07dd8ee1574c3086998343b1b2b Mon Sep 17 00:00:00 2001
From: Eugen Hristev <eugen.hristev@linaro.org>
Date: Fri, 10 Jan 2025 14:57:14 +0200
Subject: pstore/zone: avoid dereferencing zero sized ptr after init zones

In psz_init_zones, if the requested area has a total_size less than
record_size, kcalloc will be called with c == 0 and will return
ZERO_SIZE_PTR.
Further, this will lead to an oops.

With this patch, in this scenario, it will look like this :
[    6.865545] pstore_zone:     total size : 28672 Bytes
[    6.865547] pstore_zone:     kmsg size : 65536 Bytes
[    6.865549] pstore_zone:     pmsg size : 0 Bytes
[    6.865551] pstore_zone:     console size : 0 Bytes
[    6.865553] pstore_zone:     ftrace size : 0 Bytes
[    6.872095] pstore_zone: zone dmesg total_size too small
[    6.878234] pstore_zone: alloc zones failed

Signed-off-by: Eugen Hristev <eugen.hristev@linaro.org>
Link: https://lore.kernel.org/r/20250110125714.2594719-1-eugen.hristev@linaro.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/pstore/zone.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 694db616663f..ceb5639a0629 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -1212,6 +1212,11 @@ static struct pstore_zone **psz_init_zones(enum pstore_type_id type,
 	}
 
 	c = total_size / record_size;
+	if (unlikely(!c)) {
+		pr_err("zone %s total_size too small\n", name);
+		return ERR_PTR(-EINVAL);
+	}
+
 	zones = kcalloc(c, sizeof(*zones), GFP_KERNEL);
 	if (!zones) {
 		pr_err("allocate for zones %s failed\n", name);
-- 
cgit v1.2.3


From 20b1aa912316ffb7fbb5f407f17c330f2a22ddff Mon Sep 17 00:00:00 2001
From: Meetakshi Setiya <msetiya@microsoft.com>
Date: Wed, 8 Jan 2025 05:10:34 -0500
Subject: smb: client: sync the root session and superblock context passwords
 before automounting

In some cases, when password2 becomes the working password, the
client swaps the two password fields in the root session struct, but
not in the smb3_fs_context struct in cifs_sb. DFS automounts inherit
fs context from their parent mounts. Therefore, they might end up
getting the passwords in the stale order.
The automount should succeed, because the mount function will end up
retrying with the actual password anyway. But to reduce these
unnecessary session setup retries for automounts, we can sync the
parent context's passwords with the root session's passwords before
duplicating it to the child's fs context.

Cc: stable@vger.kernel.org
Signed-off-by: Meetakshi Setiya <msetiya@microsoft.com>
Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Acked-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/namespace.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index 0f788031b740..e3f9213131c4 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -196,11 +196,28 @@ static struct vfsmount *cifs_do_automount(struct path *path)
 	struct smb3_fs_context tmp;
 	char *full_path;
 	struct vfsmount *mnt;
+	struct cifs_sb_info *mntpt_sb;
+	struct cifs_ses *ses;
 
 	if (IS_ROOT(mntpt))
 		return ERR_PTR(-ESTALE);
 
-	cur_ctx = CIFS_SB(mntpt->d_sb)->ctx;
+	mntpt_sb = CIFS_SB(mntpt->d_sb);
+	ses = cifs_sb_master_tcon(mntpt_sb)->ses;
+	cur_ctx = mntpt_sb->ctx;
+
+	/*
+	 * At this point, the root session should be in the mntpt sb. We should
+	 * bring the sb context passwords in sync with the root session's
+	 * passwords. This would help prevent unnecessary retries and password
+	 * swaps for automounts.
+	 */
+	mutex_lock(&ses->session_mutex);
+	rc = smb3_sync_session_ctx_passwords(mntpt_sb, ses);
+	mutex_unlock(&ses->session_mutex);
+
+	if (rc)
+		return ERR_PTR(rc);
 
 	fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt);
 	if (IS_ERR(fc))
-- 
cgit v1.2.3


From ef3675b45bcb6c17cabbbde620c6cea52ffb21ac Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Dec 2024 19:28:52 -0500
Subject: NFSD: Encode COMPOUND operation status on page boundaries

J. David reports an odd corruption of a READDIR reply sent to a
FreeBSD client.

xdr_reserve_space() has to do a special trick when the @nbytes value
requests more space than there is in the current page of the XDR
buffer.

In that case, xdr_reserve_space() returns a pointer to the start of
the next page, and then the next call to xdr_reserve_space() invokes
__xdr_commit_encode() to copy enough of the data item back into the
previous page to make that data item contiguous across the page
boundary.

But we need to be careful in the case where buffer space is reserved
early for a data item whose value will be inserted into the buffer
later.

One such caller, nfsd4_encode_operation(), reserves 8 bytes in the
encoding buffer for each COMPOUND operation. However, a READDIR
result can sometimes encode file names so that there are only 4
bytes left at the end of the current XDR buffer page (though plenty
of pages are left to handle the remaining encoding tasks).

If a COMPOUND operation follows the READDIR result (say, a GETATTR),
then nfsd4_encode_operation() will reserve 8 bytes for the op number
(9) and the op status (usually NFS4_OK). In this weird case,
xdr_reserve_space() returns a pointer to byte zero of the next buffer
page, as it assumes the data item will be copied back into place (in
the previous page) on the next call to xdr_reserve_space().

nfsd4_encode_operation() writes the op num into the buffer, then
saves the next 4-byte location for the op's status code. The next
xdr_reserve_space() call is part of GETATTR encoding, so the op num
gets copied back into the previous page, but the saved location for
the op status continues to point to the wrong spot in the current
XDR buffer page because __xdr_commit_encode() moved that data item.

After GETATTR encoding is complete, nfsd4_encode_operation() writes
the op status over the first XDR data item in the GETATTR result.
The NFS4_OK status code (0) makes it look like there are zero items
in the GETATTR's attribute bitmask.

The patch description of commit 2825a7f90753 ("nfsd4: allow encoding
across page boundaries") [2014] remarks that NFSD "can't handle a
new operation starting close to the end of a page." This bug appears
to be one reason for that remark.

Reported-by: J David <j.david.lists@gmail.com>
Closes: https://lore.kernel.org/linux-nfs/3998d739-c042-46b4-8166-dbd6c5f0e804@oracle.com/T/#t
Tested-by: Rick Macklem <rmacklem@uoguelph.ca>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 4dcb03cd9292..b2243b5dbf18 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5761,15 +5761,14 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	struct nfs4_stateowner *so = resp->cstate.replay_owner;
 	struct svc_rqst *rqstp = resp->rqstp;
 	const struct nfsd4_operation *opdesc = op->opdesc;
-	int post_err_offset;
+	unsigned int op_status_offset;
 	nfsd4_enc encoder;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, 8);
-	if (!p)
+	if (xdr_stream_encode_u32(xdr, op->opnum) != XDR_UNIT)
+		goto release;
+	op_status_offset = xdr->buf->len;
+	if (!xdr_reserve_space(xdr, XDR_UNIT))
 		goto release;
-	*p++ = cpu_to_be32(op->opnum);
-	post_err_offset = xdr->buf->len;
 
 	if (op->opnum == OP_ILLEGAL)
 		goto status;
@@ -5810,20 +5809,21 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 		 * bug if we had to do this on a non-idempotent op:
 		 */
 		warn_on_nonidempotent_op(op);
-		xdr_truncate_encode(xdr, post_err_offset);
+		xdr_truncate_encode(xdr, op_status_offset + XDR_UNIT);
 	}
 	if (so) {
-		int len = xdr->buf->len - post_err_offset;
+		int len = xdr->buf->len - (op_status_offset + XDR_UNIT);
 
 		so->so_replay.rp_status = op->status;
 		so->so_replay.rp_buflen = len;
-		read_bytes_from_xdr_buf(xdr->buf, post_err_offset,
+		read_bytes_from_xdr_buf(xdr->buf, op_status_offset + XDR_UNIT,
 						so->so_replay.rp_buf, len);
 	}
 status:
 	op->status = nfsd4_map_status(op->status,
 				      resp->cstate.minorversion);
-	*p = op->status;
+	write_bytes_to_xdr_buf(xdr->buf, op_status_offset,
+			       &op->status, XDR_UNIT);
 release:
 	if (opdesc && opdesc->op_release)
 		opdesc->op_release(&op->u);
-- 
cgit v1.2.3


From 1a861150bd6a69ea14da7a0d752da2b442e6a5dc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Dec 2024 19:28:53 -0500
Subject: NFSD: Insulate nfsd4_encode_read() from page boundaries in the encode
 buffer

Commit 28d5bc468efe ("NFSD: Optimize nfsd4_encode_readv()") replaced
the use of write_bytes_to_xdr_buf() because it's expensive and the
data items to be encoded are already properly aligned.

However, the current code will corrupt the encoded data if the XDR
data items that are reserved early and then poked into the XDR
buffer later happen to fall on a page boundary in the XDR encoding
buffer.

__xdr_commit_encode can shift encoded data items in the encoding
buffer so that pointers returned from xdr_reserve_space() no longer
address the same part of the encoding stream.

This isn't an issue for splice reads because the reserved encode
buffer areas must fall in the XDR buffers header for the splice to
work without error. For vectored reads, however, there is a
possibility of send buffer corruption in rare cases.

Fixes: 28d5bc468efe ("NFSD: Optimize nfsd4_encode_readv()")
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b2243b5dbf18..2399b17db052 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4317,6 +4317,15 @@ static __be32 nfsd4_encode_splice_read(
 	int status, space_left;
 	__be32 nfserr;
 
+	/*
+	 * Splice read doesn't work if encoding has already wandered
+	 * into the XDR buf's page array.
+	 */
+	if (unlikely(xdr->buf->page_len)) {
+		WARN_ON_ONCE(1);
+		return nfserr_serverfault;
+	}
+
 	/*
 	 * Make sure there is room at the end of buf->head for
 	 * svcxdr_encode_opaque_pages() to create a tail buffer
@@ -4399,25 +4408,23 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp;
 	struct nfsd4_read *read = &u->read;
 	struct xdr_stream *xdr = resp->xdr;
-	int starting_len = xdr->buf->len;
 	bool splice_ok = argp->splice_ok;
+	unsigned int eof_offset;
 	unsigned long maxcount;
+	__be32 wire_data[2];
 	struct file *file;
-	__be32 *p;
 
 	if (nfserr)
 		return nfserr;
+
+	eof_offset = xdr->buf->len;
 	file = read->rd_nf->nf_file;
 
-	p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
-	if (!p) {
+	/* Reserve space for the eof flag and byte count */
+	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT * 2))) {
 		WARN_ON_ONCE(splice_ok);
 		return nfserr_resource;
 	}
-	if (resp->xdr->buf->page_len && splice_ok) {
-		WARN_ON_ONCE(1);
-		return nfserr_serverfault;
-	}
 	xdr_commit_encode(xdr);
 
 	maxcount = min_t(unsigned long, read->rd_length,
@@ -4428,12 +4435,13 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 	else
 		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
 	if (nfserr) {
-		xdr_truncate_encode(xdr, starting_len);
+		xdr_truncate_encode(xdr, eof_offset);
 		return nfserr;
 	}
 
-	p = xdr_encode_bool(p, read->rd_eof);
-	*p = cpu_to_be32(read->rd_length);
+	wire_data[0] = read->rd_eof ? xdr_one : xdr_zero;
+	wire_data[1] = cpu_to_be32(read->rd_length);
+	write_bytes_to_xdr_buf(xdr->buf, eof_offset, &wire_data, XDR_UNIT * 2);
 	return nfs_ok;
 }
 
@@ -5304,10 +5312,6 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
 	p = xdr_reserve_space(xdr, 4 + 8 + 4);
 	if (!p)
 		return nfserr_io;
-	if (resp->xdr->buf->page_len && splice_ok) {
-		WARN_ON_ONCE(splice_ok);
-		return nfserr_serverfault;
-	}
 
 	maxcount = min_t(unsigned long, read->rd_length,
 			 (xdr->buf->buflen - xdr->buf->len));
-- 
cgit v1.2.3


From c9fc7772bacb28a8bd8efb08399c5af7217fbbb7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Dec 2024 19:28:54 -0500
Subject: NFSD: Insulate nfsd4_encode_read_plus() from page boundaries in the
 encode buffer

Commit eeadcb757945 ("NFSD: Simplify READ_PLUS") replaced the use of
write_bytes_to_xdr_buf(), copying what was in nfsd4_encode_read()
at the time.

However, the current code will corrupt the encoded data if the XDR
data items that are reserved early and then poked into the XDR
buffer later happen to fall on a page boundary in the XDR encoding
buffer.

__xdr_commit_encode can shift encoded data items in the encoding
buffer so that pointers returned from xdr_reserve_space() no longer
address the same part of the encoding stream.

Fixes: eeadcb757945 ("NFSD: Simplify READ_PLUS")
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2399b17db052..474fd663b0a9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5337,16 +5337,17 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_read *read = &u->read;
 	struct file *file = read->rd_nf->nf_file;
 	struct xdr_stream *xdr = resp->xdr;
-	int starting_len = xdr->buf->len;
+	unsigned int eof_offset;
+	__be32 wire_data[2];
 	u32 segments = 0;
-	__be32 *p;
 
 	if (nfserr)
 		return nfserr;
 
-	/* eof flag, segment count */
-	p = xdr_reserve_space(xdr, 4 + 4);
-	if (!p)
+	eof_offset = xdr->buf->len;
+
+	/* Reserve space for the eof flag and segment count */
+	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT * 2)))
 		return nfserr_io;
 	xdr_commit_encode(xdr);
 
@@ -5356,15 +5357,16 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 	nfserr = nfsd4_encode_read_plus_data(resp, read);
 	if (nfserr) {
-		xdr_truncate_encode(xdr, starting_len);
+		xdr_truncate_encode(xdr, eof_offset);
 		return nfserr;
 	}
 
 	segments++;
 
 out:
-	p = xdr_encode_bool(p, read->rd_eof);
-	*p = cpu_to_be32(segments);
+	wire_data[0] = read->rd_eof ? xdr_one : xdr_zero;
+	wire_data[1] = cpu_to_be32(segments);
+	write_bytes_to_xdr_buf(xdr->buf, eof_offset, &wire_data, XDR_UNIT * 2);
 	return nfserr;
 }
 
-- 
cgit v1.2.3


From 26ea81638fa0fb9c02b76775301995722368356a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Dec 2024 19:28:55 -0500
Subject: NFSD: Insulate nfsd4_encode_read_plus_data() from page boundaries in
 the encode buffer

Commit eeadcb757945 ("NFSD: Simplify READ_PLUS") replaced the use of
write_bytes_to_xdr_buf(), copying what was in nfsd4_encode_read()
at the time.

However, the current code will corrupt the encoded data if the XDR
data items that are reserved early and then poked into the XDR
buffer later happen to fall on a page boundary in the XDR encoding
buffer.

__xdr_commit_encode can shift encoded data items in the encoding
buffer so that pointers returned from xdr_reserve_space() no longer
address the same part of the encoding stream.

Fixes: eeadcb757945 ("NFSD: Simplify READ_PLUS")
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 474fd663b0a9..3a68bed1829d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5305,13 +5305,20 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
 	struct file *file = read->rd_nf->nf_file;
 	struct xdr_stream *xdr = resp->xdr;
 	bool splice_ok = argp->splice_ok;
+	unsigned int offset_offset;
+	__be32 nfserr, wire_count;
 	unsigned long maxcount;
-	__be32 nfserr, *p;
+	__be64 wire_offset;
 
-	/* Content type, offset, byte count */
-	p = xdr_reserve_space(xdr, 4 + 8 + 4);
-	if (!p)
+	if (xdr_stream_encode_u32(xdr, NFS4_CONTENT_DATA) != XDR_UNIT)
+		return nfserr_io;
+
+	offset_offset = xdr->buf->len;
+
+	/* Reserve space for the byte offset and count */
+	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT * 3)))
 		return nfserr_io;
+	xdr_commit_encode(xdr);
 
 	maxcount = min_t(unsigned long, read->rd_length,
 			 (xdr->buf->buflen - xdr->buf->len));
@@ -5323,10 +5330,12 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
 	if (nfserr)
 		return nfserr;
 
-	*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
-	p = xdr_encode_hyper(p, read->rd_offset);
-	*p = cpu_to_be32(read->rd_length);
-
+	wire_offset = cpu_to_be64(read->rd_offset);
+	write_bytes_to_xdr_buf(xdr->buf, offset_offset, &wire_offset,
+			       XDR_UNIT * 2);
+	wire_count = cpu_to_be32(read->rd_length);
+	write_bytes_to_xdr_buf(xdr->buf, offset_offset + XDR_UNIT * 2,
+			       &wire_count, XDR_UNIT);
 	return nfs_ok;
 }
 
-- 
cgit v1.2.3


From 201cb2048a926b041f80cbd7331de77b87867940 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Dec 2024 19:28:57 -0500
Subject: NFSD: Insulate nfsd4_encode_readlink() from page boundaries in the
 encode buffer

There's no guarantee that the pointer returned from
xdr_reserve_space() will still point to the correct reserved space
in the encode buffer after one or more intervening calls to
xdr_reserve_space(). It just happens to work with the current
implementation of xdr_reserve_space().

Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 3a68bed1829d..2116a03830c3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4450,25 +4450,21 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr,
 		      union nfsd4_op_u *u)
 {
 	struct nfsd4_readlink *readlink = &u->readlink;
-	__be32 *p, *maxcount_p, zero = xdr_zero;
+	__be32 *p, wire_count, zero = xdr_zero;
 	struct xdr_stream *xdr = resp->xdr;
-	int length_offset = xdr->buf->len;
+	unsigned int length_offset;
 	int maxcount, status;
 
-	maxcount_p = xdr_reserve_space(xdr, XDR_UNIT);
-	if (!maxcount_p)
+	/* linktext4.count */
+	length_offset = xdr->buf->len;
+	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT)))
 		return nfserr_resource;
-	maxcount = PAGE_SIZE;
 
+	/* linktext4.data */
+	maxcount = PAGE_SIZE;
 	p = xdr_reserve_space(xdr, maxcount);
 	if (!p)
 		return nfserr_resource;
-	/*
-	 * XXX: By default, vfs_readlink() will truncate symlinks if they
-	 * would overflow the buffer.  Is this kosher in NFSv4?  If not, one
-	 * easy fix is: if vfs_readlink() precisely fills the buffer, assume
-	 * that truncation occurred, and return NFS4ERR_RESOURCE.
-	 */
 	nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp,
 						(char *)p, &maxcount);
 	if (nfserr == nfserr_isdir)
@@ -4481,7 +4477,9 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr,
 		nfserr = nfserrno(status);
 		goto out_err;
 	}
-	*maxcount_p = cpu_to_be32(maxcount);
+
+	wire_count = cpu_to_be32(maxcount);
+	write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, XDR_UNIT);
 	xdr_truncate_encode(xdr, length_offset + 4 + xdr_align_size(maxcount));
 	write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero,
 			       xdr_pad_size(maxcount));
-- 
cgit v1.2.3


From 825562bc7d5948723511046686217829dbeb067f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Dec 2024 19:28:58 -0500
Subject: NFSD: Refactor nfsd4_do_encode_secinfo() again

Extract the code that encodes the secinfo4 union data type to
clarify the logic. The removed warning is pretty well obscured and
thus probably not terribly useful.

Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 65 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2116a03830c3..5cc365a3ed0d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4613,13 +4613,41 @@ nfsd4_encode_rpcsec_gss_info(struct xdr_stream *xdr,
 	return nfs_ok;
 }
 
+static __be32
+nfsd4_encode_secinfo4(struct xdr_stream *xdr, rpc_authflavor_t pf,
+		      u32 *supported)
+{
+	struct rpcsec_gss_info info;
+	__be32 status;
+
+	if (rpcauth_get_gssinfo(pf, &info) == 0) {
+		(*supported)++;
+
+		/* flavor */
+		status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS);
+		if (status != nfs_ok)
+			return status;
+		/* flavor_info */
+		status = nfsd4_encode_rpcsec_gss_info(xdr, &info);
+		if (status != nfs_ok)
+			return status;
+	} else if (pf < RPC_AUTH_MAXFLAVOR) {
+		(*supported)++;
+
+		/* flavor */
+		status = nfsd4_encode_uint32_t(xdr, pf);
+		if (status != nfs_ok)
+			return status;
+	}
+	return nfs_ok;
+}
+
 static __be32
 nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 {
 	u32 i, nflavs, supported;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
-	static bool report = true;
 	__be32 *flavorsp;
 	__be32 status;
 
@@ -4643,42 +4671,17 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 		}
 	}
 
-	supported = 0;
 	flavorsp = xdr_reserve_space(xdr, XDR_UNIT);
 	if (!flavorsp)
 		return nfserr_resource;
 
-	for (i = 0; i < nflavs; i++) {
-		rpc_authflavor_t pf = flavs[i].pseudoflavor;
-		struct rpcsec_gss_info info;
-
-		if (rpcauth_get_gssinfo(pf, &info) == 0) {
-			supported++;
-
-			/* flavor */
-			status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS);
-			if (status != nfs_ok)
-				return status;
-			/* flavor_info */
-			status = nfsd4_encode_rpcsec_gss_info(xdr, &info);
-			if (status != nfs_ok)
-				return status;
-		} else if (pf < RPC_AUTH_MAXFLAVOR) {
-			supported++;
-
-			/* flavor */
-			status = nfsd4_encode_uint32_t(xdr, pf);
-			if (status != nfs_ok)
-				return status;
-		} else {
-			if (report)
-				pr_warn("NFS: SECINFO: security flavor %u "
-					"is not supported\n", pf);
-		}
+	for (i = 0, supported = 0; i < nflavs; i++) {
+		status = nfsd4_encode_secinfo4(xdr, flavs[i].pseudoflavor,
+					       &supported);
+		if (status != nfs_ok)
+			return status;
 	}
 
-	if (nflavs != supported)
-		report = false;
 	*flavorsp = cpu_to_be32(supported);
 	return 0;
 }
-- 
cgit v1.2.3


From b786caa65d4baa73257487e27ebab9004dc768d1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Dec 2024 19:28:59 -0500
Subject: NFSD: Insulate nfsd4_encode_secinfo() from page boundaries in the
 encode buffer

There's no guarantee that the pointer returned from
xdr_reserve_space() will still point to the correct reserved space
in the encode buffer after one or more intervening calls to
xdr_reserve_space(). It just happens to work with the current
implementation of xdr_reserve_space().

Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5cc365a3ed0d..ae4c6c0060bf 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4643,13 +4643,13 @@ nfsd4_encode_secinfo4(struct xdr_stream *xdr, rpc_authflavor_t pf,
 }
 
 static __be32
-nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
+nfsd4_encode_SECINFO4resok(struct xdr_stream *xdr, struct svc_export *exp)
 {
 	u32 i, nflavs, supported;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
-	__be32 *flavorsp;
-	__be32 status;
+	unsigned int count_offset;
+	__be32 status, wire_count;
 
 	if (exp->ex_nflavors) {
 		flavs = exp->ex_flavors;
@@ -4671,8 +4671,8 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 		}
 	}
 
-	flavorsp = xdr_reserve_space(xdr, XDR_UNIT);
-	if (!flavorsp)
+	count_offset = xdr->buf->len;
+	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT)))
 		return nfserr_resource;
 
 	for (i = 0, supported = 0; i < nflavs; i++) {
@@ -4682,7 +4682,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 			return status;
 	}
 
-	*flavorsp = cpu_to_be32(supported);
+	wire_count = cpu_to_be32(supported);
+	write_bytes_to_xdr_buf(xdr->buf, count_offset, &wire_count,
+			       XDR_UNIT);
 	return 0;
 }
 
@@ -4693,7 +4695,7 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_secinfo *secinfo = &u->secinfo;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp);
+	return nfsd4_encode_SECINFO4resok(xdr, secinfo->si_exp);
 }
 
 static __be32
@@ -4703,7 +4705,7 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
+	return nfsd4_encode_SECINFO4resok(xdr, secinfo->sin_exp);
 }
 
 static __be32
-- 
cgit v1.2.3


From 4163ee711cf141ada9e884a94e6e329431547fd7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Dec 2024 19:28:56 -0500
Subject: NFSD: Insulate nfsd4_encode_fattr4() from page boundaries in the
 encode buffer

Commit ab04de60ae1c ("NFSD: Optimize nfsd4_encode_fattr()") replaced
the use of write_bytes_to_xdr_buf() because it's expensive and the
data items to be encoded are already properly aligned.

However, there's no guarantee that the pointer returned from
xdr_reserve_space() will still point to the correct reserved space
in the encode buffer after one or more intervening calls to
xdr_reserve_space(). It just happens to work with the current
implementation of xdr_reserve_space().

This commit effectively reverts the optimization.

Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ae4c6c0060bf..8d373d2edff1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3507,8 +3507,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	struct nfsd4_fattr_args args;
 	struct svc_fh *tempfh = NULL;
 	int starting_len = xdr->buf->len;
-	__be32 *attrlen_p, status;
-	int attrlen_offset;
+	unsigned int attrlen_offset;
+	__be32 attrlen, status;
 	u32 attrmask[3];
 	int err;
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
@@ -3629,8 +3629,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 
 	/* attr_vals */
 	attrlen_offset = xdr->buf->len;
-	attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
-	if (!attrlen_p)
+	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT)))
 		goto out_resource;
 	bitmap_from_arr32(attr_bitmap, attrmask,
 			  ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
@@ -3640,7 +3639,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		if (status != nfs_ok)
 			goto out;
 	}
-	*attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
+	attrlen = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
+	write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, XDR_UNIT);
 	status = nfs_ok;
 
 out:
-- 
cgit v1.2.3


From 60a6002432448bb3f291d80768ae98d62efc9c77 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Jan 2025 01:37:44 -0500
Subject: hostfs: fix string handling in __dentry_name()

strcpy() should not be used with destination potentially overlapping
the source; what's more, strscpy() in there is pointless - we already
know the amount we want to copy; might as well use memcpy().

Fixes: c278e81b8a02 "hostfs: Remove open coded strcpy()"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hostfs/hostfs_kern.c | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7e51d2cec64b..bd6503b73142 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -95,32 +95,17 @@ __uml_setup("hostfs=", hostfs_args,
 static char *__dentry_name(struct dentry *dentry, char *name)
 {
 	char *p = dentry_path_raw(dentry, name, PATH_MAX);
-	char *root;
-	size_t len;
-	struct hostfs_fs_info *fsi;
-
-	fsi = dentry->d_sb->s_fs_info;
-	root = fsi->host_root_path;
-	len = strlen(root);
-	if (IS_ERR(p)) {
-		__putname(name);
-		return NULL;
-	}
-
-	/*
-	 * This function relies on the fact that dentry_path_raw() will place
-	 * the path name at the end of the provided buffer.
-	 */
-	BUG_ON(p + strlen(p) + 1 != name + PATH_MAX);
+	struct hostfs_fs_info *fsi = dentry->d_sb->s_fs_info;
+	char *root = fsi->host_root_path;
+	size_t len = strlen(root);
 
-	strscpy(name, root, PATH_MAX);
-	if (len > p - name) {
+	if (IS_ERR(p) || len > p - name) {
 		__putname(name);
 		return NULL;
 	}
 
-	if (p > name + len)
-		strcpy(name + len, p);
+	memcpy(name, root, len);
+	memmove(name + len, p, name + PATH_MAX - p);
 
 	return name;
 }
-- 
cgit v1.2.3


From cbc5dde0a461240046e8a41c43d7c3b76d5db952 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Fri, 10 Jan 2025 10:28:21 -0500
Subject: fs/proc: fix softlockup in __read_vmcore (part 2)

Since commit 5cbcb62dddf5 ("fs/proc: fix softlockup in __read_vmcore") the
number of softlockups in __read_vmcore at kdump time have gone down, but
they still happen sometimes.

In a memory constrained environment like the kdump image, a softlockup is
not just a harmless message, but it can interfere with things like RCU
freeing memory, causing the crashdump to get stuck.

The second loop in __read_vmcore has a lot more opportunities for natural
sleep points, like scheduling out while waiting for a data write to
happen, but apparently that is not always enough.

Add a cond_resched() to the second loop in __read_vmcore to (hopefully)
get rid of the softlockups.

Link: https://lkml.kernel.org/r/20250110102821.2a37581b@fangorn
Fixes: 5cbcb62dddf5 ("fs/proc: fix softlockup in __read_vmcore")
Signed-off-by: Rik van Riel <riel@surriel.com>
Reported-by: Breno Leitao <leitao@debian.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/vmcore.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3d8a82cee63e..658bf199d424 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -404,6 +404,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
 			if (!iov_iter_count(iter))
 				return acc;
 		}
+
+		cond_resched();
 	}
 
 	return acc;
-- 
cgit v1.2.3


From f5ea0319ef4f05afe24645cbec9155c2b44d5174 Mon Sep 17 00:00:00 2001
From: Easwar Hariharan <eahariha@linux.microsoft.com>
Date: Tue, 10 Dec 2024 22:02:48 +0000
Subject: ceph: convert timeouts to secs_to_jiffies()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit b35108a51cf7 ("jiffies: Define secs_to_jiffies()") introduced
secs_to_jiffies().  As the value here is a multiple of 1000, use
secs_to_jiffies() instead of msecs_to_jiffies to avoid the multiplication.

This is converted using scripts/coccinelle/misc/secs_to_jiffies.cocci with
the following Coccinelle rules:

@@ constant C; @@

- msecs_to_jiffies(C * 1000)
+ secs_to_jiffies(C)

@@ constant C; @@

- msecs_to_jiffies(C * MSEC_PER_SEC)
+ secs_to_jiffies(C)

Link: https://lkml.kernel.org/r/20241210-converge-secs-to-jiffies-v3-17-ddfefd7e9f2a@linux.microsoft.com
Signed-off-by: Easwar Hariharan <eahariha@linux.microsoft.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Anna-Maria Behnsen <anna-maria@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Daniel Mack <daniel@zonque.org>
Cc: David Airlie <airlied@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dick Kennedy <dick.kennedy@broadcom.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jack Wang <jinpu.wang@cloud.ionos.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: James Smart <james.smart@broadcom.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Jeff Johnson <jjohnson@kernel.org>
Cc: Jeff Johnson <quic_jjohnson@quicinc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jeroen de Borst <jeroendb@google.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Jozsef Kadlecsik <kadlec@netfilter.org>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: Louis Peens <louis.peens@corigine.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Naveen N Rao <naveen@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Palix <nicolas.palix@imag.fr>
Cc: Oded Gabbay <ogabbay@kernel.org>
Cc: Ofir Bitton <obitton@habana.ai>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Praveen Kaligineedi <pkaligineedi@google.com>
Cc: Ray Jui <rjui@broadcom.com>
Cc: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roger Pau Monné <roger.pau@citrix.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Shailend Chand <shailend@google.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Simon Horman <horms@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ceph/quota.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 06ee397e0c3a..d90eda19bcc4 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -166,7 +166,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
 	if (IS_ERR(in)) {
 		doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino,
 		      PTR_ERR(in));
-		qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
+		qri->timeout = jiffies + secs_to_jiffies(60); /* XXX */
 	} else {
 		qri->timeout = 0;
 		qri->inode = in;
-- 
cgit v1.2.3


From 94e0c289020a2abafc6bfbb473b51857eed00943 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Oct 2024 09:41:14 -0500
Subject: dlmfs: convert to the new mount API

Patch series "ocfs2, dlmfs: convert to the new mount API".


This patch (of 2):

Convert dlmfs to the new mount API.

Link: https://lkml.kernel.org/r/20241028144443.609151-1-sandeen@redhat.com
Link: https://lkml.kernel.org/r/20241028144443.609151-2-sandeen@redhat.com
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Tested-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlmfs/dlmfs.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 7fc0e920eda7..2a7f36643895 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -20,6 +20,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/pagemap.h>
 #include <linux/types.h>
 #include <linux/slab.h>
@@ -506,9 +507,7 @@ bail:
 	return status;
 }
 
-static int dlmfs_fill_super(struct super_block * sb,
-			    void * data,
-			    int silent)
+static int dlmfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_SIZE;
@@ -556,17 +555,27 @@ static const struct inode_operations dlmfs_file_inode_operations = {
 	.setattr	= dlmfs_file_setattr,
 };
 
-static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int dlmfs_get_tree(struct fs_context *fc)
 {
-	return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
+	return get_tree_nodev(fc, dlmfs_fill_super);
+}
+
+static const struct fs_context_operations dlmfs_context_ops = {
+	.get_tree       = dlmfs_get_tree,
+};
+
+static int dlmfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &dlmfs_context_ops;
+
+	return 0;
 }
 
 static struct file_system_type dlmfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ocfs2_dlmfs",
-	.mount		= dlmfs_mount,
 	.kill_sb	= kill_litter_super,
+	.init_fs_context = dlmfs_init_fs_context,
 };
 MODULE_ALIAS_FS("ocfs2_dlmfs");
 
-- 
cgit v1.2.3


From 9be53fe697eccf45e5c37e10ffd399eeca5eb489 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Oct 2024 09:41:15 -0500
Subject: ocfs2: convert to the new mount API

Convert ocfs2 to the new mount API.

Link: https://lkml.kernel.org/r/20241028144443.609151-3-sandeen@redhat.com
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Tested-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/ocfs2_trace.h |  20 +-
 fs/ocfs2/super.c       | 589 ++++++++++++++++++++++---------------------------
 2 files changed, 277 insertions(+), 332 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 0511c69c9fde..54ed1495de9a 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1658,34 +1658,34 @@ TRACE_EVENT(ocfs2_remount,
 );
 
 TRACE_EVENT(ocfs2_fill_super,
-	TP_PROTO(void *sb, void *data, int silent),
-	TP_ARGS(sb, data, silent),
+	TP_PROTO(void *sb, void *fc, int silent),
+	TP_ARGS(sb, fc, silent),
 	TP_STRUCT__entry(
 		__field(void *, sb)
-		__field(void *, data)
+		__field(void *, fc)
 		__field(int, silent)
 	),
 	TP_fast_assign(
 		__entry->sb = sb;
-		__entry->data = data;
+		__entry->fc = fc;
 		__entry->silent = silent;
 	),
 	TP_printk("%p %p %d", __entry->sb,
-		  __entry->data, __entry->silent)
+		  __entry->fc, __entry->silent)
 );
 
 TRACE_EVENT(ocfs2_parse_options,
-	TP_PROTO(int is_remount, char *options),
-	TP_ARGS(is_remount, options),
+	TP_PROTO(int is_remount, const char *option),
+	TP_ARGS(is_remount, option),
 	TP_STRUCT__entry(
 		__field(int, is_remount)
-		__string(options, options)
+		__string(option, option)
 	),
 	TP_fast_assign(
 		__entry->is_remount = is_remount;
-		__assign_str(options);
+		__assign_str(option);
 	),
-	TP_printk("%d %s", __entry->is_remount, __get_str(options))
+	TP_printk("%d %s", __entry->is_remount, __get_str(option))
 );
 
 DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c79b4291777f..4319cb2bf298 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -19,10 +19,10 @@
 #include <linux/blkdev.h>
 #include <linux/socket.h>
 #include <linux/inet.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
-#include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
 #include <linux/signal.h>
@@ -80,17 +80,15 @@ struct mount_options
 	unsigned int	resv_level;
 	int		dir_resv_level;
 	char		cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+	bool		user_stack;
 };
 
-static int ocfs2_parse_options(struct super_block *sb, char *options,
-			       struct mount_options *mopt,
-			       int is_remount);
+static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param);
 static int ocfs2_check_set_options(struct super_block *sb,
 				   struct mount_options *options);
 static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
-static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
 static int ocfs2_initialize_mem_caches(void);
 static void ocfs2_free_mem_caches(void);
@@ -135,7 +133,6 @@ static const struct super_operations ocfs2_sops = {
 	.evict_inode	= ocfs2_evict_inode,
 	.sync_fs	= ocfs2_sync_fs,
 	.put_super	= ocfs2_put_super,
-	.remount_fs	= ocfs2_remount,
 	.show_options   = ocfs2_show_options,
 	.quota_read	= ocfs2_quota_read,
 	.quota_write	= ocfs2_quota_write,
@@ -144,15 +141,10 @@ static const struct super_operations ocfs2_sops = {
 
 enum {
 	Opt_barrier,
-	Opt_err_panic,
-	Opt_err_ro,
+	Opt_errors,
 	Opt_intr,
-	Opt_nointr,
-	Opt_hb_none,
-	Opt_hb_local,
-	Opt_hb_global,
-	Opt_data_ordered,
-	Opt_data_writeback,
+	Opt_heartbeat,
+	Opt_data,
 	Opt_atime_quantum,
 	Opt_slot,
 	Opt_commit,
@@ -160,52 +152,64 @@ enum {
 	Opt_localflocks,
 	Opt_stack,
 	Opt_user_xattr,
-	Opt_nouser_xattr,
 	Opt_inode64,
 	Opt_acl,
-	Opt_noacl,
 	Opt_usrquota,
 	Opt_grpquota,
-	Opt_coherency_buffered,
-	Opt_coherency_full,
+	Opt_coherency,
 	Opt_resv_level,
 	Opt_dir_resv_level,
 	Opt_journal_async_commit,
-	Opt_err_cont,
-	Opt_err,
 };
 
-static const match_table_t tokens = {
-	{Opt_barrier, "barrier=%u"},
-	{Opt_err_panic, "errors=panic"},
-	{Opt_err_ro, "errors=remount-ro"},
-	{Opt_intr, "intr"},
-	{Opt_nointr, "nointr"},
-	{Opt_hb_none, OCFS2_HB_NONE},
-	{Opt_hb_local, OCFS2_HB_LOCAL},
-	{Opt_hb_global, OCFS2_HB_GLOBAL},
-	{Opt_data_ordered, "data=ordered"},
-	{Opt_data_writeback, "data=writeback"},
-	{Opt_atime_quantum, "atime_quantum=%u"},
-	{Opt_slot, "preferred_slot=%u"},
-	{Opt_commit, "commit=%u"},
-	{Opt_localalloc, "localalloc=%d"},
-	{Opt_localflocks, "localflocks"},
-	{Opt_stack, "cluster_stack=%s"},
-	{Opt_user_xattr, "user_xattr"},
-	{Opt_nouser_xattr, "nouser_xattr"},
-	{Opt_inode64, "inode64"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_usrquota, "usrquota"},
-	{Opt_grpquota, "grpquota"},
-	{Opt_coherency_buffered, "coherency=buffered"},
-	{Opt_coherency_full, "coherency=full"},
-	{Opt_resv_level, "resv_level=%u"},
-	{Opt_dir_resv_level, "dir_resv_level=%u"},
-	{Opt_journal_async_commit, "journal_async_commit"},
-	{Opt_err_cont, "errors=continue"},
-	{Opt_err, NULL}
+static const struct constant_table ocfs2_param_errors[] = {
+	{"panic",	OCFS2_MOUNT_ERRORS_PANIC},
+	{"remount-ro",	OCFS2_MOUNT_ERRORS_ROFS},
+	{"continue",	OCFS2_MOUNT_ERRORS_CONT},
+	{}
+};
+
+static const struct constant_table ocfs2_param_heartbeat[] = {
+	{"local",	OCFS2_MOUNT_HB_LOCAL},
+	{"none",	OCFS2_MOUNT_HB_NONE},
+	{"global",	OCFS2_MOUNT_HB_GLOBAL},
+	{}
+};
+
+static const struct constant_table ocfs2_param_data[] = {
+	{"writeback",	OCFS2_MOUNT_DATA_WRITEBACK},
+	{"ordered",	0},
+	{}
+};
+
+static const struct constant_table ocfs2_param_coherency[] = {
+	{"buffered",	OCFS2_MOUNT_COHERENCY_BUFFERED},
+	{"full",	0},
+	{}
+};
+
+static const struct fs_parameter_spec ocfs2_param_spec[] = {
+	fsparam_u32	("barrier",	Opt_barrier),
+	fsparam_enum	("errors",	Opt_errors,	ocfs2_param_errors),
+	fsparam_flag_no	("intr",	Opt_intr),
+	fsparam_enum	("heartbeat",	Opt_heartbeat,	ocfs2_param_heartbeat),
+	fsparam_enum	("data",	Opt_data,	ocfs2_param_data),
+	fsparam_u32	("atime_quantum", Opt_atime_quantum),
+	fsparam_u32	("preferred_slot", Opt_slot),
+	fsparam_u32	("commit",	Opt_commit),
+	fsparam_s32	("localalloc",	Opt_localalloc),
+	fsparam_flag	("localflocks",	Opt_localflocks),
+	fsparam_string	("cluster_stack", Opt_stack),
+	fsparam_flag_no	("user_xattr",	Opt_user_xattr),
+	fsparam_flag	("inode64",	Opt_inode64),
+	fsparam_flag_no	("acl",		Opt_acl),
+	fsparam_flag	("usrquota",	Opt_usrquota),
+	fsparam_flag	("grpquota",	Opt_grpquota),
+	fsparam_enum	("coherency",	Opt_coherency,	ocfs2_param_coherency),
+	fsparam_u32	("resv_level",	Opt_resv_level),
+	fsparam_u32	("dir_resv_level",	Opt_dir_resv_level),
+	fsparam_flag	("journal_async_commit", Opt_journal_async_commit),
+	{}
 };
 
 #ifdef CONFIG_DEBUG_FS
@@ -600,32 +604,32 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 	return (((unsigned long long)bytes) << bitshift) - trim;
 }
 
-static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
+static int ocfs2_reconfigure(struct fs_context *fc)
 {
 	int incompat_features;
 	int ret = 0;
-	struct mount_options parsed_options;
+	struct mount_options *parsed_options = fc->fs_private;
+	struct super_block *sb = fc->root->d_sb;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	u32 tmp;
 
 	sync_filesystem(sb);
 
-	if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
-	    !ocfs2_check_set_options(sb, &parsed_options)) {
+	if (!ocfs2_check_set_options(sb, parsed_options)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
 		OCFS2_MOUNT_HB_NONE;
-	if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
+	if ((osb->s_mount_opt & tmp) != (parsed_options->mount_opt & tmp)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
 		goto out;
 	}
 
 	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
-	    (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
+	    (parsed_options->mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change data mode on remount\n");
 		goto out;
@@ -634,16 +638,16 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	/* Probably don't want this on remount; it might
 	 * mess with other nodes */
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
-	    (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+	    (parsed_options->mount_opt & OCFS2_MOUNT_INODE64)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
 		goto out;
 	}
 
 	/* We're going to/from readonly mode. */
-	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
+	if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
 		/* Disable quota accounting before remounting RO */
-		if (*flags & SB_RDONLY) {
+		if (fc->sb_flags & SB_RDONLY) {
 			ret = ocfs2_susp_quotas(osb, 0);
 			if (ret < 0)
 				goto out;
@@ -657,7 +661,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			goto unlock_osb;
 		}
 
-		if (*flags & SB_RDONLY) {
+		if (fc->sb_flags & SB_RDONLY) {
 			sb->s_flags |= SB_RDONLY;
 			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 		} else {
@@ -678,11 +682,11 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			sb->s_flags &= ~SB_RDONLY;
 			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
 		}
-		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
+		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, fc->sb_flags);
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
 		/* Enable quota accounting after remounting RW */
-		if (!ret && !(*flags & SB_RDONLY)) {
+		if (!ret && !(fc->sb_flags & SB_RDONLY)) {
 			if (sb_any_quota_suspended(sb))
 				ret = ocfs2_susp_quotas(osb, 1);
 			else
@@ -701,11 +705,11 @@ unlock_osb:
 	if (!ret) {
 		/* Only save off the new mount options in case of a successful
 		 * remount. */
-		osb->s_mount_opt = parsed_options.mount_opt;
-		osb->s_atime_quantum = parsed_options.atime_quantum;
-		osb->preferred_slot = parsed_options.slot;
-		if (parsed_options.commit_interval)
-			osb->osb_commit_interval = parsed_options.commit_interval;
+		osb->s_mount_opt = parsed_options->mount_opt;
+		osb->s_atime_quantum = parsed_options->atime_quantum;
+		osb->preferred_slot = parsed_options->slot;
+		if (parsed_options->commit_interval)
+			osb->osb_commit_interval = parsed_options->commit_interval;
 
 		if (!ocfs2_is_hard_readonly(osb))
 			ocfs2_set_journal_params(osb);
@@ -966,23 +970,18 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 	}
 }
 
-static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
+static int ocfs2_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct dentry *root;
 	int status, sector_size;
-	struct mount_options parsed_options;
+	struct mount_options *parsed_options = fc->fs_private;
 	struct inode *inode = NULL;
 	struct ocfs2_super *osb = NULL;
 	struct buffer_head *bh = NULL;
 	char nodestr[12];
 	struct ocfs2_blockcheck_stats stats;
 
-	trace_ocfs2_fill_super(sb, data, silent);
-
-	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
-		status = -EINVAL;
-		goto out;
-	}
+	trace_ocfs2_fill_super(sb, fc, fc->sb_flags & SB_SILENT);
 
 	/* probe for superblock */
 	status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
@@ -999,24 +998,24 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	osb = OCFS2_SB(sb);
 
-	if (!ocfs2_check_set_options(sb, &parsed_options)) {
+	if (!ocfs2_check_set_options(sb, parsed_options)) {
 		status = -EINVAL;
 		goto out_super;
 	}
-	osb->s_mount_opt = parsed_options.mount_opt;
-	osb->s_atime_quantum = parsed_options.atime_quantum;
-	osb->preferred_slot = parsed_options.slot;
-	osb->osb_commit_interval = parsed_options.commit_interval;
+	osb->s_mount_opt = parsed_options->mount_opt;
+	osb->s_atime_quantum = parsed_options->atime_quantum;
+	osb->preferred_slot = parsed_options->slot;
+	osb->osb_commit_interval = parsed_options->commit_interval;
 
-	ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
-	osb->osb_resv_level = parsed_options.resv_level;
-	osb->osb_dir_resv_level = parsed_options.resv_level;
-	if (parsed_options.dir_resv_level == -1)
-		osb->osb_dir_resv_level = parsed_options.resv_level;
+	ocfs2_la_set_sizes(osb, parsed_options->localalloc_opt);
+	osb->osb_resv_level = parsed_options->resv_level;
+	osb->osb_dir_resv_level = parsed_options->resv_level;
+	if (parsed_options->dir_resv_level == -1)
+		osb->osb_dir_resv_level = parsed_options->resv_level;
 	else
-		osb->osb_dir_resv_level = parsed_options.dir_resv_level;
+		osb->osb_dir_resv_level = parsed_options->dir_resv_level;
 
-	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+	status = ocfs2_verify_userspace_stack(osb, parsed_options);
 	if (status)
 		goto out_super;
 
@@ -1180,27 +1179,72 @@ out:
 	return status;
 }
 
-static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
-			int flags,
-			const char *dev_name,
-			void *data)
+static int ocfs2_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, ocfs2_fill_super);
+}
+
+static void ocfs2_free_fc(struct fs_context *fc)
 {
-	return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations ocfs2_context_ops = {
+	.parse_param	= ocfs2_parse_param,
+	.get_tree	= ocfs2_get_tree,
+	.reconfigure	= ocfs2_reconfigure,
+	.free		= ocfs2_free_fc,
+};
+
+static int ocfs2_init_fs_context(struct fs_context *fc)
+{
+	struct mount_options *mopt;
+
+	mopt = kzalloc(sizeof(struct mount_options), GFP_KERNEL);
+	if (!mopt)
+		return -EINVAL;
+
+	mopt->commit_interval = 0;
+	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
+	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+	mopt->slot = OCFS2_INVALID_SLOT;
+	mopt->localalloc_opt = -1;
+	mopt->cluster_stack[0] = '\0';
+	mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+	mopt->dir_resv_level = -1;
+
+	fc->fs_private = mopt;
+	fc->ops = &ocfs2_context_ops;
+
+	return 0;
 }
 
 static struct file_system_type ocfs2_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "ocfs2",
-	.mount          = ocfs2_mount,
 	.kill_sb        = kill_block_super,
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
-	.next           = NULL
+	.next           = NULL,
+	.init_fs_context = ocfs2_init_fs_context,
+	.parameters	= ocfs2_param_spec,
 };
 MODULE_ALIAS_FS("ocfs2");
 
 static int ocfs2_check_set_options(struct super_block *sb,
 				   struct mount_options *options)
 {
+	if (options->user_stack == 0) {
+		u32 tmp;
+
+		/* Ensure only one heartbeat mode */
+		tmp = options->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+					    OCFS2_MOUNT_HB_GLOBAL |
+					    OCFS2_MOUNT_HB_NONE);
+		if (hweight32(tmp) != 1) {
+			mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+			return 0;
+		}
+	}
 	if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
 	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 					 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
@@ -1232,241 +1276,142 @@ static int ocfs2_check_set_options(struct super_block *sb,
 	return 1;
 }
 
-static int ocfs2_parse_options(struct super_block *sb,
-			       char *options,
-			       struct mount_options *mopt,
-			       int is_remount)
+static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	int status, user_stack = 0;
-	char *p;
-	u32 tmp;
-	int token, option;
-	substring_t args[MAX_OPT_ARGS];
-
-	trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
-
-	mopt->commit_interval = 0;
-	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
-	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
-	mopt->slot = OCFS2_INVALID_SLOT;
-	mopt->localalloc_opt = -1;
-	mopt->cluster_stack[0] = '\0';
-	mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
-	mopt->dir_resv_level = -1;
-
-	if (!options) {
-		status = 1;
-		goto bail;
-	}
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_hb_local:
-			mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
-			break;
-		case Opt_hb_none:
-			mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
-			break;
-		case Opt_hb_global:
-			mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
-			break;
-		case Opt_barrier:
-			if (match_int(&args[0], &option)) {
-				status = 0;
-				goto bail;
-			}
-			if (option)
-				mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
-			else
-				mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
-			break;
-		case Opt_intr:
-			mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
-			break;
-		case Opt_nointr:
+	struct fs_parse_result result;
+	int opt;
+	struct mount_options *mopt = fc->fs_private;
+	bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);
+
+	trace_ocfs2_parse_options(is_remount, param->key);
+
+	opt = fs_parse(fc, ocfs2_param_spec, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_heartbeat:
+		mopt->mount_opt |= result.uint_32;
+		break;
+	case Opt_barrier:
+		if (result.uint_32)
+			mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
+		else
+			mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
+		break;
+	case Opt_intr:
+		if (result.negated)
 			mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
-			break;
-		case Opt_err_panic:
-			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
-			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
-			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
-			break;
-		case Opt_err_ro:
-			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
-			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
-			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
-			break;
-		case Opt_err_cont:
-			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
-			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
-			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
-			break;
-		case Opt_data_ordered:
-			mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
-			break;
-		case Opt_data_writeback:
-			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
-			break;
-		case Opt_user_xattr:
-			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
-			break;
-		case Opt_nouser_xattr:
+		else
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
+		break;
+	case Opt_errors:
+		mopt->mount_opt &= ~(OCFS2_MOUNT_ERRORS_CONT |
+				     OCFS2_MOUNT_ERRORS_ROFS |
+				     OCFS2_MOUNT_ERRORS_PANIC);
+		mopt->mount_opt |= result.uint_32;
+		break;
+	case Opt_data:
+		mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+		mopt->mount_opt |= result.uint_32;
+		break;
+	case Opt_user_xattr:
+		if (result.negated)
 			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
-			break;
-		case Opt_atime_quantum:
-			if (match_int(&args[0], &option)) {
-				status = 0;
-				goto bail;
-			}
-			if (option >= 0)
-				mopt->atime_quantum = option;
-			break;
-		case Opt_slot:
-			if (match_int(&args[0], &option)) {
-				status = 0;
-				goto bail;
-			}
-			if (option)
-				mopt->slot = (u16)option;
-			break;
-		case Opt_commit:
-			if (match_int(&args[0], &option)) {
-				status = 0;
-				goto bail;
-			}
-			if (option < 0)
-				return 0;
-			if (option == 0)
-				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
-			mopt->commit_interval = HZ * option;
-			break;
-		case Opt_localalloc:
-			if (match_int(&args[0], &option)) {
-				status = 0;
-				goto bail;
-			}
-			if (option >= 0)
-				mopt->localalloc_opt = option;
-			break;
-		case Opt_localflocks:
-			/*
-			 * Changing this during remount could race
-			 * flock() requests, or "unbalance" existing
-			 * ones (e.g., a lock is taken in one mode but
-			 * dropped in the other). If users care enough
-			 * to flip locking modes during remount, we
-			 * could add a "local" flag to individual
-			 * flock structures for proper tracking of
-			 * state.
-			 */
-			if (!is_remount)
-				mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
-			break;
-		case Opt_stack:
-			/* Check both that the option we were passed
-			 * is of the right length and that it is a proper
-			 * string of the right length.
-			 */
-			if (((args[0].to - args[0].from) !=
-			     OCFS2_STACK_LABEL_LEN) ||
-			    (strnlen(args[0].from,
-				     OCFS2_STACK_LABEL_LEN) !=
-			     OCFS2_STACK_LABEL_LEN)) {
-				mlog(ML_ERROR,
-				     "Invalid cluster_stack option\n");
-				status = 0;
-				goto bail;
-			}
-			memcpy(mopt->cluster_stack, args[0].from,
-			       OCFS2_STACK_LABEL_LEN);
-			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
-			/*
-			 * Open code the memcmp here as we don't have
-			 * an osb to pass to
-			 * ocfs2_userspace_stack().
-			 */
-			if (memcmp(mopt->cluster_stack,
-				   OCFS2_CLASSIC_CLUSTER_STACK,
-				   OCFS2_STACK_LABEL_LEN))
-				user_stack = 1;
-			break;
-		case Opt_inode64:
-			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
-			break;
-		case Opt_usrquota:
-			mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
-			break;
-		case Opt_grpquota:
-			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
-			break;
-		case Opt_coherency_buffered:
-			mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
-			break;
-		case Opt_coherency_full:
-			mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
-			break;
-		case Opt_acl:
-			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
-			mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
-			break;
-		case Opt_noacl:
+		else
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+		break;
+	case Opt_atime_quantum:
+		mopt->atime_quantum = result.uint_32;
+		break;
+	case Opt_slot:
+		if (result.uint_32)
+			mopt->slot = (u16)result.uint_32;
+		break;
+	case Opt_commit:
+		if (result.uint_32 == 0)
+			mopt->commit_interval = HZ * JBD2_DEFAULT_MAX_COMMIT_AGE;
+		else
+			mopt->commit_interval = HZ * result.uint_32;
+		break;
+	case Opt_localalloc:
+		if (result.int_32 >= 0)
+			mopt->localalloc_opt = result.int_32;
+		break;
+	case Opt_localflocks:
+		/*
+		 * Changing this during remount could race flock() requests, or
+		 * "unbalance" existing ones (e.g., a lock is taken in one mode
+		 * but dropped in the other). If users care enough to flip
+		 * locking modes during remount, we could add a "local" flag to
+		 * individual flock structures for proper tracking of state.
+		 */
+		if (!is_remount)
+			mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+		break;
+	case Opt_stack:
+		/* Check both that the option we were passed is of the right
+		 * length and that it is a proper string of the right length.
+		 */
+		if (strlen(param->string) != OCFS2_STACK_LABEL_LEN) {
+			mlog(ML_ERROR, "Invalid cluster_stack option\n");
+			return -EINVAL;
+		}
+		memcpy(mopt->cluster_stack, param->string, OCFS2_STACK_LABEL_LEN);
+		mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+		/*
+		 * Open code the memcmp here as we don't have an osb to pass
+		 * to ocfs2_userspace_stack().
+		 */
+		if (memcmp(mopt->cluster_stack,
+			   OCFS2_CLASSIC_CLUSTER_STACK,
+			   OCFS2_STACK_LABEL_LEN))
+			mopt->user_stack = 1;
+		break;
+	case Opt_inode64:
+		mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+		break;
+	case Opt_usrquota:
+		mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+		break;
+	case Opt_grpquota:
+		mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+		break;
+	case Opt_coherency:
+		mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+		mopt->mount_opt |= result.uint_32;
+		break;
+	case Opt_acl:
+		if (result.negated) {
 			mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
 			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+		} else {
+			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+			mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
+		}
+		break;
+	case Opt_resv_level:
+		if (is_remount)
 			break;
-		case Opt_resv_level:
-			if (is_remount)
-				break;
-			if (match_int(&args[0], &option)) {
-				status = 0;
-				goto bail;
-			}
-			if (option >= OCFS2_MIN_RESV_LEVEL &&
-			    option < OCFS2_MAX_RESV_LEVEL)
-				mopt->resv_level = option;
-			break;
-		case Opt_dir_resv_level:
-			if (is_remount)
-				break;
-			if (match_int(&args[0], &option)) {
-				status = 0;
-				goto bail;
-			}
-			if (option >= OCFS2_MIN_RESV_LEVEL &&
-			    option < OCFS2_MAX_RESV_LEVEL)
-				mopt->dir_resv_level = option;
-			break;
-		case Opt_journal_async_commit:
-			mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
+		if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL &&
+		    result.uint_32 < OCFS2_MAX_RESV_LEVEL)
+			mopt->resv_level = result.uint_32;
+		break;
+	case Opt_dir_resv_level:
+		if (is_remount)
 			break;
-		default:
-			mlog(ML_ERROR,
-			     "Unrecognized mount option \"%s\" "
-			     "or missing value\n", p);
-			status = 0;
-			goto bail;
-		}
-	}
-
-	if (user_stack == 0) {
-		/* Ensure only one heartbeat mode */
-		tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
-					 OCFS2_MOUNT_HB_GLOBAL |
-					 OCFS2_MOUNT_HB_NONE);
-		if (hweight32(tmp) != 1) {
-			mlog(ML_ERROR, "Invalid heartbeat mount options\n");
-			status = 0;
-			goto bail;
-		}
+		if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL &&
+		    result.uint_32 < OCFS2_MAX_RESV_LEVEL)
+			mopt->dir_resv_level = result.uint_32;
+		break;
+	case Opt_journal_async_commit:
+		mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
+		break;
+	default:
+		return -EINVAL;
 	}
 
-	status = 1;
-
-bail:
-	return status;
+	return 0;
 }
 
 static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
-- 
cgit v1.2.3


From 08de555ae59a8b062f3d84a5f5684129584dd4c6 Mon Sep 17 00:00:00 2001
From: Daniel Yang <danielyangkang@gmail.com>
Date: Sun, 17 Nov 2024 13:52:18 -0800
Subject: ocfs2: heartbeat: replace simple_strtoul with kstrtoul

simple_strtoul() is deprecated due to ignoring overflows and also requires
clunkier error checking.  Replacing with kstrtoul() leads to safer code
and cleaner error checking.

Link: https://lkml.kernel.org/r/20241117215219.4012-1-danielyangkang@gmail.com
Signed-off-by: Daniel Yang <danielyangkang@gmail.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4200a0341343..a67817e0e663 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1535,10 +1535,11 @@ static int o2hb_read_block_input(struct o2hb_region *reg,
 {
 	unsigned long bytes;
 	char *p = (char *)page;
+	int ret;
 
-	bytes = simple_strtoul(p, &p, 0);
-	if (!p || (*p && (*p != '\n')))
-		return -EINVAL;
+	ret = kstrtoul(p, 0, &bytes);
+	if (ret)
+		return ret;
 
 	/* Heartbeat and fs min / max block sizes are the same. */
 	if (bytes > 4096 || bytes < 512)
@@ -1622,13 +1623,14 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
 	struct o2hb_region *reg = to_o2hb_region(item);
 	unsigned long tmp;
 	char *p = (char *)page;
+	int ret;
 
 	if (reg->hr_bdev_file)
 		return -EINVAL;
 
-	tmp = simple_strtoul(p, &p, 0);
-	if (!p || (*p && (*p != '\n')))
-		return -EINVAL;
+	ret = kstrtoul(p, 0, &tmp);
+	if (ret)
+		return ret;
 
 	if (tmp > O2NM_MAX_NODES || tmp == 0)
 		return -ERANGE;
@@ -2136,10 +2138,11 @@ static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *ite
 {
 	unsigned long tmp;
 	char *p = (char *)page;
+	int ret;
 
-	tmp = simple_strtoul(p, &p, 10);
-	if (!p || (*p && (*p != '\n')))
-                return -EINVAL;
+	ret = kstrtoul(p, 10, &tmp);
+	if (ret)
+		return ret;
 
 	/* this will validate ranges for us. */
 	o2hb_dead_threshold_set((unsigned int) tmp);
-- 
cgit v1.2.3


From a0f8a9a9636e19a11ab80db4a45fb80a230c6c5a Mon Sep 17 00:00:00 2001
From: Dmitry Antipov <dmantipov@yandex.ru>
Date: Fri, 15 Nov 2024 18:10:13 +0300
Subject: ocfs2: miscellaneous spelling fixes

Correct spelling here and there as suggested by codespell.

Link: https://lkml.kernel.org/r/20241115151013.1404929-1-dmantipov@yandex.ru
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/alloc.c             | 10 +++++-----
 fs/ocfs2/aops.c              |  2 +-
 fs/ocfs2/cluster/heartbeat.c |  2 +-
 fs/ocfs2/cluster/masklog.h   |  2 +-
 fs/ocfs2/cluster/quorum.c    |  6 +++---
 fs/ocfs2/cluster/tcp.c       |  8 ++++----
 fs/ocfs2/dlm/dlmapi.h        |  2 +-
 fs/ocfs2/dlm/dlmrecovery.c   |  6 +++---
 fs/ocfs2/dlmglue.c           | 12 ++++++------
 fs/ocfs2/inode.c             |  4 ++--
 fs/ocfs2/ioctl.c             |  2 +-
 fs/ocfs2/journal.c           |  2 +-
 fs/ocfs2/move_extents.c      |  8 ++++----
 fs/ocfs2/ocfs2_fs.h          |  8 ++++----
 fs/ocfs2/ocfs2_ioctl.h       |  2 +-
 fs/ocfs2/ocfs2_lockid.h      |  2 +-
 fs/ocfs2/refcounttree.c      |  6 +++---
 fs/ocfs2/reservations.h      |  4 ++--
 fs/ocfs2/stack_o2cb.c        |  2 +-
 fs/ocfs2/stackglue.h         |  2 +-
 fs/ocfs2/super.c             |  2 +-
 fs/ocfs2/xattr.c             | 10 +++++-----
 22 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 395e23920632..b3fa953e5637 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -566,7 +566,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
 					   struct ocfs2_path *path,
 					   struct ocfs2_extent_rec *insert_rec);
 /*
- * Reset the actual path elements so that we can re-use the structure
+ * Reset the actual path elements so that we can reuse the structure
  * to build another path. Generally, this involves freeing the buffer
  * heads.
  */
@@ -1182,7 +1182,7 @@ static int ocfs2_add_branch(handle_t *handle,
 
 	/*
 	 * If there is a gap before the root end and the real end
-	 * of the righmost leaf block, we need to remove the gap
+	 * of the rightmost leaf block, we need to remove the gap
 	 * between new_cpos and root_end first so that the tree
 	 * is consistent after we add a new branch(it will start
 	 * from new_cpos).
@@ -1238,7 +1238,7 @@ static int ocfs2_add_branch(handle_t *handle,
 
 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
 	 * linked with the rest of the tree.
-	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
+	 * conversely, new_eb_bhs[0] is the new bottommost leaf.
 	 *
 	 * when we leave the loop, new_last_eb_blk will point to the
 	 * newest leaf, and next_blkno will point to the topmost extent
@@ -3712,7 +3712,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
 		 * update split_index here.
 		 *
 		 * When the split_index is zero, we need to merge it to the
-		 * prevoius extent block. It is more efficient and easier
+		 * previous extent block. It is more efficient and easier
 		 * if we do merge_right first and merge_left later.
 		 */
 		ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
@@ -4517,7 +4517,7 @@ static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
 }
 
 /*
- * This should only be called against the righmost leaf extent list.
+ * This should only be called against the rightmost leaf extent list.
  *
  * ocfs2_figure_appending_type() will figure out whether we'll have to
  * insert at the tail of the rightmost leaf.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index db72b3e924b3..64b3ddeb3555 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -305,7 +305,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio)
 	}
 
 	/*
-	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * i_size might have just been updated as we grabbed the meta lock.  We
 	 * might now be discovering a truncate that hit on another node.
 	 * block_read_full_folio->get_block freaks out if it is asked to read
 	 * beyond the end of a file, so we check here.  Callers
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a67817e0e663..467127e361be 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1020,7 +1020,7 @@ fire_callbacks:
 	if (list_empty(&slot->ds_live_item))
 		goto out;
 
-	/* live nodes only go dead after enough consequtive missed
+	/* live nodes only go dead after enough consecutive missed
 	 * samples..  reset the missed counter whenever we see
 	 * activity */
 	if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index b73fc42e46ff..630bd5a3dd0d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -29,7 +29,7 @@
  * just calling printk() so that this can eventually make its way through
  * relayfs along with the debugging messages.  Everything else gets KERN_DEBUG.
  * The inline tests and macro dance give GCC the opportunity to quite cleverly
- * only emit the appropriage printk() when the caller passes in a constant
+ * only emit the appropriate printk() when the caller passes in a constant
  * mask, as is almost always the case.
  *
  * All this bitmask nonsense is managed from the files under
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 8bf17231d7b7..bfb8b456876c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -23,7 +23,7 @@
  * race between when we see a node start heartbeating and when we connect
  * to it.
  *
- * So nodes that are in this transtion put a hold on the quorum decision
+ * So nodes that are in this transition put a hold on the quorum decision
  * with a counter.  As they fall out of this transition they drop the count
  * and if they're the last, they fire off the decision.
  */
@@ -189,7 +189,7 @@ static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
 }
 
 /* as a node comes up we delay the quorum decision until we know the fate of
- * the connection.  the hold will be droped in conn_up or hb_down.  it might be
+ * the connection.  the hold will be dropped in conn_up or hb_down.  it might be
  * perpetuated by con_err until hb_down.  if we already have a conn, we might
  * be dropping a hold that conn_up got. */
 void o2quo_hb_up(u8 node)
@@ -256,7 +256,7 @@ void o2quo_hb_still_up(u8 node)
 }
 
 /* This is analogous to hb_up.  as a node's connection comes up we delay the
- * quorum decision until we see it heartbeating.  the hold will be droped in
+ * quorum decision until we see it heartbeating.  the hold will be dropped in
  * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
  * it's already heartbeating we might be dropping a hold that conn_up got.
  * */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2b8fa3e782fb..0f46b22561d6 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -5,13 +5,13 @@
  *
  * ----
  *
- * Callers for this were originally written against a very simple synchronus
+ * Callers for this were originally written against a very simple synchronous
  * API.  This implementation reflects those simple callers.  Some day I'm sure
  * we'll need to move to a more robust posting/callback mechanism.
  *
  * Transmit calls pass in kernel virtual addresses and block copying this into
  * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
- * for a failed socket to timeout.  TX callers can also pass in a poniter to an
+ * for a failed socket to timeout.  TX callers can also pass in a pointer to an
  * 'int' which gets filled with an errno off the wire in response to the
  * message they send.
  *
@@ -101,7 +101,7 @@ static struct socket *o2net_listen_sock;
  * o2net_wq.  teardown detaches the callbacks before destroying the workqueue.
  * quorum work is queued as sock containers are shutdown.. stop_listening
  * tears down all the node's sock containers, preventing future shutdowns
- * and queued quroum work, before canceling delayed quorum work and
+ * and queued quorum work, before canceling delayed quorum work and
  * destroying the work queue.
  */
 static struct workqueue_struct *o2net_wq;
@@ -1419,7 +1419,7 @@ out:
 	return ret;
 }
 
-/* this work func is triggerd by data ready.  it reads until it can read no
+/* this work func is triggered by data ready.  it reads until it can read no
  * more.  it interprets 0, eof, as fatal.  if data_ready hits while we're doing
  * our work the work struct will be marked and we'll be called again. */
 static void o2net_rx_until_empty(struct work_struct *work)
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index 847a52dcbe7d..1969db8ffa9c 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -118,7 +118,7 @@ struct dlm_lockstatus {
 #define LKM_VALBLK       0x00000100  /* lock value block request */
 #define LKM_NOQUEUE      0x00000200  /* non blocking request */
 #define LKM_CONVERT      0x00000400  /* conversion request */
-#define LKM_NODLCKWT     0x00000800  /* this lock wont deadlock (U) */
+#define LKM_NODLCKWT     0x00000800  /* this lock won't deadlock (U) */
 #define LKM_UNLOCK       0x00001000  /* deallocate this lock */
 #define LKM_CANCEL       0x00002000  /* cancel conversion request */
 #define LKM_DEQALL       0x00004000  /* remove all locks held by proc (U) */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 50da8af988c1..54c548ef037a 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -207,7 +207,7 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
  * 1) all recovery threads cluster wide will work on recovering
  *    ONE node at a time
  * 2) negotiate who will take over all the locks for the dead node.
- *    thats right... ALL the locks.
+ *    that's right... ALL the locks.
  * 3) once a new master is chosen, everyone scans all locks
  *    and moves aside those mastered by the dead guy
  * 4) each of these locks should be locked until recovery is done
@@ -1469,7 +1469,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
 		 * The first one is handled at the end of this function. The
 		 * other two are handled in the worker thread after locks have
 		 * been attached. Yes, we don't wait for purge time to match
-		 * kref_init. The lockres will still have atleast one ref
+		 * kref_init. The lockres will still have at least one ref
 		 * added because it is in the hash __dlm_insert_lockres() */
 		extra_refs++;
 
@@ -1735,7 +1735,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 				spin_unlock(&res->spinlock);
 			}
 		} else {
-			/* put.. incase we are not the master */
+			/* put.. in case we are not the master */
 			spin_unlock(&res->spinlock);
 			dlm_lockres_put(res);
 		}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 764ecbd5ad41..8f08e89be37d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -794,7 +794,7 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 
 /*
  * Keep a list of processes who have interest in a lockres.
- * Note: this is now only uesed for check recursive cluster locking.
+ * Note: this is now only used for check recursive cluster locking.
  */
 static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
 				   struct ocfs2_lock_holder *oh)
@@ -2532,7 +2532,7 @@ bail:
  * locks while holding a page lock and the downconvert thread which
  * blocks dlm lock acquiry while acquiring page locks.
  *
- * ** These _with_page variantes are only intended to be called from aop
+ * ** These _with_page variants are only intended to be called from aop
  * methods that hold page locks and return a very specific *positive* error
  * code that aop methods pass up to the VFS -- test for errors with != 0. **
  *
@@ -2630,7 +2630,7 @@ void ocfs2_inode_unlock(struct inode *inode,
 }
 
 /*
- * This _tracker variantes are introduced to deal with the recursive cluster
+ * This _tracker variants are introduced to deal with the recursive cluster
  * locking issue. The idea is to keep track of a lock holder on the stack of
  * the current process. If there's a lock holder on the stack, we know the
  * task context is already protected by cluster locking. Currently, they're
@@ -2735,7 +2735,7 @@ void ocfs2_inode_unlock_tracker(struct inode *inode,
 	struct ocfs2_lock_res *lockres;
 
 	lockres = &OCFS2_I(inode)->ip_inode_lockres;
-	/* had_lock means that the currect process already takes the cluster
+	/* had_lock means that the current process already takes the cluster
 	 * lock previously.
 	 * If had_lock is 1, we have nothing to do here.
 	 * If had_lock is 0, we will release the lock.
@@ -3802,9 +3802,9 @@ recheck:
 	 * set when the ast is received for an upconvert just before the
 	 * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
 	 * on the heels of the ast, we want to delay the downconvert just
-	 * enough to allow the up requestor to do its task. Because this
+	 * enough to allow the up requester to do its task. Because this
 	 * lock is in the blocked queue, the lock will be downconverted
-	 * as soon as the requestor is done with the lock.
+	 * as soon as the requester is done with the lock.
 	 */
 	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
 		goto leave_requeue;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 2cc5c99fe941..cd3173062ae3 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1122,7 +1122,7 @@ static void ocfs2_clear_inode(struct inode *inode)
 
 	dquot_drop(inode);
 
-	/* To preven remote deletes we hold open lock before, now it
+	/* To prevent remote deletes we hold open lock before, now it
 	 * is time to unlock PR and EX open locks. */
 	ocfs2_open_unlock(inode);
 
@@ -1437,7 +1437,7 @@ static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
 	 * Call ocfs2_validate_meta_ecc() first since it has ecc repair
 	 * function, but we should not return error immediately when ecc
 	 * validation fails, because the reason is quite likely the invalid
-	 * inode number inputed.
+	 * inode number inputted.
 	 */
 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
 	if (rc) {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 71beef7f8a60..7ae96fb8807a 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -796,7 +796,7 @@ bail:
 /*
  * OCFS2_IOC_INFO handles an array of requests passed from userspace.
  *
- * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * ocfs2_info_handle() receives a large info aggregation, grab and
  * validate the request count from header, then break it into small
  * pieces, later specific handlers can handle them one by one.
  *
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 1bf188b6866a..f1b4b3e611cb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1956,7 +1956,7 @@ bail:
 
 /*
  * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
- * randomness to the timeout to minimize multple nodes firing the timer at the
+ * randomness to the timeout to minimize multiple nodes firing the timer at the
  * same time.
  */
 static inline unsigned long ocfs2_orphan_scan_timeout(void)
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index f9d6a4f9ca92..369c7d27befd 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -492,7 +492,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
 	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
 
 	/*
-	 * moving goal is not allowd to start with a group desc blok(#0 blk)
+	 * moving goal is not allowed to start with a group desc blok(#0 blk)
 	 * let's compromise to the latter cluster.
 	 */
 	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
@@ -658,7 +658,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 
 	/*
 	 * probe the victim cluster group to find a proper
-	 * region to fit wanted movement, it even will perfrom
+	 * region to fit wanted movement, it even will perform
 	 * a best-effort attempt by compromising to a threshold
 	 * around the goal.
 	 */
@@ -920,7 +920,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
 	}
 
 	/*
-	 * rememer ip_xattr_sem also needs to be held if necessary
+	 * remember ip_xattr_sem also needs to be held if necessary
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
@@ -1022,7 +1022,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
 	context->range = &range;
 
 	/*
-	 * ok, the default theshold for the defragmentation
+	 * ok, the default threshold for the defragmentation
 	 * is 1M, since our maximum clustersize was 1M also.
 	 * any thought?
 	 */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c93689b568fe..e8e94599e907 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -132,7 +132,7 @@
  * well as the name of the cluster being joined.
  * mount.ocfs2 must pass in a matching stack name.
  *
- * If not set, the classic stack will be used.  This is compatbile with
+ * If not set, the classic stack will be used.  This is compatible with
  * all older versions.
  */
 #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK	0x0080
@@ -143,7 +143,7 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
 
-/* Support for indexed directores */
+/* Support for indexed directories */
 #define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS	0x0400
 
 /* Metadata checksum and error correction */
@@ -156,7 +156,7 @@
 #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	0x2000
 
 /*
- * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * Incompat bit to indicate usable clusterinfo with stackflags for all
  * cluster stacks (userspace adnd o2cb). If this bit is set,
  * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
  */
@@ -1083,7 +1083,7 @@ struct ocfs2_xattr_block {
 		struct ocfs2_xattr_header xb_header; /* xattr header if this
 							block contains xattr */
 		struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
-							block cotains xattr
+							block contains xattr
 							tree. */
 	} xb_attrs;
 };
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 9680797bc531..2de2f8733283 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -215,7 +215,7 @@ struct ocfs2_move_extents {
 							   movement less likely
 							   to fail, may make fs
 							   even more fragmented */
-#define OCFS2_MOVE_EXT_FL_COMPLETE	(0x00000004)	/* Move or defragmenation
+#define OCFS2_MOVE_EXT_FL_COMPLETE	(0x00000004)	/* Move or defragmentation
 							   completely gets done.
 							 */
 
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 8ac357ce6a30..9b234c03d693 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -93,7 +93,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_DATA] = "Data",
 	[OCFS2_LOCK_TYPE_SUPER] = "Super",
 	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
-	/* Need to differntiate from [R]ename.. serializing writes is the
+	/* Need to differentiate from [R]ename.. serializing writes is the
 	 * important job it does, anyway. */
 	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
 	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 004393b13c0a..73caf991ede5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2420,7 +2420,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 		 *
 		 * If we will insert a new one, this is easy and only happens
 		 * during adding refcounted flag to the extent, so we don't
-		 * have a chance of spliting. We just need one record.
+		 * have a chance of splitting. We just need one record.
 		 *
 		 * If the refcount rec already exists, that would be a little
 		 * complicated. we may have to:
@@ -2610,11 +2610,11 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
 /*
  * Calculate out the start and number of virtual clusters we need to CoW.
  *
- * cpos is vitual start cluster position we want to do CoW in a
+ * cpos is virtual start cluster position we want to do CoW in a
  * file and write_len is the cluster length.
  * max_cpos is the place where we want to stop CoW intentionally.
  *
- * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * Normal we will start CoW from the beginning of extent record containing cpos.
  * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
  * get good I/O from the resulting extent tree.
  */
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index ec8101ef5717..4fce17180342 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -31,7 +31,7 @@ struct ocfs2_alloc_reservation {
 
 #define	OCFS2_RESV_FLAG_INUSE	0x01	/* Set when r_node is part of a btree */
 #define	OCFS2_RESV_FLAG_TMP	0x02	/* Temporary reservation, will be
-					 * destroyed immedately after use */
+					 * destroyed immediately after use */
 #define	OCFS2_RESV_FLAG_DIR	0x04	/* Reservation is for an unindexed
 					 * directory btree */
 
@@ -125,7 +125,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
 /**
  * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
  * @resmap: reservations bitmap
- * @resv: optional reservation to recalulate based on new bitmap
+ * @resv: optional reservation to recalculate based on new bitmap
  * @cstart: start of allocation in clusters
  * @clen: end of allocation in clusters.
  *
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 10157d9d7a9c..f58e891aa2da 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -227,7 +227,7 @@ static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 }
 
 /*
- * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
+ * o2dlm always has a "valid" LVB. If the dlm loses track of the LVB
  * contents, it will zero out the LVB.  Thus the caller can always trust
  * the contents.
  */
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 02ab072c528a..5486a6dce70a 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -210,7 +210,7 @@ struct ocfs2_stack_operations {
 		     struct file_lock *fl);
 
 	/*
-	 * This is an optoinal debugging hook.  If provided, the
+	 * This is an optional debugging hook.  If provided, the
 	 * stack can dump debugging information about this lock.
 	 */
 	void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4319cb2bf298..e0b91dbaa0ac 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1803,7 +1803,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
-	/* Remove file check sysfs related directores/files,
+	/* Remove file check sysfs related directories/files,
 	 * and wait for the pending file check operations */
 	ocfs2_filecheck_remove_sysfs(osb);
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 73a6f6fd8a8e..d70a20d29e3e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -648,7 +648,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
 	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
 	 * The max space of acl xattr taken inline is
 	 * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
-	 * when blocksize = 512, may reserve one more cluser for
+	 * when blocksize = 512, may reserve one more cluster for
 	 * xattr bucket, otherwise reserve one metadata block
 	 * for them is ok.
 	 * If this is a new directory with inline data,
@@ -4371,7 +4371,7 @@ static int cmp_xe_offset(const void *a, const void *b)
 
 /*
  * defrag a xattr bucket if we find that the bucket has some
- * holes beteen name/value pairs.
+ * holes between name/value pairs.
  * We will move all the name/value pairs to the end of the bucket
  * so that we can spare some space for insertion.
  */
@@ -5011,7 +5011,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
  * 2. If cluster_size == bucket_size:
  *    a) If the previous extent rec has more than one cluster and the insert
  *       place isn't in the last cluster, copy the entire last cluster to the
- *       new one. This time, we don't need to upate the first_bh and header_bh
+ *       new one. This time, we don't need to update the first_bh and header_bh
  *       since they will not be moved into the new cluster.
  *    b) Otherwise, move the bottom half of the xattrs in the last cluster into
  *       the new one. And we set the extend flag to zero if the insert place is
@@ -6189,7 +6189,7 @@ struct ocfs2_xattr_reflink {
 /*
  * Given a xattr header and xe offset,
  * return the proper xv and the corresponding bh.
- * xattr in inode, block and xattr tree have different implementaions.
+ * xattr in inode, block and xattr tree have different implementations.
  */
 typedef int (get_xattr_value_root)(struct super_block *sb,
 				   struct buffer_head *bh,
@@ -6269,7 +6269,7 @@ static int ocfs2_get_xattr_value_root(struct super_block *sb,
 }
 
 /*
- * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * Lock the meta_ac and calculate how much credits we need for reflink xattrs.
  * It is only used for inline xattr and xattr block.
  */
 static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
-- 
cgit v1.2.3


From 876a5b25d54d2fbc67c3a80abbde097b321f7710 Mon Sep 17 00:00:00 2001
From: Daniel Yang <danielyangkang@gmail.com>
Date: Fri, 15 Nov 2024 00:00:17 -0800
Subject: ocfs2: replace deprecated simple_strtol with kstrtol

simple_strtol() ignores overflows and has an awkward interface for error
checking.  Replace with the recommended kstrtol function leads to clearer
error checking and safer conversions.

Link: https://lkml.kernel.org/r/20241115080018.5372-1-danielyangkang@gmail.com
Signed-off-by: Daniel Yang <danielyangkang@gmail.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 467127e361be..724350925aff 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
+#include "linux/kstrtox.h"
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/jiffies.h>
@@ -1778,8 +1779,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	if (o2nm_this_node() == O2NM_MAX_NODES)
 		return -EINVAL;
 
-	fd = simple_strtol(p, &p, 0);
-	if (!p || (*p && (*p != '\n')))
+	ret = kstrtol(p, 0, &fd);
+	if (ret < 0)
 		return -EINVAL;
 
 	if (fd < 0 || fd >= INT_MAX)
-- 
cgit v1.2.3


From 78188dd1d24b9d4cd78431feaa13120cfa37ca6a Mon Sep 17 00:00:00 2001
From: pangliyuan <pangliyuan1@huawei.com>
Date: Tue, 10 Dec 2024 17:08:42 +0800
Subject: Squashfs: don't allocate fragment caches more than fragments

Sometimes the actual number of fragments in image is between
0 and SQUASHFS_CACHED_FRAGMENTS, which cause additional
fragment caches to be allocated.

Sets the number of fragment caches to the minimum of fragments
and SQUASHFS_CACHED_FRAGMENTS.

Link: https://lkml.kernel.org/r/20241210090842.160853-1-pangliyuan1@huawei.com
Signed-off-by: pangliyuan <pangliyuan1@huawei.com>
Reviewed-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: <wangfangpeng1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 22e812808e5c..269c6d61bc29 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -405,7 +405,7 @@ handle_fragments:
 		goto check_directory_table;
 
 	msblk->fragment_cache = squashfs_cache_init("fragment",
-		SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
+		min(SQUASHFS_CACHED_FRAGMENTS, fragments), msblk->block_size);
 	if (msblk->fragment_cache == NULL) {
 		err = -ENOMEM;
 		goto failed_mount;
-- 
cgit v1.2.3


From 2b4c2094da6d84e69b843dd3317902e977bf64bd Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 5 Dec 2024 17:16:29 +0000
Subject: ocfs2: handle a symlink read error correctly

Patch series "Convert ocfs2 to use folios".

Mark did a conversion of ocfs2 to use folios and sent it to me as a
giant patch for review ;-)

So I've redone it as individual patches, and credited Mark for the patches
where his code is substantially the same.  It's not a bad way to do it;
his patch had some bugs and my patches had some bugs.  Hopefully all our
bugs were different from each other.  And hopefully Mark likes all the
changes I made to his code!


This patch (of 23):

If we can't read the buffer, be sure to unlock the page before returning.

Link: https://lkml.kernel.org/r/20241205171653.3179945-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20241205171653.3179945-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Mark Tinguely <mark.tinguely@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/symlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index d4c5fdcfa1e4..f5cf2255dc09 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -65,7 +65,7 @@ static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio)
 
 	if (status < 0) {
 		mlog_errno(status);
-		return status;
+		goto out;
 	}
 
 	fe = (struct ocfs2_dinode *) bh->b_data;
@@ -76,9 +76,10 @@ static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio)
 	memcpy(kaddr, link, len + 1);
 	kunmap_atomic(kaddr);
 	SetPageUptodate(page);
+out:
 	unlock_page(page);
 	brelse(bh);
-	return 0;
+	return status;
 }
 
 const struct address_space_operations ocfs2_fast_symlink_aops = {
-- 
cgit v1.2.3


From 3d9e356f0285325936486ab36e7b034ac471f244 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 5 Dec 2024 17:16:30 +0000
Subject: ocfs2: convert ocfs2_page_mkwrite() to use a folio

Pass the folio into __ocfs2_page_mkwrite() and use it throughout.  Does
not attempt to support large folios.

Link: https://lkml.kernel.org/r/20241205171653.3179945-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/mmap.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 6ef4cb045ccd..93b886f16c40 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -44,13 +44,13 @@ static vm_fault_t ocfs2_fault(struct vm_fault *vmf)
 }
 
 static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
-			struct buffer_head *di_bh, struct page *page)
+			struct buffer_head *di_bh, struct folio *folio)
 {
 	int err;
 	vm_fault_t ret = VM_FAULT_NOPAGE;
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
-	loff_t pos = page_offset(page);
+	loff_t pos = folio_pos(folio);
 	unsigned int len = PAGE_SIZE;
 	pgoff_t last_index;
 	struct folio *locked_folio = NULL;
@@ -72,9 +72,9 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
 	 *
 	 * Let VM retry with these cases.
 	 */
-	if ((page->mapping != inode->i_mapping) ||
-	    (!PageUptodate(page)) ||
-	    (page_offset(page) >= size))
+	if ((folio->mapping != inode->i_mapping) ||
+	    !folio_test_uptodate(folio) ||
+	    (pos >= size))
 		goto out;
 
 	/*
@@ -87,11 +87,11 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
 	 * worry about ocfs2_write_begin() skipping some buffer reads
 	 * because the "write" would invalidate their data.
 	 */
-	if (page->index == last_index)
+	if (folio->index == last_index)
 		len = ((size - 1) & ~PAGE_MASK) + 1;
 
 	err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
-				       &locked_folio, &fsdata, di_bh, page);
+				       &locked_folio, &fsdata, di_bh, &folio->page);
 	if (err) {
 		if (err != -ENOSPC)
 			mlog_errno(err);
@@ -112,7 +112,7 @@ out:
 
 static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf)
 {
-	struct page *page = vmf->page;
+	struct folio *folio = page_folio(vmf->page);
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct buffer_head *di_bh = NULL;
 	sigset_t oldset;
@@ -141,7 +141,7 @@ static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf)
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, page);
+	ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, folio);
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-- 
cgit v1.2.3


From 5290687ae320ea81db2689e92d3737904963bf86 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:31 +0000
Subject: ocfs2: convert w_target_page to w_target_folio

Pass a folio around instead of a page.  Saves a few hidden calls to
compound_head() and removes a call to kmap_atomic().

Link: https://lkml.kernel.org/r/20241205171653.3179945-4-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 63 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 64b3ddeb3555..cbb880ad887a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -731,22 +731,22 @@ struct ocfs2_write_ctxt {
 	/*
 	 * Pages involved in this write.
 	 *
-	 * w_target_page is the page being written to by the user.
+	 * w_target_folio is the folio being written to by the user.
 	 *
 	 * w_pages is an array of pages which always contains
-	 * w_target_page, and in the case of an allocating write with
+	 * w_target_folio, and in the case of an allocating write with
 	 * page_size < cluster size, it will contain zero'd and mapped
-	 * pages adjacent to w_target_page which need to be written
+	 * pages adjacent to w_target_folio which need to be written
 	 * out in so that future reads from that region will get
 	 * zero's.
 	 */
 	unsigned int			w_num_pages;
 	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
-	struct page			*w_target_page;
+	struct folio			*w_target_folio;
 
 	/*
 	 * w_target_locked is used for page_mkwrite path indicating no unlocking
-	 * against w_target_page in ocfs2_write_end_nolock.
+	 * against w_target_folio in ocfs2_write_end_nolock.
 	 */
 	unsigned int			w_target_locked:1;
 
@@ -791,18 +791,18 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
 	/*
 	 * w_target_locked is only set to true in the page_mkwrite() case.
 	 * The intent is to allow us to lock the target page from write_begin()
-	 * to write_end(). The caller must hold a ref on w_target_page.
+	 * to write_end(). The caller must hold a ref on w_target_folio.
 	 */
 	if (wc->w_target_locked) {
-		BUG_ON(!wc->w_target_page);
+		BUG_ON(!wc->w_target_folio);
 		for (i = 0; i < wc->w_num_pages; i++) {
-			if (wc->w_target_page == wc->w_pages[i]) {
+			if (&wc->w_target_folio->page == wc->w_pages[i]) {
 				wc->w_pages[i] = NULL;
 				break;
 			}
 		}
-		mark_page_accessed(wc->w_target_page);
-		put_page(wc->w_target_page);
+		folio_mark_accessed(wc->w_target_folio);
+		folio_put(wc->w_target_folio);
 	}
 	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 }
@@ -869,8 +869,9 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
  * and dirty so they'll be written out (in order to prevent uninitialised
  * block data from leaking). And clear the new bit.
  */
-static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+static void ocfs2_zero_new_buffers(struct folio *folio, unsigned from, unsigned to)
 {
+	struct page *page = &folio->page;
 	unsigned int block_start, block_end;
 	struct buffer_head *head, *bh;
 
@@ -918,8 +919,8 @@ static void ocfs2_write_failure(struct inode *inode,
 		to = user_pos + user_len;
 	struct page *tmppage;
 
-	if (wc->w_target_page)
-		ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+	if (wc->w_target_folio)
+		ocfs2_zero_new_buffers(wc->w_target_folio, from, to);
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
@@ -954,7 +955,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
 	new = new | ((i_size_read(inode) <= page_offset(page)) &&
 			(page_offset(page) <= user_pos));
 
-	if (page == wc->w_target_page) {
+	if (page == &wc->w_target_folio->page) {
 		map_from = user_pos & (PAGE_SIZE - 1);
 		map_to = map_from + user_len;
 
@@ -1097,7 +1098,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 		wait_for_stable_page(wc->w_pages[i]);
 
 		if (index == target_index)
-			wc->w_target_page = wc->w_pages[i];
+			wc->w_target_folio = page_folio(wc->w_pages[i]);
 	}
 out:
 	if (ret)
@@ -1494,7 +1495,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 	 * If we don't set w_num_pages then this page won't get unlocked
 	 * and freed on cleanup of the write context.
 	 */
-	wc->w_pages[0] = wc->w_target_page = page;
+	wc->w_target_folio = page_folio(page);
+	wc->w_pages[0] = page;
 	wc->w_num_pages = 1;
 
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
@@ -1803,7 +1805,7 @@ try_again:
 		 * the operation.
 		 */
 		if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) {
-			BUG_ON(wc->w_target_page);
+			BUG_ON(wc->w_target_folio);
 			ret = 0;
 			goto out_quota;
 		}
@@ -1826,7 +1828,7 @@ try_again:
 
 success:
 	if (foliop)
-		*foliop = page_folio(wc->w_target_page);
+		*foliop = wc->w_target_folio;
 	*fsdata = wc;
 	return 0;
 out_quota:
@@ -1924,18 +1926,15 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
 				   struct ocfs2_dinode *di,
 				   struct ocfs2_write_ctxt *wc)
 {
-	void *kaddr;
-
 	if (unlikely(*copied < len)) {
-		if (!PageUptodate(wc->w_target_page)) {
+		if (!folio_test_uptodate(wc->w_target_folio)) {
 			*copied = 0;
 			return;
 		}
 	}
 
-	kaddr = kmap_atomic(wc->w_target_page);
-	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
-	kunmap_atomic(kaddr);
+	memcpy_from_folio(di->id2.i_data.id_data + pos, wc->w_target_folio,
+			pos, *copied);
 
 	trace_ocfs2_write_end_inline(
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1973,15 +1972,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		goto out_write_size;
 	}
 
-	if (unlikely(copied < len) && wc->w_target_page) {
+	if (unlikely(copied < len) && wc->w_target_folio) {
 		loff_t new_isize;
 
-		if (!PageUptodate(wc->w_target_page))
+		if (!folio_test_uptodate(wc->w_target_folio))
 			copied = 0;
 
 		new_isize = max_t(loff_t, i_size_read(inode), pos + copied);
-		if (new_isize > page_offset(wc->w_target_page))
-			ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+		if (new_isize > folio_pos(wc->w_target_folio))
+			ocfs2_zero_new_buffers(wc->w_target_folio, start+copied,
 					       start+len);
 		else {
 			/*
@@ -1991,12 +1990,12 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			 * put page & buffer dirty bits into inconsistent
 			 * state.
 			 */
-			block_invalidate_folio(page_folio(wc->w_target_page),
+			block_invalidate_folio(wc->w_target_folio,
 						0, PAGE_SIZE);
 		}
 	}
-	if (wc->w_target_page)
-		flush_dcache_page(wc->w_target_page);
+	if (wc->w_target_folio)
+		flush_dcache_folio(wc->w_target_folio);
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
@@ -2005,7 +2004,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		if (tmppage == NULL)
 			continue;
 
-		if (tmppage == wc->w_target_page) {
+		if (tmppage == &wc->w_target_folio->page) {
 			from = wc->w_target_from;
 			to = wc->w_target_to;
 
-- 
cgit v1.2.3


From 0fad0a824e5cbe0920f119c7151dd594186687ef Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:32 +0000
Subject: ocfs2: use a folio in ocfs2_zero_new_buffers()

Convert to the new APIs, saving at least one hidden call to
compound_head().

Link: https://lkml.kernel.org/r/20241205171653.3179945-5-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index cbb880ad887a..76400bba5ab5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -869,30 +869,30 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
  * and dirty so they'll be written out (in order to prevent uninitialised
  * block data from leaking). And clear the new bit.
  */
-static void ocfs2_zero_new_buffers(struct folio *folio, unsigned from, unsigned to)
+static void ocfs2_zero_new_buffers(struct folio *folio, size_t from, size_t to)
 {
-	struct page *page = &folio->page;
 	unsigned int block_start, block_end;
 	struct buffer_head *head, *bh;
 
-	BUG_ON(!PageLocked(page));
-	if (!page_has_buffers(page))
+	BUG_ON(!folio_test_locked(folio));
+	head = folio_buffers(folio);
+	if (!head)
 		return;
 
-	bh = head = page_buffers(page);
+	bh = head;
 	block_start = 0;
 	do {
 		block_end = block_start + bh->b_size;
 
 		if (buffer_new(bh)) {
 			if (block_end > from && block_start < to) {
-				if (!PageUptodate(page)) {
+				if (!folio_test_uptodate(folio)) {
 					unsigned start, end;
 
 					start = max(from, block_start);
 					end = min(to, block_end);
 
-					zero_user_segment(page, start, end);
+					folio_zero_segment(folio, start, end);
 					set_buffer_uptodate(bh);
 				}
 
-- 
cgit v1.2.3


From e981f18e6d32716186410f848a96b6710d08a862 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:33 +0000
Subject: ocfs2: use a folio in ocfs2_write_begin_inline()

Retrieve a folio from the page cache instead of a page and use that
folio throught the function.  Saves a couple of calls to compound_head().

Link: https://lkml.kernel.org/r/20241205171653.3179945-6-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 76400bba5ab5..46fb2b564367 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1473,7 +1473,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct page *page;
+	struct folio *folio;
 	handle_t *handle;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
@@ -1484,19 +1484,20 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 		goto out;
 	}
 
-	page = find_or_create_page(mapping, 0, GFP_NOFS);
-	if (!page) {
+	folio = __filemap_get_folio(mapping, 0,
+			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+	if (IS_ERR(folio)) {
 		ocfs2_commit_trans(osb, handle);
-		ret = -ENOMEM;
+		ret = PTR_ERR(folio);
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
-	 * If we don't set w_num_pages then this page won't get unlocked
+	 * If we don't set w_num_pages then this folio won't get unlocked
 	 * and freed on cleanup of the write context.
 	 */
-	wc->w_target_folio = page_folio(page);
-	wc->w_pages[0] = page;
+	wc->w_target_folio = folio;
+	wc->w_pages[0] = &folio->page;
 	wc->w_num_pages = 1;
 
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
@@ -1511,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
 		ocfs2_set_inode_data_inline(inode, di);
 
-	if (!PageUptodate(page)) {
-		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
+	if (!folio_test_uptodate(folio)) {
+		ret = ocfs2_read_inline_data(inode, &folio->page, wc->w_di_bh);
 		if (ret) {
 			ocfs2_commit_trans(osb, handle);
 
-- 
cgit v1.2.3


From b4f3b496dc2f1dd455f6d425ea0976d1ed96766c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 5 Dec 2024 17:16:34 +0000
Subject: ocfs2: pass mmap_folio around instead of mmap_page

Saves a few hidden calls to compound_head() and accesses to page->mapping.

Link: https://lkml.kernel.org/r/20241205171653.3179945-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 37 ++++++++++++++++++-------------------
 fs/ocfs2/aops.h |  6 +++---
 fs/ocfs2/mmap.c |  2 +-
 3 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 46fb2b564367..c26853fb7dc9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1023,7 +1023,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 				      struct ocfs2_write_ctxt *wc,
 				      u32 cpos, loff_t user_pos,
 				      unsigned user_len, int new,
-				      struct page *mmap_page)
+				      struct folio *mmap_folio)
 {
 	int ret = 0, i;
 	unsigned long start, target_index, end_index, index;
@@ -1068,18 +1068,18 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 			 * and wants us to directly use the page
 			 * passed in.
 			 */
-			lock_page(mmap_page);
+			folio_lock(mmap_folio);
 
 			/* Exit and let the caller retry */
-			if (mmap_page->mapping != mapping) {
-				WARN_ON(mmap_page->mapping);
-				unlock_page(mmap_page);
+			if (mmap_folio->mapping != mapping) {
+				WARN_ON(mmap_folio->mapping);
+				folio_unlock(mmap_folio);
 				ret = -EAGAIN;
 				goto out;
 			}
 
-			get_page(mmap_page);
-			wc->w_pages[i] = mmap_page;
+			folio_get(mmap_folio);
+			wc->w_pages[i] = &mmap_folio->page;
 			wc->w_target_locked = true;
 		} else if (index >= target_index && index <= end_index &&
 			   wc->w_type == OCFS2_WRITE_DIRECT) {
@@ -1536,9 +1536,8 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
 }
 
 static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
-					  struct inode *inode, loff_t pos,
-					  unsigned len, struct page *mmap_page,
-					  struct ocfs2_write_ctxt *wc)
+		struct inode *inode, loff_t pos, size_t len,
+		struct folio *mmap_folio, struct ocfs2_write_ctxt *wc)
 {
 	int ret, written = 0;
 	loff_t end = pos + len;
@@ -1553,7 +1552,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
 	 * Handle inodes which already have inline data 1st.
 	 */
 	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		if (mmap_page == NULL &&
+		if (mmap_folio == NULL &&
 		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
 			goto do_inline_write;
 
@@ -1577,7 +1576,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
 	 * Check whether the write can fit.
 	 */
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
-	if (mmap_page ||
+	if (mmap_folio ||
 	    end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
 		return 0;
 
@@ -1644,9 +1643,9 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
 }
 
 int ocfs2_write_begin_nolock(struct address_space *mapping,
-			     loff_t pos, unsigned len, ocfs2_write_type_t type,
-			     struct folio **foliop, void **fsdata,
-			     struct buffer_head *di_bh, struct page *mmap_page)
+		loff_t pos, unsigned len, ocfs2_write_type_t type,
+		struct folio **foliop, void **fsdata,
+		struct buffer_head *di_bh, struct folio *mmap_folio)
 {
 	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
 	unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
@@ -1669,7 +1668,7 @@ try_again:
 
 	if (ocfs2_supports_inline_data(osb)) {
 		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
-						     mmap_page, wc);
+						     mmap_folio, wc);
 		if (ret == 1) {
 			ret = 0;
 			goto success;
@@ -1721,7 +1720,7 @@ try_again:
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			(long long)i_size_read(inode),
 			le32_to_cpu(di->i_clusters),
-			pos, len, type, mmap_page,
+			pos, len, type, mmap_folio,
 			clusters_to_alloc, extents_to_split);
 
 	/*
@@ -1797,7 +1796,7 @@ try_again:
 	 * extent.
 	 */
 	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
-					 cluster_of_pages, mmap_page);
+					 cluster_of_pages, mmap_folio);
 	if (ret) {
 		/*
 		 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
@@ -1848,7 +1847,7 @@ out:
 	 * to VM code.
 	 */
 	if (wc->w_target_locked)
-		unlock_page(mmap_page);
+		folio_unlock(mmap_folio);
 
 	ocfs2_free_write_ctxt(inode, wc);
 
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 1d1b4b7edba0..1cb46072ad84 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -37,9 +37,9 @@ typedef enum {
 } ocfs2_write_type_t;
 
 int ocfs2_write_begin_nolock(struct address_space *mapping,
-			     loff_t pos, unsigned len, ocfs2_write_type_t type,
-			     struct folio **foliop, void **fsdata,
-			     struct buffer_head *di_bh, struct page *mmap_page);
+		loff_t pos, unsigned len, ocfs2_write_type_t type,
+		struct folio **foliop, void **fsdata,
+		struct buffer_head *di_bh, struct folio *mmap_folio);
 
 int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 			   struct buffer_head *di_bh);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 93b886f16c40..6a314e9f2b49 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -91,7 +91,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
 		len = ((size - 1) & ~PAGE_MASK) + 1;
 
 	err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
-				       &locked_folio, &fsdata, di_bh, &folio->page);
+				       &locked_folio, &fsdata, di_bh, folio);
 	if (err) {
 		if (err != -ENOSPC)
 			mlog_errno(err);
-- 
cgit v1.2.3


From 7654ed889943ee8eb4c5d7a07ce0e86301f1da79 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:35 +0000
Subject: ocfs2: convert ocfs2_readpage_inline() to take a folio

Save a couple of calls to compound_head() by using a folio throughout this
function.

Link: https://lkml.kernel.org/r/20241205171653.3179945-8-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c26853fb7dc9..c0a6b82a6a68 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -252,12 +252,12 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 	return 0;
 }
 
-static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
+static int ocfs2_readpage_inline(struct inode *inode, struct folio *folio)
 {
 	int ret;
 	struct buffer_head *di_bh = NULL;
 
-	BUG_ON(!PageLocked(page));
+	BUG_ON(!folio_test_locked(folio));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
 	ret = ocfs2_read_inode_block(inode, &di_bh);
@@ -266,9 +266,9 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 		goto out;
 	}
 
-	ret = ocfs2_read_inline_data(inode, page, di_bh);
+	ret = ocfs2_read_inline_data(inode, &folio->page, di_bh);
 out:
-	unlock_page(page);
+	folio_unlock(folio);
 
 	brelse(di_bh);
 	return ret;
@@ -322,7 +322,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio)
 	}
 
 	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-		ret = ocfs2_readpage_inline(inode, &folio->page);
+		ret = ocfs2_readpage_inline(inode, folio);
 	else
 		ret = block_read_full_folio(folio, ocfs2_get_block);
 	unlock = 0;
-- 
cgit v1.2.3


From 57ea4b89209929abcae298603236c89771e29086 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:36 +0000
Subject: ocfs2: convert ocfs2_inode_lock_with_page() to
 ocfs2_inode_lock_with_folio()

Saves a hidden call to compound_head().

Link: https://lkml.kernel.org/r/20241205171653.3179945-9-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c    |  2 +-
 fs/ocfs2/dlmglue.c | 18 ++++++++----------
 fs/ocfs2/dlmglue.h |  6 ++----
 3 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c0a6b82a6a68..87127a7f2f78 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -283,7 +283,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio)
 
 	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index);
 
-	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, &folio->page);
+	ret = ocfs2_inode_lock_with_folio(inode, NULL, 0, folio);
 	if (ret != 0) {
 		if (ret == AOP_TRUNCATED_PAGE)
 			unlock = 0;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8f08e89be37d..c9b62a6d8673 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2529,30 +2529,28 @@ bail:
 
 /*
  * This is working around a lock inversion between tasks acquiring DLM
- * locks while holding a page lock and the downconvert thread which
- * blocks dlm lock acquiry while acquiring page locks.
+ * locks while holding a folio lock and the downconvert thread which
+ * blocks dlm lock acquiry while acquiring folio locks.
  *
- * ** These _with_page variants are only intended to be called from aop
- * methods that hold page locks and return a very specific *positive* error
+ * ** These _with_folio variants are only intended to be called from aop
+ * methods that hold folio locks and return a very specific *positive* error
  * code that aop methods pass up to the VFS -- test for errors with != 0. **
  *
  * The DLM is called such that it returns -EAGAIN if it would have
  * blocked waiting for the downconvert thread.  In that case we unlock
- * our page so the downconvert thread can make progress.  Once we've
+ * our folio so the downconvert thread can make progress.  Once we've
  * done this we have to return AOP_TRUNCATED_PAGE so the aop method
  * that called us can bubble that back up into the VFS who will then
  * immediately retry the aop call.
  */
-int ocfs2_inode_lock_with_page(struct inode *inode,
-			      struct buffer_head **ret_bh,
-			      int ex,
-			      struct page *page)
+int ocfs2_inode_lock_with_folio(struct inode *inode,
+		struct buffer_head **ret_bh, int ex, struct folio *folio)
 {
 	int ret;
 
 	ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
 	if (ret == -EAGAIN) {
-		unlock_page(page);
+		folio_unlock(folio);
 		/*
 		 * If we can't get inode lock immediately, we should not return
 		 * directly here, since this will lead to a softlockup problem.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e5da5809ed95..a3ebd7303ea2 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -137,10 +137,8 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
 			 int ex,
 			 int arg_flags,
 			 int subclass);
-int ocfs2_inode_lock_with_page(struct inode *inode,
-			      struct buffer_head **ret_bh,
-			      int ex,
-			      struct page *page);
+int ocfs2_inode_lock_with_folio(struct inode *inode,
+		struct buffer_head **ret_bh, int ex, struct folio *folio);
 /* Variants without special locking class or flags */
 #define ocfs2_inode_lock_full(i, r, e, f)\
 		ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
-- 
cgit v1.2.3


From 7e119cff9d0a21e39e53c7d27addc31f4b8c0000 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:37 +0000
Subject: ocfs2: convert w_pages to w_folios

Pass around an array of folios instead of an array of pages.  Removes a
few calls to compound_head().

Link: https://lkml.kernel.org/r/20241205171653.3179945-10-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 118 ++++++++++++++++++++++++++++++--------------------------
 fs/ocfs2/aops.h |   1 +
 2 files changed, 65 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 87127a7f2f78..f088183fbae2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -729,19 +729,19 @@ struct ocfs2_write_ctxt {
 	unsigned int			w_large_pages;
 
 	/*
-	 * Pages involved in this write.
+	 * Folios involved in this write.
 	 *
 	 * w_target_folio is the folio being written to by the user.
 	 *
-	 * w_pages is an array of pages which always contains
+	 * w_folios is an array of folios which always contains
 	 * w_target_folio, and in the case of an allocating write with
 	 * page_size < cluster size, it will contain zero'd and mapped
 	 * pages adjacent to w_target_folio which need to be written
 	 * out in so that future reads from that region will get
 	 * zero's.
 	 */
-	unsigned int			w_num_pages;
-	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
+	unsigned int			w_num_folios;
+	struct folio			*w_folios[OCFS2_MAX_CTXT_PAGES];
 	struct folio			*w_target_folio;
 
 	/*
@@ -771,6 +771,19 @@ struct ocfs2_write_ctxt {
 	unsigned int			w_unwritten_count;
 };
 
+void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios)
+{
+	int i;
+
+	for(i = 0; i < num_folios; i++) {
+		if (!folios[i])
+			continue;
+		folio_unlock(folios[i]);
+		folio_mark_accessed(folios[i]);
+		folio_put(folios[i]);
+	}
+}
+
 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
 {
 	int i;
@@ -784,7 +797,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
 	}
 }
 
-static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
+static void ocfs2_unlock_folios(struct ocfs2_write_ctxt *wc)
 {
 	int i;
 
@@ -795,16 +808,16 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
 	 */
 	if (wc->w_target_locked) {
 		BUG_ON(!wc->w_target_folio);
-		for (i = 0; i < wc->w_num_pages; i++) {
-			if (&wc->w_target_folio->page == wc->w_pages[i]) {
-				wc->w_pages[i] = NULL;
+		for (i = 0; i < wc->w_num_folios; i++) {
+			if (wc->w_target_folio == wc->w_folios[i]) {
+				wc->w_folios[i] = NULL;
 				break;
 			}
 		}
 		folio_mark_accessed(wc->w_target_folio);
 		folio_put(wc->w_target_folio);
 	}
-	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+	ocfs2_unlock_and_free_folios(wc->w_folios, wc->w_num_folios);
 }
 
 static void ocfs2_free_unwritten_list(struct inode *inode,
@@ -826,7 +839,7 @@ static void ocfs2_free_write_ctxt(struct inode *inode,
 				  struct ocfs2_write_ctxt *wc)
 {
 	ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
-	ocfs2_unlock_pages(wc);
+	ocfs2_unlock_folios(wc);
 	brelse(wc->w_di_bh);
 	kfree(wc);
 }
@@ -922,8 +935,8 @@ static void ocfs2_write_failure(struct inode *inode,
 	if (wc->w_target_folio)
 		ocfs2_zero_new_buffers(wc->w_target_folio, from, to);
 
-	for(i = 0; i < wc->w_num_pages; i++) {
-		tmppage = wc->w_pages[i];
+	for (i = 0; i < wc->w_num_folios; i++) {
+		tmppage = &wc->w_folios[i]->page;
 
 		if (tmppage && page_has_buffers(tmppage)) {
 			if (ocfs2_should_order_data(inode))
@@ -935,12 +948,11 @@ static void ocfs2_write_failure(struct inode *inode,
 	}
 }
 
-static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
-					struct ocfs2_write_ctxt *wc,
-					struct page *page, u32 cpos,
-					loff_t user_pos, unsigned user_len,
-					int new)
+static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
+		struct ocfs2_write_ctxt *wc, struct folio *folio, u32 cpos,
+		loff_t user_pos, unsigned user_len, int new)
 {
+	struct page *page = &folio->page;
 	int ret;
 	unsigned int map_from = 0, map_to = 0;
 	unsigned int cluster_start, cluster_end;
@@ -1019,11 +1031,9 @@ out:
 /*
  * This function will only grab one clusters worth of pages.
  */
-static int ocfs2_grab_pages_for_write(struct address_space *mapping,
-				      struct ocfs2_write_ctxt *wc,
-				      u32 cpos, loff_t user_pos,
-				      unsigned user_len, int new,
-				      struct folio *mmap_folio)
+static int ocfs2_grab_folios_for_write(struct address_space *mapping,
+		struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos,
+		unsigned user_len, int new, struct folio *mmap_folio)
 {
 	int ret = 0, i;
 	unsigned long start, target_index, end_index, index;
@@ -1040,7 +1050,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 	 * last page of the write.
 	 */
 	if (new) {
-		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+		wc->w_num_folios = ocfs2_pages_per_cluster(inode->i_sb);
 		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
 		/*
 		 * We need the index *past* the last page we could possibly
@@ -1050,15 +1060,15 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 		last_byte = max(user_pos + user_len, i_size_read(inode));
 		BUG_ON(last_byte < 1);
 		end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1;
-		if ((start + wc->w_num_pages) > end_index)
-			wc->w_num_pages = end_index - start;
+		if ((start + wc->w_num_folios) > end_index)
+			wc->w_num_folios = end_index - start;
 	} else {
-		wc->w_num_pages = 1;
+		wc->w_num_folios = 1;
 		start = target_index;
 	}
 	end_index = (user_pos + user_len - 1) >> PAGE_SHIFT;
 
-	for(i = 0; i < wc->w_num_pages; i++) {
+	for(i = 0; i < wc->w_num_folios; i++) {
 		index = start + i;
 
 		if (index >= target_index && index <= end_index &&
@@ -1079,26 +1089,27 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 			}
 
 			folio_get(mmap_folio);
-			wc->w_pages[i] = &mmap_folio->page;
+			wc->w_folios[i] = mmap_folio;
 			wc->w_target_locked = true;
 		} else if (index >= target_index && index <= end_index &&
 			   wc->w_type == OCFS2_WRITE_DIRECT) {
 			/* Direct write has no mapping page. */
-			wc->w_pages[i] = NULL;
+			wc->w_folios[i] = NULL;
 			continue;
 		} else {
-			wc->w_pages[i] = find_or_create_page(mapping, index,
-							     GFP_NOFS);
-			if (!wc->w_pages[i]) {
-				ret = -ENOMEM;
+			wc->w_folios[i] = __filemap_get_folio(mapping, index,
+					FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+					GFP_NOFS);
+			if (IS_ERR(wc->w_folios[i])) {
+				ret = PTR_ERR(wc->w_folios[i]);
 				mlog_errno(ret);
 				goto out;
 			}
 		}
-		wait_for_stable_page(wc->w_pages[i]);
+		folio_wait_stable(wc->w_folios[i]);
 
 		if (index == target_index)
-			wc->w_target_folio = page_folio(wc->w_pages[i]);
+			wc->w_target_folio = wc->w_folios[i];
 	}
 out:
 	if (ret)
@@ -1182,19 +1193,18 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 	if (!should_zero)
 		p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
 
-	for(i = 0; i < wc->w_num_pages; i++) {
+	for (i = 0; i < wc->w_num_folios; i++) {
 		int tmpret;
 
 		/* This is the direct io target page. */
-		if (wc->w_pages[i] == NULL) {
+		if (wc->w_folios[i] == NULL) {
 			p_blkno += (1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits));
 			continue;
 		}
 
-		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
-						      wc->w_pages[i], cpos,
-						      user_pos, user_len,
-						      should_zero);
+		tmpret = ocfs2_prepare_folio_for_write(inode, &p_blkno, wc,
+				wc->w_folios[i], cpos, user_pos, user_len,
+				should_zero);
 		if (tmpret) {
 			mlog_errno(tmpret);
 			if (ret == 0)
@@ -1493,12 +1503,12 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 		goto out;
 	}
 	/*
-	 * If we don't set w_num_pages then this folio won't get unlocked
+	 * If we don't set w_num_folios then this folio won't get unlocked
 	 * and freed on cleanup of the write context.
 	 */
 	wc->w_target_folio = folio;
-	wc->w_pages[0] = &folio->page;
-	wc->w_num_pages = 1;
+	wc->w_folios[0] = folio;
+	wc->w_num_folios = 1;
 
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1791,18 +1801,18 @@ try_again:
 	}
 
 	/*
-	 * Fill our page array first. That way we've grabbed enough so
+	 * Fill our folio array first. That way we've grabbed enough so
 	 * that we can zero and flush if we error after adding the
 	 * extent.
 	 */
-	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
-					 cluster_of_pages, mmap_folio);
+	ret = ocfs2_grab_folios_for_write(mapping, wc, wc->w_cpos, pos, len,
+			cluster_of_pages, mmap_folio);
 	if (ret) {
 		/*
-		 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
-		 * the target page. In this case, we exit with no error and no target
-		 * page. This will trigger the caller, page_mkwrite(), to re-try
-		 * the operation.
+		 * ocfs2_grab_folios_for_write() returns -EAGAIN if it
+		 * could not lock the target folio. In this case, we exit
+		 * with no error and no target folio. This will trigger
+		 * the caller, page_mkwrite(), to re-try the operation.
 		 */
 		if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) {
 			BUG_ON(wc->w_target_folio);
@@ -1997,8 +2007,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	if (wc->w_target_folio)
 		flush_dcache_folio(wc->w_target_folio);
 
-	for(i = 0; i < wc->w_num_pages; i++) {
-		tmppage = wc->w_pages[i];
+	for (i = 0; i < wc->w_num_folios; i++) {
+		tmppage = &wc->w_folios[i]->page;
 
 		/* This is the direct io target page. */
 		if (tmppage == NULL)
@@ -2059,7 +2069,7 @@ out:
 	 * this lock and will ask for the page lock when flushing the data.
 	 * put it here to preserve the unlock order.
 	 */
-	ocfs2_unlock_pages(wc);
+	ocfs2_unlock_folios(wc);
 
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 1cb46072ad84..dcf4b0dc82c4 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -17,6 +17,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new);
 
+void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios);
 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
 
 int walk_page_buffers(	handle_t *handle,
-- 
cgit v1.2.3


From ecee61651d8f5cf65a60a6dfc50d7eab7502a392 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:38 +0000
Subject: ocfs2: convert ocfs2_write_failure() to use a folio

Remove the folio->page conversion and just use the folio.

Link: https://lkml.kernel.org/r/20241205171653.3179945-11-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f088183fbae2..2b3a3f5566d5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -930,20 +930,19 @@ static void ocfs2_write_failure(struct inode *inode,
 	int i;
 	unsigned from = user_pos & (PAGE_SIZE - 1),
 		to = user_pos + user_len;
-	struct page *tmppage;
 
 	if (wc->w_target_folio)
 		ocfs2_zero_new_buffers(wc->w_target_folio, from, to);
 
 	for (i = 0; i < wc->w_num_folios; i++) {
-		tmppage = &wc->w_folios[i]->page;
+		struct folio *folio = wc->w_folios[i];
 
-		if (tmppage && page_has_buffers(tmppage)) {
+		if (folio && folio_buffers(folio)) {
 			if (ocfs2_should_order_data(inode))
 				ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
 							   user_pos, user_len);
 
-			block_commit_write(tmppage, from, to);
+			block_commit_write(&folio->page, from, to);
 		}
 	}
 }
-- 
cgit v1.2.3


From 1b426db11ba812a7805c4911a9629922781f52f1 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:39 +0000
Subject: ocfs2: use a folio in ocfs2_write_end_nolock()

Remove an access to page->index.  Remove some PAGE_SIZE assumptions.

Link: https://lkml.kernel.org/r/20241205171653.3179945-12-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 2b3a3f5566d5..47c48f4e707c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1952,17 +1952,16 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
 	     le16_to_cpu(di->i_dyn_features));
 }
 
-int ocfs2_write_end_nolock(struct address_space *mapping,
-			   loff_t pos, unsigned len, unsigned copied, void *fsdata)
+int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos,
+		unsigned len, unsigned copied, void *fsdata)
 {
 	int i, ret;
-	unsigned from, to, start = pos & (PAGE_SIZE - 1);
+	size_t from, to, start = pos & (PAGE_SIZE - 1);
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_write_ctxt *wc = fsdata;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 	handle_t *handle = wc->w_handle;
-	struct page *tmppage;
 
 	BUG_ON(!list_empty(&wc->w_unwritten_list));
 
@@ -1993,32 +1992,32 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 					       start+len);
 		else {
 			/*
-			 * When page is fully beyond new isize (data copy
-			 * failed), do not bother zeroing the page. Invalidate
+			 * When folio is fully beyond new isize (data copy
+			 * failed), do not bother zeroing the folio. Invalidate
 			 * it instead so that writeback does not get confused
 			 * put page & buffer dirty bits into inconsistent
 			 * state.
 			 */
-			block_invalidate_folio(wc->w_target_folio,
-						0, PAGE_SIZE);
+			block_invalidate_folio(wc->w_target_folio, 0,
+					folio_size(wc->w_target_folio));
 		}
 	}
 	if (wc->w_target_folio)
 		flush_dcache_folio(wc->w_target_folio);
 
 	for (i = 0; i < wc->w_num_folios; i++) {
-		tmppage = &wc->w_folios[i]->page;
+		struct folio *folio = wc->w_folios[i];
 
-		/* This is the direct io target page. */
-		if (tmppage == NULL)
+		/* This is the direct io target folio */
+		if (folio == NULL)
 			continue;
 
-		if (tmppage == &wc->w_target_folio->page) {
+		if (folio == wc->w_target_folio) {
 			from = wc->w_target_from;
 			to = wc->w_target_to;
 
-			BUG_ON(from > PAGE_SIZE ||
-			       to > PAGE_SIZE ||
+			BUG_ON(from > folio_size(folio) ||
+			       to > folio_size(folio) ||
 			       to < from);
 		} else {
 			/*
@@ -2027,19 +2026,17 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			 * to flush their entire range.
 			 */
 			from = 0;
-			to = PAGE_SIZE;
+			to = folio_size(folio);
 		}
 
-		if (page_has_buffers(tmppage)) {
+		if (folio_buffers(folio)) {
 			if (handle && ocfs2_should_order_data(inode)) {
-				loff_t start_byte =
-					((loff_t)tmppage->index << PAGE_SHIFT) +
-					from;
+				loff_t start_byte = folio_pos(folio) + from;
 				loff_t length = to - from;
 				ocfs2_jbd2_inode_add_write(handle, inode,
 							   start_byte, length);
 			}
-			block_commit_write(tmppage, from, to);
+			block_commit_write(&folio->page, from, to);
 		}
 	}
 
-- 
cgit v1.2.3


From 38bfefaa12c68709dd01a9658d791c1b1511480f Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:40 +0000
Subject: ocfs2: use a folio in ocfs2_prepare_page_for_write()

Update to the new APIs.  Removes a few page->folio conversions.

Link: https://lkml.kernel.org/r/20241205171653.3179945-13-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 47c48f4e707c..b23decd51882 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -963,10 +963,10 @@ static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
 	/* treat the write as new if the a hole/lseek spanned across
 	 * the page boundary.
 	 */
-	new = new | ((i_size_read(inode) <= page_offset(page)) &&
-			(page_offset(page) <= user_pos));
+	new = new | ((i_size_read(inode) <= folio_pos(folio)) &&
+			(folio_pos(folio) <= user_pos));
 
-	if (page == &wc->w_target_folio->page) {
+	if (folio == wc->w_target_folio) {
 		map_from = user_pos & (PAGE_SIZE - 1);
 		map_to = map_from + user_len;
 
@@ -990,7 +990,7 @@ static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
 		}
 	} else {
 		/*
-		 * If we haven't allocated the new page yet, we
+		 * If we haven't allocated the new folio yet, we
 		 * shouldn't be writing it out without copying user
 		 * data. This is likely a math error from the caller.
 		 */
@@ -1008,20 +1008,20 @@ static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
 	}
 
 	/*
-	 * Parts of newly allocated pages need to be zero'd.
+	 * Parts of newly allocated folios need to be zero'd.
 	 *
 	 * Above, we have also rewritten 'to' and 'from' - as far as
 	 * the rest of the function is concerned, the entire cluster
-	 * range inside of a page needs to be written.
+	 * range inside of a folio needs to be written.
 	 *
-	 * We can skip this if the page is up to date - it's already
+	 * We can skip this if the folio is uptodate - it's already
 	 * been zero'd from being read in as a hole.
 	 */
-	if (new && !PageUptodate(page))
+	if (new && !folio_test_uptodate(folio))
 		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
 					 cpos, user_data_from, user_data_to);
 
-	flush_dcache_page(page);
+	flush_dcache_folio(folio);
 
 out:
 	return ret;
-- 
cgit v1.2.3


From b04d305df1171448df5e87802c4d1f1022cc5784 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:41 +0000
Subject: ocfs2: use a folio in ocfs2_map_and_dirty_page()

Convert the incoming page to a folio and use it throughout the function.
Removes a couple of calls to compound_head().

Link: https://lkml.kernel.org/r/20241205171653.3179945-14-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b3fa953e5637..f65e8f1201fc 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6812,8 +6812,9 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 			      unsigned int from, unsigned int to,
 			      struct page *page, int zero, u64 *phys)
 {
+	struct folio *folio = page_folio(page);
 	int ret, partial = 0;
-	loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from;
+	loff_t start_byte = folio_pos(folio) + from;
 	loff_t length = to - from;
 
 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
@@ -6821,14 +6822,14 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 		mlog_errno(ret);
 
 	if (zero)
-		zero_user_segment(page, from, to);
+		folio_zero_segment(folio, from, to);
 
 	/*
 	 * Need to set the buffers we zero'd into uptodate
 	 * here if they aren't - ocfs2_map_page_blocks()
 	 * might've skipped some
 	 */
-	ret = walk_page_buffers(handle, page_buffers(page),
+	ret = walk_page_buffers(handle, folio_buffers(folio),
 				from, to, &partial,
 				ocfs2_zero_func);
 	if (ret < 0)
@@ -6841,9 +6842,9 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 	}
 
 	if (!partial)
-		SetPageUptodate(page);
+		folio_mark_uptodate(folio);
 
-	flush_dcache_page(page);
+	flush_dcache_folio(folio);
 }
 
 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
-- 
cgit v1.2.3


From 0313500d5d21778be60fdd699a18aedb6ac34a28 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:42 +0000
Subject: ocfs2: convert ocfs2_map_page_blocks() to ocfs2_map_folio_blocks()

All callers now have a folio, so pass it in instead of converting
folio->page->folio.

Link: https://lkml.kernel.org/r/20241205171653.3179945-15-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/alloc.c |  2 +-
 fs/ocfs2/aops.c  | 18 ++++++++----------
 fs/ocfs2/aops.h  |  2 +-
 3 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f65e8f1201fc..fbadfe53a93f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6817,7 +6817,7 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 	loff_t start_byte = folio_pos(folio) + from;
 	loff_t length = to - from;
 
-	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
+	ret = ocfs2_map_folio_blocks(folio, phys, inode, from, to, 0);
 	if (ret)
 		mlog_errno(ret);
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b23decd51882..68e93af85d1d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -560,7 +560,7 @@ static void ocfs2_clear_page_regions(struct page *page,
 /*
  * Nonsparse file systems fully allocate before we get to the write
  * code. This prevents ocfs2_write() from tagging the write as an
- * allocating one, which means ocfs2_map_page_blocks() might try to
+ * allocating one, which means ocfs2_map_folio_blocks() might try to
  * read-in the blocks at the tail of our file. Avoid reading them by
  * testing i_size against each block offset.
  */
@@ -585,11 +585,10 @@ static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio,
  *
  * This will also skip zeroing, which is handled externally.
  */
-int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new)
 {
-	struct folio *folio = page_folio(page);
 	int ret = 0;
 	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
 	unsigned int block_end, block_start;
@@ -971,12 +970,11 @@ static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
 		map_to = map_from + user_len;
 
 		if (new)
-			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-						    cluster_start, cluster_end,
-						    new);
+			ret = ocfs2_map_folio_blocks(folio, p_blkno, inode,
+					cluster_start, cluster_end, new);
 		else
-			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-						    map_from, map_to, new);
+			ret = ocfs2_map_folio_blocks(folio, p_blkno, inode,
+					map_from, map_to, new);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -999,8 +997,8 @@ static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
 		map_from = cluster_start;
 		map_to = cluster_end;
 
-		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-					    cluster_start, cluster_end, new);
+		ret = ocfs2_map_folio_blocks(folio, p_blkno, inode,
+				cluster_start, cluster_end, new);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index dcf4b0dc82c4..17ca359c6051 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -13,7 +13,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 unsigned from,
 							 unsigned to);
 
-int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new);
 
-- 
cgit v1.2.3


From 4d434f53746a6c7cd2e18b3c2fcc109bf99094d9 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:43 +0000
Subject: ocfs2: convert ocfs2_clear_page_regions() to
 ocfs2_clear_folio_regions()

Pass in and use the folio instead of its page.

Link: https://lkml.kernel.org/r/20241205171653.3179945-16-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 68e93af85d1d..ea8dc82cfe98 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -534,7 +534,7 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
  *
  * from == to == 0 is code for "zero the entire cluster region"
  */
-static void ocfs2_clear_page_regions(struct page *page,
+static void ocfs2_clear_folio_regions(struct folio *folio,
 				     struct ocfs2_super *osb, u32 cpos,
 				     unsigned from, unsigned to)
 {
@@ -543,7 +543,7 @@ static void ocfs2_clear_page_regions(struct page *page,
 
 	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
 
-	kaddr = kmap_atomic(page);
+	kaddr = kmap_local_folio(folio, 0);
 
 	if (from || to) {
 		if (from > cluster_start)
@@ -554,7 +554,7 @@ static void ocfs2_clear_page_regions(struct page *page,
 		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
 	}
 
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 }
 
 /*
@@ -950,7 +950,6 @@ static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
 		struct ocfs2_write_ctxt *wc, struct folio *folio, u32 cpos,
 		loff_t user_pos, unsigned user_len, int new)
 {
-	struct page *page = &folio->page;
 	int ret;
 	unsigned int map_from = 0, map_to = 0;
 	unsigned int cluster_start, cluster_end;
@@ -1016,7 +1015,7 @@ static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
 	 * been zero'd from being read in as a hole.
 	 */
 	if (new && !folio_test_uptodate(folio))
-		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+		ocfs2_clear_folio_regions(folio, OCFS2_SB(inode->i_sb),
 					 cpos, user_data_from, user_data_to);
 
 	flush_dcache_folio(folio);
-- 
cgit v1.2.3


From 9a5e08652dc4bea7de6495d84855439769208eab Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:44 +0000
Subject: ocfs2: use an array of folios instead of an array of pages

The ocfs2_zero_cluster_folios() / ocfs2_grab_folios() /
ocfs2_grab_eof_folios() family of functions pass around an array of pages.
Convert them to pass around an array of folios.  This removes the last
caller of ocfs2_unlock_and_free_pages(), so delete it.

Link: https://lkml.kernel.org/r/20241205171653.3179945-17-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 98 ++++++++++++++++++++++++++++----------------------------
 fs/ocfs2/alloc.h |  2 --
 fs/ocfs2/aops.c  | 13 --------
 fs/ocfs2/aops.h  |  1 -
 4 files changed, 49 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fbadfe53a93f..0f6ddb534a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6847,87 +6847,87 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 	flush_dcache_folio(folio);
 }
 
-static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
-				     loff_t end, struct page **pages,
-				     int numpages, u64 phys, handle_t *handle)
+static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
+		loff_t end, struct folio **folios, int numfolios,
+		u64 phys, handle_t *handle)
 {
 	int i;
-	struct page *page;
 	unsigned int from, to = PAGE_SIZE;
 	struct super_block *sb = inode->i_sb;
 
 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
 
-	if (numpages == 0)
+	if (numfolios == 0)
 		goto out;
 
 	to = PAGE_SIZE;
-	for(i = 0; i < numpages; i++) {
-		page = pages[i];
+	for (i = 0; i < numfolios; i++) {
+		struct folio *folio = folios[i];
 
 		from = start & (PAGE_SIZE - 1);
-		if ((end >> PAGE_SHIFT) == page->index)
+		if ((end >> PAGE_SHIFT) == folio->index)
 			to = end & (PAGE_SIZE - 1);
 
 		BUG_ON(from > PAGE_SIZE);
 		BUG_ON(to > PAGE_SIZE);
 
-		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
+		ocfs2_map_and_dirty_page(inode, handle, from, to, &folio->page, 1,
 					 &phys);
 
-		start = (page->index + 1) << PAGE_SHIFT;
+		start = (folio->index + 1) << PAGE_SHIFT;
 	}
 out:
-	if (pages)
-		ocfs2_unlock_and_free_pages(pages, numpages);
+	if (folios)
+		ocfs2_unlock_and_free_folios(folios, numfolios);
 }
 
-int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
-		     struct page **pages, int *num)
+static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end,
+		struct folio **folios, int *num)
 {
-	int numpages, ret = 0;
+	int numfolios, ret = 0;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index;
 	loff_t last_page_bytes;
 
 	BUG_ON(start > end);
 
-	numpages = 0;
+	numfolios = 0;
 	last_page_bytes = PAGE_ALIGN(end);
 	index = start >> PAGE_SHIFT;
 	do {
-		pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
-		if (!pages[numpages]) {
-			ret = -ENOMEM;
+		folios[numfolios] = __filemap_get_folio(mapping, index,
+				FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+		if (IS_ERR(folios[numfolios])) {
+			ret = PTR_ERR(folios[numfolios]);
 			mlog_errno(ret);
 			goto out;
 		}
 
-		numpages++;
-		index++;
+		index = folio_next_index(folios[numfolios]);
+		numfolios++;
 	} while (index < (last_page_bytes >> PAGE_SHIFT));
 
 out:
 	if (ret != 0) {
-		if (pages)
-			ocfs2_unlock_and_free_pages(pages, numpages);
-		numpages = 0;
+		if (folios)
+			ocfs2_unlock_and_free_folios(folios, numfolios);
+		numfolios = 0;
 	}
 
-	*num = numpages;
+	*num = numfolios;
 
 	return ret;
 }
 
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
-				struct page **pages, int *num)
+static int ocfs2_grab_eof_folios(struct inode *inode, loff_t start, loff_t end,
+				struct folio **folios, int *num)
 {
 	struct super_block *sb = inode->i_sb;
 
 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
 
-	return ocfs2_grab_pages(inode, start, end, pages, num);
+	return ocfs2_grab_folios(inode, start, end, folios, num);
 }
 
 /*
@@ -6941,8 +6941,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 				  u64 range_start, u64 range_end)
 {
-	int ret = 0, numpages;
-	struct page **pages = NULL;
+	int ret = 0, numfolios;
+	struct folio **folios = NULL;
 	u64 phys;
 	unsigned int ext_flags;
 	struct super_block *sb = inode->i_sb;
@@ -6955,17 +6955,17 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 		return 0;
 
 	/*
-	 * Avoid zeroing pages fully beyond current i_size. It is pointless as
-	 * underlying blocks of those pages should be already zeroed out and
+	 * Avoid zeroing folios fully beyond current i_size. It is pointless as
+	 * underlying blocks of those folios should be already zeroed out and
 	 * page writeback will skip them anyway.
 	 */
 	range_end = min_t(u64, range_end, i_size_read(inode));
 	if (range_start >= range_end)
 		return 0;
 
-	pages = kcalloc(ocfs2_pages_per_cluster(sb),
-			sizeof(struct page *), GFP_NOFS);
-	if (pages == NULL) {
+	folios = kcalloc(ocfs2_pages_per_cluster(sb),
+			sizeof(struct folio *), GFP_NOFS);
+	if (folios == NULL) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -6986,18 +6986,18 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
 		goto out;
 
-	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
-				   &numpages);
+	ret = ocfs2_grab_eof_folios(inode, range_start, range_end, folios,
+				   &numfolios);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
-				 numpages, phys, handle);
+	ocfs2_zero_cluster_folios(inode, range_start, range_end, folios,
+				 numfolios, phys, handle);
 
 	/*
-	 * Initiate writeout of the pages we zero'd here. We don't
+	 * Initiate writeout of the folios we zero'd here. We don't
 	 * wait on them - the truncate_inode_pages() call later will
 	 * do that for us.
 	 */
@@ -7007,7 +7007,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 		mlog_errno(ret);
 
 out:
-	kfree(pages);
+	kfree(folios);
 
 	return ret;
 }
@@ -7060,7 +7060,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 					 struct buffer_head *di_bh)
 {
-	int ret, has_data, num_pages = 0;
+	int ret, has_data, num_folios = 0;
 	int need_free = 0;
 	u32 bit_off, num;
 	handle_t *handle;
@@ -7069,7 +7069,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_alloc_context *data_ac = NULL;
-	struct page *page = NULL;
+	struct folio *folio = NULL;
 	struct ocfs2_extent_tree et;
 	int did_quota = 0;
 
@@ -7124,8 +7124,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 */
 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
 
-		ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page,
-					   &num_pages);
+		ret = ocfs2_grab_eof_folios(inode, 0, page_end, &folio,
+					   &num_folios);
 		if (ret) {
 			mlog_errno(ret);
 			need_free = 1;
@@ -7136,14 +7136,14 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * This should populate the 1st page for us and mark
 		 * it up to date.
 		 */
-		ret = ocfs2_read_inline_data(inode, page, di_bh);
+		ret = ocfs2_read_inline_data(inode, &folio->page, di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			need_free = 1;
 			goto out_unlock;
 		}
 
-		ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0,
+		ocfs2_map_and_dirty_page(inode, handle, 0, page_end, &folio->page, 0,
 					 &phys);
 	}
 
@@ -7175,8 +7175,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	}
 
 out_unlock:
-	if (page)
-		ocfs2_unlock_and_free_pages(&page, num_pages);
+	if (folio)
+		ocfs2_unlock_and_free_folios(&folio, num_folios);
 
 out_commit:
 	if (ret < 0 && did_quota)
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 4af7abaa6e40..6a2aca1a062e 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -254,8 +254,6 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 	return !rec->e_leaf_clusters;
 }
 
-int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
-		     struct page **pages, int *num);
 void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 			      unsigned int from, unsigned int to,
 			      struct page *page, int zero, u64 *phys);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ea8dc82cfe98..61fecfe7dce6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -783,19 +783,6 @@ void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios)
 	}
 }
 
-void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
-{
-	int i;
-
-	for(i = 0; i < num_pages; i++) {
-		if (pages[i]) {
-			unlock_page(pages[i]);
-			mark_page_accessed(pages[i]);
-			put_page(pages[i]);
-		}
-	}
-}
-
 static void ocfs2_unlock_folios(struct ocfs2_write_ctxt *wc)
 {
 	int i;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 17ca359c6051..cf8d202d9a8b 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -18,7 +18,6 @@ int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno,
 			  unsigned int to, int new);
 
 void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios);
-void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
 
 int walk_page_buffers(	handle_t *handle,
 			struct buffer_head *head,
-- 
cgit v1.2.3


From 395ea2ab463087629b254657378f92b24f9a8fb2 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:45 +0000
Subject: ocfs2: convert ocfs2_duplicate_clusters_by_page() to use a folio

Retrieve folios from the page cache, not pages, and use a folio throughout
this function.  Removes seven calls to compound_head().

Link: https://lkml.kernel.org/r/20241205171653.3179945-18-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/refcounttree.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 73caf991ede5..1d7325d4411f 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2902,7 +2902,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 	int ret = 0, partial;
 	struct super_block *sb = inode->i_sb;
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
-	struct page *page;
 	pgoff_t page_index;
 	unsigned int from, to;
 	loff_t offset, end, map_end;
@@ -2921,6 +2920,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		end = i_size_read(inode);
 
 	while (offset < end) {
+		struct folio *folio;
 		page_index = offset >> PAGE_SHIFT;
 		map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
 		if (map_end > end)
@@ -2933,9 +2933,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 			to = map_end & (PAGE_SIZE - 1);
 
 retry:
-		page = find_or_create_page(mapping, page_index, GFP_NOFS);
-		if (!page) {
-			ret = -ENOMEM;
+		folio = __filemap_get_folio(mapping, page_index,
+				FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+		if (IS_ERR(folio)) {
+			ret = PTR_ERR(folio);
 			mlog_errno(ret);
 			break;
 		}
@@ -2945,9 +2946,9 @@ retry:
 		 * page, so write it back.
 		 */
 		if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
-			if (PageDirty(page)) {
-				unlock_page(page);
-				put_page(page);
+			if (folio_test_dirty(folio)) {
+				folio_unlock(folio);
+				folio_put(folio);
 
 				ret = filemap_write_and_wait_range(mapping,
 						offset, map_end - 1);
@@ -2955,9 +2956,7 @@ retry:
 			}
 		}
 
-		if (!PageUptodate(page)) {
-			struct folio *folio = page_folio(page);
-
+		if (!folio_test_uptodate(folio)) {
 			ret = block_read_full_folio(folio, ocfs2_get_block);
 			if (ret) {
 				mlog_errno(ret);
@@ -2966,8 +2965,8 @@ retry:
 			folio_lock(folio);
 		}
 
-		if (page_has_buffers(page)) {
-			ret = walk_page_buffers(handle, page_buffers(page),
+		if (folio_buffers(folio)) {
+			ret = walk_page_buffers(handle, folio_buffers(folio),
 						from, to, &partial,
 						ocfs2_clear_cow_buffer);
 			if (ret) {
@@ -2978,12 +2977,11 @@ retry:
 
 		ocfs2_map_and_dirty_page(inode,
 					 handle, from, to,
-					 page, 0, &new_block);
-		mark_page_accessed(page);
+					 &folio->page, 0, &new_block);
+		folio_mark_accessed(folio);
 unlock:
-		unlock_page(page);
-		put_page(page);
-		page = NULL;
+		folio_unlock(folio);
+		folio_put(folio);
 		offset = map_end;
 		if (ret)
 			break;
-- 
cgit v1.2.3


From 602f206a7cd3cbf9f80888cdc210806a8fb27ff0 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:46 +0000
Subject: ocfs2: convert ocfs2_map_and_dirty_page() to
 ocfs2_map_and_dirty_folio()

All callers now have a folio, so skip the folio->page->folio conversion.

Link: https://lkml.kernel.org/r/20241205171653.3179945-19-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/alloc.c        | 17 ++++++++---------
 fs/ocfs2/alloc.h        |  6 +++---
 fs/ocfs2/refcounttree.c |  5 ++---
 3 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0f6ddb534a44..88d43fb5bf7d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6808,11 +6808,10 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
-			      unsigned int from, unsigned int to,
-			      struct page *page, int zero, u64 *phys)
+void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle,
+		size_t from, size_t to, struct folio *folio, int zero,
+		u64 *phys)
 {
-	struct folio *folio = page_folio(page);
 	int ret, partial = 0;
 	loff_t start_byte = folio_pos(folio) + from;
 	loff_t length = to - from;
@@ -6871,8 +6870,8 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
 		BUG_ON(from > PAGE_SIZE);
 		BUG_ON(to > PAGE_SIZE);
 
-		ocfs2_map_and_dirty_page(inode, handle, from, to, &folio->page, 1,
-					 &phys);
+		ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1,
+				&phys);
 
 		start = (folio->index + 1) << PAGE_SHIFT;
 	}
@@ -7120,7 +7119,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 
 		/*
 		 * Save two copies, one for insert, and one that can
-		 * be changed by ocfs2_map_and_dirty_page() below.
+		 * be changed by ocfs2_map_and_dirty_folio() below.
 		 */
 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
 
@@ -7143,8 +7142,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 			goto out_unlock;
 		}
 
-		ocfs2_map_and_dirty_page(inode, handle, 0, page_end, &folio->page, 0,
-					 &phys);
+		ocfs2_map_and_dirty_folio(inode, handle, 0, page_end, folio, 0,
+				&phys);
 	}
 
 	spin_lock(&oi->ip_lock);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 6a2aca1a062e..1c0c83362904 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -254,9 +254,9 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 	return !rec->e_leaf_clusters;
 }
 
-void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
-			      unsigned int from, unsigned int to,
-			      struct page *page, int zero, u64 *phys);
+void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle,
+		size_t from, size_t to, struct folio *folio, int zero,
+		u64 *phys);
 /*
  * Structures which describe a path through a btree, and functions to
  * manipulate them.
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 1d7325d4411f..8f732742b26e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2975,9 +2975,8 @@ retry:
 			}
 		}
 
-		ocfs2_map_and_dirty_page(inode,
-					 handle, from, to,
-					 &folio->page, 0, &new_block);
+		ocfs2_map_and_dirty_folio(inode, handle, from, to,
+				folio, 0, &new_block);
 		folio_mark_accessed(folio);
 unlock:
 		folio_unlock(folio);
-- 
cgit v1.2.3


From 03902e65616e18efc944c21ca1b871813875b257 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 5 Dec 2024 17:16:47 +0000
Subject: ocfs2: convert ocfs2_read_inline_data() to take a folio

All callers now have a folio, so pass it in.  We can use folio_fill_tail()
instead of open-coding it.  Saves a call to compound_head().

Link: https://lkml.kernel.org/r/20241205171653.3179945-20-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/alloc.c |  2 +-
 fs/ocfs2/aops.c  | 20 ++++++--------------
 fs/ocfs2/aops.h  |  2 +-
 3 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 88d43fb5bf7d..453be2d2c124 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7135,7 +7135,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * This should populate the 1st page for us and mark
 		 * it up to date.
 		 */
-		ret = ocfs2_read_inline_data(inode, &folio->page, di_bh);
+		ret = ocfs2_read_inline_data(inode, folio, di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			need_free = 1;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 61fecfe7dce6..5bbeb6fbb1ac 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -215,10 +215,9 @@ bail:
 	return err;
 }
 
-int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+int ocfs2_read_inline_data(struct inode *inode, struct folio *folio,
 			   struct buffer_head *di_bh)
 {
-	void *kaddr;
 	loff_t size;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
@@ -230,7 +229,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 
 	size = i_size_read(inode);
 
-	if (size > PAGE_SIZE ||
+	if (size > folio_size(folio) ||
 	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has with inline data has bad size: %Lu\n",
@@ -239,15 +238,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 		return -EROFS;
 	}
 
-	kaddr = kmap_atomic(page);
-	if (size)
-		memcpy(kaddr, di->id2.i_data.id_data, size);
-	/* Clear the remaining part of the page */
-	memset(kaddr + size, 0, PAGE_SIZE - size);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr);
-
-	SetPageUptodate(page);
+	folio_fill_tail(folio, 0, di->id2.i_data.id_data, size);
+	folio_mark_uptodate(folio);
 
 	return 0;
 }
@@ -266,7 +258,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct folio *folio)
 		goto out;
 	}
 
-	ret = ocfs2_read_inline_data(inode, &folio->page, di_bh);
+	ret = ocfs2_read_inline_data(inode, folio, di_bh);
 out:
 	folio_unlock(folio);
 
@@ -1506,7 +1498,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 		ocfs2_set_inode_data_inline(inode, di);
 
 	if (!folio_test_uptodate(folio)) {
-		ret = ocfs2_read_inline_data(inode, &folio->page, wc->w_di_bh);
+		ret = ocfs2_read_inline_data(inode, folio, wc->w_di_bh);
 		if (ret) {
 			ocfs2_commit_trans(osb, handle);
 
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index cf8d202d9a8b..6f25066e5756 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -41,7 +41,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		struct folio **foliop, void **fsdata,
 		struct buffer_head *di_bh, struct folio *mmap_folio);
 
-int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+int ocfs2_read_inline_data(struct inode *inode, struct folio *folio,
 			   struct buffer_head *di_bh);
 int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
 
-- 
cgit v1.2.3


From d254824ef15dbb67978d4a069d2535cfb913c8f7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 5 Dec 2024 17:16:48 +0000
Subject: ocfs2: use a folio in ocfs2_fast_symlink_read_folio()

Use folio_end_read() instead of SetPageUptodate() and unlock_page().  Use
memcpy_to_folio() instead of open-coding a kmap_atomic() sequence.

Link: https://lkml.kernel.org/r/20241205171653.3179945-21-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/symlink.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f5cf2255dc09..ad8be3300b49 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -54,13 +54,11 @@
 
 static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio)
 {
-	struct page *page = &folio->page;
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = folio->mapping->host;
 	struct buffer_head *bh = NULL;
 	int status = ocfs2_read_inode_block(inode, &bh);
 	struct ocfs2_dinode *fe;
 	const char *link;
-	void *kaddr;
 	size_t len;
 
 	if (status < 0) {
@@ -72,12 +70,9 @@ static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio)
 	link = (char *) fe->id2.i_symlink;
 	/* will be less than a page size */
 	len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
-	kaddr = kmap_atomic(page);
-	memcpy(kaddr, link, len + 1);
-	kunmap_atomic(kaddr);
-	SetPageUptodate(page);
+	memcpy_to_folio(folio, 0, link, len + 1);
 out:
-	unlock_page(page);
+	folio_end_read(folio, status == 0);
 	brelse(bh);
 	return status;
 }
-- 
cgit v1.2.3


From 9d063ebc7f830a8203535e8e45516f4c05deb307 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 5 Dec 2024 17:16:49 +0000
Subject: ocfs2: remove ocfs2_start_walk_page_trans() prototype

This function no longer exists.

Link: https://lkml.kernel.org/r/20241205171653.3179945-22-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 6f25066e5756..114efc9111e4 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -8,11 +8,6 @@
 
 #include <linux/fs.h>
 
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
-							 struct page *page,
-							 unsigned from,
-							 unsigned to);
-
 int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new);
-- 
cgit v1.2.3


From 2ebe9efc3f455876489ad6ed65b9bf4db292dd35 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:50 +0000
Subject: ocfs2: support large folios in ocfs2_zero_cluster_folios()

Remove assumptions that a folio is one page in size.

Link: https://lkml.kernel.org/r/20241205171653.3179945-23-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 453be2d2c124..5cf698785fae 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6851,7 +6851,6 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
 		u64 phys, handle_t *handle)
 {
 	int i;
-	unsigned int from, to = PAGE_SIZE;
 	struct super_block *sb = inode->i_sb;
 
 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
@@ -6859,21 +6858,18 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
 	if (numfolios == 0)
 		goto out;
 
-	to = PAGE_SIZE;
 	for (i = 0; i < numfolios; i++) {
 		struct folio *folio = folios[i];
+		size_t to = folio_size(folio);
+		size_t from = offset_in_folio(folio, start);
 
-		from = start & (PAGE_SIZE - 1);
-		if ((end >> PAGE_SHIFT) == folio->index)
-			to = end & (PAGE_SIZE - 1);
-
-		BUG_ON(from > PAGE_SIZE);
-		BUG_ON(to > PAGE_SIZE);
+		if (to > end - folio_pos(folio))
+			to = end - folio_pos(folio);
 
 		ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1,
 				&phys);
 
-		start = (folio->index + 1) << PAGE_SHIFT;
+		start = folio_next_index(folio) << PAGE_SHIFT;
 	}
 out:
 	if (folios)
-- 
cgit v1.2.3


From e7606f4a03d5c84a32a7a2ada4270279079ee1fc Mon Sep 17 00:00:00 2001
From: Mark Tinguely <mark.tinguely@oracle.com>
Date: Thu, 5 Dec 2024 17:16:51 +0000
Subject: ocfs2: support large folios in ocfs2_write_zero_page()

Remove assumptions that a folio is PAGE_SIZE.

Link: https://lkml.kernel.org/r/20241205171653.3179945-24-willy@infradead.org
Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 957ced628eb1..e54f2c4b5a90 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -782,11 +782,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 		goto out_commit_trans;
 	}
 
-	/* Get the offsets within the page that we want to zero */
-	zero_from = abs_from & (PAGE_SIZE - 1);
-	zero_to = abs_to & (PAGE_SIZE - 1);
+	/* Get the offsets within the folio that we want to zero */
+	zero_from = offset_in_folio(folio, abs_from);
+	zero_to = offset_in_folio(folio, abs_to);
 	if (!zero_to)
-		zero_to = PAGE_SIZE;
+		zero_to = folio_size(folio);
 
 	trace_ocfs2_write_zero_page(
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
-- 
cgit v1.2.3


From b8ed9da102beb2d0926a1d7a7e652392190151c0 Mon Sep 17 00:00:00 2001
From: Meetakshi Setiya <msetiya@microsoft.com>
Date: Fri, 10 Jan 2025 07:10:27 -0500
Subject: cifs: support reconnect with alternate password for SMB1

SMB1 shares the mount and remount code paths with SMB2/3 and already
supports password rotation in some scenarios. This patch extends the
password rotation support to SMB1 reconnects as well.

Cc: stable@vger.kernel.org
Signed-off-by: Meetakshi Setiya <msetiya@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifssmb.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6cb1e81993f8..ab0b949924d7 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -152,8 +152,17 @@ again:
 	spin_unlock(&ses->ses_lock);
 
 	rc = cifs_negotiate_protocol(0, ses, server);
-	if (!rc)
+	if (!rc) {
 		rc = cifs_setup_session(0, ses, server, ses->local_nls);
+		if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+			/*
+			 * Try alternate password for next reconnect if an alternate
+			 * password is available.
+			 */
+			if (ses->password2)
+				swap(ses->password2, ses->password);
+		}
+	}
 
 	/* do we need to reconnect tcon? */
 	if (rc || !tcon->need_reconnect) {
-- 
cgit v1.2.3


From 5473aeedffa49e036d9d313148e3fa96af85c0a6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 13 Nov 2024 13:20:37 -0500
Subject: btrfs: move select_delayed_ref() and export it

This helper is how we select the delayed ref to run once we've selected
the delayed ref head.  I need this exported to add a unit test for
delayed refs, and it's more natural home is in delayed-ref.c.  Rename it
to btrfs_select_delayed_ref and move it into delayed-ref.c.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 26 ++++++++++++++++++++++++++
 fs/btrfs/delayed-ref.h |  1 +
 fs/btrfs/extent-tree.c | 26 +-------------------------
 3 files changed, 28 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 0d878dbbabba..7e4cdae2a820 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -555,6 +555,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
 		delayed_refs->num_heads_ready--;
 }
 
+struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_ref_node *ref;
+
+	lockdep_assert_held(&head->mutex);
+	lockdep_assert_held(&head->lock);
+
+	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
+		return NULL;
+
+	/*
+	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+	 * This is to prevent a ref count from going down to zero, which deletes
+	 * the extent item from the extent tree, when there still are references
+	 * to add, which would fail because they would not find the extent item.
+	 */
+	if (!list_empty(&head->ref_add_list))
+		return list_first_entry(&head->ref_add_list,
+					struct btrfs_delayed_ref_node, add_list);
+
+	ref = rb_entry(rb_first_cached(&head->ref_tree),
+		       struct btrfs_delayed_ref_node, ref_node);
+	ASSERT(list_empty(&ref->add_list));
+	return ref;
+}
+
 /*
  * Helper to insert the ref_node to the tail or merge with tail.
  *
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 611fb3388f82..a35067cebb97 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -402,6 +402,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
 		struct btrfs_delayed_ref_root *delayed_refs);
 void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 			     struct btrfs_delayed_ref_head *head);
+struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head);
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3c6f7fecbb9a..2ce9e69ee8f8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1803,30 +1803,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static inline struct btrfs_delayed_ref_node *
-select_delayed_ref(struct btrfs_delayed_ref_head *head)
-{
-	struct btrfs_delayed_ref_node *ref;
-
-	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
-		return NULL;
-
-	/*
-	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
-	 * This is to prevent a ref count from going down to zero, which deletes
-	 * the extent item from the extent tree, when there still are references
-	 * to add, which would fail because they would not find the extent item.
-	 */
-	if (!list_empty(&head->ref_add_list))
-		return list_first_entry(&head->ref_add_list,
-				struct btrfs_delayed_ref_node, add_list);
-
-	ref = rb_entry(rb_first_cached(&head->ref_tree),
-		       struct btrfs_delayed_ref_node, ref_node);
-	ASSERT(list_empty(&ref->add_list));
-	return ref;
-}
-
 static struct btrfs_delayed_extent_op *cleanup_extent_op(
 				struct btrfs_delayed_ref_head *head)
 {
@@ -1959,7 +1935,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 	lockdep_assert_held(&locked_ref->mutex);
 	lockdep_assert_held(&locked_ref->lock);
 
-	while ((ref = select_delayed_ref(locked_ref))) {
+	while ((ref = btrfs_select_delayed_ref(locked_ref))) {
 		if (ref->seq &&
 		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
 			spin_unlock(&locked_ref->lock);
-- 
cgit v1.2.3


From 2b34879d97e27b74e8e933463f5e9073d9d39821 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 14 Nov 2024 10:03:34 -0500
Subject: btrfs: selftests: add delayed ref self test cases

The recent fix for a stupid mistake I made uncovered the fact that we
don't have adequate testing in the delayed refs code, as it took a
pretty extensive and long running stress test to uncover something that
a unit test would have uncovered right away.

Fix this by adding a delayed refs self test suite.  This will validate
that the btrfs_ref transformation does the correct thing, that we do the
correct thing when merging delayed refs, and that we get the delayed
refs in the order that we expect.  These are all crucial to how the
delayed refs operate.

I introduced various bugs (including the original bug) into the delayed
refs code to validate that these tests caught all of the shenanigans
that I could think of.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile                   |    2 +-
 fs/btrfs/delayed-ref.c              |   13 +-
 fs/btrfs/tests/btrfs-tests.c        |   18 +
 fs/btrfs/tests/btrfs-tests.h        |    6 +
 fs/btrfs/tests/delayed-refs-tests.c | 1015 +++++++++++++++++++++++++++++++++++
 5 files changed, 1050 insertions(+), 4 deletions(-)
 create mode 100644 fs/btrfs/tests/delayed-refs-tests.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3cfc440c636c..2d5f0482678b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
 	tests/extent-buffer-tests.o tests/btrfs-tests.o \
 	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
 	tests/free-space-tree-tests.o tests/extent-map-tests.o \
-	tests/raid-stripe-tree-tests.o
+	tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7e4cdae2a820..44cfe7fd85bd 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
 	u64 num_bytes;
 	u64 reserved_bytes;
 
+	if (btrfs_is_testing(fs_info))
+		return;
+
 	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
 	num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
 						       trans->delayed_ref_csum_deletions);
@@ -1260,6 +1263,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 {
 	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
 	struct btrfs_fs_info *fs_info = trans->fs_info;
+	bool testing = btrfs_is_testing(fs_info);
 
 	spin_lock(&delayed_refs->lock);
 	while (true) {
@@ -1289,7 +1293,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 		spin_unlock(&delayed_refs->lock);
 		mutex_unlock(&head->mutex);
 
-		if (pin_bytes) {
+		if (!testing && pin_bytes) {
 			struct btrfs_block_group *bg;
 
 			bg = btrfs_lookup_block_group(fs_info, head->bytenr);
@@ -1321,12 +1325,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
 				head->bytenr + head->num_bytes - 1);
 		}
-		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+		if (!testing)
+			btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
 		btrfs_put_delayed_ref_head(head);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
-	btrfs_qgroup_destroy_extent_records(trans);
+
+	if (!testing)
+		btrfs_qgroup_destroy_extent_records(trans);
 
 	spin_unlock(&delayed_refs->lock);
 }
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index e607b5d52fb1..5eff8d7d2360 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -30,6 +30,7 @@ const char *test_error[] = {
 	[TEST_ALLOC_EXTENT_MAP]      = "cannot allocate extent map",
 	[TEST_ALLOC_CHUNK_MAP]       = "cannot allocate chunk map",
 	[TEST_ALLOC_IO_CONTEXT]	     = "cannot allocate io context",
+	[TEST_ALLOC_TRANSACTION]     = "cannot allocate transaction",
 };
 
 static const struct super_operations btrfs_test_super_ops = {
@@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
 	fs_info->nodesize = nodesize;
 	fs_info->sectorsize = sectorsize;
 	fs_info->sectorsize_bits = ilog2(sectorsize);
+
+	/* CRC32C csum size. */
+	fs_info->csum_size = 4;
+	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) /
+		fs_info->csum_size;
 	set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
 
 	test_mnt->mnt_sb->s_fs_info = fs_info;
@@ -247,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
 	kfree(cache);
 }
 
+void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info)
+{
+	memset(trans, 0, sizeof(*trans));
+	trans->fs_info = fs_info;
+	xa_init(&trans->delayed_refs.head_refs);
+	xa_init(&trans->delayed_refs.dirty_extents);
+	spin_lock_init(&trans->delayed_refs.lock);
+}
+
 void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
 			    struct btrfs_fs_info *fs_info)
 {
@@ -295,6 +310,9 @@ int btrfs_run_sanity_tests(void)
 			ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
 			if (ret)
 				goto out;
+			ret = btrfs_test_delayed_refs(sectorsize, nodesize);
+			if (ret)
+				goto out;
 		}
 	}
 	ret = btrfs_test_extent_map();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index b524ecf2f452..4307bdaa6749 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -6,6 +6,8 @@
 #ifndef BTRFS_TESTS_H
 #define BTRFS_TESTS_H
 
+#include <linux/types.h>
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 int btrfs_run_sanity_tests(void);
 
@@ -25,12 +27,14 @@ enum {
 	TEST_ALLOC_EXTENT_MAP,
 	TEST_ALLOC_CHUNK_MAP,
 	TEST_ALLOC_IO_CONTEXT,
+	TEST_ALLOC_TRANSACTION,
 };
 
 extern const char *test_error[];
 
 struct btrfs_root;
 struct btrfs_trans_handle;
+struct btrfs_transaction;
 
 int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
 int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
@@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
 int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
 int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
 int btrfs_test_extent_map(void);
+int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize);
 struct inode *btrfs_new_test_inode(void);
 struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
 void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
@@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt
 void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
 void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
 			    struct btrfs_fs_info *fs_info);
+void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info);
 struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
 #else
 static inline int btrfs_run_sanity_tests(void)
diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c
new file mode 100644
index 000000000000..6558508c2ddf
--- /dev/null
+++ b/fs/btrfs/tests/delayed-refs-tests.c
@@ -0,0 +1,1015 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sizes.h>
+#include "btrfs-tests.h"
+#include "../transaction.h"
+#include "../delayed-ref.h"
+#include "../extent-tree.h"
+
+#define FAKE_ROOT_OBJECTID 256
+#define FAKE_BYTENR 0
+#define FAKE_LEVEL 1
+#define FAKE_INO 256
+#define FAKE_FILE_OFFSET 0
+#define FAKE_PARENT SZ_1M
+
+struct ref_head_check {
+	u64 bytenr;
+	u64 num_bytes;
+	int ref_mod;
+	int total_ref_mod;
+	int must_insert;
+};
+
+struct ref_node_check {
+	u64 bytenr;
+	u64 num_bytes;
+	int ref_mod;
+	enum btrfs_delayed_ref_action action;
+	u8 type;
+	u64 parent;
+	u64 root;
+	u64 owner;
+	u64 offset;
+};
+
+static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type)
+{
+	if ((type == BTRFS_TREE_BLOCK_REF_KEY) ||
+	    (type == BTRFS_SHARED_BLOCK_REF_KEY))
+		return BTRFS_REF_METADATA;
+	return BTRFS_REF_DATA;
+}
+
+static void delete_delayed_ref_head(struct btrfs_trans_handle *trans,
+				    struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_delayed_ref_root *delayed_refs =
+		&trans->transaction->delayed_refs;
+
+	spin_lock(&delayed_refs->lock);
+	spin_lock(&head->lock);
+	btrfs_delete_ref_head(fs_info, delayed_refs, head);
+	spin_unlock(&head->lock);
+	spin_unlock(&delayed_refs->lock);
+
+	btrfs_delayed_ref_unlock(head);
+	btrfs_put_delayed_ref_head(head);
+}
+
+static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head,
+				    struct btrfs_delayed_ref_node *node)
+{
+	rb_erase_cached(&node->ref_node, &head->ref_tree);
+	RB_CLEAR_NODE(&node->ref_node);
+	if (!list_empty(&node->add_list))
+		list_del_init(&node->add_list);
+	btrfs_put_delayed_ref(node);
+}
+
+static int validate_ref_head(struct btrfs_delayed_ref_head *head,
+			     struct ref_head_check *check)
+{
+	if (head->bytenr != check->bytenr) {
+		test_err("invalid bytenr have: %llu want: %llu", head->bytenr,
+			 check->bytenr);
+		return -EINVAL;
+	}
+
+	if (head->num_bytes != check->num_bytes) {
+		test_err("invalid num_bytes have: %llu want: %llu",
+			 head->num_bytes, check->num_bytes);
+		return -EINVAL;
+	}
+
+	if (head->ref_mod != check->ref_mod) {
+		test_err("invalid ref_mod have: %d want: %d", head->ref_mod,
+			 check->ref_mod);
+		return -EINVAL;
+	}
+
+	if (head->total_ref_mod != check->total_ref_mod) {
+		test_err("invalid total_ref_mod have: %d want: %d",
+			 head->total_ref_mod, check->total_ref_mod);
+		return -EINVAL;
+	}
+
+	if (head->must_insert_reserved != check->must_insert) {
+		test_err("invalid must_insert have: %d want: %d",
+			 head->must_insert_reserved, check->must_insert);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int validate_ref_node(struct btrfs_delayed_ref_node *node,
+			     struct ref_node_check *check)
+{
+	if (node->bytenr != check->bytenr) {
+		test_err("invalid bytenr have: %llu want: %llu", node->bytenr,
+			 check->bytenr);
+		return -EINVAL;
+	}
+
+	if (node->num_bytes != check->num_bytes) {
+		test_err("invalid num_bytes have: %llu want: %llu",
+			 node->num_bytes, check->num_bytes);
+		return -EINVAL;
+	}
+
+	if (node->ref_mod != check->ref_mod) {
+		test_err("invalid ref_mod have: %d want: %d", node->ref_mod,
+			 check->ref_mod);
+		return -EINVAL;
+	}
+
+	if (node->action != check->action) {
+		test_err("invalid action have: %d want: %d", node->action,
+			 check->action);
+		return -EINVAL;
+	}
+
+	if (node->parent != check->parent) {
+		test_err("invalid parent have: %llu want: %llu", node->parent,
+			 check->parent);
+		return -EINVAL;
+	}
+
+	if (node->ref_root != check->root) {
+		test_err("invalid root have: %llu want: %llu", node->ref_root,
+			 check->root);
+		return -EINVAL;
+	}
+
+	if (node->type != check->type) {
+		test_err("invalid type have: %d want: %d", node->type,
+			 check->type);
+		return -EINVAL;
+	}
+
+	if (btrfs_delayed_ref_owner(node) != check->owner) {
+		test_err("invalid owner have: %llu want: %llu",
+			 btrfs_delayed_ref_owner(node), check->owner);
+		return -EINVAL;
+	}
+
+	if (btrfs_delayed_ref_offset(node) != check->offset) {
+		test_err("invalid offset have: %llu want: %llu",
+			 btrfs_delayed_ref_offset(node), check->offset);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int simple_test(struct btrfs_trans_handle *trans,
+		       struct ref_head_check *head_check,
+		       struct ref_node_check *node_check)
+{
+	struct btrfs_delayed_ref_root *delayed_refs =
+		&trans->transaction->delayed_refs;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_node *node;
+	struct btrfs_ref ref = {
+		.type = ref_type_from_disk_ref_type(node_check->type),
+		.action = node_check->action,
+		.parent = node_check->parent,
+		.ref_root = node_check->root,
+		.bytenr = node_check->bytenr,
+		.num_bytes = fs_info->nodesize,
+	};
+	int ret;
+
+	if (ref.type == BTRFS_REF_METADATA)
+		btrfs_init_tree_ref(&ref, node_check->owner, node_check->root,
+				    false);
+	else
+		btrfs_init_data_ref(&ref, node_check->owner, node_check->offset,
+				    node_check->root, true);
+
+	if (ref.type == BTRFS_REF_METADATA)
+		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	else
+		ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		return ret;
+	}
+
+	head = btrfs_select_ref_head(fs_info, delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		return -EINVAL;
+	}
+
+	ret = -EINVAL;
+	if (validate_ref_head(head, head_check))
+		goto out;
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	if (validate_ref_node(node, node_check))
+		goto out;
+	ret = 0;
+out:
+	btrfs_unselect_ref_head(delayed_refs, head);
+	btrfs_destroy_delayed_refs(trans->transaction);
+	return ret;
+}
+
+/*
+ * These are simple tests, make sure that our btrfs_ref's get turned into the
+ * appropriate btrfs_delayed_ref_node based on their settings and action.
+ */
+static int simple_tests(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct ref_head_check head_check = {
+		.bytenr = FAKE_BYTENR,
+		.num_bytes = fs_info->nodesize,
+		.ref_mod = 1,
+		.total_ref_mod = 1,
+	};
+	struct ref_node_check node_check = {
+		.bytenr = FAKE_BYTENR,
+		.num_bytes = fs_info->nodesize,
+		.ref_mod = 1,
+		.action = BTRFS_ADD_DELAYED_REF,
+		.type = BTRFS_TREE_BLOCK_REF_KEY,
+		.parent = 0,
+		.root = FAKE_ROOT_OBJECTID,
+		.owner = FAKE_LEVEL,
+		.offset = 0,
+	};
+
+	if (simple_test(trans, &head_check, &node_check)) {
+		test_err("single add tree block failed");
+		return -EINVAL;
+	}
+
+	node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+	node_check.owner = FAKE_INO;
+	node_check.offset = FAKE_FILE_OFFSET;
+
+	if (simple_test(trans, &head_check, &node_check)) {
+		test_err("single add extent data failed");
+		return -EINVAL;
+	}
+
+	node_check.parent = FAKE_PARENT;
+	node_check.type = BTRFS_SHARED_BLOCK_REF_KEY;
+	node_check.owner = FAKE_LEVEL;
+	node_check.offset = 0;
+
+	if (simple_test(trans, &head_check, &node_check)) {
+		test_err("single add shared block failed");
+		return -EINVAL;
+	}
+
+	node_check.type = BTRFS_SHARED_DATA_REF_KEY;
+	node_check.owner = FAKE_INO;
+	node_check.offset = FAKE_FILE_OFFSET;
+
+	if (simple_test(trans, &head_check, &node_check)) {
+		test_err("single add shared data failed");
+		return -EINVAL;
+	}
+
+	head_check.ref_mod = -1;
+	head_check.total_ref_mod = -1;
+	node_check.action = BTRFS_DROP_DELAYED_REF;
+	node_check.type = BTRFS_TREE_BLOCK_REF_KEY;
+	node_check.owner = FAKE_LEVEL;
+	node_check.offset = 0;
+	node_check.parent = 0;
+
+	if (simple_test(trans, &head_check, &node_check)) {
+		test_err("single drop tree block failed");
+		return -EINVAL;
+	}
+
+	node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+	node_check.owner = FAKE_INO;
+	node_check.offset = FAKE_FILE_OFFSET;
+
+	if (simple_test(trans, &head_check, &node_check)) {
+		test_err("single drop extent data failed");
+		return -EINVAL;
+	}
+
+	node_check.parent = FAKE_PARENT;
+	node_check.type = BTRFS_SHARED_BLOCK_REF_KEY;
+	node_check.owner = FAKE_LEVEL;
+	node_check.offset = 0;
+	if (simple_test(trans, &head_check, &node_check)) {
+		test_err("single drop shared block failed");
+		return -EINVAL;
+	}
+
+	node_check.type = BTRFS_SHARED_DATA_REF_KEY;
+	node_check.owner = FAKE_INO;
+	node_check.offset = FAKE_FILE_OFFSET;
+	if (simple_test(trans, &head_check, &node_check)) {
+		test_err("single drop shared data failed");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Merge tests, validate that we do delayed ref merging properly, the ref counts
+ * all end up properly, and delayed refs are deleted once they're no longer
+ * needed.
+ */
+static int merge_tests(struct btrfs_trans_handle *trans,
+		       enum btrfs_ref_type type)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_delayed_ref_head *head = NULL;
+	struct btrfs_delayed_ref_node *node;
+	struct btrfs_ref ref = {
+		.type = type,
+		.action = BTRFS_ADD_DELAYED_REF,
+		.parent = 0,
+		.ref_root = FAKE_ROOT_OBJECTID,
+		.bytenr = FAKE_BYTENR,
+		.num_bytes = fs_info->nodesize,
+	};
+	struct ref_head_check head_check = {
+		.bytenr = FAKE_BYTENR,
+		.num_bytes = fs_info->nodesize,
+		.ref_mod = 0,
+		.total_ref_mod = 0,
+	};
+	struct ref_node_check node_check = {
+		.bytenr = FAKE_BYTENR,
+		.num_bytes = fs_info->nodesize,
+		.ref_mod = 2,
+		.action = BTRFS_ADD_DELAYED_REF,
+		.parent = 0,
+		.root = FAKE_ROOT_OBJECTID,
+	};
+	int ret;
+
+	/*
+	 * First add a ref and then drop it, make sure we get a head ref with a
+	 * 0 total ref mod and no nodes.
+	 */
+	if (type == BTRFS_REF_METADATA) {
+		node_check.type = BTRFS_TREE_BLOCK_REF_KEY;
+		node_check.owner = FAKE_LEVEL;
+		btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false);
+	} else {
+		node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+		node_check.owner = FAKE_INO;
+		node_check.offset = FAKE_FILE_OFFSET;
+		btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET,
+				    FAKE_ROOT_OBJECTID, true);
+	}
+
+	if (type == BTRFS_REF_METADATA)
+		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	else
+		ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		return ret;
+	}
+
+	ref.action = BTRFS_DROP_DELAYED_REF;
+	if (type == BTRFS_REF_METADATA)
+		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	else
+		ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		goto out;
+	}
+
+	ret = -EINVAL;
+	if (validate_ref_head(head, &head_check)) {
+		test_err("single add and drop failed");
+		goto out;
+	}
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (node) {
+		test_err("found node when none should exist");
+		goto out;
+	}
+
+	delete_delayed_ref_head(trans, head);
+	head = NULL;
+
+	/*
+	 * Add a ref, then add another ref, make sure we get a head ref with a
+	 * 2 total ref mod and 1 node.
+	 */
+	ref.action = BTRFS_ADD_DELAYED_REF;
+	if (type == BTRFS_REF_METADATA)
+		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	else
+		ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	if (type == BTRFS_REF_METADATA)
+		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	else
+		ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		goto out;
+	}
+
+	head_check.ref_mod = 2;
+	head_check.total_ref_mod = 2;
+	ret = -EINVAL;
+	if (validate_ref_head(head, &head_check)) {
+		test_err("double add failed");
+		goto out;
+	}
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	if (validate_ref_node(node, &node_check)) {
+		test_err("node check failed");
+		goto out;
+	}
+
+	delete_delayed_ref_node(head, node);
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (node) {
+		test_err("found node when none should exist");
+		goto out;
+	}
+	delete_delayed_ref_head(trans, head);
+	head = NULL;
+
+	/* Add two drop refs, make sure they are merged properly. */
+	ref.action = BTRFS_DROP_DELAYED_REF;
+	if (type == BTRFS_REF_METADATA)
+		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	else
+		ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	if (type == BTRFS_REF_METADATA)
+		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	else
+		ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		goto out;
+	}
+
+	head_check.ref_mod = -2;
+	head_check.total_ref_mod = -2;
+	ret = -EINVAL;
+	if (validate_ref_head(head, &head_check)) {
+		test_err("double drop failed");
+		goto out;
+	}
+
+	node_check.action = BTRFS_DROP_DELAYED_REF;
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	if (validate_ref_node(node, &node_check)) {
+		test_err("node check failed");
+		goto out;
+	}
+
+	delete_delayed_ref_node(head, node);
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (node) {
+		test_err("found node when none should exist");
+		goto out;
+	}
+	delete_delayed_ref_head(trans, head);
+	head = NULL;
+
+	/* Add multiple refs, then drop until we go negative again. */
+	ref.action = BTRFS_ADD_DELAYED_REF;
+	for (int i = 0; i < 10; i++) {
+		if (type == BTRFS_REF_METADATA)
+			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+		else
+			ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+		if (ret) {
+			test_err("failed ref action %d", ret);
+			goto out;
+		}
+	}
+
+	ref.action = BTRFS_DROP_DELAYED_REF;
+	for (int i = 0; i < 12; i++) {
+		if (type == BTRFS_REF_METADATA)
+			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+		else
+			ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+		if (ret) {
+			test_err("failed ref action %d", ret);
+			goto out;
+		}
+	}
+
+	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	head_check.ref_mod = -2;
+	head_check.total_ref_mod = -2;
+	ret = -EINVAL;
+	if (validate_ref_head(head, &head_check)) {
+		test_err("double drop failed");
+		goto out;
+	}
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	if (validate_ref_node(node, &node_check)) {
+		test_err("node check failed");
+		goto out;
+	}
+
+	delete_delayed_ref_node(head, node);
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (node) {
+		test_err("found node when none should exist");
+		goto out;
+	}
+
+	delete_delayed_ref_head(trans, head);
+	head = NULL;
+
+	/* Drop multiple refs, then add until we go positive again. */
+	ref.action = BTRFS_DROP_DELAYED_REF;
+	for (int i = 0; i < 10; i++) {
+		if (type == BTRFS_REF_METADATA)
+			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+		else
+			ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+		if (ret) {
+			test_err("failed ref action %d", ret);
+			goto out;
+		}
+	}
+
+	ref.action = BTRFS_ADD_DELAYED_REF;
+	for (int i = 0; i < 12; i++) {
+		if (type == BTRFS_REF_METADATA)
+			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+		else
+			ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+		if (ret) {
+			test_err("failed ref action %d", ret);
+			goto out;
+		}
+	}
+
+	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	head_check.ref_mod = 2;
+	head_check.total_ref_mod = 2;
+	ret = -EINVAL;
+	if (validate_ref_head(head, &head_check)) {
+		test_err("add and drop to positive failed");
+		goto out;
+	}
+
+	node_check.action = BTRFS_ADD_DELAYED_REF;
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	if (validate_ref_node(node, &node_check)) {
+		test_err("node check failed");
+		goto out;
+	}
+
+	delete_delayed_ref_node(head, node);
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (node) {
+		test_err("found node when none should exist");
+		goto out;
+	}
+	delete_delayed_ref_head(trans, head);
+	head = NULL;
+
+	/*
+	 * Add a bunch of refs with different roots and parents, then drop them
+	 * all, make sure everything is properly merged.
+	 */
+	ref.action = BTRFS_ADD_DELAYED_REF;
+	for (int i = 0; i < 50; i++) {
+		if (!(i % 2)) {
+			ref.parent = 0;
+			ref.ref_root = FAKE_ROOT_OBJECTID + i;
+		} else {
+			ref.parent = FAKE_PARENT + (i * fs_info->nodesize);
+		}
+		if (type == BTRFS_REF_METADATA)
+			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+		else
+			ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+		if (ret) {
+			test_err("failed ref action %d", ret);
+			goto out;
+		}
+	}
+
+	ref.action = BTRFS_DROP_DELAYED_REF;
+	for (int i = 0; i < 50; i++) {
+		if (!(i % 2)) {
+			ref.parent = 0;
+			ref.ref_root = FAKE_ROOT_OBJECTID + i;
+		} else {
+			ref.parent = FAKE_PARENT + (i * fs_info->nodesize);
+		}
+		if (type == BTRFS_REF_METADATA)
+			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+		else
+			ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+		if (ret) {
+			test_err("failed ref action %d", ret);
+			goto out;
+		}
+	}
+
+	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	head_check.ref_mod = 0;
+	head_check.total_ref_mod = 0;
+	ret = -EINVAL;
+	if (validate_ref_head(head, &head_check)) {
+		test_err("add and drop multiple failed");
+		goto out;
+	}
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (node) {
+		test_err("found node when none should exist");
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR_OR_NULL(head))
+		btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head);
+	btrfs_destroy_delayed_refs(trans->transaction);
+	return ret;
+}
+
+/*
+ * Basic test to validate we always get the add operations first followed by any
+ * delete operations.
+ */
+static int select_delayed_refs_test(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_delayed_ref_root *delayed_refs =
+		&trans->transaction->delayed_refs;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_delayed_ref_head *head = NULL;
+	struct btrfs_delayed_ref_node *node;
+	struct btrfs_ref ref = {
+		.type = BTRFS_REF_METADATA,
+		.action = BTRFS_DROP_DELAYED_REF,
+		.parent = 0,
+		.ref_root = FAKE_ROOT_OBJECTID,
+		.bytenr = FAKE_BYTENR,
+		.num_bytes = fs_info->nodesize,
+	};
+	struct ref_head_check head_check = {
+		.bytenr = FAKE_BYTENR,
+		.num_bytes = fs_info->nodesize,
+		.ref_mod = 0,
+		.total_ref_mod = 0,
+	};
+	struct ref_node_check node_check = {
+		.bytenr = FAKE_BYTENR,
+		.num_bytes = fs_info->nodesize,
+		.ref_mod = 1,
+		.action = BTRFS_ADD_DELAYED_REF,
+		.type = BTRFS_TREE_BLOCK_REF_KEY,
+		.parent = 0,
+		.owner = FAKE_LEVEL,
+		.offset = 0,
+	};
+	int ret;
+
+	/* Add the drop first. */
+	btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false);
+	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		return ret;
+	}
+
+	/*
+	 * Now add the add, and make it a different root so it's logically later
+	 * in the rb tree.
+	 */
+	ref.action = BTRFS_ADD_DELAYED_REF;
+	ref.ref_root = FAKE_ROOT_OBJECTID + 1;
+	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	head = btrfs_select_ref_head(fs_info, delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		ret = -EINVAL;
+		head = NULL;
+		goto out;
+	}
+
+	ret = -EINVAL;
+	if (validate_ref_head(head, &head_check)) {
+		test_err("head check failed");
+		goto out;
+	}
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	node_check.root = FAKE_ROOT_OBJECTID + 1;
+	if (validate_ref_node(node, &node_check)) {
+		test_err("node check failed");
+		goto out;
+	}
+	delete_delayed_ref_node(head, node);
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	node_check.action = BTRFS_DROP_DELAYED_REF;
+	node_check.root = FAKE_ROOT_OBJECTID;
+	if (validate_ref_node(node, &node_check)) {
+		test_err("node check failed");
+		goto out;
+	}
+	delete_delayed_ref_node(head, node);
+	delete_delayed_ref_head(trans, head);
+	head = NULL;
+
+	/*
+	 * Now we're going to do the same thing, but we're going to have an add
+	 * that gets deleted because of a merge, and make sure we still have
+	 * another add in place.
+	 */
+	ref.action = BTRFS_DROP_DELAYED_REF;
+	ref.ref_root = FAKE_ROOT_OBJECTID;
+	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	ref.action = BTRFS_ADD_DELAYED_REF;
+	ref.ref_root = FAKE_ROOT_OBJECTID + 1;
+	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	ref.action = BTRFS_DROP_DELAYED_REF;
+	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	ref.action = BTRFS_ADD_DELAYED_REF;
+	ref.ref_root = FAKE_ROOT_OBJECTID + 2;
+	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+	if (ret) {
+		test_err("failed ref action %d", ret);
+		goto out;
+	}
+
+	head = btrfs_select_ref_head(fs_info, delayed_refs);
+	if (IS_ERR_OR_NULL(head)) {
+		if (IS_ERR(head))
+			test_err("failed to select delayed ref head: %ld",
+				 PTR_ERR(head));
+		else
+			test_err("failed to find delayed ref head");
+		ret = -EINVAL;
+		head = NULL;
+		goto out;
+	}
+
+	ret = -EINVAL;
+	if (validate_ref_head(head, &head_check)) {
+		test_err("head check failed");
+		goto out;
+	}
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	node_check.action = BTRFS_ADD_DELAYED_REF;
+	node_check.root = FAKE_ROOT_OBJECTID + 2;
+	if (validate_ref_node(node, &node_check)) {
+		test_err("node check failed");
+		goto out;
+	}
+	delete_delayed_ref_node(head, node);
+
+	spin_lock(&head->lock);
+	node = btrfs_select_delayed_ref(head);
+	spin_unlock(&head->lock);
+	if (!node) {
+		test_err("failed to select delayed ref");
+		goto out;
+	}
+
+	node_check.action = BTRFS_DROP_DELAYED_REF;
+	node_check.root = FAKE_ROOT_OBJECTID;
+	if (validate_ref_node(node, &node_check)) {
+		test_err("node check failed");
+		goto out;
+	}
+	delete_delayed_ref_node(head, node);
+	ret = 0;
+out:
+	if (head)
+		btrfs_unselect_ref_head(delayed_refs, head);
+	btrfs_destroy_delayed_refs(trans->transaction);
+	return ret;
+}
+
+int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
+{
+	struct btrfs_transaction *transaction;
+	struct btrfs_trans_handle trans;
+	struct btrfs_fs_info *fs_info;
+	int ret;
+
+	test_msg("running delayed refs tests");
+
+	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+	if (!fs_info) {
+		test_std_err(TEST_ALLOC_FS_INFO);
+		return -ENOMEM;
+	}
+	transaction = kmalloc(sizeof(*transaction), GFP_KERNEL);
+	if (!transaction) {
+		test_std_err(TEST_ALLOC_TRANSACTION);
+		ret = -ENOMEM;
+		goto out_free_fs_info;
+	}
+	btrfs_init_dummy_trans(&trans, fs_info);
+	btrfs_init_dummy_transaction(transaction, fs_info);
+	trans.transaction = transaction;
+
+	ret = simple_tests(&trans);
+	if (!ret) {
+		test_msg("running delayed refs merg tests on metadata refs");
+		ret = merge_tests(&trans, BTRFS_REF_METADATA);
+	}
+
+	if (!ret) {
+		test_msg("running delayed refs merg tests on data refs");
+		ret = merge_tests(&trans, BTRFS_REF_DATA);
+	}
+
+	if (!ret)
+		ret = select_delayed_refs_test(&trans);
+
+out_free_fs_info:
+	btrfs_free_dummy_fs_info(fs_info);
+	return ret;
+}
-- 
cgit v1.2.3


From 27602f1d1b6edfad7694b9979b86860e0bc00b36 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 25 Nov 2024 09:13:24 +1030
Subject: btrfs: use PTR_ERR() instead of PTR_ERR_OR_ZERO() for
 btrfs_get_extent()

The function btrfs_get_extent() will only return an PTR_ERR() or a valid
extent map pointer. It will not return NULL.

Thus the usage of PTR_ERR_OR_ZERO() inside submit_one_sector() is not
needed, use plain PTR_ERR() instead, and that is the only usage of
PTR_ERR_OR_ZERO() after btrfs_get_extent().

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b923d0cec61c..9725ff7f274d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1335,7 +1335,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
 
 	em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
 	if (IS_ERR(em))
-		return PTR_ERR_OR_ZERO(em);
+		return PTR_ERR(em);
 
 	extent_offset = filepos - em->start;
 	em_end = extent_map_end(em);
-- 
cgit v1.2.3


From d0ad40d730ef30e51d3ec4e3b8c9b6691053354a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Wed, 13 Nov 2024 13:00:12 +0000
Subject: btrfs: send: remove redundant assignments to variable ret

The variable ret is being initialized to zero and also later re-assigned
to zero. In both cases the assignment is redundant since the value is
never read after the assignment and hence they can be removed.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 498c84323253..f437138fefbc 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -7259,7 +7259,7 @@ static int changed_cb(struct btrfs_path *left_path,
 		      enum btrfs_compare_tree_result result,
 		      struct send_ctx *sctx)
 {
-	int ret = 0;
+	int ret;
 
 	/*
 	 * We can not hold the commit root semaphore here. This is because in
@@ -7319,7 +7319,6 @@ static int changed_cb(struct btrfs_path *left_path,
 			return 0;
 		}
 		result = BTRFS_COMPARE_TREE_CHANGED;
-		ret = 0;
 	}
 
 	sctx->left_path = left_path;
-- 
cgit v1.2.3


From bfcf6d04f8ee817b2cb7c238e0a14392a3245f45 Mon Sep 17 00:00:00 2001
From: Allison Karlitskaya <allison.karlitskaya@redhat.com>
Date: Tue, 26 Nov 2024 16:23:31 +0100
Subject: btrfs: handle FS_IOC_READ_VERITY_METADATA ioctl

Commit 146054090b08 ("btrfs: initial fsverity support") introduced
fs-verity support for btrfs, but didn't add support for
FS_IOC_READ_VERITY_METADATA to directly query the Merkle tree,
descriptor and signature blocks for fs-verity enabled files.

Add the (trival) implementation: we just need to wire it through to the
fs-verity code, the same way as is done in the other two filesystems
which support this ioctl (ext4, f2fs). The fs-verity code already has
access to the required data.

This is also safe to backport to older stable trees (5.15+) if needed.

Signed-off-by: Allison Karlitskaya <allison.karlitskaya@redhat.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4d9305fa37a8..baecb19404ad 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5308,6 +5308,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return fsverity_ioctl_enable(file, (const void __user *)argp);
 	case FS_IOC_MEASURE_VERITY:
 		return fsverity_ioctl_measure(file, argp);
+	case FS_IOC_READ_VERITY_METADATA:
+		return fsverity_ioctl_read_metadata(file, argp);
 	case BTRFS_IOC_ENCODED_READ:
 		return btrfs_ioctl_encoded_read(file, argp, false);
 	case BTRFS_IOC_ENCODED_WRITE:
-- 
cgit v1.2.3


From 3704db1013232465ee3db742dc31853c44daf68c Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 14 Nov 2024 17:04:27 +0900
Subject: btrfs: factor out btrfs_return_free_space()

Factor out a part of unpin_extent_range() that returns space back to the
space info, prioritizing global block reserve.  Also, move the "len"
variable into the loop to clarify we don't need to carry it beyond an
iteration.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 25 ++++---------------------
 fs/btrfs/space-info.c  | 29 +++++++++++++++++++++++++++++
 fs/btrfs/space-info.h  |  1 +
 3 files changed, 34 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2ce9e69ee8f8..64a199d93d64 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2700,15 +2700,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_block_group *cache = NULL;
 	struct btrfs_space_info *space_info;
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	struct btrfs_free_cluster *cluster = NULL;
-	u64 len;
 	u64 total_unpinned = 0;
 	u64 empty_cluster = 0;
 	bool readonly;
 	int ret = 0;
 
 	while (start <= end) {
+		u64 len;
+
 		readonly = false;
 		if (!cache ||
 		    start >= cache->start + cache->length) {
@@ -2766,25 +2766,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 			readonly = true;
 		}
 		spin_unlock(&cache->lock);
-		if (!readonly && return_free_space &&
-		    global_rsv->space_info == space_info) {
-			spin_lock(&global_rsv->lock);
-			if (!global_rsv->full) {
-				u64 to_add = min(len, global_rsv->size -
-						      global_rsv->reserved);
-
-				global_rsv->reserved += to_add;
-				btrfs_space_info_update_bytes_may_use(fs_info,
-						space_info, to_add);
-				if (global_rsv->reserved >= global_rsv->size)
-					global_rsv->full = 1;
-				len -= to_add;
-			}
-			spin_unlock(&global_rsv->lock);
-		}
-		/* Add to any tickets we may have */
-		if (!readonly && return_free_space && len)
-			btrfs_try_granting_tickets(fs_info, space_info);
+		if (!readonly && return_free_space)
+			btrfs_return_free_space(space_info, len);
 		spin_unlock(&space_info->lock);
 	}
 
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 255e85f78313..8b6340c59bf2 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -2082,3 +2082,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
 			do_reclaim_sweep(space_info, raid);
 	}
 }
+
+void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
+{
+	struct btrfs_fs_info *fs_info = space_info->fs_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+
+	lockdep_assert_held(&space_info->lock);
+
+	/* Prioritize the global reservation to receive the freed space. */
+	if (global_rsv->space_info != space_info)
+		goto grant;
+
+	spin_lock(&global_rsv->lock);
+	if (!global_rsv->full) {
+		u64 to_add = min(len, global_rsv->size - global_rsv->reserved);
+
+		global_rsv->reserved += to_add;
+		btrfs_space_info_update_bytes_may_use(fs_info, space_info, to_add);
+		if (global_rsv->reserved >= global_rsv->size)
+			global_rsv->full = 1;
+		len -= to_add;
+	}
+	spin_unlock(&global_rsv->lock);
+
+grant:
+	/* Add to any tickets we may have. */
+	if (len)
+		btrfs_try_granting_tickets(fs_info, space_info);
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index efbecc0c5258..4c9e8aabee51 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -295,5 +295,6 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool
 bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
 int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
 void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
+void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
 
 #endif /* BTRFS_SPACE_INFO_H */
-- 
cgit v1.2.3


From 7de9ca1f30b7002a237e3119a8a4a54f26988b92 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 14 Nov 2024 17:04:28 +0900
Subject: btrfs: drop fs_info argument from btrfs_update_space_info_*()

Since commit e1e577aafe41 ("btrfs: store fs_info in space_info"), we have
the fs_info in a space_info. So, we can drop fs_info argument from
btrfs_update_space_info_*. There is no behavior change.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c    | 16 ++++++----------
 fs/btrfs/block-rsv.c      | 10 +++-------
 fs/btrfs/delalloc-space.c |  2 +-
 fs/btrfs/delayed-ref.c    |  5 ++---
 fs/btrfs/extent-tree.c    | 10 +++-------
 fs/btrfs/inode.c          |  2 +-
 fs/btrfs/space-info.c     | 14 +++++---------
 fs/btrfs/space-info.h     |  9 ++++-----
 fs/btrfs/transaction.c    |  3 +--
 9 files changed, 26 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 4427c1b835e8..5be029734cfa 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1223,7 +1223,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	block_group->space_info->total_bytes -= block_group->length;
 	block_group->space_info->bytes_readonly -=
 		(block_group->length - block_group->zone_unusable);
-	btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info,
+	btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
 						    -block_group->zone_unusable);
 	block_group->space_info->disk_total -= block_group->length * factor;
 
@@ -1396,8 +1396,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 		if (btrfs_is_zoned(cache->fs_info)) {
 			/* Migrate zone_unusable bytes to readonly */
 			sinfo->bytes_readonly += cache->zone_unusable;
-			btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
-								    -cache->zone_unusable);
+			btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
 			cache->zone_unusable = 0;
 		}
 		cache->ro++;
@@ -1645,8 +1644,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		spin_lock(&space_info->lock);
 		spin_lock(&block_group->lock);
 
-		btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-						     -block_group->pinned);
+		btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
 		space_info->bytes_readonly += block_group->pinned;
 		block_group->pinned = 0;
 
@@ -3060,8 +3058,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
 				(cache->alloc_offset - cache->used - cache->pinned -
 				 cache->reserved) +
 				(cache->length - cache->zone_capacity);
-			btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
-								    cache->zone_unusable);
+			btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
 			sinfo->bytes_readonly -= cache->zone_unusable;
 		}
 		num_bytes = cache->length - cache->reserved -
@@ -3699,7 +3696,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 		old_val -= num_bytes;
 		cache->used = old_val;
 		cache->pinned += num_bytes;
-		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
+		btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
 		space_info->bytes_used -= num_bytes;
 		space_info->disk_used -= num_bytes * factor;
 		if (READ_ONCE(space_info->periodic_reclaim))
@@ -3781,8 +3778,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
 	space_info->bytes_reserved += num_bytes;
 	trace_btrfs_space_reservation(cache->fs_info, "space_info",
 				      space_info->flags, num_bytes, 1);
-	btrfs_space_info_update_bytes_may_use(cache->fs_info,
-					      space_info, -ram_bytes);
+	btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
 	if (delalloc)
 		cache->delalloc_bytes += num_bytes;
 
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index a07b9594dc70..3f3608299c0b 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 			spin_unlock(&dest->lock);
 		}
 		if (num_bytes)
-			btrfs_space_info_free_bytes_may_use(fs_info,
-							    space_info,
-							    num_bytes);
+			btrfs_space_info_free_bytes_may_use(space_info, num_bytes);
 	}
 	if (qgroup_to_release_ret)
 		*qgroup_to_release_ret = qgroup_to_release;
@@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	if (block_rsv->reserved < block_rsv->size) {
 		num_bytes = block_rsv->size - block_rsv->reserved;
-		btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
-						      num_bytes);
+		btrfs_space_info_update_bytes_may_use(sinfo, num_bytes);
 		block_rsv->reserved = block_rsv->size;
 	} else if (block_rsv->reserved > block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
-		btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
-						      -num_bytes);
+		btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);
 		block_rsv->reserved = block_rsv->size;
 		btrfs_try_granting_tickets(fs_info, sinfo);
 	}
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 7aa8a395d838..88e900e5a43d 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -176,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
 	ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
 
 	data_sinfo = fs_info->data_sinfo;
-	btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
+	btrfs_space_info_free_bytes_may_use(data_sinfo, len);
 }
 
 /*
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 44cfe7fd85bd..30f7079fa28e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -257,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 	spin_unlock(&block_rsv->lock);
 
 	if (to_free > 0)
-		btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
+		btrfs_space_info_free_bytes_may_use(space_info, to_free);
 
 	if (refilled_bytes > 0)
 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
@@ -1311,8 +1311,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 				spin_lock(&bg->space_info->lock);
 				spin_lock(&bg->lock);
 				bg->pinned += head->num_bytes;
-				btrfs_space_info_update_bytes_pinned(fs_info,
-								     bg->space_info,
+				btrfs_space_info_update_bytes_pinned(bg->space_info,
 								     head->num_bytes);
 				bg->reserved -= head->num_bytes;
 				bg->space_info->bytes_reserved -= head->num_bytes;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 64a199d93d64..2f9126528a01 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2547,13 +2547,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_block_group *cache,
 			   u64 bytenr, u64 num_bytes, int reserved)
 {
-	struct btrfs_fs_info *fs_info = cache->fs_info;
-
 	spin_lock(&cache->space_info->lock);
 	spin_lock(&cache->lock);
 	cache->pinned += num_bytes;
-	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
-					     num_bytes);
+	btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes);
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
@@ -2754,15 +2751,14 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 		spin_lock(&space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
-		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
+		btrfs_space_info_update_bytes_pinned(space_info, -len);
 		space_info->max_extent_size = 0;
 		if (cache->ro) {
 			space_info->bytes_readonly += len;
 			readonly = true;
 		} else if (btrfs_is_zoned(fs_info)) {
 			/* Need reset before reusing in a zoned block group */
-			btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
-								    len);
+			btrfs_space_info_update_bytes_zone_unusable(space_info, len);
 			readonly = true;
 		}
 		spin_unlock(&cache->lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 27b2fe7f735d..283199d11642 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1809,7 +1809,7 @@ static int fallback_to_cow(struct btrfs_inode *inode,
 			bytes = range_bytes;
 
 		spin_lock(&sinfo->lock);
-		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+		btrfs_space_info_update_bytes_may_use(sinfo, bytes);
 		spin_unlock(&sinfo->lock);
 
 		if (count > 0)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 8b6340c59bf2..3ae97c974d95 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -316,7 +316,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
 	found->bytes_used += block_group->used;
 	found->disk_used += block_group->used * factor;
 	found->bytes_readonly += block_group->bytes_super;
-	btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
+	btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable);
 	if (block_group->length > 0)
 		found->full = 0;
 	btrfs_try_granting_tickets(info, found);
@@ -489,9 +489,7 @@ again:
 		if ((used + ticket->bytes <= space_info->total_bytes) ||
 		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
 					 flush)) {
-			btrfs_space_info_update_bytes_may_use(fs_info,
-							      space_info,
-							      ticket->bytes);
+			btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);
 			remove_ticket(space_info, ticket);
 			ticket->bytes = 0;
 			space_info->tickets_id++;
@@ -1690,8 +1688,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 	if (!pending_tickets &&
 	    ((used + orig_bytes <= space_info->total_bytes) ||
 	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
-		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
-						      orig_bytes);
+		btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
 		ret = 0;
 	}
 
@@ -1703,8 +1700,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 	if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
 		used = btrfs_space_info_used(space_info, false);
 		if (used + orig_bytes <= space_info->total_bytes) {
-			btrfs_space_info_update_bytes_may_use(fs_info, space_info,
-							      orig_bytes);
+			btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
 			ret = 0;
 		}
 	}
@@ -2099,7 +2095,7 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
 		u64 to_add = min(len, global_rsv->size - global_rsv->reserved);
 
 		global_rsv->reserved += to_add;
-		btrfs_space_info_update_bytes_may_use(fs_info, space_info, to_add);
+		btrfs_space_info_update_bytes_may_use(space_info, to_add);
 		if (global_rsv->reserved >= global_rsv->size)
 			global_rsv->full = 1;
 		len -= to_add;
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 4c9e8aabee51..69071afc0d47 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -229,10 +229,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i
  */
 #define DECLARE_SPACE_INFO_UPDATE(name, trace_name)			\
 static inline void							\
-btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info,		\
-			       struct btrfs_space_info *sinfo,		\
+btrfs_space_info_update_##name(struct btrfs_space_info *sinfo,		\
 			       s64 bytes)				\
 {									\
+	struct btrfs_fs_info *fs_info = sinfo->fs_info;			\
 	const u64 abs_bytes = (bytes < 0) ? -bytes : bytes;		\
 	lockdep_assert_held(&sinfo->lock);				\
 	trace_update_##name(fs_info, sinfo, sinfo->name, bytes);	\
@@ -275,13 +275,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
 			 enum btrfs_reserve_flush_enum flush);
 
 static inline void btrfs_space_info_free_bytes_may_use(
-				struct btrfs_fs_info *fs_info,
 				struct btrfs_space_info *space_info,
 				u64 num_bytes)
 {
 	spin_lock(&space_info->lock);
-	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
-	btrfs_try_granting_tickets(fs_info, space_info);
+	btrfs_space_info_update_bytes_may_use(space_info, -num_bytes);
+	btrfs_try_granting_tickets(space_info->fs_info, space_info);
 	spin_unlock(&space_info->lock);
 }
 int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dc0b837efd5d..15312013f2a3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -795,8 +795,7 @@ alloc_fail:
 	if (num_bytes)
 		btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
 	if (delayed_refs_bytes)
-		btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
-						    delayed_refs_bytes);
+		btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes);
 reserve_fail:
 	btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 	return ERR_PTR(ret);
-- 
cgit v1.2.3


From 453a73c3069a268c3c4dd00695fc2a95f7880438 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 14 Nov 2024 17:04:29 +0900
Subject: btrfs: zoned: reclaim unused zone by zone resetting

On the zoned mode, once used and freed region is still not reusable after the
freeing. The underlying zone needs to be reset before reusing. Btrfs resets a
zone when it removes a block group, and then new block group is allocated on
the zones to reuse the zones. But, it is sometime too late to catch up with a
write side.

This commit introduces a new space-info reclaim method ZONE_RESET. That will
pick a block group from the unused list and reset its zone to reuse the
zone_unusable space. It is faster than removing the block group and re-creating
a new block group on the same zones.

For the first implementation, the ZONE_RESET is only applied to a block group
whose region is fully zone_unusable. Reclaiming partial zone_unusable block
group could be implemented later.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c        |  28 +++++++++-
 fs/btrfs/space-info.h        |   5 ++
 fs/btrfs/zoned.c             | 124 +++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/zoned.h             |   7 +++
 include/trace/events/btrfs.h |   3 +-
 5 files changed, 164 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 3ae97c974d95..a341d087567a 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -14,6 +14,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "zoned.h"
 
 /*
  * HOW DOES SPACE RESERVATION WORK
@@ -127,6 +128,14 @@
  *     churn a lot and we can avoid making some extent tree modifications if we
  *     are able to delay for as long as possible.
  *
+ *   RESET_ZONES
+ *     This state works only for the zoned mode. On the zoned mode, we cannot
+ *     reuse once allocated then freed region until we reset the zone, due to
+ *     the sequential write zone requirement. The RESET_ZONES state resets the
+ *     zones of an unused block group and let us reuse the space. The reusing
+ *     is faster than removing the block group and allocating another block
+ *     group on the zones.
+ *
  *   ALLOC_CHUNK
  *     We will skip this the first time through space reservation, because of
  *     overcommit and we don't want to have a lot of useless metadata space when
@@ -832,6 +841,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 		 */
 		ret = btrfs_commit_current_transaction(root);
 		break;
+	case RESET_ZONES:
+		ret = btrfs_reset_unused_block_groups(space_info, num_bytes);
+		break;
 	default:
 		ret = -ENOSPC;
 		break;
@@ -1084,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 	enum btrfs_flush_state flush_state;
 	int commit_cycles = 0;
 	u64 last_tickets_id;
+	enum btrfs_flush_state final_state;
 
 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	if (btrfs_is_zoned(fs_info))
+		final_state = RESET_ZONES;
+	else
+		final_state = COMMIT_TRANS;
 
 	spin_lock(&space_info->lock);
 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
@@ -1139,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
 			flush_state++;
 
-		if (flush_state > COMMIT_TRANS) {
+		if (flush_state > final_state) {
 			commit_cycles++;
 			if (commit_cycles > 2) {
 				if (maybe_fail_all_tickets(fs_info, space_info)) {
@@ -1153,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 			}
 		}
 		spin_unlock(&space_info->lock);
-	} while (flush_state <= COMMIT_TRANS);
+	} while (flush_state <= final_state);
 }
 
 /*
@@ -1284,6 +1301,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
  *   This is where we reclaim all of the pinned space generated by running the
  *   iputs
  *
+ * RESET_ZONES
+ *   This state works only for the zoned mode. We scan the unused block group
+ *   list and reset the zones and reuse the block group.
+ *
  * ALLOC_CHUNK_FORCE
  *   For data we start with alloc chunk force, however we could have been full
  *   before, and then the transaction commit could have freed new block groups,
@@ -1293,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = {
 	FLUSH_DELALLOC_FULL,
 	RUN_DELAYED_IPUTS,
 	COMMIT_TRANS,
+	RESET_ZONES,
 	ALLOC_CHUNK_FORCE,
 };
 
@@ -1384,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
 static const enum btrfs_flush_state priority_flush_states[] = {
 	FLUSH_DELAYED_ITEMS_NR,
 	FLUSH_DELAYED_ITEMS,
+	RESET_ZONES,
 	ALLOC_CHUNK,
 };
 
@@ -1397,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
 	FLUSH_DELALLOC_FULL,
 	ALLOC_CHUNK,
 	COMMIT_TRANS,
+	RESET_ZONES,
 };
 
 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 69071afc0d47..a96efdb5e681 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum {
 	BTRFS_RESERVE_FLUSH_EMERGENCY,
 };
 
+/*
+ * Please be aware that the order of enum values will be the order of the reclaim
+ * process in btrfs_async_reclaim_metadata_space().
+ */
 enum btrfs_flush_state {
 	FLUSH_DELAYED_ITEMS_NR	= 1,
 	FLUSH_DELAYED_ITEMS	= 2,
@@ -91,6 +95,7 @@ enum btrfs_flush_state {
 	ALLOC_CHUNK_FORCE	= 9,
 	RUN_DELAYED_IPUTS	= 10,
 	COMMIT_TRANS		= 11,
+	RESET_ZONES		= 12,
 };
 
 struct btrfs_space_info {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index df905ae82929..73e0aa9fc08a 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2652,3 +2652,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
 	}
 	spin_unlock(&fs_info->zone_active_bgs_lock);
 }
+
+/*
+ * Reset the zones of unused block groups from @space_info->bytes_zone_unusable.
+ *
+ * @space_info:	the space to work on
+ * @num_bytes:	targeting reclaim bytes
+ *
+ * This one resets the zones of a block group, so we can reuse the region
+ * without removing the block group. On the other hand, btrfs_delete_unused_bgs()
+ * just removes a block group and frees up the underlying zones. So, we still
+ * need to allocate a new block group to reuse the zones.
+ *
+ * Resetting is faster than deleting/recreating a block group. It is similar
+ * to freeing the logical space on the regular mode. However, we cannot change
+ * the block group's profile with this operation.
+ */
+int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes)
+{
+	struct btrfs_fs_info *fs_info = space_info->fs_info;
+	const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+
+	if (!btrfs_is_zoned(fs_info))
+		return 0;
+
+	while (num_bytes > 0) {
+		struct btrfs_chunk_map *map;
+		struct btrfs_block_group *bg = NULL;
+		bool found = false;
+		u64 reclaimed = 0;
+
+		/*
+		 * Here, we choose a fully zone_unusable block group. It's
+		 * technically possible to reset a partly zone_unusable block
+		 * group, which still has some free space left. However,
+		 * handling that needs to cope with the allocation side, which
+		 * makes the logic more complex. So, let's handle the easy case
+		 * for now.
+		 */
+		spin_lock(&fs_info->unused_bgs_lock);
+		list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) {
+			if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags)
+				continue;
+
+			/*
+			 * Use trylock to avoid locking order violation. In
+			 * btrfs_reclaim_bgs_work(), the lock order is
+			 * &bg->lock -> &fs_info->unused_bgs_lock. We skip a
+			 * block group if we cannot take its lock.
+			 */
+			if (!spin_trylock(&bg->lock))
+				continue;
+			if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) {
+				spin_unlock(&bg->lock);
+				continue;
+			}
+			spin_unlock(&bg->lock);
+			found = true;
+			break;
+		}
+		if (!found) {
+			spin_unlock(&fs_info->unused_bgs_lock);
+			return 0;
+		}
+
+		list_del_init(&bg->bg_list);
+		btrfs_put_block_group(bg);
+		spin_unlock(&fs_info->unused_bgs_lock);
+
+		/*
+		 * Since the block group is fully zone_unusable and we cannot
+		 * allocate from this block group anymore, we don't need to set
+		 * this block group read-only.
+		 */
+
+		down_read(&fs_info->dev_replace.rwsem);
+		map = bg->physical_map;
+		for (int i = 0; i < map->num_stripes; i++) {
+			struct btrfs_io_stripe *stripe = &map->stripes[i];
+			unsigned int nofs_flags;
+			int ret;
+
+			nofs_flags = memalloc_nofs_save();
+			ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET,
+					       stripe->physical >> SECTOR_SHIFT,
+					       zone_size_sectors);
+			memalloc_nofs_restore(nofs_flags);
+
+			if (ret) {
+				up_read(&fs_info->dev_replace.rwsem);
+				return ret;
+			}
+		}
+		up_read(&fs_info->dev_replace.rwsem);
+
+		spin_lock(&space_info->lock);
+		spin_lock(&bg->lock);
+		ASSERT(!btrfs_is_block_group_used(bg));
+		if (bg->ro) {
+			spin_unlock(&bg->lock);
+			spin_unlock(&space_info->lock);
+			continue;
+		}
+
+		reclaimed = bg->alloc_offset;
+		bg->zone_unusable = bg->length - bg->zone_capacity;
+		bg->alloc_offset = 0;
+		/*
+		 * This holds because we currently reset fully used then freed
+		 * block group.
+		 */
+		ASSERT(reclaimed == bg->zone_capacity);
+		bg->free_space_ctl->free_space += reclaimed;
+		space_info->bytes_zone_unusable -= reclaimed;
+		spin_unlock(&bg->lock);
+		btrfs_return_free_space(space_info, reclaimed);
+		spin_unlock(&space_info->lock);
+
+		if (num_bytes <= reclaimed)
+			break;
+		num_bytes -= reclaimed;
+	}
+
+	return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 7612e6572605..9672bf4c3335 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -96,6 +96,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
 				struct btrfs_space_info *space_info, bool do_finish);
 void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
+int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes);
 #else /* CONFIG_BLK_DEV_ZONED */
 
 static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@@ -265,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
 
 static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
 
+static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info,
+						  u64 num_bytes)
+{
+	return 0;
+}
+
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 4df93ca9b7a8..549ab3b41961 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -100,7 +100,8 @@ struct find_free_extent_ctl;
 	EM( ALLOC_CHUNK,		"ALLOC_CHUNK")			\
 	EM( ALLOC_CHUNK_FORCE,		"ALLOC_CHUNK_FORCE")		\
 	EM( RUN_DELAYED_IPUTS,		"RUN_DELAYED_IPUTS")		\
-	EMe(COMMIT_TRANS,		"COMMIT_TRANS")
+	EM( COMMIT_TRANS,		"COMMIT_TRANS")			\
+	EMe(RESET_ZONES,		"RESET_ZONES")
 
 /*
  * First define the enums in the above macros to be exported to userspace via
-- 
cgit v1.2.3


From f6f0da564c668a9565c81c61c06b32e8ed981ec3 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Sun, 1 Dec 2024 21:48:53 -0800
Subject: btrfs: don't BUG_ON() in btrfs_drop_extents()

btrfs_drop_extents() calls BUG_ON() in case the counter of to be deleted
extents is greater than 0. But all of these code paths can handle errors,
so there's no need to crash the kernel. Instead WARN() that the condition
has been met and gracefully bail out.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 14e27473c5bc..d314a7e03a38 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -36,6 +36,7 @@
 #include "ioctl.h"
 #include "file.h"
 #include "super.h"
+#include "print-tree.h"
 
 /*
  * Helper to fault in page and copy.  This should go away and be replaced with
@@ -245,7 +246,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 next_slot:
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
-			BUG_ON(del_nr > 0);
+			if (WARN_ON(del_nr > 0)) {
+				btrfs_print_leaf(leaf);
+				ret = -EINVAL;
+				break;
+			}
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				break;
@@ -321,7 +326,11 @@ next_slot:
 		 *  | -------- extent -------- |
 		 */
 		if (args->start > key.offset && args->end < extent_end) {
-			BUG_ON(del_nr > 0);
+			if (WARN_ON(del_nr > 0)) {
+				btrfs_print_leaf(leaf);
+				ret = -EINVAL;
+				break;
+			}
 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				ret = -EOPNOTSUPP;
 				break;
@@ -409,7 +418,11 @@ next_slot:
 		 *  | -------- extent -------- |
 		 */
 		if (args->start > key.offset && args->end >= extent_end) {
-			BUG_ON(del_nr > 0);
+			if (WARN_ON(del_nr > 0)) {
+				btrfs_print_leaf(leaf);
+				ret = -EINVAL;
+				break;
+			}
 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				ret = -EOPNOTSUPP;
 				break;
@@ -437,7 +450,11 @@ delete_extent_item:
 				del_slot = path->slots[0];
 				del_nr = 1;
 			} else {
-				BUG_ON(del_slot + del_nr != path->slots[0]);
+				if (WARN_ON(del_slot + del_nr != path->slots[0])) {
+					btrfs_print_leaf(leaf);
+					ret = -EINVAL;
+					break;
+				}
 				del_nr++;
 			}
 
-- 
cgit v1.2.3


From 5324c4e10e9c2ce307a037e904c0d9671d7137d9 Mon Sep 17 00:00:00 2001
From: Hao-ran Zheng <zhenghaoran154@gmail.com>
Date: Tue, 3 Dec 2024 15:56:51 +0800
Subject: btrfs: fix data race when accessing the inode's disk_i_size at
 btrfs_drop_extents()

A data race occurs when the function `insert_ordered_extent_file_extent()`
and the function `btrfs_inode_safe_disk_i_size_write()` are executed
concurrently. The function `insert_ordered_extent_file_extent()` is not
locked when reading inode->disk_i_size, causing
`btrfs_inode_safe_disk_i_size_write()` to cause data competition when
writing inode->disk_i_size, thus affecting the value of `modify_tree`.

The specific call stack that appears during testing is as follows:

  ============DATA_RACE============
   btrfs_drop_extents+0x89a/0xa060 [btrfs]
   insert_reserved_file_extent+0xb54/0x2960 [btrfs]
   insert_ordered_extent_file_extent+0xff5/0x1760 [btrfs]
   btrfs_finish_one_ordered+0x1b85/0x36a0 [btrfs]
   btrfs_finish_ordered_io+0x37/0x60 [btrfs]
   finish_ordered_fn+0x3e/0x50 [btrfs]
   btrfs_work_helper+0x9c9/0x27a0 [btrfs]
   process_scheduled_works+0x716/0xf10
   worker_thread+0xb6a/0x1190
   kthread+0x292/0x330
   ret_from_fork+0x4d/0x80
   ret_from_fork_asm+0x1a/0x30
  ============OTHER_INFO============
   btrfs_inode_safe_disk_i_size_write+0x4ec/0x600 [btrfs]
   btrfs_finish_one_ordered+0x24c7/0x36a0 [btrfs]
   btrfs_finish_ordered_io+0x37/0x60 [btrfs]
   finish_ordered_fn+0x3e/0x50 [btrfs]
   btrfs_work_helper+0x9c9/0x27a0 [btrfs]
   process_scheduled_works+0x716/0xf10
   worker_thread+0xb6a/0x1190
   kthread+0x292/0x330
   ret_from_fork+0x4d/0x80
   ret_from_fork_asm+0x1a/0x30
  =================================

The main purpose of the check of the inode's disk_i_size is to avoid
taking write locks on a btree path when we have a write at or beyond
EOF, since in these cases we don't expect to find extent items in the
root to drop. However if we end up taking write locks due to a data
race on disk_i_size, everything is still correct, we only add extra
lock contention on the tree in case there's concurrency from other tasks.
If the race causes us to not take write locks when we actually need them,
then everything is functionally correct as well, since if we find out we
have extent items to drop and we took read locks (modify_tree set to 0),
we release the path and retry again with write locks.

Since this data race does not affect the correctness of the function,
it is a harmless data race, use data_race() to check inode->disk_i_size.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Hao-ran Zheng <zhenghaoran154@gmail.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d314a7e03a38..c61f210259d8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -225,7 +225,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	if (args->drop_cache)
 		btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
 
-	if (args->start >= inode->disk_i_size && !args->replace_extent)
+	if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
 		modify_tree = 0;
 
 	update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
-- 
cgit v1.2.3


From 6a4730b325aaa48f7a5d5ba97aff0a955e2d9cec Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:03 -0400
Subject: btrfs: convert BUG_ON in btrfs_reloc_cow_block() to proper error
 handling

This BUG_ON is meant to catch backref cache problems, but these can
arise from either bugs in the backref cache or corruption in the extent
tree.  Fix it to be a proper error.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index db8b42f674b7..ab2de2d1b2be 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4405,8 +4405,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(!first_cow && level == 0);
 
 		node = rc->backref_cache.path[level];
-		BUG_ON(node->bytenr != buf->start &&
-		       node->new_bytenr != buf->start);
+
+		/*
+		 * If node->bytenr != buf->start and node->new_bytenr !=
+		 * buf->start then we've got the wrong backref node for what we
+		 * expected to see here and the cache is incorrect.
+		 */
+		if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
+			btrfs_err(fs_info,
+"bytenr %llu was found but our backref cache was expecting %llu or %llu",
+				  buf->start, node->bytenr, node->new_bytenr);
+			return -EUCLEAN;
+		}
 
 		btrfs_backref_drop_node_buffer(node);
 		atomic_inc(&cow->refs);
-- 
cgit v1.2.3


From b1d4d5d1d8cf42a97e2e2bb7e7c2a965cef78dc4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:04 -0400
Subject: btrfs: remove the changed list for backref cache

Now that we're not updating the backref cache when we switch transids we
can remove the changed list.

We're going to keep the new_bytenr field because it serves as a good
sanity check for the backref cache and relocation, and can prevent us
from making extent tree corruption worse.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c    |  2 --
 fs/btrfs/backref.h    |  2 --
 fs/btrfs/relocation.c | 22 ++++++++--------------
 3 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04f53ca548e1..f686f01cdd9b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3022,7 +3022,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
 	cache->rb_root = RB_ROOT;
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 		INIT_LIST_HEAD(&cache->pending[i]);
-	INIT_LIST_HEAD(&cache->changed);
 	INIT_LIST_HEAD(&cache->detached);
 	INIT_LIST_HEAD(&cache->leaves);
 	INIT_LIST_HEAD(&cache->pending_edge);
@@ -3190,7 +3189,6 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
 	}
 	ASSERT(list_empty(&cache->pending_edge));
 	ASSERT(list_empty(&cache->useless_node));
-	ASSERT(list_empty(&cache->changed));
 	ASSERT(list_empty(&cache->detached));
 	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
 	ASSERT(!cache->nr_nodes);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e8c22cccb5c1..a810253d7b8a 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -393,8 +393,6 @@ struct btrfs_backref_cache {
 	struct list_head pending[BTRFS_MAX_LEVEL];
 	/* List of backref nodes with no child node */
 	struct list_head leaves;
-	/* List of blocks that have been COWed in current transaction */
-	struct list_head changed;
 	/* List of detached backref node. */
 	struct list_head detached;
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ab2de2d1b2be..5af1907e230b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2113,14 +2113,13 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 		if (next->new_bytenr != root->node->start) {
 			/*
 			 * We just created the reloc root, so we shouldn't have
-			 * ->new_bytenr set and this shouldn't be in the changed
-			 *  list.  If it is then we have multiple roots pointing
-			 *  at the same bytenr which indicates corruption, or
-			 *  we've made a mistake in the backref walking code.
+			 * ->new_bytenr set yet. If it is then we have multiple
+			 *  roots pointing at the same bytenr which indicates
+			 *  corruption, or we've made a mistake in the backref
+			 *  walking code.
 			 */
 			ASSERT(next->new_bytenr == 0);
-			ASSERT(list_empty(&next->list));
-			if (next->new_bytenr || !list_empty(&next->list)) {
+			if (next->new_bytenr) {
 				btrfs_err(trans->fs_info,
 	"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
 					  node->bytenr, next->bytenr);
@@ -2131,8 +2130,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 			btrfs_put_root(next->root);
 			next->root = btrfs_grab_root(root);
 			ASSERT(next->root);
-			list_add_tail(&next->list,
-				      &rc->backref_cache.changed);
 			mark_block_processed(rc, next);
 			break;
 		}
@@ -2442,7 +2439,7 @@ next:
 
 	if (!ret && node->pending) {
 		btrfs_backref_drop_node_buffer(node);
-		list_move_tail(&node->list, &rc->backref_cache.changed);
+		list_del_init(&node->list);
 		node->pending = 0;
 	}
 
@@ -2605,8 +2602,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 			/*
 			 * This block was the root block of a root, and this is
 			 * the first time we're processing the block and thus it
-			 * should not have had the ->new_bytenr modified and
-			 * should have not been included on the changed list.
+			 * should not have had the ->new_bytenr modified.
 			 *
 			 * However in the case of corruption we could have
 			 * multiple refs pointing to the same block improperly,
@@ -2616,8 +2612,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 			 * normal user in the case of corruption.
 			 */
 			ASSERT(node->new_bytenr == 0);
-			ASSERT(list_empty(&node->list));
-			if (node->new_bytenr || !list_empty(&node->list)) {
+			if (node->new_bytenr) {
 				btrfs_err(root->fs_info,
 				  "bytenr %llu has improper references to it",
 					  node->bytenr);
@@ -2640,7 +2635,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 			btrfs_put_root(node->root);
 			node->root = btrfs_grab_root(root);
 			ASSERT(node->root);
-			list_add_tail(&node->list, &rc->backref_cache.changed);
 		} else {
 			path->lowest_level = node->level;
 			if (root == root->fs_info->chunk_root)
-- 
cgit v1.2.3


From cb7de8ee9c50d86c7ea0f48c6bf50bab84613f22 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:05 -0400
Subject: btrfs: add a comment for new_bytenr in backref_cache_node

Add a comment for this field so we know what it is used for.  Previously
we used it to update the backref cache, so people may mistakenly think
it is useless, but in fact exists to make sure the backref cache makes
sense.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index a810253d7b8a..7220bde1fc31 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -318,6 +318,12 @@ struct btrfs_backref_node {
 		u64 bytenr;
 	}; /* Use rb_simple_node for search/insert */
 
+	/*
+	 * This is a sanity check, whenever we COW a block we will update
+	 * new_bytenr with it's current location, and we will check this in
+	 * various places to validate that the cache makes sense, it shouldn't
+	 * be used for anything else.
+	 */
 	u64 new_bytenr;
 	/* Objectid of tree block owner, can be not uptodate */
 	u64 owner;
-- 
cgit v1.2.3


From 551d04a32a9e0b498c44e545ec27ed4553c59305 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:06 -0400
Subject: btrfs: simplify loop in select_reloc_root()

We have this setup as a loop, but in reality we will never walk back up
the backref tree, if we do then it's a bug.  Get rid of the loop and
handle the case where we have node->new_bytenr set at all.  Previous
check was only if node->new_bytenr != root->node->start, but if it did
then we would hit the WARN_ON() and walk back up the tree.

Instead we want to just return error if ->new_bytenr is set, and then do
the normal updating of the node for the reloc root and carry on.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 135 ++++++++++++++++++++------------------------------
 1 file changed, 55 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5af1907e230b..4cad2540e3ae 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2058,97 +2058,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 	int index = 0;
 	int ret;
 
-	next = node;
-	while (1) {
-		cond_resched();
-		next = walk_up_backref(next, edges, &index);
-		root = next->root;
-
-		/*
-		 * If there is no root, then our references for this block are
-		 * incomplete, as we should be able to walk all the way up to a
-		 * block that is owned by a root.
-		 *
-		 * This path is only for SHAREABLE roots, so if we come upon a
-		 * non-SHAREABLE root then we have backrefs that resolve
-		 * improperly.
-		 *
-		 * Both of these cases indicate file system corruption, or a bug
-		 * in the backref walking code.
-		 */
-		if (!root) {
-			ASSERT(0);
-			btrfs_err(trans->fs_info,
-		"bytenr %llu doesn't have a backref path ending in a root",
-				  node->bytenr);
-			return ERR_PTR(-EUCLEAN);
-		}
-		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
-			ASSERT(0);
-			btrfs_err(trans->fs_info,
-	"bytenr %llu has multiple refs with one ending in a non-shareable root",
-				  node->bytenr);
-			return ERR_PTR(-EUCLEAN);
-		}
+	next = walk_up_backref(node, edges, &index);
+	root = next->root;
 
-		if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
-			ret = record_reloc_root_in_trans(trans, root);
-			if (ret)
-				return ERR_PTR(ret);
-			break;
-		}
+	/*
+	 * If there is no root, then our references for this block are
+	 * incomplete, as we should be able to walk all the way up to a block
+	 * that is owned by a root.
+	 *
+	 * This path is only for SHAREABLE roots, so if we come upon a
+	 * non-SHAREABLE root then we have backrefs that resolve improperly.
+	 *
+	 * Both of these cases indicate file system corruption, or a bug in the
+	 * backref walking code.
+	 */
+	if (unlikely(!root)) {
+		btrfs_err(trans->fs_info,
+			  "bytenr %llu doesn't have a backref path ending in a root",
+			  node->bytenr);
+		return ERR_PTR(-EUCLEAN);
+	}
+	if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+		btrfs_err(trans->fs_info,
+			  "bytenr %llu has multiple refs with one ending in a non-shareable root",
+			  node->bytenr);
+		return ERR_PTR(-EUCLEAN);
+	}
 
-		ret = btrfs_record_root_in_trans(trans, root);
+	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
+		ret = record_reloc_root_in_trans(trans, root);
 		if (ret)
 			return ERR_PTR(ret);
-		root = root->reloc_root;
-
-		/*
-		 * We could have raced with another thread which failed, so
-		 * root->reloc_root may not be set, return ENOENT in this case.
-		 */
-		if (!root)
-			return ERR_PTR(-ENOENT);
+		goto found;
+	}
 
-		if (next->new_bytenr != root->node->start) {
-			/*
-			 * We just created the reloc root, so we shouldn't have
-			 * ->new_bytenr set yet. If it is then we have multiple
-			 *  roots pointing at the same bytenr which indicates
-			 *  corruption, or we've made a mistake in the backref
-			 *  walking code.
-			 */
-			ASSERT(next->new_bytenr == 0);
-			if (next->new_bytenr) {
-				btrfs_err(trans->fs_info,
-	"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
-					  node->bytenr, next->bytenr);
-				return ERR_PTR(-EUCLEAN);
-			}
+	ret = btrfs_record_root_in_trans(trans, root);
+	if (ret)
+		return ERR_PTR(ret);
+	root = root->reloc_root;
 
-			next->new_bytenr = root->node->start;
-			btrfs_put_root(next->root);
-			next->root = btrfs_grab_root(root);
-			ASSERT(next->root);
-			mark_block_processed(rc, next);
-			break;
-		}
+	/*
+	 * We could have raced with another thread which failed, so
+	 * root->reloc_root may not be set, return ENOENT in this case.
+	 */
+	if (!root)
+		return ERR_PTR(-ENOENT);
 
-		WARN_ON(1);
-		root = NULL;
-		next = walk_down_backref(edges, &index);
-		if (!next || next->level <= node->level)
-			break;
-	}
-	if (!root) {
+	if (next->new_bytenr) {
 		/*
-		 * This can happen if there's fs corruption or if there's a bug
-		 * in the backref lookup code.
+		 * We just created the reloc root, so we shouldn't have
+		 * ->new_bytenr set yet. If it is then we have multiple roots
+		 *  pointing at the same bytenr which indicates corruption, or
+		 *  we've made a mistake in the backref walking code.
 		 */
-		ASSERT(0);
-		return ERR_PTR(-ENOENT);
+		ASSERT(next->new_bytenr == 0);
+		btrfs_err(trans->fs_info,
+			  "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+			  node->bytenr, next->bytenr);
+		return ERR_PTR(-EUCLEAN);
 	}
 
+	next->new_bytenr = root->node->start;
+	btrfs_put_root(next->root);
+	next->root = btrfs_grab_root(root);
+	ASSERT(next->root);
+	mark_block_processed(rc, next);
+found:
 	next = node;
 	/* setup backref node path for btrfs_reloc_cow_block */
 	while (1) {
-- 
cgit v1.2.3


From 0097422c0dfe0a943cf879777cabf299bb6258f7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:07 -0400
Subject: btrfs: remove clone_backref_node() from relocation

Since we no longer maintain backref cache across transactions, and this
is only called when we're creating the reloc root for a newly created
snapshot in the transaction critical section, we will end up doing a
bunch of work that will just get thrown away when we start the
transaction in the relocation loop.  Delete this code as it no longer
does anything for us.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 91 +--------------------------------------------------
 1 file changed, 1 insertion(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4cad2540e3ae..cb98ddd825d4 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -469,92 +469,6 @@ out:
 	return node;
 }
 
-/*
- * helper to add backref node for the newly created snapshot.
- * the backref node is created by cloning backref node that
- * corresponds to root of source tree
- */
-static int clone_backref_node(struct btrfs_trans_handle *trans,
-			      struct reloc_control *rc,
-			      const struct btrfs_root *src,
-			      struct btrfs_root *dest)
-{
-	struct btrfs_root *reloc_root = src->reloc_root;
-	struct btrfs_backref_cache *cache = &rc->backref_cache;
-	struct btrfs_backref_node *node = NULL;
-	struct btrfs_backref_node *new_node;
-	struct btrfs_backref_edge *edge;
-	struct btrfs_backref_edge *new_edge;
-	struct rb_node *rb_node;
-
-	rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
-	if (rb_node) {
-		node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
-		if (node->detached)
-			node = NULL;
-		else
-			BUG_ON(node->new_bytenr != reloc_root->node->start);
-	}
-
-	if (!node) {
-		rb_node = rb_simple_search(&cache->rb_root,
-					   reloc_root->commit_root->start);
-		if (rb_node) {
-			node = rb_entry(rb_node, struct btrfs_backref_node,
-					rb_node);
-			BUG_ON(node->detached);
-		}
-	}
-
-	if (!node)
-		return 0;
-
-	new_node = btrfs_backref_alloc_node(cache, dest->node->start,
-					    node->level);
-	if (!new_node)
-		return -ENOMEM;
-
-	new_node->lowest = node->lowest;
-	new_node->checked = 1;
-	new_node->root = btrfs_grab_root(dest);
-	ASSERT(new_node->root);
-
-	if (!node->lowest) {
-		list_for_each_entry(edge, &node->lower, list[UPPER]) {
-			new_edge = btrfs_backref_alloc_edge(cache);
-			if (!new_edge)
-				goto fail;
-
-			btrfs_backref_link_edge(new_edge, edge->node[LOWER],
-						new_node, LINK_UPPER);
-		}
-	} else {
-		list_add_tail(&new_node->lower, &cache->leaves);
-	}
-
-	rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
-				   &new_node->rb_node);
-	if (rb_node)
-		btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
-
-	if (!new_node->lowest) {
-		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
-			list_add_tail(&new_edge->list[LOWER],
-				      &new_edge->node[LOWER]->upper);
-		}
-	}
-	return 0;
-fail:
-	while (!list_empty(&new_node->lower)) {
-		new_edge = list_entry(new_node->lower.next,
-				      struct btrfs_backref_edge, list[UPPER]);
-		list_del(&new_edge->list[UPPER]);
-		btrfs_backref_free_edge(cache, new_edge);
-	}
-	btrfs_backref_free_node(cache, new_node);
-	return -ENOMEM;
-}
-
 /*
  * helper to add 'address of tree root -> reloc tree' mapping
  */
@@ -4485,10 +4399,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 	new_root->reloc_root = btrfs_grab_root(reloc_root);
-
-	if (rc->create_reloc_tree)
-		ret = clone_backref_node(trans, rc, root, reloc_root);
-	return ret;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 46bb6765d31138abe436b37a4f271895f7751bf2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:08 -0400
Subject: btrfs: don't build backref tree for COW-only blocks

We already determine the owner for any blocks we find when we're
relocating, and for COW-only blocks (and the data reloc tree) we COW
down to the block and call it good enough.  However we still build a
whole backref tree for them, even though we're not going to use it, and
then just don't put these blocks in the cache.

Rework the code to check if the block belongs to a COW-only root or the
data reloc root, and then just cow down to the block, skipping the
backref cache generation.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 92 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 73 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cb98ddd825d4..c8f35d456a61 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2133,17 +2133,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
 	return num_bytes;
 }
 
-static int reserve_metadata_space(struct btrfs_trans_handle *trans,
-				  struct reloc_control *rc,
-				  struct btrfs_backref_node *node)
+static int refill_metadata_space(struct btrfs_trans_handle *trans,
+				 struct reloc_control *rc, u64 num_bytes)
 {
-	struct btrfs_root *root = rc->extent_root;
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	u64 num_bytes;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int ret;
-	u64 tmp;
-
-	num_bytes = calcu_metadata_size(rc, node) * 2;
 
 	trans->block_rsv = rc->block_rsv;
 	rc->reserved_bytes += num_bytes;
@@ -2156,7 +2150,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
 				     BTRFS_RESERVE_FLUSH_LIMIT);
 	if (ret) {
-		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
+		u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
+
 		while (tmp <= rc->reserved_bytes)
 			tmp <<= 1;
 		/*
@@ -2174,6 +2169,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+				  struct reloc_control *rc,
+				  struct btrfs_backref_node *node)
+{
+	u64 num_bytes;
+
+	num_bytes = calcu_metadata_size(rc, node) * 2;
+	return refill_metadata_space(trans, rc, num_bytes);
+}
+
 /*
  * relocate a block tree, and then update pointers in upper level
  * blocks that reference the block to point to the new location.
@@ -2525,15 +2530,11 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 			node->root = btrfs_grab_root(root);
 			ASSERT(node->root);
 		} else {
-			path->lowest_level = node->level;
-			if (root == root->fs_info->chunk_root)
-				btrfs_reserve_chunk_metadata(trans, false);
-			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-			btrfs_release_path(path);
-			if (root == root->fs_info->chunk_root)
-				btrfs_trans_release_chunk_metadata(trans);
-			if (ret > 0)
-				ret = 0;
+			btrfs_err(root->fs_info,
+				  "bytenr %llu resolved to a non-shareable root",
+				  node->bytenr);
+			ret = -EUCLEAN;
+			goto out;
 		}
 		if (!ret)
 			update_processed_blocks(rc, node);
@@ -2546,6 +2547,45 @@ out:
 	return ret;
 }
 
+static int relocate_cowonly_block(struct btrfs_trans_handle *trans,
+				  struct reloc_control *rc, struct tree_block *block,
+				  struct btrfs_path *path)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *root;
+	u64 num_bytes;
+	int nr_levels;
+	int ret;
+
+	root = btrfs_get_fs_root(fs_info, block->owner, true);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1;
+
+	num_bytes = fs_info->nodesize * nr_levels;
+	ret = refill_metadata_space(trans, rc, num_bytes);
+	if (ret) {
+		btrfs_put_root(root);
+		return ret;
+	}
+	path->lowest_level = block->level;
+	if (root == root->fs_info->chunk_root)
+		btrfs_reserve_chunk_metadata(trans, false);
+
+	ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1);
+	path->lowest_level = 0;
+	btrfs_release_path(path);
+
+	if (root == root->fs_info->chunk_root)
+		btrfs_trans_release_chunk_metadata(trans);
+	if (ret > 0)
+		ret = 0;
+	btrfs_put_root(root);
+
+	return ret;
+}
+
 /*
  * relocate a list of blocks
  */
@@ -2585,6 +2625,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 
 	/* Do tree relocation */
 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
+		/*
+		 * For COWonly blocks, or the data reloc tree, we only need to
+		 * COW down to the block, there's no need to generate a backref
+		 * tree.
+		 */
+		if (block->owner &&
+		    (!is_fstree(block->owner) ||
+		     block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
+			ret = relocate_cowonly_block(trans, rc, block, path);
+			if (ret)
+				break;
+			continue;
+		}
+
 		node = build_backref_tree(trans, rc, &block->key,
 					  block->level, block->bytenr);
 		if (IS_ERR(node)) {
-- 
cgit v1.2.3


From 4eb8064dc9230a2f58c9df13d59e53265b0cc8e6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:09 -0400
Subject: btrfs: do not handle non-shareable roots in backref cache

Now that we handle relocation for non-shareable roots without using the
backref cache, remove the ->cowonly field from the backref nodes and
update the handling to throw an error.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c    | 50 +++++++++++++++++++++++---------------------------
 fs/btrfs/backref.h    |  2 --
 fs/btrfs/relocation.c |  2 +-
 3 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f686f01cdd9b..2e0e36487b33 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3314,8 +3314,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
 	root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
 	if (IS_ERR(root))
 		return PTR_ERR(root);
-	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-		cur->cowonly = 1;
+
+	/* We shouldn't be using backref cache for non-shareable roots. */
+	if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+		btrfs_put_root(root);
+		return -EUCLEAN;
+	}
 
 	if (btrfs_root_level(&root->root_item) == cur->level) {
 		/* Tree root */
@@ -3401,8 +3405,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
 				goto out;
 			}
 			upper->owner = btrfs_header_owner(eb);
-			if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-				upper->cowonly = 1;
+
+			/* We shouldn't be using backref cache for non shareable roots. */
+			if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+				btrfs_put_root(root);
+				btrfs_backref_free_edge(cache, edge);
+				btrfs_backref_free_node(cache, upper);
+				ret = -EUCLEAN;
+				goto out;
+			}
 
 			/*
 			 * If we know the block isn't shared we can avoid
@@ -3593,15 +3604,10 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
 
 	ASSERT(start->checked);
 
-	/* Insert this node to cache if it's not COW-only */
-	if (!start->cowonly) {
-		rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
-					   &start->rb_node);
-		if (rb_node)
-			btrfs_backref_panic(cache->fs_info, start->bytenr,
-					    -EEXIST);
-		list_add_tail(&start->lower, &cache->leaves);
-	}
+	rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
+	if (rb_node)
+		btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
+	list_add_tail(&start->lower, &cache->leaves);
 
 	/*
 	 * Use breadth first search to iterate all related edges.
@@ -3655,23 +3661,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
 			return -EUCLEAN;
 		}
 
-		/* Sanity check, COW-only node has non-COW-only parent */
-		if (start->cowonly != upper->cowonly) {
-			ASSERT(0);
+		rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
+					   &upper->rb_node);
+		if (unlikely(rb_node)) {
+			btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
 			return -EUCLEAN;
 		}
 
-		/* Only cache non-COW-only (subvolume trees) tree blocks */
-		if (!upper->cowonly) {
-			rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
-						   &upper->rb_node);
-			if (rb_node) {
-				btrfs_backref_panic(cache->fs_info,
-						upper->bytenr, -EEXIST);
-				return -EUCLEAN;
-			}
-		}
-
 		list_add_tail(&edge->list[UPPER], &upper->lower);
 
 		/*
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 7220bde1fc31..c52bc5f45041 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -341,8 +341,6 @@ struct btrfs_backref_node {
 	struct extent_buffer *eb;
 	/* Level of the tree block */
 	unsigned int level:8;
-	/* Is the block in a non-shareable tree */
-	unsigned int cowonly:1;
 	/* 1 if no child node is in the cache */
 	unsigned int lowest:1;
 	/* Is the extent buffer locked */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c8f35d456a61..fe4e2528c806 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2542,7 +2542,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 		ret = do_relocation(trans, rc, node, key, path, 1);
 	}
 out:
-	if (ret || node->level == 0 || node->cowonly)
+	if (ret || node->level == 0)
 		btrfs_backref_cleanup_node(&rc->backref_cache, node);
 	return ret;
 }
-- 
cgit v1.2.3


From 29e74a12a31456ee29d83ea83a545767111517de Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:10 -0400
Subject: btrfs: simplify btrfs_backref_release_cache()

We rely on finding all our nodes on the various lists in the backref
cache, when they are all also in the rbtree.  Instead just search
through the rbtree and free everything.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2e0e36487b33..1a21ff2a86f9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3165,32 +3165,14 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
 void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
 {
 	struct btrfs_backref_node *node;
-	int i;
 
-	while (!list_empty(&cache->detached)) {
-		node = list_entry(cache->detached.next,
-				  struct btrfs_backref_node, list);
+	while ((node = rb_entry_safe(rb_first(&cache->rb_root),
+				     struct btrfs_backref_node, rb_node)))
 		btrfs_backref_cleanup_node(cache, node);
-	}
 
-	while (!list_empty(&cache->leaves)) {
-		node = list_entry(cache->leaves.next,
-				  struct btrfs_backref_node, lower);
-		btrfs_backref_cleanup_node(cache, node);
-	}
-
-	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
-		while (!list_empty(&cache->pending[i])) {
-			node = list_first_entry(&cache->pending[i],
-						struct btrfs_backref_node,
-						list);
-			btrfs_backref_cleanup_node(cache, node);
-		}
-	}
 	ASSERT(list_empty(&cache->pending_edge));
 	ASSERT(list_empty(&cache->useless_node));
 	ASSERT(list_empty(&cache->detached));
-	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
 	ASSERT(!cache->nr_nodes);
 	ASSERT(!cache->nr_edges);
 }
-- 
cgit v1.2.3


From b61e0eb0374299ab5fdd5a767f2759907dc41e1e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:11 -0400
Subject: btrfs: remove the ->lowest and ->leaves members from struct
 btrfs_backref_node

Before we were keeping all of our nodes on various lists in order to
make sure everything got cleaned up correctly.  We used node->lowest to
indicate that node->lower was linked into the cache->leaves list.  Now
that we do cleanup based on the rb-tree both the list and the flag are
useless, so delete them both.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c    | 19 -------------------
 fs/btrfs/backref.h    |  4 ----
 fs/btrfs/relocation.c |  7 -------
 3 files changed, 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 1a21ff2a86f9..597d1d5f44ec 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3023,7 +3023,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 		INIT_LIST_HEAD(&cache->pending[i]);
 	INIT_LIST_HEAD(&cache->detached);
-	INIT_LIST_HEAD(&cache->leaves);
 	INIT_LIST_HEAD(&cache->pending_edge);
 	INIT_LIST_HEAD(&cache->useless_node);
 	cache->fs_info = fs_info;
@@ -3131,29 +3130,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
 void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
 				struct btrfs_backref_node *node)
 {
-	struct btrfs_backref_node *upper;
 	struct btrfs_backref_edge *edge;
 
 	if (!node)
 		return;
 
-	BUG_ON(!node->lowest && !node->detached);
 	while (!list_empty(&node->upper)) {
 		edge = list_entry(node->upper.next, struct btrfs_backref_edge,
 				  list[LOWER]);
-		upper = edge->node[UPPER];
 		list_del(&edge->list[LOWER]);
 		list_del(&edge->list[UPPER]);
 		btrfs_backref_free_edge(cache, edge);
-
-		/*
-		 * Add the node to leaf node list if no other child block
-		 * cached.
-		 */
-		if (list_empty(&upper->lower)) {
-			list_add_tail(&upper->lower, &cache->leaves);
-			upper->lowest = 1;
-		}
 	}
 
 	btrfs_backref_drop_node(cache, node);
@@ -3589,7 +3576,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
 	rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
 	if (rb_node)
 		btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
-	list_add_tail(&start->lower, &cache->leaves);
 
 	/*
 	 * Use breadth first search to iterate all related edges.
@@ -3628,11 +3614,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
 		 * parents have already been linked.
 		 */
 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
-			if (upper->lowest) {
-				list_del_init(&upper->lower);
-				upper->lowest = 0;
-			}
-
 			list_add_tail(&edge->list[UPPER], &upper->lower);
 			continue;
 		}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c52bc5f45041..bf47f7ad08be 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -341,8 +341,6 @@ struct btrfs_backref_node {
 	struct extent_buffer *eb;
 	/* Level of the tree block */
 	unsigned int level:8;
-	/* 1 if no child node is in the cache */
-	unsigned int lowest:1;
 	/* Is the extent buffer locked */
 	unsigned int locked:1;
 	/* Has the block been processed */
@@ -395,8 +393,6 @@ struct btrfs_backref_cache {
 	 * level blocks may not reflect the new location
 	 */
 	struct list_head pending[BTRFS_MAX_LEVEL];
-	/* List of backref nodes with no child node */
-	struct list_head leaves;
 	/* List of detached backref node. */
 	struct list_head detached;
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fe4e2528c806..0f94dea8e329 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -342,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
 		if (cur == node)
 			ret = true;
 
-		/* The node is the lowest node */
-		if (cur->lowest) {
-			list_del_init(&cur->lower);
-			cur->lowest = 0;
-		}
-
 		/* Cleanup the lower edges */
 		while (!list_empty(&cur->lower)) {
 			struct btrfs_backref_edge *edge;
@@ -426,7 +420,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
 		goto out;
 	}
 
-	node->lowest = 1;
 	cur = node;
 
 	/* Breadth-first search to build backref cache */
-- 
cgit v1.2.3


From f974bc3c9ac0025b89195d605ed8543763232eeb Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 3 Oct 2024 11:43:12 -0400
Subject: btrfs: remove detached list from struct btrfs_backref_cache

We don't ever look at this list, remove it.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c    | 2 --
 fs/btrfs/backref.h    | 2 --
 fs/btrfs/relocation.c | 1 -
 3 files changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 597d1d5f44ec..6d9f39c1d89c 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3022,7 +3022,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
 	cache->rb_root = RB_ROOT;
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 		INIT_LIST_HEAD(&cache->pending[i]);
-	INIT_LIST_HEAD(&cache->detached);
 	INIT_LIST_HEAD(&cache->pending_edge);
 	INIT_LIST_HEAD(&cache->useless_node);
 	cache->fs_info = fs_info;
@@ -3159,7 +3158,6 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
 
 	ASSERT(list_empty(&cache->pending_edge));
 	ASSERT(list_empty(&cache->useless_node));
-	ASSERT(list_empty(&cache->detached));
 	ASSERT(!cache->nr_nodes);
 	ASSERT(!cache->nr_edges);
 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index bf47f7ad08be..74e614031274 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -393,8 +393,6 @@ struct btrfs_backref_cache {
 	 * level blocks may not reflect the new location
 	 */
 	struct list_head pending[BTRFS_MAX_LEVEL];
-	/* List of detached backref node. */
-	struct list_head detached;
 
 	u64 last_trans;
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0f94dea8e329..cdd9a7b15a11 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -367,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
 		 * cache to avoid unnecessary backref lookup.
 		 */
 		if (cur->level > 0) {
-			list_add(&cur->list, &cache->detached);
 			cur->detached = 1;
 		} else {
 			rb_erase(&cur->rb_node, &cache->rb_root);
-- 
cgit v1.2.3


From c0def46dec9c547679a25fe7552c4bcbec0b0dd2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 11 Nov 2024 07:29:07 +1030
Subject: btrfs: improve the warning and error message for
 btrfs_remove_qgroup()

[WARNING]
There are several warnings about the recently introduced qgroup
auto-removal that it triggers WARN_ON() for the non-zero rfer/excl
numbers, e.g:

 ------------[ cut here ]------------
 WARNING: CPU: 67 PID: 2882 at fs/btrfs/qgroup.c:1854 btrfs_remove_qgroup+0x3df/0x450
 CPU: 67 UID: 0 PID: 2882 Comm: btrfs-cleaner Kdump: loaded Not tainted 6.11.6-300.fc41.x86_64 #1
 RIP: 0010:btrfs_remove_qgroup+0x3df/0x450
 Call Trace:
  <TASK>
  btrfs_qgroup_cleanup_dropped_subvolume+0x97/0xc0
  btrfs_drop_snapshot+0x44e/0xa80
  btrfs_clean_one_deleted_snapshot+0xc3/0x110
  cleaner_kthread+0xd8/0x130
  kthread+0xd2/0x100
  ret_from_fork+0x34/0x50
  ret_from_fork_asm+0x1a/0x30
  </TASK>
 ---[ end trace 0000000000000000 ]---
 BTRFS warning (device sda): to be deleted qgroup 0/319 has non-zero numbers, rfer 258478080 rfer_cmpr 258478080 excl 0 excl_cmpr 0

[CAUSE]
Although the root cause is still unclear, as if qgroup is consistent a
fully dropped subvolume (with extra transaction committed) should lead
to all zero numbers for the qgroup.

My current guess is the subvolume drop triggered the new subtree drop
threshold thus marked qgroup inconsistent, then rescan cleared it but
some corner case is not properly handled during subvolume dropping.

But at least for this particular case, since it's only the rfer/excl not
properly reset to 0, and qgroup is already marked inconsistent, there is
nothing to be worried for the end users.

The user space tool utilizing qgroup would queue a rescan to handle
everything, so the kernel wanring is a little overkilled.

[ENHANCEMENT]
Enhance the warning inside btrfs_remove_qgroup() by:

- Only do WARN() if CONFIG_BTRFS_DEBUG is enabled
  As explained the kernel can handle inconsistent qgroups by simply do a
  rescan, there is nothing to bother the end users.

- Treat the reserved space leak the same as non-zero numbers
  By outputting the values and trigger a WARN() if it's a debug build.
  So far I haven't experienced any case related to reserved space so I
  hope we will never need to bother them.

Fixes: 839d6ea4f86d ("btrfs: automatically remove the subvolume qgroup")
Link: https://github.com/kdave/btrfs-progs/issues/922
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f9b214992212..993b5e803699 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1838,9 +1838,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 	 * Thus its reserved space should all be zero, no matter if qgroup
 	 * is consistent or the mode.
 	 */
-	WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
-		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
-		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+	if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+	    qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+	    qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
+		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+		btrfs_warn_rl(fs_info,
+"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
+			      btrfs_qgroup_level(qgroup->qgroupid),
+			      btrfs_qgroup_subvolid(qgroup->qgroupid),
+			      qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
+			      qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
+			      qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+
+	}
 	/*
 	 * The same for rfer/excl numbers, but that's only if our qgroup is
 	 * consistent and if it's in regular qgroup mode.
@@ -1849,8 +1859,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 	 */
 	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
 	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
-		if (WARN_ON(qgroup->rfer || qgroup->excl ||
-			    qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
+		if (qgroup->rfer || qgroup->excl ||
+		    qgroup->rfer_cmpr || qgroup->excl_cmpr) {
+			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
 			btrfs_warn_rl(fs_info,
 "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
 				      btrfs_qgroup_level(qgroup->qgroupid),
-- 
cgit v1.2.3


From a883120b2d19630d3be56bc9649a71299ccbf95d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 7 Nov 2024 14:35:07 +1030
Subject: btrfs: open-code btrfs_copy_from_user()

The function btrfs_copy_from_user() handles the folio dirtying for
buffered write. The original design is to allow that function to handle
multiple folios, but since commit c87c299776e4 ("btrfs: make buffered
write to copy one page a time") there is no need to support multiple
folios.

So here open-code btrfs_copy_from_user() to
copy_folio_from_iter_atomic() and flush_dcache_folio() calls.

The short-copy check and revert are still kept as-is.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 66 ++++++++++++++++-----------------------------------------
 1 file changed, 18 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c61f210259d8..3c00dc48b925 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -38,52 +38,6 @@
 #include "super.h"
 #include "print-tree.h"
 
-/*
- * Helper to fault in page and copy.  This should go away and be replaced with
- * calls into generic code.
- */
-static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
-					 struct folio *folio, struct iov_iter *i)
-{
-	size_t copied = 0;
-	size_t total_copied = 0;
-	int offset = offset_in_page(pos);
-
-	while (write_bytes > 0) {
-		size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
-		/*
-		 * Copy data from userspace to the current page
-		 */
-		copied = copy_folio_from_iter_atomic(folio, offset, count, i);
-
-		/* Flush processor's dcache for this page */
-		flush_dcache_folio(folio);
-
-		/*
-		 * if we get a partial write, we can end up with
-		 * partially up to date page.  These add
-		 * a lot of complexity, so make sure they don't
-		 * happen by forcing this copy to be retried.
-		 *
-		 * The rest of the btrfs_file_write code will fall
-		 * back to page at a time copies after we return 0.
-		 */
-		if (unlikely(copied < count)) {
-			if (!folio_test_uptodate(folio)) {
-				iov_iter_revert(i, copied);
-				copied = 0;
-			}
-			if (!copied)
-				break;
-		}
-
-		write_bytes -= copied;
-		total_copied += copied;
-		offset += copied;
-	}
-	return total_copied;
-}
-
 /*
  * Unlock folio after btrfs_file_write() is done with it.
  */
@@ -107,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
 }
 
 /*
- * After btrfs_copy_from_user(), update the following things for delalloc:
+ * After copy_folio_from_iter_atomic(), update the following things for delalloc:
  * - Mark newly dirtied folio as DELALLOC in the io tree.
  *   Used to advise which range is to be written back.
  * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
@@ -1269,7 +1223,23 @@ again:
 			break;
 		}
 
-		copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
+		copied = copy_folio_from_iter_atomic(folio,
+				offset_in_folio(folio, pos), write_bytes, i);
+		flush_dcache_folio(folio);
+
+		/*
+		 * If we get a partial write, we can end up with partially
+		 * uptodate page. Although if sector size < page size we can
+		 * handle it, but if it's not sector aligned it can cause
+		 * a lot of complexity, so make sure they don't happen by
+		 * forcing retry this copy.
+		 */
+		if (unlikely(copied < write_bytes)) {
+			if (!folio_test_uptodate(folio)) {
+				iov_iter_revert(i, copied);
+				copied = 0;
+			}
+		}
 
 		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
 		dirty_sectors = round_up(copied + sector_offset,
-- 
cgit v1.2.3


From d0f038104fa37380e2a725e669508e43d0c503e9 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 10 Dec 2024 15:23:06 +1030
Subject: btrfs: output the reason for open_ctree() failure

There is a recent ML report that mounting a large fs backed by hardware
RAID56 controller (with one device missing) took too much time, and
systemd seems to kill the mount attempt.

In that case, the only error message is:

  BTRFS error (device sdj): open_ctree failed

There is no reason on why the failure happened, making it very hard to
understand the reason.

At least output the error number (in the particular case it should be
-EINTR) to provide some clue.

Link: https://lore.kernel.org/linux-btrfs/9b9c4d2810abcca2f9f76e32220ed9a90febb235.camel@scientia.org/
Reported-by: Christoph Anton Mitterer <calestyo@scientia.org>
Cc: stable@vger.kernel.org
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7dfe5005129a..f6eaaf20229d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -971,7 +971,7 @@ static int btrfs_fill_super(struct super_block *sb,
 
 	err = open_ctree(sb, fs_devices);
 	if (err) {
-		btrfs_err(fs_info, "open_ctree failed");
+		btrfs_err(fs_info, "open_ctree failed: %d", err);
 		return err;
 	}
 
-- 
cgit v1.2.3


From 4016358e852861d3a84a41fb3adea540443f7c96 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Thu, 12 Dec 2024 13:56:06 +0100
Subject: btrfs: remove unused variable length in
 btrfs_insert_one_raid_extent()

Remove the variable length in btrfs_insert_one_raid_extent() as it is
unused.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 9ffc79f250fb..45b823a0913a 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -199,12 +199,8 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
 	for (int i = 0; i < num_stripes; i++) {
 		u64 devid = bioc->stripes[i].dev->devid;
 		u64 physical = bioc->stripes[i].physical;
-		u64 length = bioc->stripes[i].length;
 		struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
 
-		if (length == 0)
-			length = bioc->size;
-
 		btrfs_set_stack_raid_stride_devid(raid_stride, devid);
 		btrfs_set_stack_raid_stride_physical(raid_stride, physical);
 	}
-- 
cgit v1.2.3


From 6c440755244e0044b3de37c3e5b334e46b4d93a6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Dec 2024 14:54:51 +0000
Subject: btrfs: remove no longer needed strict argument from
 can_nocow_extent()

All callers of can_nocow_extent() now pass a value of false for its
'strict' argument, making it redundant. So remove the argument from
can_nocow_extent() as well as can_nocow_file_extent(),
btrfs_cross_ref_exist() and check_committed_ref(), because this
argument was used just to influence the behavior of check_committed_ref().
Also remove the 'strict' field from struct can_nocow_file_extent_args,
which is now always false as well, as its value is taken from the
argument to can_nocow_extent().

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  2 +-
 fs/btrfs/direct-io.c   |  3 +--
 fs/btrfs/extent-tree.c | 15 ++++++---------
 fs/btrfs/extent-tree.h |  2 +-
 fs/btrfs/file.c        |  2 +-
 fs/btrfs/inode.c       | 11 +++--------
 6 files changed, 13 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index aa1f55cd81b7..b2fa33911c28 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -526,7 +526,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 			u32 bio_offset, struct bio_vec *bv);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      struct btrfs_file_extent *file_extent,
-			      bool nowait, bool strict);
+			      bool nowait);
 
 void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index a7c3e221378d..8567af46e16f 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -248,8 +248,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 		len = min(len, em->len - (start - em->start));
 		block_start = extent_map_block_start(em) + (start - em->start);
 
-		if (can_nocow_extent(inode, start, &len,
-				     &file_extent, false, false) == 1) {
+		if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) {
 			bg = btrfs_inc_nocow_writers(fs_info, block_start);
 			if (bg)
 				can_nocow = true;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2f9126528a01..46a3a4a4536b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2296,8 +2296,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 
 static noinline int check_committed_ref(struct btrfs_root *root,
 					struct btrfs_path *path,
-					u64 objectid, u64 offset, u64 bytenr,
-					bool strict)
+					u64 objectid, u64 offset, u64 bytenr)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
@@ -2361,11 +2360,10 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 
 	/*
 	 * If extent created before last snapshot => it's shared unless the
-	 * snapshot has been deleted. Use the heuristic if strict is false.
+	 * snapshot has been deleted.
 	 */
-	if (!strict &&
-	    (btrfs_extent_generation(leaf, ei) <=
-	     btrfs_root_last_snapshot(&root->root_item)))
+	if (btrfs_extent_generation(leaf, ei) <=
+	    btrfs_root_last_snapshot(&root->root_item))
 		goto out;
 
 	/* If this extent has SHARED_DATA_REF then it's shared */
@@ -2387,13 +2385,12 @@ out:
 }
 
 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
-			  u64 bytenr, bool strict, struct btrfs_path *path)
+			  u64 bytenr, struct btrfs_path *path)
 {
 	int ret;
 
 	do {
-		ret = check_committed_ref(root, path, objectid,
-					  offset, bytenr, strict);
+		ret = check_committed_ref(root, path, objectid, offset, bytenr);
 		if (ret && ret != -ENOENT)
 			goto out;
 
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 2ad51130c037..ee62035c4a71 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -117,7 +117,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
 				    const struct extent_buffer *eb);
 int btrfs_exclude_logged_extents(struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_root *root,
-			  u64 objectid, u64 offset, u64 bytenr, bool strict,
+			  u64 objectid, u64 offset, u64 bytenr,
 			  struct btrfs_path *path);
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3c00dc48b925..4775a17c4ee1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1023,7 +1023,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 						   &cached_state);
 	}
 	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
-			       NULL, nowait, false);
+			       NULL, nowait);
 	if (ret <= 0)
 		btrfs_drew_write_unlock(&root->snapshot_lock);
 	else
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 283199d11642..0965a29cf4f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1837,7 +1837,6 @@ struct can_nocow_file_extent_args {
 	/* End file offset (inclusive) of the range we want to NOCOW. */
 	u64 end;
 	bool writeback_path;
-	bool strict;
 	/*
 	 * Free the path passed to can_nocow_file_extent() once it's not needed
 	 * anymore.
@@ -1892,8 +1891,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
 	 * for its subvolume was created, then this implies the extent is shared,
 	 * hence we must COW.
 	 */
-	if (!args->strict &&
-	    btrfs_file_extent_generation(leaf, fi) <=
+	if (btrfs_file_extent_generation(leaf, fi) <=
 	    btrfs_root_last_snapshot(&root->root_item))
 		goto out;
 
@@ -1924,7 +1922,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
 
 	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
 				    key->offset - args->file_extent.offset,
-				    args->file_extent.disk_bytenr, args->strict, path);
+				    args->file_extent.disk_bytenr, path);
 	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
 	if (ret != 0)
 		goto out;
@@ -7011,8 +7009,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
  * @orig_start:	(optional) Return the original file offset of the file extent
  * @orig_len:	(optional) Return the original on-disk length of the file extent
  * @ram_bytes:	(optional) Return the ram_bytes of the file extent
- * @strict:	if true, omit optimizations that might force us into unnecessary
- *		cow. e.g., don't trust generation number.
  *
  * Return:
  * >0	and update @len if we can do nocow write
@@ -7024,7 +7020,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
  */
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      struct btrfs_file_extent *file_extent,
-			      bool nowait, bool strict)
+			      bool nowait)
 {
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct can_nocow_file_extent_args nocow_args = { 0 };
@@ -7077,7 +7073,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 
 	nocow_args.start = offset;
 	nocow_args.end = offset + *len - 1;
-	nocow_args.strict = strict;
 	nocow_args.free_path = true;
 
 	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
-- 
cgit v1.2.3


From 4f000a87fbfe6522a6f43d44d1dfc6af157ce476 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Dec 2024 15:52:13 +0000
Subject: btrfs: remove the snapshot check from check_committed_ref()

At check_committed_ref() we have this check to see if the data extent was
created in a generation lower than or equals to the generation where the
last snapshot for the root was created, and if so we return immediately
with 1, since it's very likely the extent is shared, referenced by other
root.

The only call chain for check_committed_ref() is the following:

   can_nocow_file_extent()
      btrfs_cross_ref_exist()
         check_committed_ref()

And we already do that snapshot check at can_nocow_file_extent(), before
we call btrfs_cross_ref_exist(). This makes the check done at
check_committed_ref() redundant, so remove it.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 46a3a4a4536b..e81f4615ccdf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2358,14 +2358,6 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	if (item_size != expected_size)
 		goto out;
 
-	/*
-	 * If extent created before last snapshot => it's shared unless the
-	 * snapshot has been deleted.
-	 */
-	if (btrfs_extent_generation(leaf, ei) <=
-	    btrfs_root_last_snapshot(&root->root_item))
-		goto out;
-
 	/* If this extent has SHARED_DATA_REF then it's shared */
 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
-- 
cgit v1.2.3


From 78cdfba85df90fe0eb507f2fe86eba7c8f51fe4c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Dec 2024 16:07:12 +0000
Subject: btrfs: avoid redundant call to get inline ref type at
 check_committed_ref()

At check_committed_ref() we are calling btrfs_get_extent_inline_ref_type()
twice, once before we check if have an inline extent owner ref (for simple
qgroups) and then once again sometime after that check. This second call
is redundant when we have simple quotas disabled or we found an inline ref
that is not of the owner ref type. So avoid this second call unless we
have simple quotas enabled and found an owner ref, saving a function call
that does inline ref validation again.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e81f4615ccdf..00e137c48a9b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2352,6 +2352,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
 		expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
 		iref = (struct btrfs_extent_inline_ref *)(iref + 1);
+		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 	}
 
 	/* If extent item has more than 1 inline ref then it's shared */
@@ -2359,7 +2360,6 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 		goto out;
 
 	/* If this extent has SHARED_DATA_REF then it's shared */
-	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
 		goto out;
 
-- 
cgit v1.2.3


From adf7da3f261cee646a488d46f2086d6e98e72f57 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Dec 2024 16:24:30 +0000
Subject: btrfs: simplify return logic at check_committed_ref()

Instead of setting the value to return in a local variable 'ret' and then
jumping into a label named 'out' that does nothing but return that value,
simplify everything by getting rid of the label and directly returning a
value.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 00e137c48a9b..51c49b2f4991 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2316,35 +2316,32 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 	if (ret == 0) {
 		/*
 		 * Key with offset -1 found, there would have to exist an extent
 		 * item with such offset, but this is out of the valid range.
 		 */
-		ret = -EUCLEAN;
-		goto out;
+		return -EUCLEAN;
 	}
 
-	ret = -ENOENT;
 	if (path->slots[0] == 0)
-		goto out;
+		return -ENOENT;
 
 	path->slots[0]--;
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
-		goto out;
+		return -ENOENT;
 
-	ret = 1;
 	item_size = btrfs_item_size(leaf, path->slots[0]);
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
 
 	/* No inline refs; we need to bail before checking for owner ref. */
 	if (item_size == sizeof(*ei))
-		goto out;
+		return 1;
 
 	/* Check for an owner ref; skip over it to the real inline refs. */
 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2357,11 +2354,11 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 
 	/* If extent item has more than 1 inline ref then it's shared */
 	if (item_size != expected_size)
-		goto out;
+		return 1;
 
 	/* If this extent has SHARED_DATA_REF then it's shared */
 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
-		goto out;
+		return 1;
 
 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
 	if (btrfs_extent_refs(leaf, ei) !=
@@ -2369,11 +2366,9 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	    btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
-		goto out;
+		return 1;
 
-	ret = 0;
-out:
-	return ret;
+	return 0;
 }
 
 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
-- 
cgit v1.2.3


From 9e0d43ea4e69aee29ab58f9cd8c90d2c807c862a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Dec 2024 16:59:23 +0000
Subject: btrfs: simplify arguments for btrfs_cross_ref_exist()

Instead of passing a root and an objectid which matches an inode number,
pass the inode instead, since the root is always the root associated to
the inode and the objectid is the number of that inode.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 22 ++++++++++++----------
 fs/btrfs/extent-tree.h |  3 +--
 fs/btrfs/inode.c       |  3 +--
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 51c49b2f4991..af3893ad784b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2206,10 +2206,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static noinline int check_delayed_ref(struct btrfs_root *root,
+static noinline int check_delayed_ref(struct btrfs_inode *inode,
 				      struct btrfs_path *path,
-				      u64 objectid, u64 offset, u64 bytenr)
+				      u64 offset, u64 bytenr)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -2283,7 +2284,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 		 * then we have a cross reference.
 		 */
 		if (ref->ref_root != btrfs_root_id(root) ||
-		    ref_owner != objectid || ref_offset != offset) {
+		    ref_owner != btrfs_ino(inode) || ref_offset != offset) {
 			ret = 1;
 			break;
 		}
@@ -2294,10 +2295,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 	return ret;
 }
 
-static noinline int check_committed_ref(struct btrfs_root *root,
+static noinline int check_committed_ref(struct btrfs_inode *inode,
 					struct btrfs_path *path,
-					u64 objectid, u64 offset, u64 bytenr)
+					u64 offset, u64 bytenr)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
 	struct extent_buffer *leaf;
@@ -2364,29 +2366,29 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	if (btrfs_extent_refs(leaf, ei) !=
 	    btrfs_extent_data_ref_count(leaf, ref) ||
 	    btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
-	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) ||
 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
 		return 1;
 
 	return 0;
 }
 
-int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
+int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset,
 			  u64 bytenr, struct btrfs_path *path)
 {
 	int ret;
 
 	do {
-		ret = check_committed_ref(root, path, objectid, offset, bytenr);
+		ret = check_committed_ref(inode, path, offset, bytenr);
 		if (ret && ret != -ENOENT)
 			goto out;
 
-		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+		ret = check_delayed_ref(inode, path, offset, bytenr);
 	} while (ret == -EAGAIN && !path->nowait);
 
 out:
 	btrfs_release_path(path);
-	if (btrfs_is_data_reloc_root(root))
+	if (btrfs_is_data_reloc_root(inode->root))
 		WARN_ON(ret > 0);
 	return ret;
 }
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index ee62035c4a71..46b8e19022df 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -116,8 +116,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
 				    const struct extent_buffer *eb);
 int btrfs_exclude_logged_extents(struct extent_buffer *eb);
-int btrfs_cross_ref_exist(struct btrfs_root *root,
-			  u64 objectid, u64 offset, u64 bytenr,
+int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr,
 			  struct btrfs_path *path);
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0965a29cf4f7..8a173a24ac05 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1920,8 +1920,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
 	 */
 	btrfs_release_path(path);
 
-	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
-				    key->offset - args->file_extent.offset,
+	ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
 				    args->file_extent.disk_bytenr, path);
 	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
 	if (ret != 0)
-- 
cgit v1.2.3


From 2747c555958448eb32fc953b28798c1b55df4e1d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Dec 2024 11:41:33 +0000
Subject: btrfs: add function comment for check_committed_ref()

There are some not immediately obvious details about the operation of
check_committed_ref(), namely that when it returns 0 it must return with
the path having a locked leaf from the extent tree that contains the
extent's extent item, so that we can later check for delayed refs when
calling check_delayed_ref() in a way that doesn't race with a task running
delayed references. For similar reasons, it must also return with a locked
leaf when the extent item is not found, and that leaf is where the extent
item should be located, because we may have delayed references that are
going to create the extent item. Also document that the function can
return false positives in order to not be too slow, and that the most
important is to not return false negatives.

So add a function comment to check_committed_ref().

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index af3893ad784b..b76df4d9d1ee 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2295,6 +2295,48 @@ static noinline int check_delayed_ref(struct btrfs_inode *inode,
 	return ret;
 }
 
+/*
+ * Check if there are references for a data extent other than the one belonging
+ * to the given inode and offset.
+ *
+ * @inode:     The only inode we expect to find associated with the data extent.
+ * @path:      A path to use for searching the extent tree.
+ * @offset:    The only offset we expect to find associated with the data extent.
+ * @bytenr:    The logical address of the data extent.
+ *
+ * When the extent does not have any other references other than the one we
+ * expect to find, we always return a value of 0 with the path having a locked
+ * leaf that contains the extent's extent item - this is necessary to ensure
+ * we don't race with a task running delayed references, and our caller must
+ * have such a path when calling check_delayed_ref() - it must lock a delayed
+ * ref head while holding the leaf locked. In case the extent item is not found
+ * in the extent tree, we return -ENOENT with the path having the leaf (locked)
+ * where the extent item should be, in order to prevent races with another task
+ * running delayed references, so that we don't miss any reference when calling
+ * check_delayed_ref().
+ *
+ * Note: this may return false positives, and this is because we want to be
+ *       quick here as we're called in write paths (when flushing delalloc and
+ *       in the direct IO write path). For example we can have an extent with
+ *       a single reference but that reference is not inlined, or we may have
+ *       many references in the extent tree but we also have delayed references
+ *       that cancel all the reference except the one for our inode and offset,
+ *       but it would be expensive to do such checks and complex due to all
+ *       locking to avoid races between the checks and flushing delayed refs,
+ *       plus non-inline references may be located on leaves other than the one
+ *       that contains the extent item in the extent tree. The important thing
+ *       here is to not return false negatives and that the false positives are
+ *       not very common.
+ *
+ * Returns: 0 if there are no cross references and with the path having a locked
+ *          leaf from the extent tree that contains the extent's extent item.
+ *
+ *          1 if there are cross references (false positives can happen).
+ *
+ *          < 0 in case of an error. In case of -ENOENT the leaf in the extent
+ *          tree where the extent item should be located at is read locked and
+ *          accessible in the given path.
+ */
 static noinline int check_committed_ref(struct btrfs_inode *inode,
 					struct btrfs_path *path,
 					u64 offset, u64 bytenr)
-- 
cgit v1.2.3


From 88694f74f4853b01dae678fb8d5f2c46948ae620 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Dec 2024 11:51:11 +0000
Subject: btrfs: add assertions and comment about path expectations to
 btrfs_cross_ref_exist()

We should always call check_delayed_ref() with a path having a locked leaf
from the extent tree where either the extent item is located or where it
should be located in case it doesn't exist yet (when there's a pending
unflushed delayed ref to do it), as we need to lock any existing delayed
ref head while holding such leaf locked in order to avoid races with
flushing delayed references, which could make us think an extent is not
shared when it really is.

So add some assertions and a comment about such expectations to
btrfs_cross_ref_exist(), which is the only caller of check_delayed_ref().

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 25 +++++++++++++++++++++++++
 fs/btrfs/locking.h     |  5 +++++
 2 files changed, 30 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b76df4d9d1ee..3dfe651aeaa9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2425,6 +2425,31 @@ int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset,
 		if (ret && ret != -ENOENT)
 			goto out;
 
+		/*
+		 * The path must have a locked leaf from the extent tree where
+		 * the extent item for our extent is located, in case it exists,
+		 * or where it should be located in case it doesn't exist yet
+		 * because it's new and its delayed ref was not yet flushed.
+		 * We need to lock the delayed ref head at check_delayed_ref(),
+		 * if one exists, while holding the leaf locked in order to not
+		 * race with delayed ref flushing, missing references and
+		 * incorrectly reporting that the extent is not shared.
+		 */
+		if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+			struct extent_buffer *leaf = path->nodes[0];
+
+			ASSERT(leaf != NULL);
+			btrfs_assert_tree_read_locked(leaf);
+
+			if (ret != -ENOENT) {
+				struct btrfs_key key;
+
+				btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+				ASSERT(key.objectid == bytenr);
+				ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY);
+			}
+		}
+
 		ret = check_delayed_ref(inode, path, offset, bytenr);
 	} while (ret == -EAGAIN && !path->nowait);
 
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 35036b151bf5..c69e57ff804b 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -199,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
 {
 	lockdep_assert_held_write(&eb->lock);
 }
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+	lockdep_assert_held_read(&eb->lock);
+}
 #else
 static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
 #endif
 
 void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
-- 
cgit v1.2.3


From 68ab9825a6a9677b6eab07666750e3fbc006b000 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 16 Dec 2024 09:10:39 +0100
Subject: btrfs: cache stripe tree usage in struct btrfs_io_geometry

Cache the return of btrfs_need_stripe_tree_update() in struct
btrfs_io_geometry starting from btrfs_map_block().

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1cccaf9c2b0d..fa190f710854 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -48,6 +48,7 @@ struct btrfs_io_geometry {
 	u64 raid56_full_stripe_start;
 	int max_errors;
 	enum btrfs_map_op op;
+	bool use_rst;
 };
 
 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
@@ -6346,8 +6347,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
 {
 	dst->dev = map->stripes[io_geom->stripe_index].dev;
 
-	if (io_geom->op == BTRFS_MAP_READ &&
-	    btrfs_need_stripe_tree_update(fs_info, map->type))
+	if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)
 		return btrfs_get_raid_extent_offset(fs_info, logical, length,
 						    map->type,
 						    io_geom->stripe_index, dst);
@@ -6579,6 +6579,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	io_geom.raid56_full_stripe_start = (u64)-1;
 	max_len = btrfs_max_io_len(map, map_offset, &io_geom);
 	*length = min_t(u64, map->chunk_len - map_offset, max_len);
+	io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
 
 	if (dev_replace->replace_task != current)
 		down_read(&dev_replace->rwsem);
-- 
cgit v1.2.3


From 9c48bcec47c8dd36b66ce1363c29c6a39612f7ad Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 16 Dec 2024 09:10:40 +0100
Subject: btrfs: cache RAID stripe tree decision in btrfs_io_context

Cache the decision if a particular I/O needs to update RAID stripe tree
entries in struct btrfs_io_context.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c     | 3 +--
 fs/btrfs/volumes.c | 1 +
 fs/btrfs/volumes.h | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 7ea6f0b43b95..bc80ee4f95a5 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -725,8 +725,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 			bio->bi_opf |= REQ_OP_ZONE_APPEND;
 		}
 
-		if (is_data_bbio(bbio) && bioc &&
-		    btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+		if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
 			/*
 			 * No locking for the list update, as we only add to
 			 * the list in the I/O submission path, and list
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fa190f710854..088ba0499e18 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6663,6 +6663,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		goto out;
 	}
 	bioc->map_type = map->type;
+	bioc->use_rst = io_geom.use_rst;
 
 	/*
 	 * For RAID56 full map, we need to make sure the stripes[] follows the
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3a416b1bc24c..10bdd731e3fc 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -485,6 +485,7 @@ struct btrfs_io_context {
 	struct bio *orig_bio;
 	atomic_t error;
 	u16 max_errors;
+	bool use_rst;
 
 	u64 logical;
 	u64 size;
-- 
cgit v1.2.3


From 63e5f9df7cac7a5bf5da9ce6c36364d74be85f55 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 16 Dec 2024 09:10:41 +0100
Subject: btrfs: pass btrfs_io_geometry to is_single_device_io

Now that we have the stripe tree decision saved in struct
btrfs_io_geometry we can pass it into is_single_device_io() and get rid of
another call to btrfs_need_raid_stripe_tree_update().

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 088ba0499e18..fcd80ba9dd42 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6362,7 +6362,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
 				const struct btrfs_io_stripe *smap,
 				const struct btrfs_chunk_map *map,
 				int num_alloc_stripes,
-				enum btrfs_map_op op, int mirror_num)
+				struct btrfs_io_geometry *io_geom)
 {
 	if (!smap)
 		return false;
@@ -6370,10 +6370,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
 	if (num_alloc_stripes != 1)
 		return false;
 
-	if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+	if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)
 		return false;
 
-	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)
 		return false;
 
 	return true;
@@ -6648,8 +6648,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	 * physical block information on the stack instead of allocating an
 	 * I/O context structure.
 	 */
-	if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
-				io_geom.mirror_num)) {
+	if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {
 		ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
 		if (mirror_num_ret)
 			*mirror_num_ret = io_geom.mirror_num;
-- 
cgit v1.2.3


From b815a78e17b9dd90398561ec7d91891d95f25301 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 11:26:35 +0000
Subject: btrfs: move abort_should_print_stack() to transaction.h

The function abort_should_print_stack() is declared in transaction.h but
its definition is in ctree.c, which doesn't make sense since ctree.c is
the btree implementation and the function is related to the transaction
code. Move its definition into transaction.h as an inline function since
it's a very short and trivial function, and also add the 'btrfs_' prefix
into its name.

This change also reduces the module size.

Before this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1783148	 161137	  16920	1961205	 1decf5	fs/btrfs/btrfs.ko

After this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1782126	 161045	  16920	1960091	 1de89b	fs/btrfs/btrfs.ko

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c       | 16 ----------------
 fs/btrfs/transaction.h | 18 ++++++++++++++++--
 2 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 185985a337b3..99a58ede387e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -225,22 +225,6 @@ noinline void btrfs_release_path(struct btrfs_path *p)
 	}
 }
 
-/*
- * We want the transaction abort to print stack trace only for errors where the
- * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
- * caused by external factors.
- */
-bool __cold abort_should_print_stack(int error)
-{
-	switch (error) {
-	case -EIO:
-	case -EROFS:
-	case -ENOMEM:
-		return false;
-	}
-	return true;
-}
-
 /*
  * safely gets a reference on the root node of a tree.  A lock
  * is not taken, so a concurrent writer may put a different node
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 184fa5c0062a..9f7c777af635 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -227,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
 	delayed_refs->qgroup_to_skip = 0;
 }
 
-bool __cold abort_should_print_stack(int error);
+/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+static inline bool btrfs_abort_should_print_stack(int error)
+{
+	switch (error) {
+	case -EIO:
+	case -EROFS:
+	case -ENOMEM:
+		return false;
+	}
+	return true;
+}
 
 /*
  * Call btrfs_abort_transaction as early as possible when an error condition is
@@ -240,7 +254,7 @@ do {								\
 	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
 			&((trans)->fs_info->fs_state))) {	\
 		__first = true;					\
-		if (WARN(abort_should_print_stack(error),	\
+		if (WARN(btrfs_abort_should_print_stack(error),	\
 			KERN_ERR				\
 			"BTRFS: Transaction aborted (error %d)\n",	\
 			(error))) {					\
-- 
cgit v1.2.3


From a6f0bcf9b190219fa2686247dfc99a44e597aa11 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 11:38:30 +0000
Subject: btrfs: move csum related functions from ctree.c into fs.c

The ctree module is about the implementation of the btree data structure
and not a place holder for generic filesystem things like the csum
algorithm details. Move the functions related to the csum algorithm
details away from ctree.c and into fs.c, which is a far better place for
them. Also fix missing punctuation in comments and change one multiline
comment to a single line comment since everything fits in under 80
characters.

For some reason this also slightly reduces the module's size.

Before this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1782126	 161045	  16920	1960091	 1de89b	fs/btrfs/btrfs.ko

After this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1782094	 161045	  16920	1960059	 1de87b	fs/btrfs/btrfs.ko

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 51 ---------------------------------------------------
 fs/btrfs/ctree.h |  6 ------
 fs/btrfs/fs.c    | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/fs.h    |  6 ++++++
 4 files changed, 55 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 99a58ede387e..c93f52a30a16 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *dst_buf,
 			      struct extent_buffer *src_buf);
-
-static const struct btrfs_csums {
-	u16		size;
-	const char	name[10];
-	const char	driver[12];
-} btrfs_csums[] = {
-	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
-	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
-	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
-	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
-				     .driver = "blake2b-256" },
-};
-
 /*
  * The leaf data grows from end-to-front in the node.  this returns the address
  * of the start of the last item, which is the stop of the leaf data stack.
@@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst,
 			      nr_items * sizeof(struct btrfs_item));
 }
 
-/* This exists for btrfs-progs usages. */
-u16 btrfs_csum_type_size(u16 type)
-{
-	return btrfs_csums[type].size;
-}
-
-int btrfs_super_csum_size(const struct btrfs_super_block *s)
-{
-	u16 t = btrfs_super_csum_type(s);
-	/*
-	 * csum type is validated at mount time
-	 */
-	return btrfs_csum_type_size(t);
-}
-
-const char *btrfs_super_csum_name(u16 csum_type)
-{
-	/* csum type is validated at mount time */
-	return btrfs_csums[csum_type].name;
-}
-
-/*
- * Return driver name if defined, otherwise the name that's also a valid driver
- * name
- */
-const char *btrfs_super_csum_driver(u16 csum_type)
-{
-	/* csum type is validated at mount time */
-	return btrfs_csums[csum_type].driver[0] ?
-		btrfs_csums[csum_type].driver :
-		btrfs_csums[csum_type].name;
-}
-
-size_t __attribute_const__ btrfs_get_num_csums(void)
-{
-	return ARRAY_SIZE(btrfs_csums);
-}
-
 struct btrfs_path *btrfs_alloc_path(void)
 {
 	might_sleep();
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c341956a01c..a1bab0b3f193 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -756,12 +756,6 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
 	return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
 }
 
-u16 btrfs_csum_type_size(u16 type);
-int btrfs_super_csum_size(const struct btrfs_super_block *s);
-const char *btrfs_super_csum_name(u16 csum_type);
-const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __attribute_const__ btrfs_get_num_csums(void);
-
 /*
  * We use folio flag owner_2 to indicate there is an ordered extent with
  * unfinished IO.
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 31c1648bc0b4..3756a3b9c9da 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -5,6 +5,55 @@
 #include "fs.h"
 #include "accessors.h"
 
+static const struct btrfs_csums {
+	u16		size;
+	const char	name[10];
+	const char	driver[12];
+} btrfs_csums[] = {
+	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
+	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
+	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
+				     .driver = "blake2b-256" },
+};
+
+/* This exists for btrfs-progs usages. */
+u16 btrfs_csum_type_size(u16 type)
+{
+	return btrfs_csums[type].size;
+}
+
+int btrfs_super_csum_size(const struct btrfs_super_block *s)
+{
+	u16 t = btrfs_super_csum_type(s);
+
+	/* csum type is validated at mount time. */
+	return btrfs_csum_type_size(t);
+}
+
+const char *btrfs_super_csum_name(u16 csum_type)
+{
+	/* csum type is validated at mount time. */
+	return btrfs_csums[csum_type].name;
+}
+
+/*
+ * Return driver name if defined, otherwise the name that's also a valid driver
+ * name.
+ */
+const char *btrfs_super_csum_driver(u16 csum_type)
+{
+	/* csum type is validated at mount time */
+	return btrfs_csums[csum_type].driver[0] ?
+		btrfs_csums[csum_type].driver :
+		btrfs_csums[csum_type].name;
+}
+
+size_t __attribute_const__ btrfs_get_num_csums(void)
+{
+	return ARRAY_SIZE(btrfs_csums);
+}
+
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
 			     const char *name)
 {
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 79a1a3d6f04d..b05f2af97140 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -982,6 +982,12 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
 
 int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);
 
+u16 btrfs_csum_type_size(u16 type);
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
 /* Compatibility and incompatibility defines */
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
 			     const char *name);
-- 
cgit v1.2.3


From 0b93369104ac5f65721793e038cafa4b3e58fdba Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 12:10:19 +0000
Subject: btrfs: move the exclusive operation functions into fs.c

The declarations for the exclusive operation functions are located at fs.h
but their definitions are in ioctl.c, which doesn't make much sense since
(most of them) are used in several files other than ioctl.c. Since they
are used in several files and they are generic enough, move them out of
ioctl.c and into fs.c, even the ones that are currently only used at
ioctl.c, for the sake of having them all in the same C file.

This also reduces the module's size.

Before this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1782094	 161045	  16920	1960059	 1de87b	fs/btrfs/btrfs.ko

After this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1781492	 161037	  16920	1959449	 1de619	fs/btrfs/btrfs.ko

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.c    | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.c | 80 -------------------------------------------------------
 2 files changed, 81 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 3756a3b9c9da..09cfb43580cb 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -4,6 +4,7 @@
 #include "ctree.h"
 #include "fs.h"
 #include "accessors.h"
+#include "volumes.h"
 
 static const struct btrfs_csums {
 	u16		size;
@@ -54,6 +55,86 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
 	return ARRAY_SIZE(btrfs_csums);
 }
 
+/*
+ * Start exclusive operation @type, return true on success.
+ */
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+			enum btrfs_exclusive_operation type)
+{
+	bool ret = false;
+
+	spin_lock(&fs_info->super_lock);
+	if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
+		fs_info->exclusive_operation = type;
+		ret = true;
+	}
+	spin_unlock(&fs_info->super_lock);
+
+	return ret;
+}
+
+/*
+ * Conditionally allow to enter the exclusive operation in case it's compatible
+ * with the running one.  This must be paired with btrfs_exclop_start_unlock()
+ * and btrfs_exclop_finish().
+ *
+ * Compatibility:
+ * - the same type is already running
+ * - when trying to add a device and balance has been paused
+ * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
+ *   must check the condition first that would allow none -> @type
+ */
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+				 enum btrfs_exclusive_operation type)
+{
+	spin_lock(&fs_info->super_lock);
+	if (fs_info->exclusive_operation == type ||
+	    (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
+	     type == BTRFS_EXCLOP_DEV_ADD))
+		return true;
+
+	spin_unlock(&fs_info->super_lock);
+	return false;
+}
+
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
+{
+	spin_unlock(&fs_info->super_lock);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+	spin_lock(&fs_info->super_lock);
+	WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+	spin_unlock(&fs_info->super_lock);
+	sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+			  enum btrfs_exclusive_operation op)
+{
+	switch (op) {
+	case BTRFS_EXCLOP_BALANCE_PAUSED:
+		spin_lock(&fs_info->super_lock);
+		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
+		       fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
+		       fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
+		       fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
+		spin_unlock(&fs_info->super_lock);
+		break;
+	case BTRFS_EXCLOP_BALANCE:
+		spin_lock(&fs_info->super_lock);
+		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+		spin_unlock(&fs_info->super_lock);
+		break;
+	default:
+		btrfs_warn(fs_info,
+			"invalid exclop balance operation %d requested", op);
+	}
+}
+
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
 			     const char *name)
 {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index baecb19404ad..243d08f37c58 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -403,86 +403,6 @@ update_flags:
 	return ret;
 }
 
-/*
- * Start exclusive operation @type, return true on success
- */
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
-			enum btrfs_exclusive_operation type)
-{
-	bool ret = false;
-
-	spin_lock(&fs_info->super_lock);
-	if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
-		fs_info->exclusive_operation = type;
-		ret = true;
-	}
-	spin_unlock(&fs_info->super_lock);
-
-	return ret;
-}
-
-/*
- * Conditionally allow to enter the exclusive operation in case it's compatible
- * with the running one.  This must be paired with btrfs_exclop_start_unlock and
- * btrfs_exclop_finish.
- *
- * Compatibility:
- * - the same type is already running
- * - when trying to add a device and balance has been paused
- * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
- *   must check the condition first that would allow none -> @type
- */
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
-				 enum btrfs_exclusive_operation type)
-{
-	spin_lock(&fs_info->super_lock);
-	if (fs_info->exclusive_operation == type ||
-	    (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
-	     type == BTRFS_EXCLOP_DEV_ADD))
-		return true;
-
-	spin_unlock(&fs_info->super_lock);
-	return false;
-}
-
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
-{
-	spin_unlock(&fs_info->super_lock);
-}
-
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
-{
-	spin_lock(&fs_info->super_lock);
-	WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
-	spin_unlock(&fs_info->super_lock);
-	sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
-}
-
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
-			  enum btrfs_exclusive_operation op)
-{
-	switch (op) {
-	case BTRFS_EXCLOP_BALANCE_PAUSED:
-		spin_lock(&fs_info->super_lock);
-		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
-		       fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
-		       fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
-		       fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
-		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
-		spin_unlock(&fs_info->super_lock);
-		break;
-	case BTRFS_EXCLOP_BALANCE:
-		spin_lock(&fs_info->super_lock);
-		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
-		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
-		spin_unlock(&fs_info->super_lock);
-		break;
-	default:
-		btrfs_warn(fs_info,
-			"invalid exclop balance operation %d requested", op);
-	}
-}
-
 static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
 {
 	return put_user(inode->i_generation, arg);
-- 
cgit v1.2.3


From a5b3f117daead61c3c9c88cd1159d38fa4ad1362 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 12:27:07 +0000
Subject: btrfs: move btrfs_is_empty_uuid() from ioctl.c into fs.c

It's a generic helper not specific to ioctls and used in several places,
so move it out from ioctl.c and into fs.c. While at it change its return
type from int to bool and declare the loop variable in the loop itself.

This also slightly reduces the module's size.

Before this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1781492	 161037	  16920	1959449	 1de619	fs/btrfs/btrfs.ko

After this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1781340	 161037	  16920	1959297	 1de581	fs/btrfs/btrfs.ko

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.c    |  9 +++++++++
 fs/btrfs/fs.h    |  2 ++
 fs/btrfs/ioctl.c | 11 -----------
 fs/btrfs/ioctl.h |  1 -
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 09cfb43580cb..06a863252a85 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -55,6 +55,15 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
 	return ARRAY_SIZE(btrfs_csums);
 }
 
+bool __pure btrfs_is_empty_uuid(const u8 *uuid)
+{
+	for (int i = 0; i < BTRFS_UUID_SIZE; i++) {
+		if (uuid[i] != 0)
+			return false;
+	}
+	return true;
+}
+
 /*
  * Start exclusive operation @type, return true on success.
  */
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index b05f2af97140..15c26c6f4d6e 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -988,6 +988,8 @@ const char *btrfs_super_csum_name(u16 csum_type);
 const char *btrfs_super_csum_driver(u16 csum_type);
 size_t __attribute_const__ btrfs_get_num_csums(void);
 
+bool __pure btrfs_is_empty_uuid(const u8 *uuid);
+
 /* Compatibility and incompatibility defines */
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
 			     const char *name);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 243d08f37c58..415b20801d78 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -471,17 +471,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-int __pure btrfs_is_empty_uuid(const u8 *uuid)
-{
-	int i;
-
-	for (i = 0; i < BTRFS_UUID_SIZE; i++) {
-		if (uuid[i])
-			return 0;
-	}
-	return 1;
-}
-
 /*
  * Calculate the number of transaction items to reserve for creating a subvolume
  * or snapshot, not including the inode, directory entries, or parent directory.
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 2b760c8778f8..ce915fcda43b 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -19,7 +19,6 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
 		       struct dentry *dentry, struct fileattr *fa);
 int btrfs_ioctl_get_supported_features(void __user *arg);
 void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(const u8 *uuid);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 				     struct btrfs_ioctl_balance_args *bargs);
 int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
-- 
cgit v1.2.3


From 2205302298af2036e9c164fca025ba7a1ab2c816 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 12:58:09 +0000
Subject: btrfs: move the folio ordered helpers from ctree.h into fs.h

The folio ordered helper macros are defined at ctree.h but this is not
the best place since ctree.{h,c} is all about the btree data structure
implementation and not a generic module. So move these macros into the
fs.h header.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 8 --------
 fs/btrfs/fs.h    | 8 ++++++++
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a1bab0b3f193..3d9855d30057 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -756,12 +756,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
 	return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
 }
 
-/*
- * We use folio flag owner_2 to indicate there is an ordered extent with
- * unfinished IO.
- */
-#define folio_test_ordered(folio)	folio_test_owner_2(folio)
-#define folio_set_ordered(folio)	folio_set_owner_2(folio)
-#define folio_clear_ordered(folio)	folio_clear_owner_2(folio)
-
 #endif
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 15c26c6f4d6e..7a27f5fe9bc2 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -1066,6 +1066,14 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
 	(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,		\
 			   &(fs_info)->fs_state)))
 
+/*
+ * We use folio flag owner_2 to indicate there is an ordered extent with
+ * unfinished IO.
+ */
+#define folio_test_ordered(folio)	folio_test_owner_2(folio)
+#define folio_set_ordered(folio)	folio_set_owner_2(folio)
+#define folio_clear_ordered(folio)	folio_clear_owner_2(folio)
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 
 #define EXPORT_FOR_TESTS
-- 
cgit v1.2.3


From a4545b74e2de98989c1d14bc48c52c2f9fa734b6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 16:18:35 +0000
Subject: btrfs: move BTRFS_BYTES_TO_BLKS() into fs.h

Currently BTRFS_BYTES_TO_BLKS() is defined in ctree.h but it's not related
at all to the btree data structure, so move it into fs.h.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 3 ---
 fs/btrfs/fs.h    | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3d9855d30057..bf054470dcd0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -506,9 +506,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
 }
 
-#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
-				((bytes) >> (fs_info)->sectorsize_bits)
-
 static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 {
 	return mapping_gfp_constraint(mapping, ~__GFP_FS);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 7a27f5fe9bc2..dd1a82297d4c 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -953,6 +953,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
 #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
 					sizeof(struct btrfs_item))
 
+#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits)
+
 static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
 {
 	return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0;
-- 
cgit v1.2.3


From 378f25d3fc429ad001fa686f615df5c0f9cd47e1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 16:22:54 +0000
Subject: btrfs: move btrfs_alloc_write_mask() into fs.h

Currently btrfs_alloc_write_mask() is defined in ctree.h but it's not
related at all to the btree data structure, so move it into fs.h.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 6 ------
 fs/btrfs/fs.h    | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bf054470dcd0..53f9fc04f66f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -7,7 +7,6 @@
 #define BTRFS_CTREE_H
 
 #include "linux/cleanup.h"
-#include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
 #include <linux/mutex.h>
@@ -506,11 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
 }
 
-static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
-{
-	return mapping_gfp_constraint(mapping, ~__GFP_FS);
-}
-
 void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 num_bytes, u64 *actual_bytes);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index dd1a82297d4c..1113646374f3 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -18,6 +18,7 @@
 #include <linux/rwsem.h>
 #include <linux/semaphore.h>
 #include <linux/list.h>
+#include <linux/pagemap.h>
 #include <linux/radix-tree.h>
 #include <linux/workqueue.h>
 #include <linux/wait.h>
@@ -887,6 +888,11 @@ struct btrfs_fs_info {
 #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode),			\
 					   struct inode *: (_inode)))->root->fs_info)
 
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+	return mapping_gfp_constraint(mapping, ~__GFP_FS);
+}
+
 static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
 {
 	return READ_ONCE(fs_info->generation);
-- 
cgit v1.2.3


From 07174a34295767389383fd4e27da2a41ebb6966e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 16:29:45 +0000
Subject: btrfs: move extent-tree function declarations out of ctree.h

We have 3 functions that have their prototypes declared in ctree.h but
they are defined at extent-tree.c and they are unrelated to the btree
data structure. Move the prototypes out of ctree.h and into extent-tree.h.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h            | 5 -----
 fs/btrfs/extent-tree.h      | 4 ++++
 fs/btrfs/free-space-cache.c | 2 +-
 fs/btrfs/volumes.c          | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 53f9fc04f66f..cdf10cca8194 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,11 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
 }
 
-void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
-int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
-			 u64 num_bytes, u64 *actual_bytes);
-int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
-
 /* ctree.c */
 int __init btrfs_ctree_init(void);
 void __cold btrfs_ctree_exit(void);
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 46b8e19022df..cfa52264f678 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -162,5 +162,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
 			struct extent_buffer *parent);
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
+int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
+			 u64 num_bytes, u64 *actual_bytes);
+int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
 
 #endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cfa52ef40b06..17707c898eae 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -12,7 +12,7 @@
 #include <linux/error-injection.h>
 #include <linux/sched/mm.h>
 #include <linux/string_choices.h>
-#include "ctree.h"
+#include "extent-tree.h"
 #include "fs.h"
 #include "messages.h"
 #include "misc.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fcd80ba9dd42..d32913c51d69 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -13,8 +13,8 @@
 #include <linux/list_sort.h>
 #include <linux/namei.h>
 #include "misc.h"
-#include "ctree.h"
 #include "disk-io.h"
+#include "extent-tree.h"
 #include "transaction.h"
 #include "volumes.h"
 #include "raid56.h"
-- 
cgit v1.2.3


From de9c8265b763f98b4b8f7f13acf4888285ac5a47 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 16 Dec 2024 16:36:19 +0000
Subject: btrfs: remove pointless comment from ctree.h

It's pointless to have a comment above the prototype declarations of
btrfs_ctree_init() and btrfs_ctree_exit() mentioning that they are
declared in ctree.c. This is from the old days when ctree.h was used
to place anything that didn't fit in any other file. So remove it.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cdf10cca8194..1096a80a64e7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,7 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
 }
 
-/* ctree.c */
 int __init btrfs_ctree_init(void);
 void __cold btrfs_ctree_exit(void);
 
-- 
cgit v1.2.3


From 6a2b3d7a36df2c7a7ad3a8ef00bc4ea194221c02 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 17 Dec 2024 12:00:39 +0000
Subject: btrfs: use uuid_is_null() to verify if an uuid is empty

At btrfs_is_empty_uuid() we have our custom code to check if an uuid is
empty, however there a kernel uuid library that has a function named
uuid_is_null() which does the same and probably more efficient.

So change btrfs_is_empty_uuid() to use uuid_is_null(), which is almost
a directly replacement, it just wraps the necessary casting since our
uuid types are u8 arrays while the uuid kernel library uses the uuid_t
type, which is just a typedef of an u8 array of 16 elements as well.

Also since the function is now to trivial, make it a static inline
function in fs.h.

Suggested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.c | 9 ---------
 fs/btrfs/fs.h | 5 ++++-
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 06a863252a85..09cfb43580cb 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -55,15 +55,6 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
 	return ARRAY_SIZE(btrfs_csums);
 }
 
-bool __pure btrfs_is_empty_uuid(const u8 *uuid)
-{
-	for (int i = 0; i < BTRFS_UUID_SIZE; i++) {
-		if (uuid[i] != 0)
-			return false;
-	}
-	return true;
-}
-
 /*
  * Start exclusive operation @type, return true on success.
  */
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 1113646374f3..58e6b4b953f1 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -996,7 +996,10 @@ const char *btrfs_super_csum_name(u16 csum_type);
 const char *btrfs_super_csum_driver(u16 csum_type);
 size_t __attribute_const__ btrfs_get_num_csums(void);
 
-bool __pure btrfs_is_empty_uuid(const u8 *uuid);
+static inline bool btrfs_is_empty_uuid(const u8 *uuid)
+{
+	return uuid_is_null((const uuid_t *)uuid);
+}
 
 /* Compatibility and incompatibility defines */
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
-- 
cgit v1.2.3


From 882af9f13e830c0a4ef696bb72cd5998a5067a93 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 19 Dec 2024 15:04:08 +1030
Subject: btrfs: handle free space tree rebuild in multiple transactions

During free space tree rebuild, we're holding a transaction handle for
the whole rebuild process.

This can lead to blocked task warning, e.g. btrfs-transaction kthread
(which is already created before btrfs_start_pre_rw_mount()) can be
waked up to join and commit the current transaction.

But the free space tree rebuild process may need to go through thousands
block groups, this will block btrfs-transaction kthread for a long time.

Fix the problem by calling btrfs_should_end_transaction() after each
block group, so that we won't hold the transaction handle too long.

And since the free-space-tree rebuild can be split into
multiple transactions, we need to consider the safety when the rebuild
process is interrupted.

Thankfully since we only set the FREE_SPACE_TREE compat_ro flag without
FREE_SPACE_TREE_VALID flag, even if the rebuild is interrupted, on the
next RW mount, we will still go rebuild the free space tree, by deleting
any items we have and re-starting the rebuild from scratch.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 7ba50e133921..2400fa5a5be4 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1350,6 +1350,12 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
 			btrfs_end_transaction(trans);
 			return ret;
 		}
+		if (btrfs_should_end_transaction(trans)) {
+			btrfs_end_transaction(trans);
+			trans = btrfs_start_transaction(free_space_root, 1);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
+		}
 		node = rb_next(node);
 	}
 
-- 
cgit v1.2.3


From 57e421867b7aa60783dac3d4caf97af74263f5f5 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Tue, 17 Dec 2024 08:05:43 +0100
Subject: btrfs: don't include linux/rwlock_types.h directly

The header clearly states that it does not want to be included directly,
only via linux/spinlock_types.h. Drop this as we can simply use the
spinlock.h which is already included.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 58e6b4b953f1..be8c32d1a7bb 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -14,7 +14,6 @@
 #include <linux/lockdep.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
-#include <linux/rwlock_types.h>
 #include <linux/rwsem.h>
 #include <linux/semaphore.h>
 #include <linux/list.h>
-- 
cgit v1.2.3


From 372484f2c27c5ebd2b37d6c7d3f92678fc8c07af Mon Sep 17 00:00:00 2001
From: "Roger L. Beckermeyer III" <beckerlee3@gmail.com>
Date: Wed, 18 Dec 2024 08:28:51 +1030
Subject: btrfs: update btrfs_add_block_group_cache() to use rb helper

Update fs/btrfs/block-group.c to use rb_find_add_cached().

Suggested-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Roger L. Beckermeyer III <beckerlee3@gmail.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5be029734cfa..39881d66cfa0 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -173,43 +173,41 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
 	}
 }
 
+static int btrfs_bg_start_cmp(const struct rb_node *new,
+			      const struct rb_node *exist)
+{
+	const struct btrfs_block_group *new_bg =
+		rb_entry(new, struct btrfs_block_group, cache_node);
+	const struct btrfs_block_group *exist_bg =
+		rb_entry(exist, struct btrfs_block_group, cache_node);
+
+	if (new_bg->start < exist_bg->start)
+		return -1;
+	if (new_bg->start > exist_bg->start)
+		return 1;
+	return 0;
+}
+
 /*
  * This adds the block group to the fs_info rb tree for the block group cache
  */
 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 				       struct btrfs_block_group *block_group)
 {
-	struct rb_node **p;
-	struct rb_node *parent = NULL;
-	struct btrfs_block_group *cache;
-	bool leftmost = true;
+	struct rb_node *exist;
+	int ret = 0;
 
 	ASSERT(block_group->length != 0);
 
 	write_lock(&info->block_group_cache_lock);
-	p = &info->block_group_cache_tree.rb_root.rb_node;
-
-	while (*p) {
-		parent = *p;
-		cache = rb_entry(parent, struct btrfs_block_group, cache_node);
-		if (block_group->start < cache->start) {
-			p = &(*p)->rb_left;
-		} else if (block_group->start > cache->start) {
-			p = &(*p)->rb_right;
-			leftmost = false;
-		} else {
-			write_unlock(&info->block_group_cache_lock);
-			return -EEXIST;
-		}
-	}
-
-	rb_link_node(&block_group->cache_node, parent, p);
-	rb_insert_color_cached(&block_group->cache_node,
-			       &info->block_group_cache_tree, leftmost);
 
+	exist = rb_find_add_cached(&block_group->cache_node,
+			&info->block_group_cache_tree, btrfs_bg_start_cmp);
+	if (exist)
+		ret = -EEXIST;
 	write_unlock(&info->block_group_cache_lock);
 
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From 14ae60c71221d3ec64482476262e5b9eb4494be4 Mon Sep 17 00:00:00 2001
From: "Roger L. Beckermeyer III" <beckerlee3@gmail.com>
Date: Wed, 18 Dec 2024 08:28:52 +1030
Subject: btrfs: update prelim_ref_insert() to use rb helpers

Update prelim_ref_insert() to use rb_find_add_cached().

There is a special change that the existing prelim_ref_compare() is
called with the first parameter as the existing ref in the rbtree.

But the newer rb_find_add_cached() expects the cmp() function to have
the first parameter as the to-be-added node, thus the new helper
prelim_ref_rb_add_cmp() need to adapt this new order.

Signed-off-by: Roger L. Beckermeyer III <beckerlee3@gmail.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 79 +++++++++++++++++++++++++++---------------------------
 1 file changed, 39 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6d9f39c1d89c..3d3923cfc357 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -250,6 +250,21 @@ static int prelim_ref_compare(const struct prelim_ref *ref1,
 	return 0;
 }
 
+static int prelim_ref_rb_add_cmp(const struct rb_node *new,
+				 const struct rb_node *exist)
+{
+	const struct prelim_ref *ref_new =
+		rb_entry(new, struct prelim_ref, rbnode);
+	const struct prelim_ref *ref_exist =
+		rb_entry(exist, struct prelim_ref, rbnode);
+
+	/*
+	 * prelim_ref_compare() expects the first parameter as the existing one,
+	 * different from the rb_find_add_cached() order.
+	 */
+	return prelim_ref_compare(ref_exist, ref_new);
+}
+
 static void update_share_count(struct share_check *sc, int oldcount,
 			       int newcount, const struct prelim_ref *newref)
 {
@@ -278,55 +293,39 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
 			      struct share_check *sc)
 {
 	struct rb_root_cached *root;
-	struct rb_node **p;
-	struct rb_node *parent = NULL;
-	struct prelim_ref *ref;
-	int result;
-	bool leftmost = true;
+	struct rb_node *exist;
 
 	root = &preftree->root;
-	p = &root->rb_root.rb_node;
-
-	while (*p) {
-		parent = *p;
-		ref = rb_entry(parent, struct prelim_ref, rbnode);
-		result = prelim_ref_compare(ref, newref);
-		if (result < 0) {
-			p = &(*p)->rb_left;
-		} else if (result > 0) {
-			p = &(*p)->rb_right;
-			leftmost = false;
-		} else {
-			/* Identical refs, merge them and free @newref */
-			struct extent_inode_elem *eie = ref->inode_list;
+	exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp);
+	if (exist) {
+		struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode);
+		/* Identical refs, merge them and free @newref */
+		struct extent_inode_elem *eie = ref->inode_list;
 
-			while (eie && eie->next)
-				eie = eie->next;
+		while (eie && eie->next)
+			eie = eie->next;
 
-			if (!eie)
-				ref->inode_list = newref->inode_list;
-			else
-				eie->next = newref->inode_list;
-			trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
-						     preftree->count);
-			/*
-			 * A delayed ref can have newref->count < 0.
-			 * The ref->count is updated to follow any
-			 * BTRFS_[ADD|DROP]_DELAYED_REF actions.
-			 */
-			update_share_count(sc, ref->count,
-					   ref->count + newref->count, newref);
-			ref->count += newref->count;
-			free_pref(newref);
-			return;
-		}
+		if (!eie)
+			ref->inode_list = newref->inode_list;
+		else
+			eie->next = newref->inode_list;
+		trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
+							preftree->count);
+		/*
+		 * A delayed ref can have newref->count < 0.
+		 * The ref->count is updated to follow any
+		 * BTRFS_[ADD|DROP]_DELAYED_REF actions.
+		 */
+		update_share_count(sc, ref->count,
+					ref->count + newref->count, newref);
+		ref->count += newref->count;
+		free_pref(newref);
+		return;
 	}
 
 	update_share_count(sc, 0, newref->count, newref);
 	preftree->count++;
 	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
-	rb_link_node(&newref->rbnode, parent, p);
-	rb_insert_color_cached(&newref->rbnode, root, leftmost);
 }
 
 /*
-- 
cgit v1.2.3


From 0877597dc347169bed19e3e8282bbdce2593d16f Mon Sep 17 00:00:00 2001
From: "Roger L. Beckermeyer III" <beckerlee3@gmail.com>
Date: Wed, 18 Dec 2024 08:28:53 +1030
Subject: btrfs: update __btrfs_add_delayed_item() to use rb helper

Update __btrfs_add_delayed_item() to use rb_find_add_cached().

Signed-off-by: Roger L. Beckermeyer III <beckerlee3@gmail.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 508bdbae29a0..60a6866a6cd9 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -366,40 +366,35 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
 	return NULL;
 }
 
+static int btrfs_delayed_item_cmp(const struct rb_node *new,
+				  const struct rb_node *exist)
+{
+	const struct btrfs_delayed_item *new_item =
+		rb_entry(new, struct btrfs_delayed_item, rb_node);
+	const struct btrfs_delayed_item *exist_item =
+		rb_entry(exist, struct btrfs_delayed_item, rb_node);
+
+	if (new_item->index < exist_item->index)
+		return -1;
+	if (new_item->index > exist_item->index)
+		return 1;
+	return 0;
+}
+
 static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
 				    struct btrfs_delayed_item *ins)
 {
-	struct rb_node **p, *node;
-	struct rb_node *parent_node = NULL;
 	struct rb_root_cached *root;
-	struct btrfs_delayed_item *item;
-	bool leftmost = true;
+	struct rb_node *exist;
 
 	if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
 		root = &delayed_node->ins_root;
 	else
 		root = &delayed_node->del_root;
 
-	p = &root->rb_root.rb_node;
-	node = &ins->rb_node;
-
-	while (*p) {
-		parent_node = *p;
-		item = rb_entry(parent_node, struct btrfs_delayed_item,
-				 rb_node);
-
-		if (item->index < ins->index) {
-			p = &(*p)->rb_right;
-			leftmost = false;
-		} else if (item->index > ins->index) {
-			p = &(*p)->rb_left;
-		} else {
-			return -EEXIST;
-		}
-	}
-
-	rb_link_node(node, parent_node, p);
-	rb_insert_color_cached(node, root, leftmost);
+	exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp);
+	if (exist)
+		return -EEXIST;
 
 	if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
 	    ins->index >= delayed_node->index_cnt)
-- 
cgit v1.2.3


From 287373c701e6d0ba93a7f21e979ed1bc25a02016 Mon Sep 17 00:00:00 2001
From: "Roger L. Beckermeyer III" <beckerlee3@gmail.com>
Date: Wed, 18 Dec 2024 08:28:54 +1030
Subject: btrfs: update btrfs_add_chunk_map() to use rb helpers

Update btrfs_add_chunk_map() to use rb_find_add_cached().

Signed-off-by: Roger L. Beckermeyer III <beckerlee3@gmail.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d32913c51d69..c8b079ad1dfa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5514,33 +5514,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
 	btrfs_free_chunk_map(map);
 }
 
+static int btrfs_chunk_map_cmp(const struct rb_node *new,
+			       const struct rb_node *exist)
+{
+	const struct btrfs_chunk_map *new_map =
+		rb_entry(new, struct btrfs_chunk_map, rb_node);
+	const struct btrfs_chunk_map *exist_map =
+		rb_entry(exist, struct btrfs_chunk_map, rb_node);
+
+	if (new_map->start == exist_map->start)
+		return 0;
+	if (new_map->start < exist_map->start)
+		return -1;
+	return 1;
+}
+
 EXPORT_FOR_TESTS
 int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
 {
-	struct rb_node **p;
-	struct rb_node *parent = NULL;
-	bool leftmost = true;
+	struct rb_node *exist;
 
 	write_lock(&fs_info->mapping_tree_lock);
-	p = &fs_info->mapping_tree.rb_root.rb_node;
-	while (*p) {
-		struct btrfs_chunk_map *entry;
-
-		parent = *p;
-		entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
-
-		if (map->start < entry->start) {
-			p = &(*p)->rb_left;
-		} else if (map->start > entry->start) {
-			p = &(*p)->rb_right;
-			leftmost = false;
-		} else {
-			write_unlock(&fs_info->mapping_tree_lock);
-			return -EEXIST;
-		}
+	exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree,
+				   btrfs_chunk_map_cmp);
+
+	if (exist) {
+		write_unlock(&fs_info->mapping_tree_lock);
+		return -EEXIST;
 	}
-	rb_link_node(&map->rb_node, parent, p);
-	rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
 	chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
 	chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
 	write_unlock(&fs_info->mapping_tree_lock);
-- 
cgit v1.2.3


From 4e4d058e21294d8062bab0285ed456f711793990 Mon Sep 17 00:00:00 2001
From: "Roger L. Beckermeyer III" <beckerlee3@gmail.com>
Date: Wed, 18 Dec 2024 08:28:55 +1030
Subject: btrfs: update tree_insert() to use rb helpers

Update tree_insert() to use rb_find_add_cached().
add cmp_refs_node in rb_find_add_cached() to compare.

Since we're here, also make comp_data_refs() and comp_refs() accept
both parameters as const.

Signed-off-by: Roger L. Beckermeyer III <beckerlee3@gmail.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 45 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 30f7079fa28e..98c5b61dabe8 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -268,8 +268,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 /*
  * compare two delayed data backrefs with same bytenr and type
  */
-static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
-			  struct btrfs_delayed_ref_node *ref2)
+static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1,
+			  const struct btrfs_delayed_ref_node *ref2)
 {
 	if (ref1->data_ref.objectid < ref2->data_ref.objectid)
 		return -1;
@@ -282,8 +282,8 @@ static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
 	return 0;
 }
 
-static int comp_refs(struct btrfs_delayed_ref_node *ref1,
-		     struct btrfs_delayed_ref_node *ref2,
+static int comp_refs(const struct btrfs_delayed_ref_node *ref1,
+		     const struct btrfs_delayed_ref_node *ref2,
 		     bool check_seq)
 {
 	int ret = 0;
@@ -317,34 +317,25 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
 	return 0;
 }
 
+static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist)
+{
+	const struct btrfs_delayed_ref_node *new_node =
+		rb_entry(new, struct btrfs_delayed_ref_node, ref_node);
+	const struct btrfs_delayed_ref_node *exist_node =
+		rb_entry(exist, struct btrfs_delayed_ref_node, ref_node);
+
+	return comp_refs(new_node, exist_node, true);
+}
+
 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
 		struct btrfs_delayed_ref_node *ins)
 {
-	struct rb_node **p = &root->rb_root.rb_node;
 	struct rb_node *node = &ins->ref_node;
-	struct rb_node *parent_node = NULL;
-	struct btrfs_delayed_ref_node *entry;
-	bool leftmost = true;
-
-	while (*p) {
-		int comp;
-
-		parent_node = *p;
-		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
-				 ref_node);
-		comp = comp_refs(ins, entry, true);
-		if (comp < 0) {
-			p = &(*p)->rb_left;
-		} else if (comp > 0) {
-			p = &(*p)->rb_right;
-			leftmost = false;
-		} else {
-			return entry;
-		}
-	}
+	struct rb_node *exist;
 
-	rb_link_node(node, parent_node, p);
-	rb_insert_color_cached(node, root, leftmost);
+	exist = rb_find_add_cached(node, root, cmp_refs_node);
+	if (exist)
+		return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node);
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 2a9bb78cfd367fdeff74f15b1e98969912292d9e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sat, 21 Dec 2024 16:15:19 +1030
Subject: btrfs: validate system chunk array at btrfs_validate_super()

Currently btrfs_validate_super() only does a very basic check on the
array chunk size (not too large than the available space, but not too
small to contain no chunk).

The more comprehensive checks (the regular chunk checks and size check
inside the system chunk array) are all done inside btrfs_read_sys_array().

It's not a big deal, but it also means we do not do any validation on
the system chunk array at super block writeback time either.

Do the following modification to centralize the system chunk array
checks into btrfs_validate_super():

- Make chunk_err() helper accept stack chunk pointer
  If @leaf parameter is NULL, then the @chunk pointer will be a pointer
  to the chunk item, other than the offset inside the leaf.

  And since @leaf can be NULL, add a new @fs_info parameter for that
  case.

- Make btrfs_check_chunk_valid() handle stack chunk pointer
  The same as chunk_err(), a new @fs_info parameter, and if @leaf is
  NULL, then @chunk will be a pointer to a stack chunk.

  If @chunk is NULL, then all needed btrfs_chunk members will be read
  using the stack helper instead of the leaf helper.
  This means we need to read out all the needed member at the beginning
  of the function.

  Furthermore, at super block read time, fs_info->sectorsize is not yet
  initialized, we need one extra @sectorsize parameter to grab the
  correct sectorsize.

- Introduce a helper validate_sys_chunk_array()
  * Validate the disk key.
  * Validate the size before we access the full chunk items.
  * Do the full chunk item validation.

- Call validate_sys_chunk_array() at btrfs_validate_super()

- Simplify the checks inside btrfs_read_sys_array()
  Now the checks will be converted to an ASSERT().

- Simplify the checks inside read_one_chunk()
  Now that all chunk items inside system chunk array and chunk tree are
  verified, there is no need to verify them again inside read_one_chunk().

This change has the following advantages:

- More comprehensive checks at write time
  And unlike the sys_chunk_array read routine, this time we do not need
  to allocate a dummy extent buffer to do the check.
  All the checks done here require no new memory allocation.

- Slightly improved readability when iterating the system chunk array

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c      | 67 ++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-checker.c | 96 +++++++++++++++++++++++++++----------------------
 fs/btrfs/tree-checker.h |  7 ++--
 fs/btrfs/volumes.c      | 73 +++++++------------------------------
 4 files changed, 139 insertions(+), 104 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eff0dd1ae62f..04d68f253940 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2327,6 +2327,71 @@ out:
 	return ret;
 }
 
+static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
+				    const struct btrfs_super_block *sb)
+{
+	unsigned int cur = 0; /* Offset inside the sys chunk array */
+	/*
+	 * At sb read time, fs_info is not fully initialized. Thus we have
+	 * to use super block sectorsize, which should have been validated.
+	 */
+	const u32 sectorsize = btrfs_super_sectorsize(sb);
+	u32 sys_array_size = btrfs_super_sys_array_size(sb);
+
+	if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+		btrfs_err(fs_info, "system chunk array too big %u > %u",
+			  sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+		return -EUCLEAN;
+	}
+
+	while (cur < sys_array_size) {
+		struct btrfs_disk_key *disk_key;
+		struct btrfs_chunk *chunk;
+		struct btrfs_key key;
+		u64 type;
+		u16 num_stripes;
+		u32 len;
+		int ret;
+
+		disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
+		len = sizeof(*disk_key);
+
+		if (cur + len > sys_array_size)
+			goto short_read;
+		cur += len;
+
+		btrfs_disk_key_to_cpu(&key, disk_key);
+		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
+			btrfs_err(fs_info,
+			    "unexpected item type %u in sys_array at offset %u",
+				  key.type, cur);
+			return -EUCLEAN;
+		}
+		chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
+		num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+		if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)
+			goto short_read;
+		type = btrfs_stack_chunk_type(chunk);
+		if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+			btrfs_err(fs_info,
+			"invalid chunk type %llu in sys_array at offset %u",
+				  type, cur);
+			return -EUCLEAN;
+		}
+		ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset,
+					      sectorsize);
+		if (ret < 0)
+			return ret;
+		cur += btrfs_chunk_item_size(num_stripes);
+	}
+	return 0;
+short_read:
+	btrfs_err(fs_info,
+	"super block sys chunk array short read, cur=%u sys_array_size=%u",
+		  cur, sys_array_size);
+	return -EUCLEAN;
+}
+
 /*
  * Real super block validation
  * NOTE: super csum type and incompat features will not be checked here.
@@ -2495,6 +2560,8 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
+	ret = validate_sys_chunk_array(fs_info, sb);
+
 	/*
 	 * Obvious sys_chunk_array corruptions, it must hold at least one key
 	 * and one chunk
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index dfeee033f31f..43979891f7c8 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -764,22 +764,19 @@ static int check_block_group_item(struct extent_buffer *leaf,
 	return 0;
 }
 
-__printf(4, 5)
+__printf(5, 6)
 __cold
-static void chunk_err(const struct extent_buffer *leaf,
+static void chunk_err(const struct btrfs_fs_info *fs_info,
+		      const struct extent_buffer *leaf,
 		      const struct btrfs_chunk *chunk, u64 logical,
 		      const char *fmt, ...)
 {
-	const struct btrfs_fs_info *fs_info = leaf->fs_info;
-	bool is_sb;
+	bool is_sb = !leaf;
 	struct va_format vaf;
 	va_list args;
 	int i;
 	int slot = -1;
 
-	/* Only superblock eb is able to have such small offset */
-	is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET);
-
 	if (!is_sb) {
 		/*
 		 * Get the slot number by iterating through all slots, this
@@ -812,13 +809,17 @@ static void chunk_err(const struct extent_buffer *leaf,
 /*
  * The common chunk check which could also work on super block sys chunk array.
  *
+ * If @leaf is NULL, then @chunk must be an on-stack chunk item.
+ * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable)
+ *
  * Return -EUCLEAN if anything is corrupted.
  * Return 0 if everything is OK.
  */
-int btrfs_check_chunk_valid(struct extent_buffer *leaf,
-			    struct btrfs_chunk *chunk, u64 logical)
+int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
+			    const struct extent_buffer *leaf,
+			    const struct btrfs_chunk *chunk, u64 logical,
+			    u32 sectorsize)
 {
-	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	u64 length;
 	u64 chunk_end;
 	u64 stripe_len;
@@ -826,63 +827,73 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 	u16 sub_stripes;
 	u64 type;
 	u64 features;
+	u32 chunk_sector_size;
 	bool mixed = false;
 	int raid_index;
 	int nparity;
 	int ncopies;
 
-	length = btrfs_chunk_length(leaf, chunk);
-	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
-	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
-	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
-	type = btrfs_chunk_type(leaf, chunk);
+	if (leaf) {
+		length = btrfs_chunk_length(leaf, chunk);
+		stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+		num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+		sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+		type = btrfs_chunk_type(leaf, chunk);
+		chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk);
+	} else {
+		length = btrfs_stack_chunk_length(chunk);
+		stripe_len = btrfs_stack_chunk_stripe_len(chunk);
+		num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+		sub_stripes = btrfs_stack_chunk_sub_stripes(chunk);
+		type = btrfs_stack_chunk_type(chunk);
+		chunk_sector_size = btrfs_stack_chunk_sector_size(chunk);
+	}
 	raid_index = btrfs_bg_flags_to_raid_index(type);
 	ncopies = btrfs_raid_array[raid_index].ncopies;
 	nparity = btrfs_raid_array[raid_index].nparity;
 
 	if (unlikely(!num_stripes)) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "invalid chunk num_stripes, have %u", num_stripes);
 		return -EUCLEAN;
 	}
 	if (unlikely(num_stripes < ncopies)) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "invalid chunk num_stripes < ncopies, have %u < %d",
 			  num_stripes, ncopies);
 		return -EUCLEAN;
 	}
 	if (unlikely(nparity && num_stripes == nparity)) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "invalid chunk num_stripes == nparity, have %u == %d",
 			  num_stripes, nparity);
 		return -EUCLEAN;
 	}
-	if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) {
-		chunk_err(leaf, chunk, logical,
+	if (unlikely(!IS_ALIGNED(logical, sectorsize))) {
+		chunk_err(fs_info, leaf, chunk, logical,
 		"invalid chunk logical, have %llu should aligned to %u",
-			  logical, fs_info->sectorsize);
+			  logical, sectorsize);
 		return -EUCLEAN;
 	}
-	if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) {
-		chunk_err(leaf, chunk, logical,
+	if (unlikely(chunk_sector_size != sectorsize)) {
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "invalid chunk sectorsize, have %u expect %u",
-			  btrfs_chunk_sector_size(leaf, chunk),
-			  fs_info->sectorsize);
+			  chunk_sector_size, sectorsize);
 		return -EUCLEAN;
 	}
-	if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) {
-		chunk_err(leaf, chunk, logical,
+	if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) {
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "invalid chunk length, have %llu", length);
 		return -EUCLEAN;
 	}
 	if (unlikely(check_add_overflow(logical, length, &chunk_end))) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 "invalid chunk logical start and length, have logical start %llu length %llu",
 			  logical, length);
 		return -EUCLEAN;
 	}
 	if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "invalid chunk stripe length: %llu",
 			  stripe_len);
 		return -EUCLEAN;
@@ -896,30 +907,29 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 	 * Thus it should be a good way to catch obvious bitflips.
 	 */
 	if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "chunk length too large: have %llu limit %llu",
 			  length, btrfs_stripe_nr_to_offset(U32_MAX));
 		return -EUCLEAN;
 	}
 	if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
 			      BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "unrecognized chunk type: 0x%llx",
 			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
-			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
-			  btrfs_chunk_type(leaf, chunk));
+			    BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
 		return -EUCLEAN;
 	}
 
 	if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
 		     (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 		"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
 			  type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
 		return -EUCLEAN;
 	}
 	if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 	"missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
 			  type, BTRFS_BLOCK_GROUP_TYPE_MASK);
 		return -EUCLEAN;
@@ -928,7 +938,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 	if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
 		     (type & (BTRFS_BLOCK_GROUP_METADATA |
 			      BTRFS_BLOCK_GROUP_DATA)))) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 			  "system chunk with data or metadata type: 0x%llx",
 			  type);
 		return -EUCLEAN;
@@ -941,7 +951,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 	if (!mixed) {
 		if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&
 			     (type & BTRFS_BLOCK_GROUP_DATA))) {
-			chunk_err(leaf, chunk, logical,
+			chunk_err(fs_info, leaf, chunk, logical,
 			"mixed chunk type in non-mixed mode: 0x%llx", type);
 			return -EUCLEAN;
 		}
@@ -963,7 +973,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 		      num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
 		     ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
 		      num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
-		chunk_err(leaf, chunk, logical,
+		chunk_err(fs_info, leaf, chunk, logical,
 			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
 			num_stripes, sub_stripes,
 			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -983,14 +993,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
 				 struct btrfs_chunk *chunk,
 				 struct btrfs_key *key, int slot)
 {
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	int num_stripes;
 
 	if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
-		chunk_err(leaf, chunk, key->offset,
+		chunk_err(fs_info, leaf, chunk, key->offset,
 			"invalid chunk item size: have %u expect [%zu, %u)",
 			btrfs_item_size(leaf, slot),
 			sizeof(struct btrfs_chunk),
-			BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+			BTRFS_LEAF_DATA_SIZE(fs_info));
 		return -EUCLEAN;
 	}
 
@@ -1001,14 +1012,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
 
 	if (unlikely(btrfs_chunk_item_size(num_stripes) !=
 		     btrfs_item_size(leaf, slot))) {
-		chunk_err(leaf, chunk, key->offset,
+		chunk_err(fs_info, leaf, chunk, key->offset,
 			"invalid chunk item size: have %u expect %lu",
 			btrfs_item_size(leaf, slot),
 			btrfs_chunk_item_size(num_stripes));
 		return -EUCLEAN;
 	}
 out:
-	return btrfs_check_chunk_valid(leaf, chunk, key->offset);
+	return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset,
+				       fs_info->sectorsize);
 }
 
 __printf(3, 4)
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index db67f96cbe4b..eb201f4ec3c7 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -10,6 +10,7 @@
 #include <uapi/linux/btrfs_tree.h>
 
 struct extent_buffer;
+struct btrfs_fs_info;
 struct btrfs_chunk;
 struct btrfs_key;
 
@@ -66,8 +67,10 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node);
 int btrfs_check_leaf(struct extent_buffer *leaf);
 int btrfs_check_node(struct extent_buffer *node);
 
-int btrfs_check_chunk_valid(struct extent_buffer *leaf,
-			    struct btrfs_chunk *chunk, u64 logical);
+int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
+			    const struct extent_buffer *leaf,
+			    const struct btrfs_chunk *chunk, u64 logical,
+			    u32 sectorsize);
 int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
 int btrfs_verify_level_key(struct extent_buffer *eb,
 			   const struct btrfs_tree_parent_check *check);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c8b079ad1dfa..a58cf494b3d0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7004,16 +7004,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 	warn_32bit_meta_chunk(fs_info, logical, length, type);
 #endif
 
-	/*
-	 * Only need to verify chunk item if we're reading from sys chunk array,
-	 * as chunk item in tree block is already verified by tree-checker.
-	 */
-	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
-		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
-		if (ret)
-			return ret;
-	}
-
 	map = btrfs_find_chunk_map(fs_info, logical, 1);
 
 	/* already mapped? */
@@ -7271,16 +7261,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_super_block *super_copy = fs_info->super_copy;
 	struct extent_buffer *sb;
-	struct btrfs_disk_key *disk_key;
-	struct btrfs_chunk *chunk;
 	u8 *array_ptr;
 	unsigned long sb_array_offset;
 	int ret = 0;
-	u32 num_stripes;
 	u32 array_size;
-	u32 len = 0;
 	u32 cur_offset;
-	u64 type;
 	struct btrfs_key key;
 
 	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
@@ -7303,10 +7288,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
 	cur_offset = 0;
 
 	while (cur_offset < array_size) {
-		disk_key = (struct btrfs_disk_key *)array_ptr;
-		len = sizeof(*disk_key);
-		if (cur_offset + len > array_size)
-			goto out_short_read;
+		struct btrfs_chunk *chunk;
+		struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr;
+		u32 len = sizeof(*disk_key);
+
+		/*
+		 * The sys_chunk_array has been already verified at super block
+		 * read time.  Only do ASSERT()s for basic checks.
+		 */
+		ASSERT(cur_offset + len <= array_size);
 
 		btrfs_disk_key_to_cpu(&key, disk_key);
 
@@ -7314,44 +7304,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
 		sb_array_offset += len;
 		cur_offset += len;
 
-		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
-			btrfs_err(fs_info,
-			    "unexpected item type %u in sys_array at offset %u",
-				  (u32)key.type, cur_offset);
-			ret = -EIO;
-			break;
-		}
+		ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);
 
 		chunk = (struct btrfs_chunk *)sb_array_offset;
-		/*
-		 * At least one btrfs_chunk with one stripe must be present,
-		 * exact stripe count check comes afterwards
-		 */
-		len = btrfs_chunk_item_size(1);
-		if (cur_offset + len > array_size)
-			goto out_short_read;
-
-		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
-		if (!num_stripes) {
-			btrfs_err(fs_info,
-			"invalid number of stripes %u in sys_array at offset %u",
-				  num_stripes, cur_offset);
-			ret = -EIO;
-			break;
-		}
+		ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM);
 
-		type = btrfs_chunk_type(sb, chunk);
-		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
-			btrfs_err(fs_info,
-			"invalid chunk type %llu in sys_array at offset %u",
-				  type, cur_offset);
-			ret = -EIO;
-			break;
-		}
+		len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk));
 
-		len = btrfs_chunk_item_size(num_stripes);
-		if (cur_offset + len > array_size)
-			goto out_short_read;
+		ASSERT(cur_offset + len <= array_size);
 
 		ret = read_one_chunk(&key, sb, chunk);
 		if (ret)
@@ -7364,13 +7324,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
 	clear_extent_buffer_uptodate(sb);
 	free_extent_buffer_stale(sb);
 	return ret;
-
-out_short_read:
-	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
-			len, cur_offset);
-	clear_extent_buffer_uptodate(sb);
-	free_extent_buffer_stale(sb);
-	return -EIO;
 }
 
 /*
-- 
cgit v1.2.3


From 097a7eef61bd0366a822418b2ba074942eb00744 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 Jan 2025 12:09:50 +0000
Subject: btrfs: uncollapse transaction aborts during renames

During renames we are grouping transaction aborts that can be due to a
failure of one of several function calls. While this makes the code less
verbose, it makes it harder to debug as we end up not knowing from which
function call we got an error.

So change this to trigger a transaction abort after each function call
failure, so that when we get a transaction abort message we know exactly
which function call failed, helping us to debug issues.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 74 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8a173a24ac05..a7a3d879f2f2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8021,31 +8021,45 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	/* src is a subvolume */
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
 	} else { /* src is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(old_dentry->d_inode),
 					   old_name, &old_rename_ctx);
-		if (!ret)
-			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
-	}
-	if (ret) {
-		btrfs_abort_transaction(trans, ret);
-		goto out_fail;
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
+		ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
 	}
 
 	/* dest is a subvolume */
 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
 	} else { /* dest is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 					   BTRFS_I(new_dentry->d_inode),
 					   new_name, &new_rename_ctx);
-		if (!ret)
-			ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
-	}
-	if (ret) {
-		btrfs_abort_transaction(trans, ret);
-		goto out_fail;
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
+		ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
@@ -8281,16 +8295,23 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 
 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
 	} else {
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(d_inode(old_dentry)),
 					   &old_fname.disk_name, &rename_ctx);
-		if (!ret)
-			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
-	}
-	if (ret) {
-		btrfs_abort_transaction(trans, ret);
-		goto out_fail;
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
+		ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_fail;
+		}
 	}
 
 	if (new_inode) {
@@ -8298,18 +8319,27 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				goto out_fail;
+			}
 			BUG_ON(new_inode->i_nlink == 0);
 		} else {
 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 						 BTRFS_I(d_inode(new_dentry)),
 						 &new_fname.disk_name);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				goto out_fail;
+			}
 		}
-		if (!ret && new_inode->i_nlink == 0)
+		if (new_inode->i_nlink == 0) {
 			ret = btrfs_orphan_add(trans,
 					BTRFS_I(d_inode(new_dentry)));
-		if (ret) {
-			btrfs_abort_transaction(trans, ret);
-			goto out_fail;
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				goto out_fail;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From 8787c36c630c3c732f071488aba3dc5fd3594c23 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:02:16 +0000
Subject: btrfs: tree-log: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c8d6587688b3..955d1677e865 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -590,7 +590,6 @@ insert:
 		}
 	}
 no_copy:
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 	return 0;
 }
@@ -3588,7 +3587,6 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 		last_offset = max(last_offset, curr_end);
 	}
 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 	return 0;
 }
@@ -4566,7 +4564,6 @@ copy_item:
 		dst_index++;
 	}
 
-	btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
 	btrfs_release_path(dst_path);
 out:
 	kfree(ins_data);
@@ -4776,7 +4773,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	write_extent_buffer(leaf, &fi,
 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
 			    sizeof(fi));
-	btrfs_mark_buffer_dirty(trans, leaf);
 
 	btrfs_release_path(path);
 
-- 
cgit v1.2.3


From 63eb2223872faebf4e9df3fe31f8e02fdcfa5d99 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:10:56 +0000
Subject: btrfs: free-space-tree: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
ike btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 2400fa5a5be4..cae540ec15ed 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -89,7 +89,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
 			      struct btrfs_free_space_info);
 	btrfs_set_free_space_extent_count(leaf, info, 0);
 	btrfs_set_free_space_flags(leaf, info, 0);
-	btrfs_mark_buffer_dirty(trans, leaf);
 
 	ret = 0;
 out:
@@ -287,7 +286,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 	flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
 	btrfs_set_free_space_flags(leaf, info, flags);
 	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	if (extent_count != expected_extent_count) {
@@ -324,7 +322,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		write_extent_buffer(leaf, bitmap_cursor, ptr,
 				    data_size);
-		btrfs_mark_buffer_dirty(trans, leaf);
 		btrfs_release_path(path);
 
 		i += extent_size;
@@ -430,7 +427,6 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 	flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
 	btrfs_set_free_space_flags(leaf, info, flags);
 	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
@@ -495,7 +491,6 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
 
 	extent_count += new_extents;
 	btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 
 	if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
-- 
cgit v1.2.3


From 1440fd2757ff63cf1d457bc640e51a685c4e4c5b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:19:10 +0000
Subject: btrfs: extent-tree: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3dfe651aeaa9..1cb1bd45f7ec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -570,7 +570,6 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 		}
 	}
-	btrfs_mark_buffer_dirty(trans, leaf);
 	ret = 0;
 fail:
 	btrfs_release_path(path);
@@ -618,7 +617,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
-		btrfs_mark_buffer_dirty(trans, leaf);
 	}
 	return ret;
 }
@@ -1050,7 +1048,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	} else {
 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
 	}
-	btrfs_mark_buffer_dirty(trans, leaf);
 }
 
 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1195,7 +1192,6 @@ static noinline_for_stack int update_inline_extent_backref(
 		item_size -= size;
 		btrfs_truncate_item(trans, path, item_size, 1);
 	}
-	btrfs_mark_buffer_dirty(trans, leaf);
 	return 0;
 }
 
@@ -1527,7 +1523,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (extent_op)
 		__run_delayed_extent_op(extent_op, leaf, item);
 
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/* now insert the actual backref */
@@ -1711,8 +1706,6 @@ again:
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	__run_delayed_extent_op(extent_op, leaf, ei);
-
-	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3267,7 +3260,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		} else {
 			btrfs_set_extent_refs(leaf, ei, refs);
-			btrfs_mark_buffer_dirty(trans, leaf);
 		}
 		if (found_extent) {
 			ret = remove_extent_backref(trans, extent_root, path,
@@ -4835,7 +4827,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
 	}
 
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_free_path(path);
 
 	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
@@ -4910,7 +4901,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 		btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
 	}
 
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_free_path(path);
 
 	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
-- 
cgit v1.2.3


From ca9d907645d361519a2719573c0174517655458a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:21:15 +0000
Subject: btrfs: block-group: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 39881d66cfa0..c0a8f7d92acc 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2668,7 +2668,6 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
-	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3118,7 +3117,6 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 						   cache->global_root_id);
 	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
 	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
-	btrfs_mark_buffer_dirty(trans, leaf);
 fail:
 	btrfs_release_path(path);
 	/*
-- 
cgit v1.2.3


From a81ae6c31d6431a7d6a7c9c9c6bfcecaa6b68917 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:26:42 +0000
Subject: btrfs: delayed-inode: remove unnecessary call to
 btrfs_mark_buffer_dirty()

The call to btrfs_mark_buffer_dirty() at __btrfs_update_delayed_inode() is
not necessary as we have a path setup for writing with btrfs_search_slot()
having a 'cow' argument set to 1.

This just makes the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 60a6866a6cd9..f9f1a972a6f7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1033,7 +1033,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 				    struct btrfs_inode_item);
 	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
 			    sizeof(struct btrfs_inode_item));
-	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
 		goto out;
-- 
cgit v1.2.3


From 7caa86c44b81c177f51715ea89cfcc0c4bd85622 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:33:50 +0000
Subject: btrfs: dev-replace: remove unnecessary call to
 btrfs_mark_buffer_dirty()

The call to btrfs_mark_buffer_dirty() at btrfs_run_dev_replace() is not
necessary as we have a path setup for writing with btrfs_search_slot()
having a 'cow' argument set to 1.

This just makes the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ac8e97ed13f7..f86fbea0b3de 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -440,9 +440,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
 		dev_replace->cursor_right);
 	dev_replace->item_needs_writeback = 0;
 	up_write(&dev_replace->rwsem);
-
-	btrfs_mark_buffer_dirty(trans, eb);
-
 out:
 	btrfs_free_path(path);
 
-- 
cgit v1.2.3


From 4866812020a27dee7cb8e94dc9e13a2793ae966a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:36:21 +0000
Subject: btrfs: dir-item: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dir-item.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1ea5d8fcfbf7..ccf91de29f80 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -92,7 +92,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 
 	write_extent_buffer(leaf, name, name_ptr, name_len);
 	write_extent_buffer(leaf, data, data_ptr, data_len);
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 	return ret;
 }
@@ -152,7 +151,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 	name_ptr = (unsigned long)(dir_item + 1);
 
 	write_extent_buffer(leaf, name->name, name_ptr, name->len);
-	btrfs_mark_buffer_dirty(trans, leaf);
 
 second_insert:
 	/* FIXME, use some real flag for selecting the extra index */
-- 
cgit v1.2.3


From 49c318e4f7ebdb6316cf8a504aeb3cf21bb986c7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:41:17 +0000
Subject: btrfs: file: remove unnecessary calls to btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4775a17c4ee1..36f51c311bb1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -314,7 +314,6 @@ next_slot:
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - args->start);
-			btrfs_mark_buffer_dirty(trans, leaf);
 
 			if (update_refs && disk_bytenr > 0) {
 				struct btrfs_ref ref = {
@@ -360,7 +359,6 @@ next_slot:
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - args->end);
-			btrfs_mark_buffer_dirty(trans, leaf);
 			if (update_refs && disk_bytenr > 0)
 				args->bytes_found += args->end - key.offset;
 			break;
@@ -384,7 +382,6 @@ next_slot:
 
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							args->start - key.offset);
-			btrfs_mark_buffer_dirty(trans, leaf);
 			if (update_refs && disk_bytenr > 0)
 				args->bytes_found += extent_end - args->start;
 			if (args->end == extent_end)
@@ -639,7 +636,6 @@ again:
 							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							end - other_start);
-			btrfs_mark_buffer_dirty(trans, leaf);
 			goto out;
 		}
 	}
@@ -668,7 +664,6 @@ again:
 							other_end - start);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     start - orig_offset);
-			btrfs_mark_buffer_dirty(trans, leaf);
 			goto out;
 		}
 	}
@@ -702,7 +697,6 @@ again:
 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - split);
-		btrfs_mark_buffer_dirty(trans, leaf);
 
 		ref.action = BTRFS_ADD_DELAYED_REF;
 		ref.bytenr = bytenr;
@@ -781,7 +775,6 @@ again:
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(trans, leaf);
 	} else {
 		fi = btrfs_item_ptr(leaf, del_slot - 1,
 			   struct btrfs_file_extent_item);
@@ -790,7 +783,6 @@ again:
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - key.offset);
-		btrfs_mark_buffer_dirty(trans, leaf);
 
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret < 0) {
@@ -2016,7 +2008,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(trans, leaf);
 		goto out;
 	}
 
@@ -2033,7 +2024,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(trans, leaf);
 		goto out;
 	}
 	btrfs_release_path(path);
@@ -2181,7 +2171,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
 	if (extent_info->is_new_extent)
 		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
-- 
cgit v1.2.3


From 5e887b5071e9ec2c934bb7839e153483433e5f9f Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:42:21 +0000
Subject: btrfs: file-item: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file-item.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 886749b39672..d04a3b47b1fb 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -190,8 +190,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_compression(leaf, item, 0);
 	btrfs_set_file_extent_encryption(leaf, item, 0);
 	btrfs_set_file_extent_other_encoding(leaf, item, 0);
-
-	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -1259,7 +1257,6 @@ found:
 	ins_size /= csum_size;
 	total_bytes += ins_size * fs_info->sectorsize;
 
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	if (total_bytes < sums->len) {
 		btrfs_release_path(path);
 		cond_resched();
-- 
cgit v1.2.3


From 038d6999ec9fa54c87915f372f98a179197aeb8d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:44:10 +0000
Subject: btrfs: free-space-cache: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 17707c898eae..3048cb38dc80 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -198,7 +198,6 @@ static int __create_free_space_inode(struct btrfs_root *root,
 	btrfs_set_inode_nlink(leaf, inode_item, 1);
 	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 	btrfs_set_inode_block_group(leaf, inode_item, offset);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -216,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root,
 				struct btrfs_free_space_header);
 	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
 	btrfs_set_free_space_key(leaf, header, &disk_key);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	return 0;
@@ -1189,7 +1187,6 @@ update_cache_item(struct btrfs_trans_handle *trans,
 	btrfs_set_free_space_entries(leaf, header, entries);
 	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
 	btrfs_set_free_space_generation(leaf, header, trans->transid);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	return 0;
-- 
cgit v1.2.3


From 5c7763312c17a44b6ff3590ababca31429842bd6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:46:03 +0000
Subject: btrfs: inode: remove unnecessary calls to btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a7a3d879f2f2..1546f341f9a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -564,7 +564,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 		kunmap_local(kaddr);
 		folio_put(folio);
 	}
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/*
@@ -2918,7 +2917,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 			btrfs_item_ptr_offset(leaf, path->slots[0]),
 			sizeof(struct btrfs_file_extent_item));
 
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/*
@@ -4082,7 +4080,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 				    struct btrfs_inode_item);
 
 	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
@@ -6377,7 +6374,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	/*
 	 * We don't need the path anymore, plus inheriting properties, adding
 	 * ACLs, security xattrs, orphan item or adding the link, will result in
@@ -8679,7 +8675,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 
 	ptr = btrfs_file_extent_inline_start(ei);
 	write_extent_buffer(leaf, symname, ptr, name_len);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_free_path(path);
 
 	d_instantiate_new(dentry, inode);
-- 
cgit v1.2.3


From 212e5f5cb8e34c93b7d50ed59421a82ad9e53caf Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:47:23 +0000
Subject: btrfs: inode-item: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 29572dfaf878..448aa1a682d6 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -298,8 +298,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 
 	ptr = (unsigned long)&extref->name;
 	write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -363,8 +361,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		ptr = (unsigned long)(ref + 1);
 	}
 	write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-
 out:
 	btrfs_free_path(path);
 
@@ -590,7 +586,6 @@ search_again:
 				num_dec = (orig_num_bytes - extent_num_bytes);
 				if (extent_start != 0)
 					control->sub_bytes += num_dec;
-				btrfs_mark_buffer_dirty(trans, leaf);
 			} else {
 				extent_num_bytes =
 					btrfs_file_extent_disk_num_bytes(leaf, fi);
-- 
cgit v1.2.3


From bd25bf9dcd3c3d57a22a04bd73845911d33e3ac1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:48:39 +0000
Subject: btrfs: ioctl: remove unnecessary call to btrfs_mark_buffer_dirty()

The call to btrfs_mark_buffer_dirty() at btrfs_ioctl_default_subvol() is
not necessary as we have a path setup for writing with btrfs_search_slot()
having a 'cow' argument set to 1.

This just makes the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 415b20801d78..69c0444369b7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2916,7 +2916,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
 	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
 	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 
 	btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
-- 
cgit v1.2.3


From d74a36f37e29b6457c8405ed85c5d426c2afd077 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:51:20 +0000
Subject: btrfs: qgroup: remove unnecessary calls to btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 993b5e803699..b90fabe302e6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -673,9 +673,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 	key.offset = dst;
 
 	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
-
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-
 	btrfs_free_path(path);
 	return ret;
 }
@@ -752,8 +749,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 
-	btrfs_mark_buffer_dirty(trans, leaf);
-
 	btrfs_release_path(path);
 
 	key.type = BTRFS_QGROUP_LIMIT_KEY;
@@ -771,8 +766,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 
-	btrfs_mark_buffer_dirty(trans, leaf);
-
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -859,9 +852,6 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
-
-	btrfs_mark_buffer_dirty(trans, l);
-
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -905,9 +895,6 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
-
-	btrfs_mark_buffer_dirty(trans, l);
-
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -947,9 +934,6 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 	btrfs_set_qgroup_status_rescan(l, ptr,
 				fs_info->qgroup_rescan_progress.objectid);
-
-	btrfs_mark_buffer_dirty(trans, l);
-
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -1130,8 +1114,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
 	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
 
-	btrfs_mark_buffer_dirty(trans, leaf);
-
 	key.objectid = 0;
 	key.type = BTRFS_ROOT_REF_KEY;
 	key.offset = 0;
-- 
cgit v1.2.3


From bdf1660b221abb8660a809ab8de6b6b0e6ff0320 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:52:28 +0000
Subject: btrfs: raid-stripe-tree: remove unnecessary call to
 btrfs_mark_buffer_dirty()

The call to btrfs_mark_buffer_dirty() at update_raid_extent_item() is not
necessary as we have a path setup for writing with btrfs_search_slot()
having a 'cow' argument set to 1.

This just makes the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 45b823a0913a..0bf3c032d9dc 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -169,7 +169,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
 
 	write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
 			    item_size);
-	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_free_path(path);
 
 	return ret;
-- 
cgit v1.2.3


From 5a8293a1cc466e2effce6fd3a61880acea37adf2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:56:38 +0000
Subject: btrfs: relocation: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cdd9a7b15a11..d4100e58172f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -856,7 +856,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 	u32 i;
 	int ret = 0;
 	int first = 1;
-	int dirty = 0;
 
 	if (rc->stage != UPDATE_DATA_PTRS)
 		return 0;
@@ -936,7 +935,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		}
 
 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
-		dirty = 1;
 
 		key.offset -= btrfs_file_extent_offset(leaf, fi);
 		ref.action = BTRFS_ADD_DELAYED_REF;
@@ -967,8 +965,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 			break;
 		}
 	}
-	if (dirty)
-		btrfs_mark_buffer_dirty(trans, leaf);
 	if (inode)
 		btrfs_add_delayed_iput(inode);
 	return ret;
@@ -1161,13 +1157,11 @@ again:
 		 */
 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
-		btrfs_mark_buffer_dirty(trans, parent);
 
 		btrfs_set_node_blockptr(path->nodes[level],
 					path->slots[level], old_bytenr);
 		btrfs_set_node_ptr_generation(path->nodes[level],
 					      path->slots[level], old_ptr_gen);
-		btrfs_mark_buffer_dirty(trans, path->nodes[level]);
 
 		ref.action = BTRFS_ADD_DELAYED_REF;
 		ref.bytenr = old_bytenr;
@@ -3728,7 +3722,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
 					  BTRFS_INODE_PREALLOC);
-	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
-- 
cgit v1.2.3


From 65733e8d6cc334565891719ed6bc9239695d63d9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:57:56 +0000
Subject: btrfs: root-tree: remove unnecessary calls to
 btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/root-tree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 33962671a96c..e22e6b06927a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -197,7 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
 
 	write_extent_buffer(l, item, ptr, sizeof(*item));
-	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -447,7 +446,6 @@ again:
 	btrfs_set_root_ref_name_len(leaf, ref, name->len);
 	ptr = (unsigned long)(ref + 1);
 	write_extent_buffer(leaf, name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
 		btrfs_release_path(path);
-- 
cgit v1.2.3


From c9a4390707c8b6a6f022526a5618e76d24bc6cb2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 12:59:51 +0000
Subject: btrfs: uuid-tree: remove unnecessary call to
 btrfs_mark_buffer_dirty()

The call to btrfs_mark_buffer_dirty() at btrfs_uuid_tree_add() is not
necessary as we have a path setup for writing with btrfs_search_slot()
having a 'cow' argument set to 1 (through btrfs_insert_empty_item()).

This just makes the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/uuid-tree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index aca2861f2187..17b5e81123a1 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -140,8 +140,6 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
 	ret = 0;
 	subid_le = cpu_to_le64(subid_cpu);
 	write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
-	btrfs_mark_buffer_dirty(trans, eb);
-
 out:
 	btrfs_free_path(path);
 	return ret;
-- 
cgit v1.2.3


From 1ca4e15f41f2e3ad99016b3b13b3713af9f40eb2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 13:02:59 +0000
Subject: btrfs: volumes: remove unnecessary calls to btrfs_mark_buffer_dirty()

We have several places explicitly calling btrfs_mark_buffer_dirty() but
that is not necessarily since the target leaf came from a path that was
obtained for a btree search function that modifies the btree, something
like btrfs_insert_empty_item() or anything else that ends up calling
btrfs_search_slot() with a value of 1 for its 'cow' argument.

These just make the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove them.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a58cf494b3d0..ccbfea163390 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2046,7 +2046,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
 	ptr = btrfs_device_fsid(dev_item);
 	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
 			    ptr, BTRFS_FSID_SIZE);
-	btrfs_mark_buffer_dirty(trans, leaf);
 
 	ret = 0;
 out:
@@ -2742,11 +2741,9 @@ next_slot:
 		device = btrfs_find_device(fs_info->fs_devices, &args);
 		BUG_ON(!device); /* Logic error */
 
-		if (device->fs_devices->seeding) {
+		if (device->fs_devices->seeding)
 			btrfs_set_device_generation(leaf, dev_item,
 						    device->generation);
-			btrfs_mark_buffer_dirty(trans, leaf);
-		}
 
 		path->slots[0]++;
 		goto next_slot;
@@ -3039,8 +3036,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
 				     btrfs_device_get_disk_total_bytes(device));
 	btrfs_set_device_bytes_used(leaf, dev_item,
 				    btrfs_device_get_bytes_used(device));
-	btrfs_mark_buffer_dirty(trans, leaf);
-
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3749,10 +3744,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
 	btrfs_set_balance_meta(leaf, item, &disk_bargs);
 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
 	btrfs_set_balance_sys(leaf, item, &disk_bargs);
-
 	btrfs_set_balance_flags(leaf, item, bctl->flags);
-
-	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	err = btrfs_commit_transaction(trans);
@@ -7700,8 +7692,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
 		btrfs_set_dev_stats_value(eb, ptr, i,
 					  btrfs_dev_stat_read(device, i));
-	btrfs_mark_buffer_dirty(trans, eb);
-
 out:
 	btrfs_free_path(path);
 	return ret;
-- 
cgit v1.2.3


From 74973b45a69b8f805e12e50cf85fa2ad500754f0 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 18 Dec 2024 13:04:01 +0000
Subject: btrfs: xattr: remove unnecessary call to btrfs_mark_buffer_dirty()

The call to btrfs_mark_buffer_dirty() at btrfs_setxattr() is not
necessary as we have a path setup for writing with btrfs_search_slot()
having a 'cow' argument set to 1.

This just makes the code more verbose, confusing and add a little extra
overhead and well as increase the module's text size, so remove it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/xattr.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index bc18710d1dcf..3e0edbcf73e1 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -204,7 +204,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		btrfs_set_dir_data_len(leaf, di, size);
 		data_ptr = ((unsigned long)(di + 1)) + name_len;
 		write_extent_buffer(leaf, value, data_ptr, size);
-		btrfs_mark_buffer_dirty(trans, leaf);
 	} else {
 		/*
 		 * Insert, and we had space for the xattr, so path->slots[0] is
-- 
cgit v1.2.3


From a5019b70704a8cbea4c295ae7a61abd87300ff29 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:30 +0800
Subject: btrfs: initialize fs_devices->fs_info earlier in
 btrfs_init_devices_late()

Currently, fs_devices->fs_info is initialized in btrfs_init_devices_late(),
but this occurs too late for find_live_mirror(), which is invoked by
load_super_root() much earlier than btrfs_init_devices_late().

Fix this by moving the initialization to open_ctree(), before load_super_root().

Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 1 +
 fs/btrfs/volumes.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 04d68f253940..4928bf2cd07f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3388,6 +3388,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
 	fs_info->stripesize = stripesize;
+	fs_info->fs_devices->fs_info = fs_info;
 
 	/*
 	 * Handle the space caching options appropriately now that we have the
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ccbfea163390..e5d5cfb2d239 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7515,8 +7515,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
 	struct btrfs_device *device;
 	int ret = 0;
 
-	fs_devices->fs_info = fs_info;
-
 	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry(device, &fs_devices->devices, dev_list)
 		device->fs_info = fs_info;
-- 
cgit v1.2.3


From 83be7f8b9c24bd040a348577c7c84fd08911707f Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:31 +0800
Subject: btrfs: sysfs: refactor output formatting in btrfs_read_policy_show()

Refactor the logic in btrfs_read_policy_show() for easier extension with
more balancing methods.  Streamline the space and bracket handling
around the active policy without altering the functional output.  This
is in preparation to add more methods.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 7f09b6c9cc2d..ab18b4e59468 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1316,14 +1316,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 	int i;
 
 	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
-		if (policy == i)
-			ret += sysfs_emit_at(buf, ret, "%s[%s]",
-					 (ret == 0 ? "" : " "),
-					 btrfs_read_policy_name[i]);
-		else
-			ret += sysfs_emit_at(buf, ret, "%s%s",
-					 (ret == 0 ? "" : " "),
-					 btrfs_read_policy_name[i]);
+		if (ret != 0)
+			ret += sysfs_emit_at(buf, ret, " ");
+
+		if (i == policy)
+			ret += sysfs_emit_at(buf, ret, "[");
+
+		ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
+
+		if (i == policy)
+			ret += sysfs_emit_at(buf, ret, "]");
 	}
 
 	ret += sysfs_emit_at(buf, ret, "\n");
-- 
cgit v1.2.3


From 38cae63137d5e13dc3c2ba88c4f393be4a6bf4bb Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:32 +0800
Subject: btrfs: sysfs: add btrfs_read_policy_to_enum() helper and refactor
 read policy store

Introduce btrfs_read_policy_to_enum() helper to simplify the conversion
of a string read policy to its corresponding enum value. This reduces
duplication and improves code clarity in btrfs_read_policy_store().

The parameter is copied locally to allow modification, enabling the
separation of the method and its value. This prepares for the addition of
more functionality in subsequent patches.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ab18b4e59468..78b4af72997b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1307,6 +1307,18 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
 
 static const char * const btrfs_read_policy_name[] = { "pid" };
 
+static int btrfs_read_policy_to_enum(const char *str)
+{
+	char param[32] = { 0 };
+
+	if (!str || strlen(str) == 0)
+		return 0;
+
+	strncpy(param, str, sizeof(param) - 1);
+
+	return sysfs_match_string(btrfs_read_policy_name, param);
+}
+
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 				      struct kobj_attribute *a, char *buf)
 {
@@ -1338,21 +1350,19 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 				       const char *buf, size_t len)
 {
 	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
-	int i;
+	int index;
 
-	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
-		if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
-			if (i != READ_ONCE(fs_devices->read_policy)) {
-				WRITE_ONCE(fs_devices->read_policy, i);
-				btrfs_info(fs_devices->fs_info,
-					   "read policy set to '%s'",
-					   btrfs_read_policy_name[i]);
-			}
-			return len;
-		}
+	index = btrfs_read_policy_to_enum(buf);
+	if (index < 0)
+		return -EINVAL;
+
+	if (index != READ_ONCE(fs_devices->read_policy)) {
+		WRITE_ONCE(fs_devices->read_policy, index);
+		btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
+			   btrfs_read_policy_name[index]);
 	}
 
-	return -EINVAL;
+	return len;
 }
 BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
 
-- 
cgit v1.2.3


From b6bed20ed398f71069bfd2cd769bb91fa15859b5 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:33 +0800
Subject: btrfs: sysfs: handle value associated with read balancing policy

Enable specifying additional configuration values along the RAID1
balancing read policy in a single input string.

Update btrfs_read_policy_to_enum() to parse and handle a value
associated with the policy in the format "policy:value", the value part
if present is converted to 64-bit integer.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78b4af72997b..2880407d0dd3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1307,15 +1307,34 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
 
 static const char * const btrfs_read_policy_name[] = { "pid" };
 
-static int btrfs_read_policy_to_enum(const char *str)
+static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
 {
 	char param[32] = { 0 };
+	char __maybe_unused *value_str;
 
 	if (!str || strlen(str) == 0)
 		return 0;
 
 	strncpy(param, str, sizeof(param) - 1);
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* Separate value from input in policy:value format. */
+	value_str = strchr(param, ':');
+	if (value_str) {
+		int ret;
+
+		*value_str = 0;
+		value_str++;
+		if (!value_ret)
+			return -EINVAL;
+		ret = kstrtos64(value_str, 10, value_ret);
+		if (ret)
+			return -EINVAL;
+		if (*value_ret < 0)
+			return -ERANGE;
+	}
+#endif
+
 	return sysfs_match_string(btrfs_read_policy_name, param);
 }
 
@@ -1351,8 +1370,9 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 {
 	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
 	int index;
+	s64 value = -1;
 
-	index = btrfs_read_policy_to_enum(buf);
+	index = btrfs_read_policy_to_enum(buf, &value);
 	if (index < 0)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 22fb0d99c90583e5b32a2a54e614bce221d31a8a Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:34 +0800
Subject: btrfs: add tracking of read blocks for read policy

Track number of read blocks in the whole filesystem. The counter is
initialized when devices are opened. The counter is increased at
btrfs_submit_dev_bio() if the stats tracking is enabled (depends on the
read policy).  Stats tracking is disabled by default and is enabled
through fs_devices::collect_fs_stats when required.

The code is not under the EXPERIMENTAL define, as stats can be expanded
to include write counts and other performance counters, with the user
interface independent of its internal use.

This is an in-memory-only feature, not related to the dev error stats.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c     | 8 ++++++++
 fs/btrfs/disk-io.c | 5 +++++
 fs/btrfs/fs.h      | 3 +++
 fs/btrfs/volumes.h | 2 ++
 4 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index bc80ee4f95a5..bc2555c44a12 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -453,6 +453,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
 		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
 		dev->devid, bio->bi_iter.bi_size);
 
+	/*
+	 * Track reads if tracking is enabled; ignore I/O operations before the
+	 * filesystem is fully initialized.
+	 */
+	if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
+		percpu_counter_add(&dev->fs_info->stats_read_blocks,
+				   bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
+
 	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
 		blkcg_punt_bio_submit(bio);
 	else
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4928bf2cd07f..ef3121b55c50 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1258,6 +1258,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 {
 	struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
 
+	percpu_counter_destroy(&fs_info->stats_read_blocks);
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
 	percpu_counter_destroy(&fs_info->ordered_bytes);
@@ -2923,6 +2924,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
 	if (ret)
 		return ret;
 
+	ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL);
+	if (ret)
+		return ret;
+
 	fs_info->dirty_metadata_batch = PAGE_SIZE *
 					(1 + ilog2(nr_cpu_ids));
 
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index be8c32d1a7bb..b572d6b9730b 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -627,6 +627,9 @@ struct btrfs_fs_info {
 	struct kobject *qgroups_kobj;
 	struct kobject *discard_kobj;
 
+	/* Track the number of blocks (sectors) read by the filesystem. */
+	struct percpu_counter stats_read_blocks;
+
 	/* Used to keep from writing metadata until there is a nice batch */
 	struct percpu_counter dirty_metadata_bytes;
 	struct percpu_counter delalloc_bytes;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 10bdd731e3fc..77926fdb6b0d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -417,6 +417,8 @@ struct btrfs_fs_devices {
 	bool seeding;
 	/* The mount needs to use a randomly generated fsid. */
 	bool temp_fsid;
+	/* Enable/disable the filesystem stats tracking. */
+	bool collect_fs_stats;
 
 	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
-- 
cgit v1.2.3


From 6d7a9154955e50c0b991063c65f86ab24796754e Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:35 +0800
Subject: btrfs: introduce RAID1 round-robin read balancing

Add round-robin read policy that balances reads over available devices
(all RAID1 block group profiles). Switch to the next devices is done
after a number of blocks is read, which is 256K by default and is
configurable in sysfs.

The format is "round-robin:<min-contig-read>" and can be set in file

  /sys/fs/btrfs/FSID/read_policy

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c   | 48 +++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h | 13 +++++++++++
 3 files changed, 125 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 2880407d0dd3..e155b7ce1ee5 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
 }
 BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
 
-static const char * const btrfs_read_policy_name[] = { "pid" };
+static const char *btrfs_read_policy_name[] = {
+	"pid",
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	"round-robin",
+#endif
+};
 
 static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
 {
@@ -1355,6 +1360,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 
 		ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+		if (i == BTRFS_READ_POLICY_RR)
+			ret += sysfs_emit_at(buf, ret, ":%u",
+					     READ_ONCE(fs_devices->rr_min_contig_read));
+#endif
+
 		if (i == policy)
 			ret += sysfs_emit_at(buf, ret, "]");
 	}
@@ -1376,6 +1387,41 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 	if (index < 0)
 		return -EINVAL;
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* If moving from RR then disable collecting fs stats. */
+	if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR)
+		fs_devices->collect_fs_stats = false;
+
+	if (index == BTRFS_READ_POLICY_RR) {
+		if (value != -1) {
+			const u32 sectorsize = fs_devices->fs_info->sectorsize;
+
+			if (!IS_ALIGNED(value, sectorsize)) {
+				u64 temp_value = round_up(value, sectorsize);
+
+				btrfs_debug(fs_devices->fs_info,
+"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu",
+					  value, sectorsize, temp_value);
+				value = temp_value;
+			}
+		} else {
+			value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
+		}
+
+		if (index != READ_ONCE(fs_devices->read_policy) ||
+		    value != READ_ONCE(fs_devices->rr_min_contig_read)) {
+			WRITE_ONCE(fs_devices->read_policy, index);
+			WRITE_ONCE(fs_devices->rr_min_contig_read, value);
+
+			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
+				   btrfs_read_policy_name[index], value);
+		}
+
+		fs_devices->collect_fs_stats = true;
+
+		return len;
+	}
+#endif
 	if (index != READ_ONCE(fs_devices->read_policy)) {
 		WRITE_ONCE(fs_devices->read_policy, index);
 		btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e5d5cfb2d239..cfe1d5ada5f2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1329,6 +1329,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 	fs_devices->total_rw_bytes = 0;
 	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
 	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
+#endif
 
 	return 0;
 }
@@ -5953,6 +5956,63 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 	return len;
 }
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+struct stripe_mirror {
+	u64 devid;
+	int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+	const struct stripe_mirror *s1 = (const struct stripe_mirror *)a;
+	const struct stripe_mirror *s2 = (const struct stripe_mirror *)b;
+
+	if (s1->devid < s2->devid)
+		return -1;
+	if (s1->devid > s2->devid)
+		return 1;
+	return 0;
+}
+
+/*
+ * Select a stripe for reading using the round-robin algorithm.
+ *
+ *  1. Compute the read cycle as the total sectors read divided by the minimum
+ *     sectors per device.
+ *  2. Determine the stripe number for the current read by taking the modulus
+ *     of the read cycle with the total number of stripes:
+ *
+ *      stripe index = (total sectors / min sectors per dev) % num stripes
+ *
+ * The calculated stripe index is then used to select the corresponding device
+ * from the list of devices, which is ordered by devid.
+ */
+static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes)
+{
+	struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 };
+	struct btrfs_device *device  = map->stripes[first].dev;
+	struct btrfs_fs_info *fs_info = device->fs_devices->fs_info;
+	unsigned int read_cycle;
+	unsigned int total_reads;
+	unsigned int min_reads_per_dev;
+
+	total_reads = percpu_counter_sum(&fs_info->stats_read_blocks);
+	min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >>
+						       fs_info->sectorsize_bits;
+
+	for (int index = 0, i = first; i < first + num_stripes; i++) {
+		stripes[index].devid = map->stripes[i].dev->devid;
+		stripes[index].num = i;
+		index++;
+	}
+	sort(stripes, num_stripes, sizeof(struct stripe_mirror),
+	     btrfs_cmp_devid, NULL);
+
+	read_cycle = total_reads / min_reads_per_dev;
+	return stripes[read_cycle % num_stripes].num;
+}
+#endif
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct btrfs_chunk_map *map, int first,
 			    int dev_replace_is_ongoing)
@@ -5982,6 +6042,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	case BTRFS_READ_POLICY_PID:
 		preferred_mirror = first + (current->pid % num_stripes);
 		break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	case BTRFS_READ_POLICY_RR:
+		preferred_mirror = btrfs_read_rr(map, first, num_stripes);
+		break;
+#endif
 	}
 
 	if (dev_replace_is_ongoing &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 77926fdb6b0d..f9fe698a9b4b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -296,6 +296,9 @@ enum btrfs_chunk_allocation_policy {
 	BTRFS_CHUNK_ALLOC_ZONED,
 };
 
+#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ	(SZ_256K)
+/* Keep in sync with raid_attr table, current maximum is RAID1C4. */
+#define BTRFS_RAID1_MAX_MIRRORS			(4)
 /*
  * Read policies for mirrored block group profiles, read picks the stripe based
  * on these policies.
@@ -303,6 +306,10 @@ enum btrfs_chunk_allocation_policy {
 enum btrfs_read_policy {
 	/* Use process PID to choose the stripe */
 	BTRFS_READ_POLICY_PID,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* Balancing RAID1 reads across all striped devices (round-robin). */
+	BTRFS_READ_POLICY_RR,
+#endif
 	BTRFS_NR_READ_POLICY,
 };
 
@@ -433,6 +440,12 @@ struct btrfs_fs_devices {
 	enum btrfs_read_policy read_policy;
 
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/*
+	 * Minimum contiguous reads before switching to next device, the unit
+	 * is one block/sectorsize.
+	 */
+	u32 rr_min_contig_read;
+
 	/* Checksum mode - offload it or do it synchronously. */
 	enum btrfs_offload_csum_mode offload_csum_mode;
 #endif
-- 
cgit v1.2.3


From c86aae73bd5882e4a6b4e1b6ed448ea902551f80 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:36 +0800
Subject: btrfs: add read policy to set a preferred device

Add read policy that will force all reads to go to the given device
(specified by devid) on the RAID1 profiles.

This will be used for testing, e.g. to read from stale device. Users may
find other use cases.

Can be set in sysfs, the value format is "devid:<devid>" to the file

  /sys/fs/btrfs/FSID/read_policy

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c   | 31 ++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c | 17 +++++++++++++++++
 fs/btrfs/volumes.h |  5 +++++
 3 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e155b7ce1ee5..5211d13d73f8 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1309,6 +1309,7 @@ static const char *btrfs_read_policy_name[] = {
 	"pid",
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
 	"round-robin",
+	"devid",
 #endif
 };
 
@@ -1364,8 +1365,11 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 		if (i == BTRFS_READ_POLICY_RR)
 			ret += sysfs_emit_at(buf, ret, ":%u",
 					     READ_ONCE(fs_devices->rr_min_contig_read));
-#endif
 
+		if (i == BTRFS_READ_POLICY_DEVID)
+			ret += sysfs_emit_at(buf, ret, ":%llu",
+					     READ_ONCE(fs_devices->read_devid));
+#endif
 		if (i == policy)
 			ret += sysfs_emit_at(buf, ret, "]");
 	}
@@ -1421,6 +1425,31 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 
 		return len;
 	}
+
+	if (index == BTRFS_READ_POLICY_DEVID) {
+		if (value != -1) {
+			BTRFS_DEV_LOOKUP_ARGS(args);
+
+			/* Validate input devid. */
+			args.devid = value;
+			if (btrfs_find_device(fs_devices, &args) == NULL)
+				return -EINVAL;
+		} else {
+			/* Set default devid to the devid of the latest device. */
+			value = fs_devices->latest_dev->devid;
+		}
+
+		if (index != READ_ONCE(fs_devices->read_policy) ||
+		    value != READ_ONCE(fs_devices->read_devid)) {
+			WRITE_ONCE(fs_devices->read_policy, index);
+			WRITE_ONCE(fs_devices->read_devid, value);
+
+			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'",
+				   btrfs_read_policy_name[index], value);
+		}
+
+		return len;
+	}
 #endif
 	if (index != READ_ONCE(fs_devices->read_policy)) {
 		WRITE_ONCE(fs_devices->read_policy, index);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cfe1d5ada5f2..b5fd1aa45c4c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1331,6 +1331,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
 	fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
+	fs_devices->read_devid = latest_dev->devid;
 #endif
 
 	return 0;
@@ -5957,6 +5958,19 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 }
 
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes)
+{
+	for (int index = first; index < first + num_stripes; index++) {
+		const struct btrfs_device *device = map->stripes[index].dev;
+
+		if (device->devid == READ_ONCE(device->fs_devices->read_devid))
+			return index;
+	}
+
+	/* If no read-preferred device is set use the first stripe. */
+	return first;
+}
+
 struct stripe_mirror {
 	u64 devid;
 	int num;
@@ -6046,6 +6060,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	case BTRFS_READ_POLICY_RR:
 		preferred_mirror = btrfs_read_rr(map, first, num_stripes);
 		break;
+	case BTRFS_READ_POLICY_DEVID:
+		preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
+		break;
 #endif
 	}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f9fe698a9b4b..120f65e21eeb 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -309,6 +309,8 @@ enum btrfs_read_policy {
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
 	/* Balancing RAID1 reads across all striped devices (round-robin). */
 	BTRFS_READ_POLICY_RR,
+	/* Read from a specific device. */
+	BTRFS_READ_POLICY_DEVID,
 #endif
 	BTRFS_NR_READ_POLICY,
 };
@@ -446,6 +448,9 @@ struct btrfs_fs_devices {
 	 */
 	u32 rr_min_contig_read;
 
+	/* Device to be used for reading in case of RAID1. */
+	u64 read_devid;
+
 	/* Checksum mode - offload it or do it synchronously. */
 	enum btrfs_offload_csum_mode offload_csum_mode;
 #endif
-- 
cgit v1.2.3


From bb4715e967cf3b2eb8550eda73886208f1fc805d Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:37 +0800
Subject: btrfs: print status of experimental mode when loading module

Commit c9c49e8f157e ("btrfs: split out CONFIG_BTRFS_EXPERIMENTAL from
CONFIG_BTRFS_DEBUG") introduces a way to enable or disable experimental
features, print its status during module load, like:

  Btrfs loaded, experimental=on, debug=on, assert=on, zoned=yes, fsverity=yes

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f6eaaf20229d..5157037a0048 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2446,6 +2446,9 @@ static __cold void btrfs_interface_exit(void)
 static int __init btrfs_print_mod_info(void)
 {
 	static const char options[] = ""
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+			", experimental=on"
+#endif
 #ifdef CONFIG_BTRFS_DEBUG
 			", debug=on"
 #endif
-- 
cgit v1.2.3


From e426286cfa6f85e51006f6151b309a395ada6540 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:38 +0800
Subject: btrfs: configure read policy via module parameter

For testing purposes allow to configure the read policy via module
parameter from the beginning. Available only with CONFIG_BTRFS_EXPERIMENTAL

Examples:

- Set the RAID1 balancing method to round-robin with a custom
  min_contig_read of 4k:
  $ modprobe btrfs read_policy=round-robin:4096

- Set the round-robin balancing method with the default
  min_contiguous_read:
  $ modprobe btrfs read_policy=round-robin

- Set the "devid" balancing method, defaulting to the latest device:
  $ modprobe btrfs read_policy=devid

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c   |  5 +++++
 fs/btrfs/sysfs.c   | 31 ++++++++++++++++++++++++++++++-
 fs/btrfs/sysfs.h   |  6 ++++++
 fs/btrfs/volumes.c | 15 ++++++++++++++-
 4 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5157037a0048..f310cfa0b5b4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2527,6 +2527,11 @@ static const struct init_sequence mod_init_seq[] = {
 	}, {
 		.init_func = extent_map_init,
 		.exit_func = extent_map_exit,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	}, {
+		.init_func = btrfs_read_policy_init,
+		.exit_func = NULL,
+#endif
 	}, {
 		.init_func = ordered_data_init,
 		.exit_func = ordered_data_exit,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5211d13d73f8..53b846d99ece 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1313,7 +1313,22 @@ static const char *btrfs_read_policy_name[] = {
 #endif
 };
 
-static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+
+/* Global module configuration parameters. */
+static char *read_policy;
+char *btrfs_get_mod_read_policy(void)
+{
+	return read_policy;
+}
+
+/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */
+module_param(read_policy, charp, 0);
+MODULE_PARM_DESC(read_policy,
+"Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]");
+#endif
+
+int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
 {
 	char param[32] = { 0 };
 	char __maybe_unused *value_str;
@@ -1344,6 +1359,20 @@ static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
 	return sysfs_match_string(btrfs_read_policy_name, param);
 }
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+int __init btrfs_read_policy_init(void)
+{
+	s64 value;
+
+	if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) {
+		btrfs_err(NULL, "invalid read policy or value %s", read_policy);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 				      struct kobj_attribute *a, char *buf)
 {
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index e6a284c59809..3fc5c6f90dc4 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -47,5 +47,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
 int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
 void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
 				struct btrfs_qgroup *qgroup);
+int btrfs_read_policy_to_enum(const char *str, s64 *value);
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+int __init btrfs_read_policy_init(void);
+char *btrfs_get_mod_read_policy(void);
+#endif
 
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b5fd1aa45c4c..a594f66daedf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1299,6 +1299,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 	struct btrfs_device *device;
 	struct btrfs_device *latest_dev = NULL;
 	struct btrfs_device *tmp_device;
+	s64 __maybe_unused value = 0;
 	int ret = 0;
 
 	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
@@ -1328,10 +1329,22 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 	fs_devices->latest_dev = latest_dev;
 	fs_devices->total_rw_bytes = 0;
 	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
-	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
 	fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
 	fs_devices->read_devid = latest_dev->devid;
+	fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(),
+							    &value);
+	if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
+		fs_devices->collect_fs_stats = true;
+
+	if (value) {
+		if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
+			fs_devices->rr_min_contig_read = value;
+		if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID)
+			fs_devices->read_devid = value;
+	}
+#else
+	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
 #endif
 
 	return 0;
-- 
cgit v1.2.3


From 3681dbe0afeef3946b71a7af05e31375d6e70b90 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 2 Jan 2025 02:06:39 +0800
Subject: btrfs: print read policy on module load

Print the read read policy if set as module parameter (with
CONFIG_BTRFS_EXPERIMENTAL).

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f310cfa0b5b4..f809c3200c21 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2469,7 +2469,17 @@ static int __init btrfs_print_mod_info(void)
 			", fsverity=no"
 #endif
 			;
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	if (btrfs_get_mod_read_policy() == NULL)
+		pr_info("Btrfs loaded%s\n", options);
+	else
+		pr_info("Btrfs loaded%s, read_policy=%s\n",
+			 options, btrfs_get_mod_read_policy());
+#else
 	pr_info("Btrfs loaded%s\n", options);
+#endif
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2fa07d7a0f0084d9777f076a154ad10e759ba731 Mon Sep 17 00:00:00 2001
From: Jing Xia <j.xia@samsung.com>
Date: Tue, 3 Sep 2024 13:40:12 +0800
Subject: btrfs: pass write-hint for buffered IO

Commit 449813515d3e ("block, fs: Restore the per-bio/request data
lifetime fields") restored write-hint support in btrfs. But that is
applicable only for direct IO. This patch supports passing
write-hint for buffered IO from btrfs file system to block layer
by filling bi_write_hint of struct bio in alloc_new_bio().

There's an ongoing discussion which devices can use that,
https://lore.kernel.org/all/20240910150200.6589-6-joshi.k@samsung.com,
in SCSI there's support using sd_group_number() and
sd_setup_rw32_cmnd().

The hint goes from the application directly to the block device so it's
up to the application to set up everything properly to utilize the
different hint classes.

Link: https://lore.kernel.org/all/20240910150200.6589-6-joshi.k@samsung.com
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jing Xia <j.xia@samsung.com>
[ Add more context and use case. ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9725ff7f274d..74f2775db51e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -710,6 +710,7 @@ static void alloc_new_bio(struct btrfs_inode *inode,
 	bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
 			       bio_ctrl->end_io_func, NULL);
 	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+	bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint;
 	bbio->inode = inode;
 	bbio->file_offset = file_offset;
 	bio_ctrl->bbio = bbio;
-- 
cgit v1.2.3


From 5f14eb12a3be1628809141759e46c381925b5ef1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:23:59 +0100
Subject: btrfs: drop unused parameter fs_info to
 btrfs_delete_delayed_insertion_item()

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f9f1a972a6f7..0b4933c6a889 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1555,8 +1555,7 @@ release_node:
 	return ret;
 }
 
-static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
-					       struct btrfs_delayed_node *node,
+static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
 					       u64 index)
 {
 	struct btrfs_delayed_item *item;
@@ -1614,7 +1613,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 	if (IS_ERR(node))
 		return PTR_ERR(node);
 
-	ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
+	ret = btrfs_delete_delayed_insertion_item(node, index);
 	if (!ret)
 		goto end;
 
-- 
cgit v1.2.3


From 6d67ff1c0be3c04e01c6b20a0c8286e99df05b2d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:01 +0100
Subject: btrfs: remove stray comment about SRCU

The subvol_srcu was removed in c75e839414d361 ("btrfs: kill the
subvol_srcu") years ago.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a7051e2570c1..587842991b24 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -96,9 +96,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
 /*
  * This function is used to grab the root, and avoid it is freed when we
  * access it. But it doesn't ensure that the tree is not dropped.
- *
- * If you want to ensure the whole tree is safe, you should use
- * 	fs_info->subvol_srcu
  */
 static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 {
-- 
cgit v1.2.3


From 2a1e8378dc3814e37deee2be495600da0c98b175 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:07 +0100
Subject: btrfs: use SECTOR_SIZE defines in btrfs_issue_discard()

Use the existing define for single sector size.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1cb1bd45f7ec..3014a1a23efd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1256,12 +1256,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 {
 	int j, ret = 0;
 	u64 bytes_left, end;
-	u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
+	u64 aligned_start = ALIGN(start, SECTOR_SIZE);
 
 	/* Adjust the range to be aligned to 512B sectors if necessary. */
 	if (start != aligned_start) {
 		len -= aligned_start - start;
-		len = round_down(len, 1 << SECTOR_SHIFT);
+		len = round_down(len, SECTOR_SIZE);
 		start = aligned_start;
 	}
 
-- 
cgit v1.2.3


From 2b41599bff1714df957c82821b8b17113ea44054 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:10 +0100
Subject: btrfs: rename __unlock_for_delalloc() and drop underscores

Drop the leading underscores in '__unlock_for_delalloc()' and rename it
to 'unlock_delalloc_folio()'. This also ensures naming parity with
'lock_delalloc_folios()'.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 74f2775db51e..bc4d41985c4b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -221,7 +221,7 @@ static void __process_folios_contig(struct address_space *mapping,
 	}
 }
 
-static noinline void __unlock_for_delalloc(const struct inode *inode,
+static noinline void unlock_delalloc_folio(const struct inode *inode,
 					   const struct folio *locked_folio,
 					   u64 start, u64 end)
 {
@@ -288,8 +288,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
 out:
 	folio_batch_release(&fbatch);
 	if (processed_end > start)
-		__unlock_for_delalloc(inode, locked_folio, start,
-				      processed_end);
+		unlock_delalloc_folio(inode, locked_folio, start, processed_end);
 	return -EAGAIN;
 }
 
@@ -390,7 +389,7 @@ again:
 
 	unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
 	if (!ret) {
-		__unlock_for_delalloc(inode, locked_folio, delalloc_start,
+		unlock_delalloc_folio(inode, locked_folio, delalloc_start,
 				      delalloc_end);
 		cond_resched();
 		goto again;
@@ -1248,7 +1247,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			 */
 			unlock_extent(&inode->io_tree, found_start,
 				      found_start + found_len - 1, NULL);
-			__unlock_for_delalloc(&inode->vfs_inode, folio,
+			unlock_delalloc_folio(&inode->vfs_inode, folio,
 					      found_start,
 					      found_start + found_len - 1);
 		}
-- 
cgit v1.2.3


From 3a1c46dbc9856a286808170b58c35ff5f50afa30 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:12 +0100
Subject: btrfs: open code set_page_extent_mapped()

The function set_page_extent_mapped() is now a simple wrapper so use the
folio helper.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c        | 5 -----
 fs/btrfs/extent_io.h        | 1 -
 fs/btrfs/free-space-cache.c | 2 +-
 fs/btrfs/relocation.c       | 2 +-
 4 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc4d41985c4b..532a5f09121f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -862,11 +862,6 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
 	return ret;
 }
 
-int set_page_extent_mapped(struct page *page)
-{
-	return set_folio_extent_mapped(page_folio(page));
-}
-
 int set_folio_extent_mapped(struct folio *folio)
 {
 	struct btrfs_fs_info *fs_info;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8a36117ed453..d14bda928e7b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -248,7 +248,6 @@ int btree_write_cache_pages(struct address_space *mapping,
 			    struct writeback_control *wbc);
 void btrfs_readahead(struct readahead_control *rac);
 int set_folio_extent_mapped(struct folio *folio);
-int set_page_extent_mapped(struct page *page);
 void clear_folio_extent_mapped(struct folio *folio);
 
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3048cb38dc80..d42b6f882f57 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -461,7 +461,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
 			return -ENOMEM;
 		}
 
-		ret = set_page_extent_mapped(page);
+		ret = set_folio_extent_mapped(page_folio(page));
 		if (ret < 0) {
 			unlock_page(page);
 			put_page(page);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d4100e58172f..af0969b70b53 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2870,7 +2870,7 @@ again:
 
 	/*
 	 * We could have lost folio private when we dropped the lock to read the
-	 * folio above, make sure we set_page_extent_mapped here so we have any
+	 * folio above, make sure we set_folio_extent_mapped() here so we have any
 	 * of the subpage blocksize stuff we need in place.
 	 */
 	ret = set_folio_extent_mapped(folio);
-- 
cgit v1.2.3


From 06de96faf795b5c276a3be612da6b08c6112e747 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:15 +0100
Subject: btrfs: rename __get_extent_map() and pass btrfs_inode

The double underscore naming scheme does not apply here, there's only
only get_extent_map(). As the definition is changed also pass the struct
btrfs_inode.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 532a5f09121f..6bbc4fc67858 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -896,9 +896,9 @@ void clear_folio_extent_mapped(struct folio *folio)
 	folio_detach_private(folio);
 }
 
-static struct extent_map *__get_extent_map(struct inode *inode,
-					   struct folio *folio, u64 start,
-					   u64 len, struct extent_map **em_cached)
+static struct extent_map *get_extent_map(struct btrfs_inode *inode,
+					 struct folio *folio, u64 start,
+					 u64 len, struct extent_map **em_cached)
 {
 	struct extent_map *em;
 	struct extent_state *cached_state = NULL;
@@ -917,14 +917,14 @@ static struct extent_map *__get_extent_map(struct inode *inode,
 		*em_cached = NULL;
 	}
 
-	btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state);
-	em = btrfs_get_extent(BTRFS_I(inode), folio, start, len);
+	btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state);
+	em = btrfs_get_extent(inode, folio, start, len);
 	if (!IS_ERR(em)) {
 		BUG_ON(*em_cached);
 		refcount_inc(&em->refs);
 		*em_cached = em;
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state);
+	unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state);
 
 	return em;
 }
@@ -980,8 +980,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
 			end_folio_read(folio, true, cur, iosize);
 			break;
 		}
-		em = __get_extent_map(inode, folio, cur, end - cur + 1,
-				      em_cached);
+		em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
 		if (IS_ERR(em)) {
 			end_folio_read(folio, false, cur, end + 1 - cur);
 			return PTR_ERR(em);
-- 
cgit v1.2.3


From 011a9a1f244656cc3cbde47edba2b250f794d440 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:17 +0100
Subject: btrfs: use btrfs_inode in extent_writepage()

As extent_writepage() is internal helper we should use our inode type,
so change it from struct inode.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6bbc4fc67858..4fb59231cbcc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1458,15 +1458,15 @@ out:
  */
 static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
 {
-	struct inode *inode = folio->mapping->host;
-	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	const u64 page_start = folio_pos(folio);
 	int ret;
 	size_t pg_offset;
-	loff_t i_size = i_size_read(inode);
+	loff_t i_size = i_size_read(&inode->vfs_inode);
 	unsigned long end_index = i_size >> PAGE_SHIFT;
 
-	trace_extent_writepage(folio, inode, bio_ctrl->wbc);
+	trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
 
 	WARN_ON(!folio_test_locked(folio));
 
@@ -1490,13 +1490,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 	if (ret < 0)
 		goto done;
 
-	ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl);
+	ret = writepage_delalloc(inode, folio, bio_ctrl);
 	if (ret == 1)
 		return 0;
 	if (ret)
 		goto done;
 
-	ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio),
+	ret = extent_writepage_io(inode, folio, folio_pos(folio),
 				  PAGE_SIZE, bio_ctrl, i_size);
 	if (ret == 1)
 		return 0;
@@ -1505,7 +1505,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 
 done:
 	if (ret) {
-		btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
+		btrfs_mark_ordered_io_finished(inode, folio,
 					       page_start, PAGE_SIZE, !ret);
 		mapping_set_error(folio->mapping, ret);
 	}
-- 
cgit v1.2.3


From 075adeeb9204359e8232aeccf8b3c350ff6d9ff4 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:19 +0100
Subject: btrfs: make wait_on_extent_buffer_writeback() static inline

The simple helper can be inlined, no need for the separate function.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 6 ------
 fs/btrfs/extent_io.h | 7 ++++++-
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4fb59231cbcc..7487681cbd71 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1519,12 +1519,6 @@ done:
 	return ret;
 }
 
-void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
-{
-	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
-		       TASK_UNINTERRUPTIBLE);
-}
-
 /*
  * Lock extent buffer status and pages for writeback.
  *
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index d14bda928e7b..986f15022fef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -266,7 +266,12 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 			     const struct btrfs_tree_parent_check *parent_check);
-void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
+		       TASK_UNINTERRUPTIBLE);
+}
+
 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 				u64 bytenr, u64 owner_root, u64 gen, int level);
 void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
-- 
cgit v1.2.3


From b6160baed37916b6b315b2ab868a265600e03b2a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:22 +0100
Subject: btrfs: drop one time used local variable in end_bbio_meta_write()

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7487681cbd71..5463ff10b705 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1659,11 +1659,10 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio)
 {
 	struct extent_buffer *eb = bbio->private;
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	bool uptodate = !bbio->bio.bi_status;
 	struct folio_iter fi;
 	u32 bio_offset = 0;
 
-	if (!uptodate)
+	if (bbio->bio.bi_status != BLK_STS_OK)
 		set_btree_ioerr(eb);
 
 	bio_for_each_folio_all(fi, &bbio->bio) {
-- 
cgit v1.2.3


From a722c72bef93449d9093fa33d9c29eb3b348f164 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:24 +0100
Subject: btrfs: open code __free_extent_buffer()

Using the kmem cache freeing directly is clear enough, we don't need to
wrap it.  All the users are in the same file.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5463ff10b705..617f2bfd46de 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2482,11 +2482,6 @@ next:
 	return try_release_extent_state(io_tree, folio);
 }
 
-static void __free_extent_buffer(struct extent_buffer *eb)
-{
-	kmem_cache_free(extent_buffer_cache, eb);
-}
-
 static int extent_buffer_under_io(const struct extent_buffer *eb)
 {
 	return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@ -2592,7 +2587,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 {
 	btrfs_release_extent_buffer_pages(eb);
 	btrfs_leak_debug_del_eb(eb);
-	__free_extent_buffer(eb);
+	kmem_cache_free(extent_buffer_cache, eb);
 }
 
 static struct extent_buffer *
@@ -2690,7 +2685,7 @@ err:
 			folio_put(eb->folios[i]);
 		}
 	}
-	__free_extent_buffer(eb);
+	kmem_cache_free(extent_buffer_cache, eb);
 	return NULL;
 }
 
@@ -3182,7 +3177,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 	struct extent_buffer *eb =
 			container_of(head, struct extent_buffer, rcu_head);
 
-	__free_extent_buffer(eb);
+	kmem_cache_free(extent_buffer_cache, eb);
 }
 
 static int release_extent_buffer(struct extent_buffer *eb)
@@ -3210,7 +3205,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
 		btrfs_release_extent_buffer_pages(eb);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 		if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
-			__free_extent_buffer(eb);
+			kmem_cache_free(extent_buffer_cache, eb);
 			return 1;
 		}
 #endif
-- 
cgit v1.2.3


From cc8f51a3550a77427449c2b7a64281c72073a412 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:27 +0100
Subject: btrfs: rename btrfs_release_extent_buffer_pages() to mention folios

Continue page to folio updates, sync what the function does with it's
name.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 617f2bfd46de..e98f4f531ebc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2562,8 +2562,8 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
 	spin_unlock(&folio->mapping->i_private_lock);
 }
 
-/* Release all pages attached to the extent buffer */
-static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
+/* Release all folios attached to the extent buffer */
+static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
 {
 	ASSERT(!extent_buffer_under_io(eb));
 
@@ -2585,7 +2585,7 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
  */
 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 {
-	btrfs_release_extent_buffer_pages(eb);
+	btrfs_release_extent_buffer_folios(eb);
 	btrfs_leak_debug_del_eb(eb);
 	kmem_cache_free(extent_buffer_cache, eb);
 }
@@ -3201,8 +3201,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
 		}
 
 		btrfs_leak_debug_del_eb(eb);
-		/* Should be safe to release our pages at this point */
-		btrfs_release_extent_buffer_pages(eb);
+		/* Should be safe to release folios at this point. */
+		btrfs_release_extent_buffer_folios(eb);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 		if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
 			kmem_cache_free(extent_buffer_cache, eb);
-- 
cgit v1.2.3


From a43caf82a103ea9fa8af2630119f1c018db06bb4 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:29 +0100
Subject: btrfs: switch grab_extent_buffer() to folios

Use the folio API, remove page_folio/folio_page conversions.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e98f4f531ebc..cfc50ccd2ea6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2812,13 +2812,12 @@ free_eb:
 }
 #endif
 
-static struct extent_buffer *grab_extent_buffer(
-		struct btrfs_fs_info *fs_info, struct page *page)
+static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
+						struct folio *folio)
 {
-	struct folio *folio = page_folio(page);
 	struct extent_buffer *exists;
 
-	lockdep_assert_held(&page->mapping->i_private_lock);
+	lockdep_assert_held(&folio->mapping->i_private_lock);
 
 	/*
 	 * For subpage case, we completely rely on radix tree to ensure we
@@ -2833,7 +2832,7 @@ static struct extent_buffer *grab_extent_buffer(
 		return NULL;
 
 	/*
-	 * We could have already allocated an eb for this page and attached one
+	 * We could have already allocated an eb for this folio and attached one
 	 * so lets see if we can get a ref on the existing eb, and if we can we
 	 * know it's good and we can just return that one, else we know we can
 	 * just overwrite folio private.
@@ -2842,7 +2841,7 @@ static struct extent_buffer *grab_extent_buffer(
 	if (atomic_inc_not_zero(&exists->refs))
 		return exists;
 
-	WARN_ON(PageDirty(page));
+	WARN_ON(folio_test_dirty(folio));
 	folio_detach_private(folio);
 	return NULL;
 }
@@ -2933,8 +2932,7 @@ finish:
 	} else if (existing_folio) {
 		struct extent_buffer *existing_eb;
 
-		existing_eb = grab_extent_buffer(fs_info,
-						 folio_page(existing_folio, 0));
+		existing_eb = grab_extent_buffer(fs_info, existing_folio);
 		if (existing_eb) {
 			/* The extent buffer still exists, we can use it directly. */
 			*found_eb_ret = existing_eb;
-- 
cgit v1.2.3


From 549a88acbe544cebd41011e56b4ac5ef2ae79e7c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:35 +0100
Subject: btrfs: change return type to bool type of check_eb_alignment()

The check function pattern is supposed to return true/false, currently
there's only one error code.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfc50ccd2ea6..9651a7549564 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2846,11 +2846,14 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
 	return NULL;
 }
 
-static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+/*
+ * Validate alignment constraints of eb at logical address @start.
+ */
+static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 {
 	if (!IS_ALIGNED(start, fs_info->sectorsize)) {
 		btrfs_err(fs_info, "bad tree block start %llu", start);
-		return -EINVAL;
+		return true;
 	}
 
 	if (fs_info->nodesize < PAGE_SIZE &&
@@ -2858,14 +2861,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 		btrfs_err(fs_info,
 		"tree block crosses page boundary, start %llu nodesize %u",
 			  start, fs_info->nodesize);
-		return -EINVAL;
+		return true;
 	}
 	if (fs_info->nodesize >= PAGE_SIZE &&
 	    !PAGE_ALIGNED(start)) {
 		btrfs_err(fs_info,
 		"tree block is not page aligned, start %llu nodesize %u",
 			  start, fs_info->nodesize);
-		return -EINVAL;
+		return true;
 	}
 	if (!IS_ALIGNED(start, fs_info->nodesize) &&
 	    !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
@@ -2873,10 +2876,9 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
 			      start, fs_info->nodesize);
 	}
-	return 0;
+	return false;
 }
 
-
 /*
  * Return 0 if eb->folios[i] is attached to btree inode successfully.
  * Return >0 if there is already another extent buffer for the range,
-- 
cgit v1.2.3


From f8e0b8f9c2796474db532c83959993b4ee28c4ef Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:38 +0100
Subject: btrfs: unwrap folio locking helpers

Another conversion to folio API, use the folio locking directly instead
of back and forth page <-> folio conversions.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9651a7549564..4ccf629d2127 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3131,7 +3131,7 @@ again:
 	 * live buffer and won't free them prematurely.
 	 */
 	for (int i = 0; i < num_folios; i++)
-		unlock_page(folio_page(eb->folios[i], 0));
+		folio_unlock(eb->folios[i]);
 	return eb;
 
 out:
@@ -3155,7 +3155,7 @@ out:
 	for (int i = 0; i < attached; i++) {
 		ASSERT(eb->folios[i]);
 		detach_extent_buffer_folio(eb, eb->folios[i]);
-		unlock_page(folio_page(eb->folios[i], 0));
+		folio_unlock(eb->folios[i]);
 		folio_put(eb->folios[i]);
 		eb->folios[i] = NULL;
 	}
@@ -3364,12 +3364,12 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
 		 * the above race.
 		 */
 		if (subpage)
-			lock_page(folio_page(eb->folios[0], 0));
+			folio_lock(eb->folios[0]);
 		for (int i = 0; i < num_folios; i++)
 			btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
 					      eb->start, eb->len);
 		if (subpage)
-			unlock_page(folio_page(eb->folios[0], 0));
+			folio_unlock(eb->folios[0]);
 		percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
 					 eb->len,
 					 eb->fs_info->dirty_metadata_batch);
-- 
cgit v1.2.3


From db9eef2ea8633714ccdcb224f13ca3f3b5ed62cc Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:40 +0100
Subject: btrfs: remove unused define WAIT_PAGE_LOCK for extent io

Last use was in the readahead code that got removed by f26c9238602856
("btrfs: remove reada infrastructure").

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 986f15022fef..ca09fc31e2de 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -263,7 +263,6 @@ void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE	0
 #define WAIT_COMPLETE	1
-#define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 			     const struct btrfs_tree_parent_check *parent_check);
 static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
-- 
cgit v1.2.3


From 248c4ff3935252a82504c55cfd3592e413575bd0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:43 +0100
Subject: btrfs: split waiting from read_extent_buffer_pages(), drop parameter
 wait

There are only 2 WAIT_* values left for wait parameter, we can encode
this to the function name if the waiting functionality is split.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   |  2 +-
 fs/btrfs/extent_io.c | 27 +++++++++++++++++----------
 fs/btrfs/extent_io.h |  7 ++++---
 3 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ef3121b55c50..f09db62e61a1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -226,7 +226,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
 
 	while (1) {
 		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
-		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
+		ret = read_extent_buffer_pages(eb, mirror_num, check);
 		if (!ret)
 			break;
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ccf629d2127..d7a63b2e33b0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3479,8 +3479,8 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
 	bio_put(&bbio->bio);
 }
 
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
-			     const struct btrfs_tree_parent_check *check)
+int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
+				    const struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_bio *bbio;
 	bool ret;
@@ -3498,7 +3498,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 
 	/* Someone else is already reading the buffer, just wait for it. */
 	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
-		goto done;
+		return 0;
 
 	/*
 	 * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above
@@ -3538,14 +3538,21 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 		}
 	}
 	btrfs_submit_bbio(bbio, mirror_num);
+	return 0;
+}
 
-done:
-	if (wait == WAIT_COMPLETE) {
-		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
-		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-			return -EIO;
-	}
+int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
+			     const struct btrfs_tree_parent_check *check)
+{
+	int ret;
 
+	ret = read_extent_buffer_pages_nowait(eb, mirror_num, check);
+	if (ret < 0)
+		return ret;
+
+	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
+	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+		return -EIO;
 	return 0;
 }
 
@@ -4276,7 +4283,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 		return;
 	}
 
-	ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
+	ret = read_extent_buffer_pages_nowait(eb, 0, &check);
 	if (ret < 0)
 		free_extent_buffer_stale(eb);
 	else
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ca09fc31e2de..6c5328bfabc2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -261,10 +261,11 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 					 u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
-#define WAIT_NONE	0
-#define WAIT_COMPLETE	1
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
 			     const struct btrfs_tree_parent_check *parent_check);
+int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
+				    const struct btrfs_tree_parent_check *parent_check);
+
 static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
 {
 	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
-- 
cgit v1.2.3


From ef8c0047aac932bd62a86cc3d5d66d328154fffe Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:45 +0100
Subject: btrfs: remove redundant variables from __process_folios_contig() and
 lock_delalloc_folios()

Same pattern in both functions, we really only use index, start_index is
redundant.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d7a63b2e33b0..c068a442753c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -198,9 +198,8 @@ static void __process_folios_contig(struct address_space *mapping,
 				    u64 end, unsigned long page_ops)
 {
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
-	pgoff_t start_index = start >> PAGE_SHIFT;
+	pgoff_t index = start >> PAGE_SHIFT;
 	pgoff_t end_index = end >> PAGE_SHIFT;
-	pgoff_t index = start_index;
 	struct folio_batch fbatch;
 	int i;
 
@@ -242,9 +241,8 @@ static noinline int lock_delalloc_folios(struct inode *inode,
 {
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct address_space *mapping = inode->i_mapping;
-	pgoff_t start_index = start >> PAGE_SHIFT;
+	pgoff_t index = start >> PAGE_SHIFT;
 	pgoff_t end_index = end >> PAGE_SHIFT;
-	pgoff_t index = start_index;
 	u64 processed_end = start;
 	struct folio_batch fbatch;
 
-- 
cgit v1.2.3


From 311473984c56dfa6cadfec9690f0b5c372ea15fc Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 9 Jan 2025 11:24:51 +0100
Subject: btrfs: async-thread: rename DFT_THRESHOLD to DEFAULT_THRESHOLD

Rename the macro so it's obvious what it means.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/async-thread.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 361a866c1995..a4c51600a408 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -18,7 +18,7 @@ enum {
 };
 
 #define NO_THRESHOLD (-1)
-#define DFT_THRESHOLD (32)
+#define DEFAULT_THRESHOLD (32)
 
 struct btrfs_workqueue {
 	struct workqueue_struct *normal_wq;
@@ -94,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
 
 	ret->limit_active = limit_active;
 	if (thresh == 0)
-		thresh = DFT_THRESHOLD;
+		thresh = DEFAULT_THRESHOLD;
 	/* For low threshold, disabling threshold is a better choice */
-	if (thresh < DFT_THRESHOLD) {
+	if (thresh < DEFAULT_THRESHOLD) {
 		ret->current_active = limit_active;
 		ret->thresh = NO_THRESHOLD;
 	} else {
-- 
cgit v1.2.3


From 23ebf63925989adbe4c4277c8e9b04e0a37f6005 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 6 Jan 2025 10:50:29 +0100
Subject: xfs: mark xfs_dir_isempty static

And return bool instead of a boolean condition as int.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2.c | 6 +++---
 fs/xfs/libxfs/xfs_dir2.h | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 202468223bf9..81aaef2f495e 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -197,7 +197,7 @@ xfs_da_unmount(
 /*
  * Return 1 if directory contains only "." and "..".
  */
-int
+static bool
 xfs_dir_isempty(
 	xfs_inode_t	*dp)
 {
@@ -205,9 +205,9 @@ xfs_dir_isempty(
 
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	if (dp->i_disk_size == 0)	/* might happen during shutdown. */
-		return 1;
+		return true;
 	if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
-		return 0;
+		return false;
 	sfp = dp->i_df.if_data;
 	return !sfp->count;
 }
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 576068ed81fa..a6594a5a941d 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -58,7 +58,6 @@ extern void xfs_dir_startup(void);
 extern int xfs_da_mount(struct xfs_mount *mp);
 extern void xfs_da_unmount(struct xfs_mount *mp);
 
-extern int xfs_dir_isempty(struct xfs_inode *dp);
 extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
 				struct xfs_inode *pdp);
 extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
-- 
cgit v1.2.3


From 415dee1e06da431f3d314641ceecb9018bb6fa53 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 6 Jan 2025 10:50:30 +0100
Subject: xfs: remove XFS_ILOG_NONCORE

XFS_ILOG_NONCORE is not used in the kernel code or xfsprogs, remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/libxfs/xfs_log_format.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index ec7157eaba5f..a472ac2e45d0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -359,12 +359,6 @@ struct xfs_inode_log_format_32 {
  */
 #define XFS_ILOG_IVERSION	0x8000
 
-#define	XFS_ILOG_NONCORE	(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
-				 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
-				 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-				 XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \
-				 XFS_ILOG_AOWNER)
-
 #define	XFS_ILOG_DFORK		(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
 				 XFS_ILOG_DBROOT)
 
-- 
cgit v1.2.3


From 471511d6ef7d00b40e65902ff47acfc194c6a952 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 6 Jan 2025 10:50:31 +0100
Subject: xfs: remove the t_magic field in struct xfs_trans

The t_magic field is only ever assigned to, but never read.  Remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_trans.c | 2 --
 fs/xfs/xfs_trans.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f53f82456288..c6657072361a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -100,7 +100,6 @@ xfs_trans_dup(
 	/*
 	 * Initialize the new transaction structure.
 	 */
-	ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
 	ntp->t_mountp = tp->t_mountp;
 	INIT_LIST_HEAD(&ntp->t_items);
 	INIT_LIST_HEAD(&ntp->t_busy);
@@ -275,7 +274,6 @@ retry:
 	ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
 	       xfs_has_lazysbcount(mp));
 
-	tp->t_magic = XFS_TRANS_HEADER_MAGIC;
 	tp->t_flags = flags;
 	tp->t_mountp = mp;
 	INIT_LIST_HEAD(&tp->t_items);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 71c2e82e4dad..2b366851e9a4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -122,7 +122,6 @@ void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
  * This is the structure maintained for every active transaction.
  */
 typedef struct xfs_trans {
-	unsigned int		t_magic;	/* magic number */
 	unsigned int		t_log_res;	/* amt of log space resvd */
 	unsigned int		t_log_count;	/* count for perm log res */
 	unsigned int		t_blk_res;	/* # of blocks resvd */
-- 
cgit v1.2.3


From efebe42d95fbba91dca6e3e32cb9e0612eb56de5 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Sat, 11 Jan 2025 15:05:44 +0800
Subject: xfs: fix mount hang during primary superblock recovery failure

When mounting an image containing a log with sb modifications that require
log replay, the mount process hang all the time and stack as follows:

  [root@localhost ~]# cat /proc/557/stack
  [<0>] xfs_buftarg_wait+0x31/0x70
  [<0>] xfs_buftarg_drain+0x54/0x350
  [<0>] xfs_mountfs+0x66e/0xe80
  [<0>] xfs_fs_fill_super+0x7f1/0xec0
  [<0>] get_tree_bdev_flags+0x186/0x280
  [<0>] get_tree_bdev+0x18/0x30
  [<0>] xfs_fs_get_tree+0x1d/0x30
  [<0>] vfs_get_tree+0x2d/0x110
  [<0>] path_mount+0xb59/0xfc0
  [<0>] do_mount+0x92/0xc0
  [<0>] __x64_sys_mount+0xc2/0x160
  [<0>] x64_sys_call+0x2de4/0x45c0
  [<0>] do_syscall_64+0xa7/0x240
  [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e

During log recovery, while updating the in-memory superblock from the
primary SB buffer, if an error is encountered, such as superblock
corruption occurs or some other reasons, we will proceed to out_release
and release the xfs_buf. However, this is insufficient because the
xfs_buf's log item has already been initialized and the xfs_buf is held
by the buffer log item as follows, the xfs_buf will not be released,
causing the mount thread to hang.

  xlog_recover_do_primary_sb_buffer
    xlog_recover_do_reg_buffer
      xlog_recover_validate_buf_type
        xfs_buf_item_init(bp, mp)

The solution is straightforward, we simply need to allow it to be
handled by the normal buffer write process. The filesystem will be
shutdown before the submission of buffer_list in xlog_do_recovery_pass(),
ensuring the correct release of the xfs_buf as follows:

  xlog_do_recovery_pass
    error = xlog_recover_process
      xlog_recover_process_data
        xlog_recover_process_ophdr
          xlog_recovery_process_trans
            ...
              xlog_recover_buf_commit_pass2
                error = xlog_recover_do_primary_sb_buffer
                  //Encounter error and return
                if (error)
                  goto out_writebuf
                ...
              out_writebuf:
                xfs_buf_delwri_queue(bp, buffer_list) //add bp to list
                return  error
            ...
    if (!list_empty(&buffer_list))
      if (error)
        xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); //shutdown first
      xfs_buf_delwri_submit(&buffer_list); //submit buffers in list
        __xfs_buf_submit
          if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log))
            xfs_buf_ioend_fail(bp)  //release bp correctly

Fixes: 6a18765b54e2 ("xfs: update the file system geometry after recoverying superblock buffers")
Cc: stable@vger.kernel.org # v6.12
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf_item_recover.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index b05d5b81f642..05a2f6927c12 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -1087,7 +1087,7 @@ xlog_recover_buf_commit_pass2(
 		error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f,
 				current_lsn);
 		if (error)
-			goto out_release;
+			goto out_writebuf;
 
 		/* Update the rt superblock if we have one. */
 		if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) {
@@ -1104,6 +1104,15 @@ xlog_recover_buf_commit_pass2(
 		xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
 	}
 
+	/*
+	 * Buffer held by buf log item during 'normal' buffer recovery must
+	 * be committed through buffer I/O submission path to ensure proper
+	 * release. When error occurs during sb buffer recovery, log shutdown
+	 * will be done before submitting buffer list so that buffers can be
+	 * released correctly through ioend failure path.
+	 */
+out_writebuf:
+
 	/*
 	 * Perform delayed write on the buffer.  Asynchronous writes will be
 	 * slower when taking into account all the buffers to be flushed.
-- 
cgit v1.2.3


From 99fc33d16b2405cbc753fd30f93cd413d7d1b5fd Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Mon, 9 Dec 2024 19:42:41 +0800
Subject: xfs: clean up xfs_end_ioend() to reuse local variables

Use already initialized local variables 'offset' and 'size' instead
of accessing ioend members directly in xfs_setfilesize() call.

This is just a code cleanup with no functional changes.

Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 559a3a577097..67877c36ed11 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -131,7 +131,7 @@ xfs_end_ioend(
 		error = xfs_iomap_write_unwritten(ip, offset, size, false);
 
 	if (!error && xfs_ioend_is_append(ioend))
-		error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+		error = xfs_setfilesize(ip, offset, size);
 done:
 	iomap_finish_ioends(ioend, error);
 	memalloc_nofs_restore(nofs_flag);
-- 
cgit v1.2.3


From adcaff355bd8abb9b5097cc72339fb9cbf2eefda Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Mon, 23 Dec 2024 19:45:10 +0800
Subject: xfs: remove redundant update for ticket->t_curr_res in
 xfs_log_ticket_regrant

The current reservation of the log ticket has already been updated a few
lines above in xfs_log_ticket_regrant(), so there is no need to update it
again. This is just a code cleanup with no functional changes.

Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_log.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 05daad8a8d34..f8851ff835de 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2744,8 +2744,6 @@ xfs_log_ticket_regrant(
 	if (!ticket->t_cnt) {
 		xlog_grant_add_space(&log->l_reserve_head, ticket->t_unit_res);
 		trace_xfs_log_ticket_regrant_exit(log, ticket);
-
-		ticket->t_curr_res = ticket->t_unit_res;
 	}
 
 	xfs_log_ticket_put(ticket);
-- 
cgit v1.2.3


From 09f7680dea8760d48dd5b8b5288a388fec122275 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Mon, 23 Dec 2024 19:45:11 +0800
Subject: xfs: remove bp->b_error check in xfs_attr3_root_inactive

The b_error check right after xfs_trans_get_buf() is redundant:

1) If the buffer is found in transaction via xfs_trans_buf_item_match(),
   any corrupted metadata error would have already been exposed during
   previous reads like xfs_da3_node_read().

2) If the buffer is obtained via xfs_buf_get_map():
   - It's called without XBF_READ flag, so won't return buffer with
     b_error set, since xfs_buf_get_map() will clear it anyway.
   - Buffer found in cache normally won't have error since previous reads
     had checked it, unless someone corrupts the buffer and the AIL
     pushes it out to disk while the buffer's unlocked. But in this case,
     AIL will shut down the log.

Remove this redundant check to simplify the code, make the code consistent
with most other xfs_trans_get_buf() callers in XFS.

Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_attr_inactive.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 24fb12986a56..319004bf089f 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -305,11 +305,6 @@ xfs_attr3_root_inactive(
 			XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp);
 	if (error)
 		return error;
-	error = bp->b_error;
-	if (error) {
-		xfs_trans_brelse(*trans, bp);
-		return error;
-	}
 	xfs_trans_binval(*trans, bp);	/* remove from cache */
 	/*
 	 * Commit the invalidate and start the next transaction.
-- 
cgit v1.2.3


From f4752daf472b52a376f38243436c67b188a5eddf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 05:26:26 +0100
Subject: xfs: fix the comment above xfs_discard_endio

pagb_lock has been replaced with eb_lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_discard.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index c4bd145f5ec1..3f2403a7b49c 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -90,7 +90,7 @@ xfs_discard_endio_work(
 
 /*
  * Queue up the actual completion to a thread to avoid IRQ-safe locking for
- * pagb_lock.
+ * eb_lock.
  */
 static void
 xfs_discard_endio(
-- 
cgit v1.2.3


From 72843ca62417a0587ca98791b172e4c3b3f8d3a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 05:32:58 +0100
Subject: xfs: don't take m_sb_lock in xfs_fs_statfs

The only non-constant value read under m_sb_lock in xfs_fs_statfs is
sb_dblocks, and it could become stale right after dropping the lock
anyway.  Remove the thus pointless lock section.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_super.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 7c3f996cd39e..20cc00b992a6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -850,11 +850,13 @@ xfs_fs_statfs(
 	ifree = percpu_counter_sum(&mp->m_ifree);
 	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
 
-	spin_lock(&mp->m_sb_lock);
 	statp->f_bsize = sbp->sb_blocksize;
 	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+	/*
+	 * sb_dblocks can change during growfs, but nothing cares about reporting
+	 * the old or new value during growfs.
+	 */
 	statp->f_blocks = sbp->sb_dblocks - lsize;
-	spin_unlock(&mp->m_sb_lock);
 
 	/* make sure statp->f_bfree does not underflow */
 	statp->f_bfree = max_t(int64_t, 0,
-- 
cgit v1.2.3


From dd324cb79e54d3a61621f09c346ee050315a4d2e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 05:32:59 +0100
Subject: xfs: refactor xfs_fs_statfs

Split out helpers for data, rt data and inode related information, and
assigning f_bavail once instead of in three places.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_qm_bhv.c |   1 -
 fs/xfs/xfs_super.c  | 132 +++++++++++++++++++++++++++++++---------------------
 2 files changed, 78 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index db5b8afd9d1b..37f1230e7584 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -40,7 +40,6 @@ xfs_fill_statvfs_from_dquot(
 
 		statp->f_blocks = min(statp->f_blocks, limit);
 		statp->f_bfree = min(statp->f_bfree, remaining);
-		statp->f_bavail = min(statp->f_bavail, remaining);
 	}
 
 	limit = dqp->q_ino.softlimit ?
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 20cc00b992a6..809ac6d1813c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -819,81 +819,105 @@ xfs_fs_sync_fs(
 	return 0;
 }
 
-STATIC int
-xfs_fs_statfs(
-	struct dentry		*dentry,
-	struct kstatfs		*statp)
+static xfs_extlen_t
+xfs_internal_log_size(
+	struct xfs_mount	*mp)
 {
-	struct xfs_mount	*mp = XFS_M(dentry->d_sb);
-	xfs_sb_t		*sbp = &mp->m_sb;
-	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
-	uint64_t		fakeinos, id;
-	uint64_t		icount;
-	uint64_t		ifree;
-	uint64_t		fdblocks;
-	xfs_extlen_t		lsize;
-	int64_t			ffree;
-
-	/*
-	 * Expedite background inodegc but don't wait. We do not want to block
-	 * here waiting hours for a billion extent file to be truncated.
-	 */
-	xfs_inodegc_push(mp);
-
-	statp->f_type = XFS_SUPER_MAGIC;
-	statp->f_namelen = MAXNAMELEN - 1;
-
-	id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
-	statp->f_fsid = u64_to_fsid(id);
+	if (!mp->m_sb.sb_logstart)
+		return 0;
+	return mp->m_sb.sb_logblocks;
+}
 
-	icount = percpu_counter_sum(&mp->m_icount);
-	ifree = percpu_counter_sum(&mp->m_ifree);
-	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+static void
+xfs_statfs_data(
+	struct xfs_mount	*mp,
+	struct kstatfs		*st)
+{
+	int64_t			fdblocks =
+		percpu_counter_sum(&mp->m_fdblocks);
 
-	statp->f_bsize = sbp->sb_blocksize;
-	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+	/* make sure st->f_bfree does not underflow */
+	st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
 	/*
 	 * sb_dblocks can change during growfs, but nothing cares about reporting
 	 * the old or new value during growfs.
 	 */
-	statp->f_blocks = sbp->sb_dblocks - lsize;
+	st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp);
+}
+
+/*
+ * When stat(v)fs is called on a file with the realtime bit set or a directory
+ * with the rtinherit bit, report freespace information for the RT device
+ * instead of the main data device.
+ */
+static void
+xfs_statfs_rt(
+	struct xfs_mount	*mp,
+	struct kstatfs		*st)
+{
+	st->f_bfree = xfs_rtbxlen_to_blen(mp,
+			percpu_counter_sum_positive(&mp->m_frextents));
+	st->f_blocks = mp->m_sb.sb_rblocks;
+}
 
-	/* make sure statp->f_bfree does not underflow */
-	statp->f_bfree = max_t(int64_t, 0,
-				fdblocks - xfs_fdblocks_unavailable(mp));
-	statp->f_bavail = statp->f_bfree;
+static void
+xfs_statfs_inodes(
+	struct xfs_mount	*mp,
+	struct kstatfs		*st)
+{
+	uint64_t		icount = percpu_counter_sum(&mp->m_icount);
+	uint64_t		ifree = percpu_counter_sum(&mp->m_ifree);
+	uint64_t		fakeinos = XFS_FSB_TO_INO(mp, st->f_bfree);
 
-	fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
-	statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
+	st->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
 	if (M_IGEO(mp)->maxicount)
-		statp->f_files = min_t(typeof(statp->f_files),
-					statp->f_files,
+		st->f_files = min_t(typeof(st->f_files), st->f_files,
 					M_IGEO(mp)->maxicount);
 
 	/* If sb_icount overshot maxicount, report actual allocation */
-	statp->f_files = max_t(typeof(statp->f_files),
-					statp->f_files,
-					sbp->sb_icount);
+	st->f_files = max_t(typeof(st->f_files), st->f_files,
+			mp->m_sb.sb_icount);
 
-	/* make sure statp->f_ffree does not underflow */
-	ffree = statp->f_files - (icount - ifree);
-	statp->f_ffree = max_t(int64_t, ffree, 0);
+	/* Make sure st->f_ffree does not underflow */
+	st->f_ffree = max_t(int64_t, 0, st->f_files - (icount - ifree));
+}
 
-	if (XFS_IS_REALTIME_MOUNT(mp) &&
-	    (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
-		s64	freertx;
+STATIC int
+xfs_fs_statfs(
+	struct dentry		*dentry,
+	struct kstatfs		*st)
+{
+	struct xfs_mount	*mp = XFS_M(dentry->d_sb);
+	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 
-		statp->f_blocks = sbp->sb_rblocks;
-		freertx = percpu_counter_sum_positive(&mp->m_frextents);
-		statp->f_bavail = statp->f_bfree =
-			xfs_rtbxlen_to_blen(mp, freertx);
-	}
+	/*
+	 * Expedite background inodegc but don't wait. We do not want to block
+	 * here waiting hours for a billion extent file to be truncated.
+	 */
+	xfs_inodegc_push(mp);
+
+	st->f_type = XFS_SUPER_MAGIC;
+	st->f_namelen = MAXNAMELEN - 1;
+	st->f_bsize = mp->m_sb.sb_blocksize;
+	st->f_fsid = u64_to_fsid(huge_encode_dev(mp->m_ddev_targp->bt_dev));
+
+	xfs_statfs_data(mp, st);
+	xfs_statfs_inodes(mp, st);
+
+	if (XFS_IS_REALTIME_MOUNT(mp) &&
+	    (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME)))
+		xfs_statfs_rt(mp, st);
 
 	if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
 	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
 			      (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
-		xfs_qm_statvfs(ip, statp);
+		xfs_qm_statvfs(ip, st);
 
+	/*
+	 * XFS does not distinguish between blocks available to privileged and
+	 * unprivileged users.
+	 */
+	st->f_bavail = st->f_bfree;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 183d988ae9e7ada9d7d4333e2289256e74a5ab5b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 05:31:20 +0100
Subject: xfs: constify feature checks

They will eventually be needed to be const for zoned growfs, but even
now having such simpler helpers as const as possible is a good thing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/libxfs/xfs_rtgroup.c |  2 +-
 fs/xfs/scrub/scrub.h        |  2 +-
 fs/xfs/xfs_mount.h          | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index a6468e591232..d84d32f1b48f 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -338,7 +338,7 @@ struct xfs_rtginode_ops {
 	unsigned int		fmt_mask; /* all valid data fork formats */
 
 	/* Does the fs have this feature? */
-	bool			(*enabled)(struct xfs_mount *mp);
+	bool			(*enabled)(const struct xfs_mount *mp);
 
 	/* Create this rtgroup metadata inode and initialize it. */
 	int			(*create)(struct xfs_rtgroup *rtg,
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index a1086f1f06d0..a3f1abc91390 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -96,7 +96,7 @@ struct xchk_meta_ops {
 	int		(*repair_eval)(struct xfs_scrub *sc);
 
 	/* Decide if we even have this piece of metadata. */
-	bool		(*has)(struct xfs_mount *);
+	bool		(*has)(const struct xfs_mount *);
 
 	/* type describing required/allowed inputs */
 	enum xchk_type	type;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9a1516080e63..fbed172d6770 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -357,7 +357,7 @@ typedef struct xfs_mount {
 #define XFS_FEAT_NOUUID		(1ULL << 63)	/* ignore uuid during mount */
 
 #define __XFS_HAS_FEAT(name, NAME) \
-static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
+static inline bool xfs_has_ ## name (const struct xfs_mount *mp) \
 { \
 	return mp->m_features & XFS_FEAT_ ## NAME; \
 }
@@ -393,25 +393,25 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
 __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
 __XFS_HAS_FEAT(metadir, METADIR)
 
-static inline bool xfs_has_rtgroups(struct xfs_mount *mp)
+static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
 {
 	/* all metadir file systems also allow rtgroups */
 	return xfs_has_metadir(mp);
 }
 
-static inline bool xfs_has_rtsb(struct xfs_mount *mp)
+static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
 {
 	/* all rtgroups filesystems with an rt section have an rtsb */
 	return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
 }
 
-static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
+static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
 {
 	return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) &&
 	       xfs_has_rmapbt(mp);
 }
 
-static inline bool xfs_has_rtreflink(struct xfs_mount *mp)
+static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
 {
 	return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
 	       xfs_has_reflink(mp);
-- 
cgit v1.2.3


From 9d9b72472631262b35157f1a650f066c0e11c2bb Mon Sep 17 00:00:00 2001
From: Mirsad Todorovac <mtodorovac69@gmail.com>
Date: Tue, 17 Dec 2024 23:58:12 +0100
Subject: xfs/libxfs: replace kmalloc() and memcpy() with kmemdup()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The source static analysis tool gave the following advice:

./fs/xfs/libxfs/xfs_dir2.c:382:15-22: WARNING opportunity for kmemdup

 → 382         args->value = kmalloc(len,
   383                          GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL);
   384         if (!args->value)
   385                 return -ENOMEM;
   386
 → 387         memcpy(args->value, name, len);
   388         args->valuelen = len;
   389         return -EEXIST;

Replacing kmalloc() + memcpy() with kmemdump() doesn't change semantics.
Original code works without fault, so this is not a bug fix but proposed improvement.

Link: https://lwn.net/Articles/198928/
Fixes: 94a69db2367ef ("xfs: use __GFP_NOLOCKDEP instead of GFP_NOFS")
Fixes: 384f3ced07efd ("[XFS] Return case-insensitive match for dentry cache")
Fixes: 2451337dd0439 ("xfs: global error sign conversion")
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Chandan Babu R <chandanbabu@kernel.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: linux-xfs@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Mirsad Todorovac <mtodorovac69@gmail.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 81aaef2f495e..1775abcfa04d 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -379,12 +379,11 @@ xfs_dir_cilookup_result(
 					!(args->op_flags & XFS_DA_OP_CILOOKUP))
 		return -EEXIST;
 
-	args->value = kmalloc(len,
+	args->value = kmemdup(name, len,
 			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL);
 	if (!args->value)
 		return -ENOMEM;
 
-	memcpy(args->value, name, len);
 	args->valuelen = len;
 	return -EEXIST;
 }
-- 
cgit v1.2.3


From 72dad8e377afa50435940adfb697e070d3556670 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:43:55 +1030
Subject: btrfs: fix double accounting race when btrfs_run_delalloc_range()
 failed

[BUG]
When running btrfs with block size (4K) smaller than page size (64K,
aarch64), there is a very high chance to crash the kernel at
generic/750, with the following messages:
(before the call traces, there are 3 extra debug messages added)

  BTRFS warning (device dm-3): read-write for sector size 4096 with page size 65536 is experimental
  BTRFS info (device dm-3): checking UUID tree
  hrtimer: interrupt took 5451385 ns
  BTRFS error (device dm-3): cow_file_range failed, root=4957 inode=257 start=1605632 len=69632: -28
  BTRFS error (device dm-3): run_delalloc_nocow failed, root=4957 inode=257 start=1605632 len=69632: -28
  BTRFS error (device dm-3): failed to run delalloc range, root=4957 ino=257 folio=1572864 submit_bitmap=8-15 start=1605632 len=69632: -28
  ------------[ cut here ]------------
  WARNING: CPU: 2 PID: 3020984 at ordered-data.c:360 can_finish_ordered_extent+0x370/0x3b8 [btrfs]
  CPU: 2 UID: 0 PID: 3020984 Comm: kworker/u24:1 Tainted: G           OE      6.13.0-rc1-custom+ #89
  Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
  Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
  Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
  pc : can_finish_ordered_extent+0x370/0x3b8 [btrfs]
  lr : can_finish_ordered_extent+0x1ec/0x3b8 [btrfs]
  Call trace:
   can_finish_ordered_extent+0x370/0x3b8 [btrfs] (P)
   can_finish_ordered_extent+0x1ec/0x3b8 [btrfs] (L)
   btrfs_mark_ordered_io_finished+0x130/0x2b8 [btrfs]
   extent_writepage+0x10c/0x3b8 [btrfs]
   extent_write_cache_pages+0x21c/0x4e8 [btrfs]
   btrfs_writepages+0x94/0x160 [btrfs]
   do_writepages+0x74/0x190
   filemap_fdatawrite_wbc+0x74/0xa0
   start_delalloc_inodes+0x17c/0x3b0 [btrfs]
   btrfs_start_delalloc_roots+0x17c/0x288 [btrfs]
   shrink_delalloc+0x11c/0x280 [btrfs]
   flush_space+0x288/0x328 [btrfs]
   btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
   process_one_work+0x228/0x680
   worker_thread+0x1bc/0x360
   kthread+0x100/0x118
   ret_from_fork+0x10/0x20
  ---[ end trace 0000000000000000 ]---
  BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1605632 OE len=16384 to_dec=16384 left=0
  BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1622016 OE len=12288 to_dec=12288 left=0
  Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
  BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1634304 OE len=8192 to_dec=4096 left=0
  CPU: 1 UID: 0 PID: 3286940 Comm: kworker/u24:3 Tainted: G        W  OE      6.13.0-rc1-custom+ #89
  Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
  Workqueue:  btrfs_work_helper [btrfs] (btrfs-endio-write)
  pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
  pc : process_one_work+0x110/0x680
  lr : worker_thread+0x1bc/0x360
  Call trace:
   process_one_work+0x110/0x680 (P)
   worker_thread+0x1bc/0x360 (L)
   worker_thread+0x1bc/0x360
   kthread+0x100/0x118
   ret_from_fork+0x10/0x20
  Code: f84086a1 f9000fe1 53041c21 b9003361 (f9400661)
  ---[ end trace 0000000000000000 ]---
  Kernel panic - not syncing: Oops: Fatal exception
  SMP: stopping secondary CPUs
  SMP: failed to stop secondary CPUs 2-3
  Dumping ftrace buffer:
     (ftrace buffer empty)
  Kernel Offset: 0x275bb9540000 from 0xffff800080000000
  PHYS_OFFSET: 0xffff8fbba0000000
  CPU features: 0x100,00000070,00801250,8201720b

[CAUSE]
The above warning is triggered immediately after the delalloc range
failure, this happens in the following sequence:

- Range [1568K, 1636K) is dirty

   1536K  1568K     1600K    1636K  1664K
   |      |/////////|////////|      |

  Where 1536K, 1600K and 1664K are page boundaries (64K page size)

- Enter extent_writepage() for page 1536K

- Enter run_delalloc_nocow() with locked page 1536K and range
  [1568K, 1636K)
  This is due to the inode having preallocated extents.

- Enter cow_file_range() with locked page 1536K and range
  [1568K, 1636K)

- btrfs_reserve_extent() only reserved two extents
  The main loop of cow_file_range() only reserved two data extents,

  Now we have:

   1536K  1568K        1600K    1636K  1664K
   |      |<-->|<--->|/|///////|      |
               1584K  1596K
  Range [1568K, 1596K) has an ordered extent reserved.

- btrfs_reserve_extent() failed inside cow_file_range() for file offset
  1596K
  This is already a bug in our space reservation code, but for now let's
  focus on the error handling path.

  Now cow_file_range() returned -ENOSPC.

- btrfs_run_delalloc_range() do error cleanup <<< ROOT CAUSE
  Call btrfs_cleanup_ordered_extents() with locked folio 1536K and range
  [1568K, 1636K)

  Function btrfs_cleanup_ordered_extents() normally needs to skip the
  ranges inside the folio, as it will normally be cleaned up by
  extent_writepage().

  Such split error handling is already problematic in the first place.

  What's worse is the folio range skipping itself, which is not taking
  subpage cases into consideration at all, it will only skip the range
  if the page start >= the range start.
  In our case, the page start < the range start, since for subpage cases
  we can have delalloc ranges inside the folio but not covering the
  folio.

  So it doesn't skip the page range at all.
  This means all the ordered extents, both [1568K, 1584K) and
  [1584K, 1596K) will be marked as IOERR.

  And these two ordered extents have no more pending ios, they are marked
  finished, and *QUEUED* to be deleted from the io tree.

- extent_writepage() do error cleanup
  Call btrfs_mark_ordered_io_finished() for the range [1536K, 1600K).

  Although ranges [1568K, 1584K) and [1584K, 1596K) are finished, the
  deletion from io tree is async, it may or may not happen at this
  time.

  If the ranges have not yet been removed, we will do double cleaning on
  those ranges, triggering the above ordered extent warnings.

In theory there are other bugs, like the cleanup in extent_writepage()
can cause double accounting on ranges that are submitted asynchronously
(compression for example).

But that's much harder to trigger because normally we do not mix regular
and compression delalloc ranges.

[FIX]
The folio range split is already buggy and not subpage compatible, it
was introduced a long time ago where subpage support was not even considered.

So instead of splitting the ordered extents cleanup into the folio range
and out of folio range, do all the cleanup inside writepage_delalloc().

- Pass @NULL as locked_folio for btrfs_cleanup_ordered_extents() in
  btrfs_run_delalloc_range()

- Skip the btrfs_cleanup_ordered_extents() if writepage_delalloc()
  failed

  So all ordered extents are only cleaned up by
  btrfs_run_delalloc_range().

- Handle the ranges that already have ordered extents allocated
  If part of the folio already has ordered extent allocated, and
  btrfs_run_delalloc_range() failed, we also need to cleanup that range.

Now we have a concentrated error handling for ordered extents during
btrfs_run_delalloc_range().

Fixes: d1051d6ebf8e ("btrfs: Fix error handling in btrfs_cleanup_ordered_extents")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 59 ++++++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/inode.c     |  3 +--
 2 files changed, 49 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068a442753c..bc2bd103c8cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1134,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
 }
 
 /*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
  *
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent).  In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
  *
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
  */
 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 						 struct folio *folio,
@@ -1159,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 	 * last delalloc end.
 	 */
 	u64 last_delalloc_end = 0;
+	/*
+	 * The range end (exclusive) of the last successfully finished delalloc
+	 * range.
+	 * Any range covered by ordered extent must either be manually marked
+	 * finished (error handling), or has IO submitted (and finish the
+	 * ordered extent normally).
+	 *
+	 * This records the end of ordered extent cleanup if we hit an error.
+	 */
+	u64 last_finished_delalloc_end = page_start;
 	u64 delalloc_start = page_start;
 	u64 delalloc_end = page_end;
 	u64 delalloc_to_write = 0;
@@ -1227,11 +1242,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			found_len = last_delalloc_end + 1 - found_start;
 
 		if (ret >= 0) {
+			/*
+			 * Some delalloc range may be created by previous folios.
+			 * Thus we still need to clean up this range during error
+			 * handling.
+			 */
+			last_finished_delalloc_end = found_start;
 			/* No errors hit so far, run the current delalloc range. */
 			ret = btrfs_run_delalloc_range(inode, folio,
 						       found_start,
 						       found_start + found_len - 1,
 						       wbc);
+			if (ret >= 0)
+				last_finished_delalloc_end = found_start + found_len;
 		} else {
 			/*
 			 * We've hit an error during previous delalloc range,
@@ -1266,8 +1289,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 
 		delalloc_start = found_start + found_len;
 	}
-	if (ret < 0)
+	/*
+	 * It's possible we had some ordered extents created before we hit
+	 * an error, cleanup non-async successfully created delalloc ranges.
+	 */
+	if (unlikely(ret < 0)) {
+		unsigned int bitmap_size = min(
+				(last_finished_delalloc_end - page_start) >>
+				fs_info->sectorsize_bits,
+				fs_info->sectors_per_page);
+
+		for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+			btrfs_mark_ordered_io_finished(inode, folio,
+				page_start + (bit << fs_info->sectorsize_bits),
+				fs_info->sectorsize, false);
 		return ret;
+	}
 out:
 	if (last_delalloc_end)
 		delalloc_end = last_delalloc_end;
@@ -1501,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 
 	bio_ctrl->wbc->nr_to_write--;
 
-done:
-	if (ret) {
+	if (ret)
 		btrfs_mark_ordered_io_finished(inode, folio,
 					       page_start, PAGE_SIZE, !ret);
-		mapping_set_error(folio->mapping, ret);
-	}
 
+done:
+	if (ret < 0)
+		mapping_set_error(folio->mapping, ret);
 	/*
 	 * Only unlock ranges that are submitted. As there can be some async
 	 * submitted ranges inside the folio.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1546f341f9a4..b81afe757f64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2301,8 +2301,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
 
 out:
 	if (ret < 0)
-		btrfs_cleanup_ordered_extents(inode, locked_folio, start,
-					      end - start + 1);
+		btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 8bf334beb3496da3c3fbf3daf3856f7eec70dacc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:43:56 +1030
Subject: btrfs: fix double accounting race when extent_writepage_io() failed

[BUG]
If submit_one_sector() failed inside extent_writepage_io() for sector
size < page size cases (e.g. 4K sector size and 64K page size), then
we can hit double ordered extent accounting error.

This should be very rare, as submit_one_sector() only fails when we
failed to grab the extent map, and such extent map should exist inside
the memory and has been pinned.

[CAUSE]
For example we have the following folio layout:

    0  4K          32K    48K   60K 64K
    |//|           |//////|     |///|

Where |///| is the dirty range we need to writeback. The 3 different
dirty ranges are submitted for regular COW.

Now we hit the following sequence:

- submit_one_sector() returned 0 for [0, 4K)

- submit_one_sector() returned 0 for [32K, 48K)

- submit_one_sector() returned error for [60K, 64K)

- btrfs_mark_ordered_io_finished() called for the whole folio
  This will mark the following ranges as finished:
  * [0, 4K)
  * [32K, 48K)
    Both ranges have their IO already submitted, this cleanup will
    lead to double accounting.

  * [60K, 64K)
    That's the correct cleanup.

The only good news is, this error is only theoretical, as the target
extent map is always pinned, thus we should directly grab it from
memory, other than reading it from the disk.

[FIX]
Instead of calling btrfs_mark_ordered_io_finished() for the whole folio
range, which can touch ranges we should not touch, instead
move the error handling inside extent_writepage_io().

So that we can cleanup exact sectors that ought to be submitted but failed.

This provides much more accurate cleanup, avoiding the double accounting.

CC: stable@vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc2bd103c8cc..5014134b9aa2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1420,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	unsigned long range_bitmap = 0;
 	bool submitted_io = false;
+	bool error = false;
 	const u64 folio_start = folio_pos(folio);
 	u64 cur;
 	int bit;
@@ -1462,11 +1463,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 			break;
 		}
 		ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
-		if (ret < 0)
-			goto out;
+		if (unlikely(ret < 0)) {
+			/*
+			 * bio_ctrl may contain a bio crossing several folios.
+			 * Submit it immediately so that the bio has a chance
+			 * to finish normally, other than marked as error.
+			 */
+			submit_one_bio(bio_ctrl);
+			/*
+			 * Failed to grab the extent map which should be very rare.
+			 * Since there is no bio submitted to finish the ordered
+			 * extent, we have to manually finish this sector.
+			 */
+			btrfs_mark_ordered_io_finished(inode, folio, cur,
+						       fs_info->sectorsize, false);
+			error = true;
+			continue;
+		}
 		submitted_io = true;
 	}
-out:
+
 	/*
 	 * If we didn't submitted any sector (>= i_size), folio dirty get
 	 * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1474,8 +1490,11 @@ out:
 	 *
 	 * Here we set writeback and clear for the range. If the full folio
 	 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+	 *
+	 * If we hit any error, the corresponding sector will still be dirty
+	 * thus no need to clear PAGECACHE_TAG_DIRTY.
 	 */
-	if (!submitted_io) {
+	if (!submitted_io && !error) {
 		btrfs_folio_set_writeback(fs_info, folio, start, len);
 		btrfs_folio_clear_writeback(fs_info, folio, start, len);
 	}
@@ -1495,7 +1514,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 {
 	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	const u64 page_start = folio_pos(folio);
 	int ret;
 	size_t pg_offset;
 	loff_t i_size = i_size_read(&inode->vfs_inode);
@@ -1538,10 +1556,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 
 	bio_ctrl->wbc->nr_to_write--;
 
-	if (ret)
-		btrfs_mark_ordered_io_finished(inode, folio,
-					       page_start, PAGE_SIZE, !ret);
-
 done:
 	if (ret < 0)
 		mapping_set_error(folio->mapping, ret);
@@ -2314,11 +2328,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
 		if (ret == 1)
 			goto next_page;
 
-		if (ret) {
-			btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
-						       cur, cur_len, !ret);
+		if (ret)
 			mapping_set_error(mapping, ret);
-		}
 		btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
 		if (ret < 0)
 			found_error = true;
-- 
cgit v1.2.3


From a7858d5c36cae52eaf3048490b05c0b19086073b Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:43:57 +1030
Subject: btrfs: fix error handling of submit_uncompressed_range()

[BUG]
If we failed to compress the range, or cannot reserve a large enough
data extent (e.g. too fragmented free space), we will fall back to
submit_uncompressed_range().

But inside submit_uncompressed_range(), run_delalloc_cow() can also fail
due to -ENOSPC or any other error.

In that case there are 3 bugs in the error handling:

1) Double freeing for the same ordered extent
   This can lead to crash due to ordered extent double accounting

2) Start/end writeback without updating the subpage writeback bitmap

3) Unlock the folio without clear the subpage lock bitmap

Both bugs 2) and 3) will crash the kernel if the btrfs block size is
smaller than folio size, as the next time the folio gets writeback/lock
updates, subpage will find the bitmap already have the range set,
triggering an ASSERT().

[CAUSE]
Bug 1) happens in the following call chain:

  submit_uncompressed_range()
  |- run_delalloc_cow()
  |  |- cow_file_range()
  |     |- btrfs_reserve_extent()
  |        Failed with -ENOSPC or whatever error
  |
  |- btrfs_clean_up_ordered_extents()
  |  |- btrfs_mark_ordered_io_finished()
  |     Which cleans all the ordered extents in the async_extent range.
  |
  |- btrfs_mark_ordered_io_finished()
     Which cleans the folio range.

The finished ordered extents may not be immediately removed from the
ordered io tree, as they are removed inside a work queue.

So the second btrfs_mark_ordered_io_finished() may find the finished but
not-yet-removed ordered extents, and double free them.

Furthermore, the second btrfs_mark_ordered_io_finished() is not subpage
compatible, as it uses fixed folio_pos() with PAGE_SIZE, which can cover
other ordered extents.

Bugs 2) and 3) are more straightforward, btrfs just calls folio_unlock(),
folio_start_writeback() and folio_end_writeback(), other than the helpers
which handle subpage cases.

[FIX]
For bug 1) since the first btrfs_cleanup_ordered_extents() call is
handling the whole range, we should not do the second
btrfs_mark_ordered_io_finished() call.

And for the first btrfs_cleanup_ordered_extents(), we no longer need to
pass the @locked_page parameter, as we are already in the async extent
context, thus will never rely on the error handling inside
btrfs_run_delalloc_range().

So just let the btrfs_clean_up_ordered_extents() handle every folio
equally.

For bug 2) we should not even call
folio_start_writeback()/folio_end_writeback() anymore.
As the error handling protocol, cow_file_range() should clear
dirty flag and start/finish the writeback for the whole range passed in.

For bug 3) just change the folio_unlock() to btrfs_folio_end_lock()
helper.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b81afe757f64..ca50e72608d6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1128,19 +1128,10 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
 			       &wbc, false);
 	wbc_detach_inode(&wbc);
 	if (ret < 0) {
-		btrfs_cleanup_ordered_extents(inode, locked_folio,
-					      start, end - start + 1);
-		if (locked_folio) {
-			const u64 page_start = folio_pos(locked_folio);
-
-			folio_start_writeback(locked_folio);
-			folio_end_writeback(locked_folio);
-			btrfs_mark_ordered_io_finished(inode, locked_folio,
-						       page_start, PAGE_SIZE,
-						       !ret);
-			mapping_set_error(locked_folio->mapping, ret);
-			folio_unlock(locked_folio);
-		}
+		btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
+		if (locked_folio)
+			btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
+					     start, async_extent->ram_size);
 	}
 }
 
-- 
cgit v1.2.3


From 06f364284794f149d2abc167c11d556cf20c954b Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:43:58 +1030
Subject: btrfs: do proper folio cleanup when cow_file_range() failed

[BUG]
When testing with COW fixup marked as BUG_ON() (this is involved with the
new pin_user_pages*() change, which should not result new out-of-band
dirty pages), I hit a crash triggered by the BUG_ON() from hitting COW
fixup path.

This BUG_ON() happens just after a failed btrfs_run_delalloc_range():

  BTRFS error (device dm-2): failed to run delalloc range, root 348 ino 405 folio 65536 submit_bitmap 6-15 start 90112 len 106496: -28
  ------------[ cut here ]------------
  kernel BUG at fs/btrfs/extent_io.c:1444!
  Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
  CPU: 0 UID: 0 PID: 434621 Comm: kworker/u24:8 Tainted: G           OE      6.12.0-rc7-custom+ #86
  Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
  Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
  pc : extent_writepage_io+0x2d4/0x308 [btrfs]
  lr : extent_writepage_io+0x2d4/0x308 [btrfs]
  Call trace:
   extent_writepage_io+0x2d4/0x308 [btrfs]
   extent_writepage+0x218/0x330 [btrfs]
   extent_write_cache_pages+0x1d4/0x4b0 [btrfs]
   btrfs_writepages+0x94/0x150 [btrfs]
   do_writepages+0x74/0x190
   filemap_fdatawrite_wbc+0x88/0xc8
   start_delalloc_inodes+0x180/0x3b0 [btrfs]
   btrfs_start_delalloc_roots+0x174/0x280 [btrfs]
   shrink_delalloc+0x114/0x280 [btrfs]
   flush_space+0x250/0x2f8 [btrfs]
   btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
   process_one_work+0x164/0x408
   worker_thread+0x25c/0x388
   kthread+0x100/0x118
   ret_from_fork+0x10/0x20
  Code: aa1403e1 9402f3ef aa1403e0 9402f36f (d4210000)
  ---[ end trace 0000000000000000 ]---

[CAUSE]
That failure is mostly from cow_file_range(), where we can hit -ENOSPC.

Although the -ENOSPC is already a bug related to our space reservation
code, let's just focus on the error handling.

For example, we have the following dirty range [0, 64K) of an inode,
with 4K sector size and 4K page size:

   0        16K        32K       48K       64K
   |///////////////////////////////////////|
   |#######################################|

Where |///| means page are still dirty, and |###| means the extent io
tree has EXTENT_DELALLOC flag.

- Enter extent_writepage() for page 0

- Enter btrfs_run_delalloc_range() for range [0, 64K)

- Enter cow_file_range() for range [0, 64K)

- Function btrfs_reserve_extent() only reserved one 16K extent
  So we created extent map and ordered extent for range [0, 16K)

   0        16K        32K       48K       64K
   |////////|//////////////////////////////|
   |<- OE ->|##############################|

   And range [0, 16K) has its delalloc flag cleared.
   But since we haven't yet submit any bio, involved 4 pages are still
   dirty.

- Function btrfs_reserve_extent() returns with -ENOSPC
  Now we have to run error cleanup, which will clear all
  EXTENT_DELALLOC* flags and clear the dirty flags for the remaining
  ranges:

   0        16K        32K       48K       64K
   |////////|                              |
   |        |                              |

  Note that range [0, 16K) still has its pages dirty.

- Some time later, writeback is triggered again for the range [0, 16K)
  since the page range still has dirty flags.

- btrfs_run_delalloc_range() will do nothing because there is no
  EXTENT_DELALLOC flag.

- extent_writepage_io() finds page 0 has no ordered flag
  Which falls into the COW fixup path, triggering the BUG_ON().

Unfortunately this error handling bug dates back to the introduction of
btrfs.  Thankfully with the abuse of COW fixup, at least it won't crash
the kernel.

[FIX]
Instead of immediately unlocking the extent and folios, we keep the extent
and folios locked until either erroring out or the whole delalloc range
finished.

When the whole delalloc range finished without error, we just unlock the
whole range with PAGE_SET_ORDERED (and PAGE_UNLOCK for !keep_locked
cases), with EXTENT_DELALLOC and EXTENT_LOCKED cleared.
And the involved folios will be properly submitted, with their dirty
flags cleared during submission.

For the error path, it will be a little more complex:

- The range with ordered extent allocated (range (1))
  We only clear the EXTENT_DELALLOC and EXTENT_LOCKED, as the remaining
  flags are cleaned up by
  btrfs_mark_ordered_io_finished()->btrfs_finish_one_ordered().

  For folios we finish the IO (clear dirty, start writeback and
  immediately finish the writeback) and unlock the folios.

- The range with reserved extent but no ordered extent (range(2))
- The range we never touched (range(3))
  For both range (2) and range(3) the behavior is not changed.

Now even if cow_file_range() failed halfway with some successfully
reserved extents/ordered extents, we will keep all folios clean, so
there will be no future writeback triggered on them.

CC: stable@vger.kernel.org
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 63 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ca50e72608d6..9bb8c447cde1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1362,6 +1362,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 
 	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
 
+	/*
+	 * We're not doing compressed IO, don't unlock the first page (which
+	 * the caller expects to stay locked), don't clear any dirty bits and
+	 * don't set any writeback bits.
+	 *
+	 * Do set the Ordered (Private2) bit so we know this page was properly
+	 * setup for writepage.
+	 */
+	page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+	page_ops |= PAGE_SET_ORDERED;
+
 	/*
 	 * Relocation relies on the relocated extents to have exactly the same
 	 * size as the original extents. Normally writeback for relocation data
@@ -1421,6 +1432,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 		file_extent.offset = 0;
 		file_extent.compression = BTRFS_COMPRESS_NONE;
 
+		/*
+		 * Locked range will be released either during error clean up or
+		 * after the whole range is finished.
+		 */
 		lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
 			    &cached);
 
@@ -1466,21 +1481,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 
 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 
-		/*
-		 * We're not doing compressed IO, don't unlock the first page
-		 * (which the caller expects to stay locked), don't clear any
-		 * dirty bits and don't set any writeback bits
-		 *
-		 * Do set the Ordered flag so we know this page was
-		 * properly setup for writepage.
-		 */
-		page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
-		page_ops |= PAGE_SET_ORDERED;
-
-		extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
-					     locked_folio, &cached,
-					     EXTENT_LOCKED | EXTENT_DELALLOC,
-					     page_ops);
 		if (num_bytes < cur_alloc_size)
 			num_bytes = 0;
 		else
@@ -1497,6 +1497,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 		if (ret)
 			goto out_unlock;
 	}
+	extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
+				     EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
 done:
 	if (done_offset)
 		*done_offset = end;
@@ -1517,35 +1519,30 @@ out_unlock:
 	 * We process each region below.
 	 */
 
-	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
-		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
-	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
-
 	/*
 	 * For the range (1). We have already instantiated the ordered extents
 	 * for this region. They are cleaned up by
 	 * btrfs_cleanup_ordered_extents() in e.g,
-	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
-	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
-	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
-	 * function.
+	 * btrfs_run_delalloc_range().
+	 * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
+	 * are also handled by the cleanup function.
 	 *
-	 * However, in case of @keep_locked, we still need to unlock the pages
-	 * (except @locked_folio) to ensure all the pages are unlocked.
+	 * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
+	 * finish the writeback of the involved folios, which will be never submitted.
 	 */
-	if (keep_locked && orig_start < start) {
+	if (orig_start < start) {
+		clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+		page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
 		if (!locked_folio)
 			mapping_set_error(inode->vfs_inode.i_mapping, ret);
 		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
-					     locked_folio, NULL, 0, page_ops);
+					     locked_folio, NULL, clear_bits, page_ops);
 	}
 
-	/*
-	 * At this point we're unlocked, we want to make sure we're only
-	 * clearing these flags under the extent lock, so lock the rest of the
-	 * range and clear everything up.
-	 */
-	lock_extent(&inode->io_tree, start, end, NULL);
+	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+		     EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
 
 	/*
 	 * For the range (2). If we reserved an extent for our delalloc range
-- 
cgit v1.2.3


From c2b47df81c8e20a8e8cd94f0d7df211137ae94ed Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:43:59 +1030
Subject: btrfs: do proper folio cleanup when run_delalloc_nocow() failed

[BUG]
With CONFIG_DEBUG_VM set, test case generic/476 has some chance to crash
with the following VM_BUG_ON_FOLIO():

  BTRFS error (device dm-3): cow_file_range failed, start 1146880 end 1253375 len 106496 ret -28
  BTRFS error (device dm-3): run_delalloc_nocow failed, start 1146880 end 1253375 len 106496 ret -28
  page: refcount:4 mapcount:0 mapping:00000000592787cc index:0x12 pfn:0x10664
  aops:btrfs_aops [btrfs] ino:101 dentry name(?):"f1774"
  flags: 0x2fffff80004028(uptodate|lru|private|node=0|zone=2|lastcpupid=0xfffff)
  page dumped because: VM_BUG_ON_FOLIO(!folio_test_locked(folio))
  ------------[ cut here ]------------
  kernel BUG at mm/page-writeback.c:2992!
  Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
  CPU: 2 UID: 0 PID: 3943513 Comm: kworker/u24:15 Tainted: G           OE      6.12.0-rc7-custom+ #87
  Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
  Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
  Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
  pc : folio_clear_dirty_for_io+0x128/0x258
  lr : folio_clear_dirty_for_io+0x128/0x258
  Call trace:
   folio_clear_dirty_for_io+0x128/0x258
   btrfs_folio_clamp_clear_dirty+0x80/0xd0 [btrfs]
   __process_folios_contig+0x154/0x268 [btrfs]
   extent_clear_unlock_delalloc+0x5c/0x80 [btrfs]
   run_delalloc_nocow+0x5f8/0x760 [btrfs]
   btrfs_run_delalloc_range+0xa8/0x220 [btrfs]
   writepage_delalloc+0x230/0x4c8 [btrfs]
   extent_writepage+0xb8/0x358 [btrfs]
   extent_write_cache_pages+0x21c/0x4e8 [btrfs]
   btrfs_writepages+0x94/0x150 [btrfs]
   do_writepages+0x74/0x190
   filemap_fdatawrite_wbc+0x88/0xc8
   start_delalloc_inodes+0x178/0x3a8 [btrfs]
   btrfs_start_delalloc_roots+0x174/0x280 [btrfs]
   shrink_delalloc+0x114/0x280 [btrfs]
   flush_space+0x250/0x2f8 [btrfs]
   btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
   process_one_work+0x164/0x408
   worker_thread+0x25c/0x388
   kthread+0x100/0x118
   ret_from_fork+0x10/0x20
  Code: 910a8021 a90363f7 a9046bf9 94012379 (d4210000)
  ---[ end trace 0000000000000000 ]---

[CAUSE]
The first two lines of extra debug messages show the problem is caused
by the error handling of run_delalloc_nocow().

E.g. we have the following dirtied range (4K blocksize 4K page size):

    0                 16K                  32K
    |//////////////////////////////////////|
    |  Pre-allocated  |

And the range [0, 16K) has a preallocated extent.

- Enter run_delalloc_nocow() for range [0, 16K)
  Which found range [0, 16K) is preallocated, can do the proper NOCOW
  write.

- Enter fallback_to_fow() for range [16K, 32K)
  Since the range [16K, 32K) is not backed by preallocated extent, we
  have to go COW.

- cow_file_range() failed for range [16K, 32K)
  So cow_file_range() will do the clean up by clearing folio dirty,
  unlock the folios.

  Now the folios in range [16K, 32K) is unlocked.

- Enter extent_clear_unlock_delalloc() from run_delalloc_nocow()
  Which is called with PAGE_START_WRITEBACK to start page writeback.
  But folios can only be marked writeback when it's properly locked,
  thus this triggered the VM_BUG_ON_FOLIO().

Furthermore there is another hidden but common bug that
run_delalloc_nocow() is not clearing the folio dirty flags in its error
handling path.
This is the common bug shared between run_delalloc_nocow() and
cow_file_range().

[FIX]
- Clear folio dirty for range [@start, @cur_offset)
  Introduce a helper, cleanup_dirty_folios(), which
  will find and lock the folio in the range, clear the dirty flag and
  start/end the writeback, with the extra handling for the
  @locked_folio.

- Introduce a helper to clear folio dirty, start and end writeback

- Introduce a helper to record the last failed COW range end
  This is to trace which range we should skip, to avoid double
  unlocking.

- Skip the failed COW range for the error handling

CC: stable@vger.kernel.org
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c   | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/subpage.h | 13 ++++++++
 2 files changed, 102 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9bb8c447cde1..7aa178e728cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1954,6 +1954,53 @@ static int can_nocow_file_extent(struct btrfs_path *path,
 	return ret < 0 ? ret : can_nocow;
 }
 
+/*
+ * Cleanup the dirty folios which will never be submitted due to error.
+ *
+ * When running a delalloc range, we may need to split the ranges (due to
+ * fragmentation or NOCOW). If we hit an error in the later part, we will error
+ * out and previously successfully executed range will never be submitted, thus
+ * we have to cleanup those folios by clearing their dirty flag, starting and
+ * finishing the writeback.
+ */
+static void cleanup_dirty_folios(struct btrfs_inode *inode,
+				 struct folio *locked_folio,
+				 u64 start, u64 end, int error)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct address_space *mapping = inode->vfs_inode.i_mapping;
+	pgoff_t start_index = start >> PAGE_SHIFT;
+	pgoff_t end_index = end >> PAGE_SHIFT;
+	u32 len;
+
+	ASSERT(end + 1 - start < U32_MAX);
+	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+	       IS_ALIGNED(end + 1, fs_info->sectorsize));
+	len = end + 1 - start;
+
+	/*
+	 * Handle the locked folio first.
+	 * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
+	 */
+	btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+
+	for (pgoff_t index = start_index; index <= end_index; index++) {
+		struct folio *folio;
+
+		/* Already handled at the beginning. */
+		if (index == locked_folio->index)
+			continue;
+		folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
+		/* Cache already dropped, no need to do any cleanup. */
+		if (IS_ERR(folio))
+			continue;
+		btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+	mapping_set_error(mapping, error);
+}
+
 /*
  * when nowcow writeback call back.  This checks for snapshots or COW copies
  * of the extents that exist in the file, and COWs the file as required.
@@ -1969,6 +2016,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
 	struct btrfs_root *root = inode->root;
 	struct btrfs_path *path;
 	u64 cow_start = (u64)-1;
+	/*
+	 * If not 0, represents the inclusive end of the last fallback_to_cow()
+	 * range. Only for error handling.
+	 */
+	u64 cow_end = 0;
 	u64 cur_offset = start;
 	int ret;
 	bool check_prev = true;
@@ -2129,6 +2181,7 @@ must_cow:
 					      found_key.offset - 1);
 			cow_start = (u64)-1;
 			if (ret) {
+				cow_end = found_key.offset - 1;
 				btrfs_dec_nocow_writers(nocow_bg);
 				goto error;
 			}
@@ -2202,24 +2255,54 @@ must_cow:
 		cow_start = cur_offset;
 
 	if (cow_start != (u64)-1) {
-		cur_offset = end;
 		ret = fallback_to_cow(inode, locked_folio, cow_start, end);
 		cow_start = (u64)-1;
-		if (ret)
+		if (ret) {
+			cow_end = end;
 			goto error;
+		}
 	}
 
 	btrfs_free_path(path);
 	return 0;
 
 error:
+	/*
+	 * There are several error cases:
+	 *
+	 * 1) Failed without falling back to COW
+	 *    start         cur_offset             end
+	 *    |/////////////|                      |
+	 *
+	 *    For range [start, cur_offset) the folios are already unlocked (except
+	 *    @locked_folio), EXTENT_DELALLOC already removed.
+	 *    Only need to clear the dirty flag as they will never be submitted.
+	 *    Ordered extent and extent maps are handled by
+	 *    btrfs_mark_ordered_io_finished() inside run_delalloc_range().
+	 *
+	 * 2) Failed with error from fallback_to_cow()
+	 *    start         cur_offset  cow_end    end
+	 *    |/////////////|-----------|          |
+	 *
+	 *    For range [start, cur_offset) it's the same as case 1).
+	 *    But for range [cur_offset, cow_end), the folios have dirty flag
+	 *    cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range().
+	 *
+	 *    Thus we should not call extent_clear_unlock_delalloc() on range
+	 *    [cur_offset, cow_end), as the folios are already unlocked.
+	 *
+	 * So clear the folio dirty flags for [start, cur_offset) first.
+	 */
+	if (cur_offset > start)
+		cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+
 	/*
 	 * If an error happened while a COW region is outstanding, cur_offset
-	 * needs to be reset to cow_start to ensure the COW region is unlocked
-	 * as well.
+	 * needs to be reset to @cow_end + 1 to skip the COW range, as
+	 * cow_file_range() will do the proper cleanup at error.
 	 */
-	if (cow_start != (u64)-1)
-		cur_offset = cow_start;
+	if (cow_end)
+		cur_offset = cow_end + 1;
 
 	/*
 	 * We need to lock the extent here because we're clearing DELALLOC and
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 428fa9389fd4..44fff1f4eac4 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -137,6 +137,19 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback);
 DECLARE_BTRFS_SUBPAGE_OPS(ordered);
 DECLARE_BTRFS_SUBPAGE_OPS(checked);
 
+/*
+ * Helper for error cleanup, where a folio will have its dirty flag cleared,
+ * with writeback started and finished.
+ */
+static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info,
+					       struct folio *locked_folio,
+					       u64 start, u32 len)
+{
+	btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len);
+	btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len);
+	btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len);
+}
+
 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
 					struct folio *folio, u64 start, u32 len);
 
-- 
cgit v1.2.3


From 396294d1afee65a203d6cabd843d0782e5d7388e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:44:00 +1030
Subject: btrfs: subpage: fix the bitmap dump of the locked flags

We're dumping the locked bitmap into the @checked_bitmap variable,
printing incorrect values during debug.

Thankfully even during my development I haven't hit a case where I need
to dump the locked bitmap.  But for the sake of consistency, fix it by
dupping the locked bitmap into @locked_bitmap variable for output.

Fixes: 75258f20fb70 ("btrfs: subpage: dump extra subpage bitmaps for debug")
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/subpage.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 8c68059ac1b0..03d7bfc042e2 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -716,6 +716,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 	unsigned long writeback_bitmap;
 	unsigned long ordered_bitmap;
 	unsigned long checked_bitmap;
+	unsigned long locked_bitmap;
 	unsigned long flags;
 
 	ASSERT(folio_test_private(folio) && folio_get_private(folio));
@@ -728,15 +729,16 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 	GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
 	GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
 	GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
-	GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap);
+	GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 
 	dump_page(folio_page(folio, 0), "btrfs subpage dump");
 	btrfs_warn(fs_info,
-"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
 		    start, len, folio_pos(folio),
 		    sectors_per_page, &uptodate_bitmap,
 		    sectors_per_page, &dirty_bitmap,
+		    sectors_per_page, &locked_bitmap,
 		    sectors_per_page, &writeback_bitmap,
 		    sectors_per_page, &ordered_bitmap,
 		    sectors_per_page, &checked_bitmap);
-- 
cgit v1.2.3


From 61d730731b47eeee42ad11fc71e145d269acab8d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:44:01 +1030
Subject: btrfs: subpage: dump the involved bitmap when ASSERT() failed

For btrfs_folio_assert_not_dirty() and btrfs_folio_set_lock(), we call
bitmap_test_range_all_zero() to ensure the involved range has no
dirty/lock bit already set.

However with my recent enhanced delalloc range error handling, I was
hitting the ASSERT() inside btrfs_folio_set_lock(), and it turns out
that some error handling path is not properly updating the folio flags.

So add some extra dumping for the ASSERTs to dump the involved bitmap
to help debug.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/subpage.c | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 03d7bfc042e2..722acf768396 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -635,6 +635,28 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
 IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
 			 folio_test_checked);
 
+#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst)			\
+{									\
+	const int sectors_per_page = fs_info->sectors_per_page;		\
+									\
+	ASSERT(sectors_per_page < BITS_PER_LONG);			\
+	*dst = bitmap_read(subpage->bitmaps,				\
+			   sectors_per_page * btrfs_bitmap_nr_##name,	\
+			   sectors_per_page);				\
+}
+
+#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len)		\
+{									\
+	const struct btrfs_subpage *subpage = folio_get_private(folio);	\
+	unsigned long bitmap;						\
+									\
+	GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap);		\
+	btrfs_warn(fs_info,						\
+	"dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
+		   start, len, folio_pos(folio),			\
+		   fs_info->sectors_per_page, &bitmap);			\
+}
+
 /*
  * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
  * is cleared.
@@ -660,6 +682,10 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
 	subpage = folio_get_private(folio);
 	ASSERT(subpage);
 	spin_lock_irqsave(&subpage->lock, flags);
+	if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+		SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len);
+		ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+	}
 	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
@@ -689,23 +715,16 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
 	nbits = len >> fs_info->sectorsize_bits;
 	spin_lock_irqsave(&subpage->lock, flags);
 	/* Target range should not yet be locked. */
-	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+	if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+		SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len);
+		ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+	}
 	bitmap_set(subpage->bitmaps, start_bit, nbits);
 	ret = atomic_add_return(nbits, &subpage->nr_locked);
 	ASSERT(ret <= fs_info->sectors_per_page);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
-#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst)			\
-{									\
-	const int sectors_per_page = fs_info->sectors_per_page;		\
-									\
-	ASSERT(sectors_per_page < BITS_PER_LONG);			\
-	*dst = bitmap_read(subpage->bitmaps,				\
-			   sectors_per_page * btrfs_bitmap_nr_##name,	\
-			   sectors_per_page);				\
-}
-
 void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 				      struct folio *folio, u64 start, u32 len)
 {
-- 
cgit v1.2.3


From 975a6a8855f45729a0fbfe2a8f2df2d3faef2a97 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:44:02 +1030
Subject: btrfs: add extra error messages for delalloc range related errors

All the error handling bugs I hit so far are all -ENOSPC from either:

- cow_file_range()
- run_delalloc_nocow()
- submit_uncompressed_range()

Previously when those functions failed, there was no error message at
all, making the debugging much harder.

So here we introduce extra error messages for:

- cow_file_range()
- run_delalloc_nocow()
- submit_uncompressed_range()
- writepage_delalloc() when btrfs_run_delalloc_range() failed
- extent_writepage() when extent_writepage_io() failed

One example of the new debug error messages is the following one:

  run fstests generic/750 at 2024-12-08 12:41:41
  BTRFS: device fsid 461b25f5-e240-4543-8deb-e7c2bd01a6d3 devid 1 transid 8 /dev/mapper/test-scratch1 (253:4) scanned by mount (2436600)
  BTRFS info (device dm-4): first mount of filesystem 461b25f5-e240-4543-8deb-e7c2bd01a6d3
  BTRFS info (device dm-4): using crc32c (crc32c-arm64) checksum algorithm
  BTRFS info (device dm-4): forcing free space tree for sector size 4096 with page size 65536
  BTRFS info (device dm-4): using free-space-tree
  BTRFS warning (device dm-4): read-write for sector size 4096 with page size 65536 is experimental
  BTRFS info (device dm-4): checking UUID tree
  BTRFS error (device dm-4): cow_file_range failed, root=363 inode=412 start=503808 len=98304: -28
  BTRFS error (device dm-4): run_delalloc_nocow failed, root=363 inode=412 start=503808 len=98304: -28
  BTRFS error (device dm-4): failed to run delalloc range, root=363 ino=412 folio=458752 submit_bitmap=11-15 start=503808 len=98304: -28

Which shows an error from cow_file_range() which is called inside a
nocow write attempt, along with the extra bitmap from
writepage_delalloc().

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 15 +++++++++++++++
 fs/btrfs/inode.c     | 12 ++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5014134b9aa2..d9f856358704 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1255,6 +1255,15 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 						       wbc);
 			if (ret >= 0)
 				last_finished_delalloc_end = found_start + found_len;
+			if (unlikely(ret < 0))
+				btrfs_err_rl(fs_info,
+"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
+					     btrfs_root_id(inode->root),
+					     btrfs_ino(inode),
+					     folio_pos(folio),
+					     fs_info->sectors_per_page,
+					     &bio_ctrl->submit_bitmap,
+					     found_start, found_len, ret);
 		} else {
 			/*
 			 * We've hit an error during previous delalloc range,
@@ -1553,6 +1562,12 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 				  PAGE_SIZE, bio_ctrl, i_size);
 	if (ret == 1)
 		return 0;
+	if (ret < 0)
+		btrfs_err_rl(fs_info,
+"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
+			     btrfs_root_id(inode->root), btrfs_ino(inode),
+			     folio_pos(folio), fs_info->sectors_per_page,
+			     &bio_ctrl->submit_bitmap, ret);
 
 	bio_ctrl->wbc->nr_to_write--;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7aa178e728cf..57bd601cc736 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1132,6 +1132,10 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
 		if (locked_folio)
 			btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
 					     start, async_extent->ram_size);
+		btrfs_err_rl(inode->root->fs_info,
+			"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+			     __func__, btrfs_root_id(inode->root),
+			     btrfs_ino(inode), start, async_extent->ram_size, ret);
 	}
 }
 
@@ -1576,6 +1580,10 @@ out_unlock:
 		btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
 				       end - start - cur_alloc_size + 1, NULL);
 	}
+	btrfs_err_rl(fs_info,
+		     "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+		     __func__, btrfs_root_id(inode->root),
+		     btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
 	return ret;
 }
 
@@ -2322,6 +2330,10 @@ error:
 		btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
 	}
 	btrfs_free_path(path);
+	btrfs_err_rl(fs_info,
+		     "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+		     __func__, btrfs_root_id(inode->root),
+		     btrfs_ino(inode), start, end + 1 - start, ret);
 	return ret;
 }
 
-- 
cgit v1.2.3


From bf50aca633bb5de5901b831bbac0e6b678d61a3f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 12 Dec 2024 16:44:03 +1030
Subject: btrfs: remove the unused locked_folio parameter from
 btrfs_cleanup_ordered_extents()

The function btrfs_cleanup_ordered_extents() is only called in error
handling path, and the last caller with a @locked_folio parameter was
removed to fix a bug in the btrfs_run_delalloc_range() error handling.

There is no need to pass @locked_folio parameter anymore.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 42 ++----------------------------------------
 1 file changed, 2 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 57bd601cc736..fe2c810335ff 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -393,34 +393,13 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
  * extent (btrfs_finish_ordered_io()).
  */
 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
-						 struct folio *locked_folio,
 						 u64 offset, u64 bytes)
 {
 	unsigned long index = offset >> PAGE_SHIFT;
 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
-	u64 page_start = 0, page_end = 0;
 	struct folio *folio;
 
-	if (locked_folio) {
-		page_start = folio_pos(locked_folio);
-		page_end = page_start + folio_size(locked_folio) - 1;
-	}
-
 	while (index <= end_index) {
-		/*
-		 * For locked page, we will call btrfs_mark_ordered_io_finished
-		 * through btrfs_mark_ordered_io_finished() on it
-		 * in run_delalloc_range() for the error handling, which will
-		 * clear page Ordered and run the ordered extent accounting.
-		 *
-		 * Here we can't just clear the Ordered bit, or
-		 * btrfs_mark_ordered_io_finished() would skip the accounting
-		 * for the page range, and the ordered extent will never finish.
-		 */
-		if (locked_folio && index == (page_start >> PAGE_SHIFT)) {
-			index++;
-			continue;
-		}
 		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
 		index++;
 		if (IS_ERR(folio))
@@ -436,23 +415,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 		folio_put(folio);
 	}
 
-	if (locked_folio) {
-		/* The locked page covers the full range, nothing needs to be done */
-		if (bytes + offset <= page_start + folio_size(locked_folio))
-			return;
-		/*
-		 * In case this page belongs to the delalloc range being
-		 * instantiated then skip it, since the first page of a range is
-		 * going to be properly cleaned up by the caller of
-		 * run_delalloc_range
-		 */
-		if (page_start >= offset && page_end <= (offset + bytes - 1)) {
-			bytes = offset + bytes - folio_pos(locked_folio) -
-				folio_size(locked_folio);
-			offset = folio_pos(locked_folio) + folio_size(locked_folio);
-		}
-	}
-
 	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
 }
 
@@ -1128,7 +1090,7 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
 			       &wbc, false);
 	wbc_detach_inode(&wbc);
 	if (ret < 0) {
-		btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
+		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
 		if (locked_folio)
 			btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
 					     start, async_extent->ram_size);
@@ -2384,7 +2346,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
 
 out:
 	if (ret < 0)
-		btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
+		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 344af27715ddbf357cf76978d674428b88f8e92d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 13 Jan 2025 09:37:24 +0100
Subject: select: Fix unbalanced user_access_end()

While working on implementing user access validation on powerpc
I got the following warnings on a pmac32_defconfig build:

	  CC      fs/select.o
	fs/select.o: warning: objtool: sys_pselect6+0x1bc: redundant UACCESS disable
	fs/select.o: warning: objtool: sys_pselect6_time32+0x1bc: redundant UACCESS disable

On powerpc/32s, user_read_access_begin/end() are no-ops, but the
failure path has a user_access_end() instead of user_read_access_end()
which means an access end without any prior access begin.

Replace that user_access_end() by user_read_access_end().

Fixes: 7e71609f64ec ("pselect6() and friends: take handling the combined 6th/7th args into helper")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/r/a7139e28d767a13e667ee3c79599a8047222ef36.1736751221.git.christophe.leroy@csgroup.eu
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/select.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index e223d1fe9d55..7da531b1cf6b 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -786,7 +786,7 @@ static inline int get_sigset_argpack(struct sigset_argpack *to,
 	}
 	return 0;
 Efault:
-	user_access_end();
+	user_read_access_end();
 	return -EFAULT;
 }
 
@@ -1355,7 +1355,7 @@ static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to,
 	}
 	return 0;
 Efault:
-	user_access_end();
+	user_read_access_end();
 	return -EFAULT;
 }
 
-- 
cgit v1.2.3


From 49fd4e34751e90e6df009b70cd0659dc839e7ca8 Mon Sep 17 00:00:00 2001
From: Zichen Xie <zichenxie0106@gmail.com>
Date: Wed, 18 Dec 2024 00:13:12 +0800
Subject: NFS: Fix potential buffer overflowin nfs_sysfs_link_rpc_client()

name is char[64] where the size of clnt->cl_program->name remains
unknown. Invoking strcat() directly will also lead to potential buffer
overflow. Change them to strscpy() and strncat() to fix potential
issues.

Signed-off-by: Zichen Xie <zichenxie0106@gmail.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/sysfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index bf378ecd5d9f..7b59a40d40c0 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -280,9 +280,9 @@ void nfs_sysfs_link_rpc_client(struct nfs_server *server,
 	char name[RPC_CLIENT_NAME_SIZE];
 	int ret;
 
-	strcpy(name, clnt->cl_program->name);
-	strcat(name, uniq ? uniq : "");
-	strcat(name, "_client");
+	strscpy(name, clnt->cl_program->name, sizeof(name));
+	strncat(name, uniq ? uniq : "", sizeof(name) - strlen(name) - 1);
+	strncat(name, "_client", sizeof(name) - strlen(name) - 1);
 
 	ret = sysfs_create_link_nowarn(&server->kobj,
 						&clnt->cl_sysfs->kobject, name);
-- 
cgit v1.2.3


From e53c568f4603e997426712146dce0bc194c1db12 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.sun@unisoc.com>
Date: Mon, 23 Dec 2024 16:10:43 +0800
Subject: f2fs: add parameter @len to f2fs_invalidate_blocks()

New function can process some consecutive blocks at a time.

Function f2fs_invalidate_blocks()->down_write() and up_write()
are very time-consuming, so if f2fs_invalidate_blocks() can
process consecutive blocks at one time, it will save a lot of time.

Signed-off-by: Yi Sun <yi.sun@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c |  4 ++--
 fs/f2fs/f2fs.h     |  3 ++-
 fs/f2fs/file.c     |  8 ++++----
 fs/f2fs/node.c     |  4 ++--
 fs/f2fs/segment.c  | 32 +++++++++++++++++++++++++-------
 5 files changed, 35 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index be3e6f4d33a2..c5e42eec8ac9 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1380,7 +1380,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 			if (blkaddr == COMPRESS_ADDR)
 				fio.compr_blocks++;
 			if (__is_valid_data_blkaddr(blkaddr))
-				f2fs_invalidate_blocks(sbi, blkaddr);
+				f2fs_invalidate_blocks(sbi, blkaddr, 1);
 			f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR);
 			goto unlock_continue;
 		}
@@ -1390,7 +1390,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 
 		if (i > cc->valid_nr_cpages) {
 			if (__is_valid_data_blkaddr(blkaddr)) {
-				f2fs_invalidate_blocks(sbi, blkaddr);
+				f2fs_invalidate_blocks(sbi, blkaddr, 1);
 				f2fs_update_data_blkaddr(&dn, NEW_ADDR);
 			}
 			goto unlock_continue;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f265c26d0038..4bfe162eefd3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3724,7 +3724,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
 int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi);
 int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
 void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
-void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
+void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr,
+						unsigned int len);
 bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
 int f2fs_start_discard_thread(struct f2fs_sb_info *sbi);
 void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index aa9679b3d8e4..81764b10840b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -652,7 +652,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 				valid_blocks++;
 		}
 
-		f2fs_invalidate_blocks(sbi, blkaddr);
+		f2fs_invalidate_blocks(sbi, blkaddr, 1);
 
 		if (!released || blkaddr != COMPRESS_ADDR)
 			nr_free++;
@@ -750,7 +750,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
 		unsigned int i;
 
 		for (i = 0; i < ei.len; i++)
-			f2fs_invalidate_blocks(sbi, ei.blk + i);
+			f2fs_invalidate_blocks(sbi, ei.blk + i, 1);
 
 		dec_valid_block_count(sbi, inode, ei.len);
 		f2fs_update_time(sbi, REQ_TIME);
@@ -1323,7 +1323,7 @@ static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr,
 		ret = f2fs_get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA);
 		if (ret) {
 			dec_valid_block_count(sbi, inode, 1);
-			f2fs_invalidate_blocks(sbi, *blkaddr);
+			f2fs_invalidate_blocks(sbi, *blkaddr, 1);
 		} else {
 			f2fs_update_data_blkaddr(&dn, *blkaddr);
 		}
@@ -1575,7 +1575,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 			break;
 		}
 
-		f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
+		f2fs_invalidate_blocks(sbi, dn->data_blkaddr, 1);
 		f2fs_set_data_blkaddr(dn, NEW_ADDR);
 	}
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 00f340c91fcb..f88392fc4ba9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -917,7 +917,7 @@ static int truncate_node(struct dnode_of_data *dn)
 	}
 
 	/* Deallocate node address */
-	f2fs_invalidate_blocks(sbi, ni.blk_addr);
+	f2fs_invalidate_blocks(sbi, ni.blk_addr, 1);
 	dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
 	set_node_addr(sbi, &ni, NULL_ADDR, false);
 
@@ -2765,7 +2765,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
 	if (err)
 		return err;
 
-	f2fs_invalidate_blocks(sbi, ni.blk_addr);
+	f2fs_invalidate_blocks(sbi, ni.blk_addr, 1);
 	dec_valid_node_count(sbi, inode, false);
 	set_node_addr(sbi, &ni, NULL_ADDR, false);
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 67d2859831e4..813254dcc00e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -245,7 +245,7 @@ retry:
 		if (!__is_valid_data_blkaddr(new_addr)) {
 			if (new_addr == NULL_ADDR)
 				dec_valid_block_count(sbi, inode, 1);
-			f2fs_invalidate_blocks(sbi, dn.data_blkaddr);
+			f2fs_invalidate_blocks(sbi, dn.data_blkaddr, 1);
 			f2fs_update_data_blkaddr(&dn, new_addr);
 		} else {
 			f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
@@ -2567,25 +2567,43 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 		get_sec_entry(sbi, segno)->valid_blocks += del;
 }
 
-void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
+void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr,
+				unsigned int len)
 {
 	unsigned int segno = GET_SEGNO(sbi, addr);
 	struct sit_info *sit_i = SIT_I(sbi);
+	block_t addr_start = addr, addr_end = addr + len - 1;
+	unsigned int seg_num = GET_SEGNO(sbi, addr_end) - segno + 1;
+	unsigned int i = 1, max_blocks = sbi->blocks_per_seg, cnt;
 
 	f2fs_bug_on(sbi, addr == NULL_ADDR);
 	if (addr == NEW_ADDR || addr == COMPRESS_ADDR)
 		return;
 
-	f2fs_invalidate_internal_cache(sbi, addr, 1);
+	f2fs_invalidate_internal_cache(sbi, addr, len);
 
 	/* add it into sit main buffer */
 	down_write(&sit_i->sentry_lock);
 
-	update_segment_mtime(sbi, addr, 0);
-	update_sit_entry(sbi, addr, -1);
+	if (seg_num == 1)
+		cnt = len;
+	else
+		cnt = max_blocks - GET_BLKOFF_FROM_SEG0(sbi, addr);
 
-	/* add it into dirty seglist */
-	locate_dirty_segment(sbi, segno);
+	do {
+		update_segment_mtime(sbi, addr_start, 0);
+		update_sit_entry(sbi, addr_start, -cnt);
+
+		/* add it into dirty seglist */
+		locate_dirty_segment(sbi, segno);
+
+		/* update @addr_start and @cnt and @segno */
+		addr_start = START_BLOCK(sbi, ++segno);
+		if (++i == seg_num)
+			cnt = GET_BLKOFF_FROM_SEG0(sbi, addr_end) + 1;
+		else
+			cnt = max_blocks;
+	} while (i <= seg_num);
 
 	up_write(&sit_i->sentry_lock);
 }
-- 
cgit v1.2.3


From c84c2424932d18cf3888adfe9bd16f95dc5d9fd4 Mon Sep 17 00:00:00 2001
From: zangyangyang1 <zangyangyang1@xiaomi.com>
Date: Mon, 13 Jan 2025 11:05:18 +0800
Subject: f2fs: fix using wrong 'submitted' value in f2fs_write_cache_pages

When f2fs_write_single_data_page fails, f2fs_write_cache_pages
will use the last 'submitted' value incorrectly, which will cause
'nwritten' and 'wbc->nr_to_write' calculation errors

Signed-off-by: zangyangyang1 <zangyangyang1@xiaomi.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 1 +
 fs/f2fs/data.c     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index c5e42eec8ac9..985690d81a82 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1551,6 +1551,7 @@ continue_unlock:
 		if (!clear_page_dirty_for_io(cc->rpages[i]))
 			goto continue_unlock;
 
+		submitted = 0;
 		ret = f2fs_write_single_data_page(page_folio(cc->rpages[i]),
 						&submitted,
 						NULL, NULL, wbc, io_type,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 35b9455fb899..e9582e06403b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3154,6 +3154,7 @@ continue_unlock:
 				continue;
 			}
 #endif
+			submitted = 0;
 			ret = f2fs_write_single_data_page(folio,
 					&submitted, &bio, &last_block,
 					wbc, io_type, 0, true);
-- 
cgit v1.2.3


From e614a6c52d32c9c7ff545ca842eb2de4aeb1d2d9 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Tue, 7 Jan 2025 13:18:40 +0000
Subject: bcachefs: make directory i_size meaningful

The isize of directory is 0 in bcachefs if the directory is empty.
With more child dirents created, its size ought to change. Many
other filesystems changed as that (ie. xfs and btrfs). And many of
them changed as the size of child dirent name. Although the directory
size may not seem to convey much, we can still give it some meaning.

The formula of dentry size as follow:
    occupied_size = 40 + ALIGN(9 + namelen, 8)

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.h    |  5 +++++
 fs/bcachefs/fs-common.c | 11 +++++++++++
 fs/bcachefs/fs.c        | 13 ++++++++-----
 3 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 362b3b2f2f2e..a633f83c1ac7 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -31,6 +31,11 @@ static inline unsigned dirent_val_u64s(unsigned len)
 			    sizeof(u64));
 }
 
+static inline unsigned int dirent_occupied_size(const struct qstr *name)
+{
+	return (BKEY_U64s + dirent_val_u64s(name->len)) * sizeof(u64);
+}
+
 int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
 			    struct bkey_s_c_dirent, subvol_inum *);
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 2c3d46ac70c6..d70d9f634cea 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -152,6 +152,7 @@ int bch2_create_trans(struct btree_trans *trans,
 		if (is_subdir_for_nlink(new_inode))
 			dir_u->bi_nlink++;
 		dir_u->bi_mtime = dir_u->bi_ctime = now;
+		dir_u->bi_size += dirent_occupied_size(name);
 
 		ret = bch2_inode_write(trans, &dir_iter, dir_u);
 		if (ret)
@@ -220,6 +221,7 @@ int bch2_link_trans(struct btree_trans *trans,
 	}
 
 	dir_u->bi_mtime = dir_u->bi_ctime = now;
+	dir_u->bi_size += dirent_occupied_size(name);
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
@@ -322,6 +324,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
 	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
 	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
+	dir_u->bi_size	-= dirent_occupied_size(name);
 
 	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
 				    &dir_hash, &dirent_iter,
@@ -460,6 +463,14 @@ int bch2_rename_trans(struct btree_trans *trans,
 		goto err;
 	}
 
+	if (mode == BCH_RENAME) {
+		src_dir_u->bi_size -= dirent_occupied_size(src_name);
+		dst_dir_u->bi_size += dirent_occupied_size(dst_name);
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE)
+		src_dir_u->bi_size -= dirent_occupied_size(src_name);
+
 	if (src_inode_u->bi_parent_subvol)
 		src_inode_u->bi_parent_subvol = dst_dir.subvol;
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3f83f131d0e8..90ade8f648d9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -67,6 +67,9 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
 	i_gid_write(&inode->v, bi->bi_gid);
 	inode->v.i_mode	= bi->bi_mode;
 
+	if (fields & ATTR_SIZE)
+		i_size_write(&inode->v, bi->bi_size);
+
 	if (fields & ATTR_ATIME)
 		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
 	if (fields & ATTR_MTIME)
@@ -582,7 +585,7 @@ err_before_quota:
 
 	if (!(flags & BCH_CREATE_TMPFILE)) {
 		bch2_inode_update_after_write(trans, dir, &dir_u,
-					      ATTR_MTIME|ATTR_CTIME);
+					      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
@@ -739,7 +742,7 @@ static int __bch2_link(struct bch_fs *c,
 
 	if (likely(!ret)) {
 		bch2_inode_update_after_write(trans, dir, &dir_u,
-					      ATTR_MTIME|ATTR_CTIME);
+					      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
 		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
 	}
 
@@ -792,7 +795,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 		goto err;
 
 	bch2_inode_update_after_write(trans, dir, &dir_u,
-				      ATTR_MTIME|ATTR_CTIME);
+				      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
 	bch2_inode_update_after_write(trans, inode, &inode_u,
 				      ATTR_MTIME);
 
@@ -970,11 +973,11 @@ err_tx_restart:
 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
 
 	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
-				      ATTR_MTIME|ATTR_CTIME);
+				      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
 
 	if (src_dir != dst_dir)
 		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
-					      ATTR_MTIME|ATTR_CTIME);
+					      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
 
 	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
 				      ATTR_CTIME);
-- 
cgit v1.2.3


From c72deb03ff0426596c69c294afa1572d2748c589 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Tue, 7 Jan 2025 13:18:41 +0000
Subject: bcachefs: bcachefs_metadata_version_directory_size

This adds another metadata version for accounting directory size.
For the new version of the filesystem, when new subdirectory items
are created or deleted, the parent directory's size will change
accordingly. For the old version of the existed file system, running
fsck will automatically upgrade the metadata version, and it will
do the check and recalculationg of the directory size.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h  |  3 ++-
 fs/bcachefs/fsck.c             | 41 +++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/sb-downgrade.c     |  5 ++++-
 fs/bcachefs/sb-errors_format.h |  3 ++-
 4 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0680930508a3..f70f0108401f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -685,7 +685,8 @@ struct bch_sb_field_ext {
 	x(reflink_p_may_update_opts,	BCH_VERSION(1, 16))		\
 	x(inode_depth,			BCH_VERSION(1, 17))		\
 	x(persistent_inode_cursors,	BCH_VERSION(1, 18))		\
-	x(autofix_errors,		BCH_VERSION(1, 19))
+	x(autofix_errors,		BCH_VERSION(1, 19))		\
+	x(directory_size,		BCH_VERSION(1, 20))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 3917d75f3c98..8fcf7c8e5ede 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1116,6 +1116,37 @@ err:
 	return ret;
 }
 
+static int check_directory_size(struct btree_trans *trans,
+				struct bch_inode_unpacked *inode_u,
+				struct bkey_s_c inode_k, bool *write_inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 new_size = 0;
+	int ret;
+
+	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
+			SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot),
+			POS(inode_k.k->p.offset, U64_MAX),
+			0, k, ret) {
+		if (k.k->type != KEY_TYPE_dirent)
+			continue;
+
+		struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k);
+		struct qstr name = bch2_dirent_get_name(dirent);
+
+		new_size += dirent_occupied_size(&name);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && inode_u->bi_size != new_size) {
+		inode_u->bi_size = new_size;
+		*write_inode = true;
+	}
+
+	return ret;
+}
+
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_s_c k,
@@ -1304,6 +1335,16 @@ static int check_inode(struct btree_trans *trans,
 		u.bi_journal_seq = journal_cur_seq(&c->journal);
 		do_update = true;
 	}
+
+	if (S_ISDIR(u.bi_mode)) {
+		ret = check_directory_size(trans, &u, k, &do_update);
+
+		fsck_err_on(ret,
+			    trans, directory_size_mismatch,
+			    "directory inode %llu:%u with the mismatch directory size",
+			    u.bi_inum, k.k->p.snapshot);
+		ret = 0;
+	}
 do_update:
 	if (do_update) {
 		ret = __bch2_fsck_write_inode(trans, &u);
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 051214fdc735..14f6b6a5fb38 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -90,7 +90,10 @@
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
 	  BCH_FSCK_ERR_accounting_mismatch,			\
 	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
-	  BCH_FSCK_ERR_accounting_key_junk_at_end)
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
+	x(directory_size,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
+	  BCH_FSCK_ERR_directory_size_mismatch)			\
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 80b6d589808b..0b4fe899209b 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -313,7 +313,8 @@ enum bch_fsck_flags {
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
 	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
 	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
-	x(MAX,							303,	0)
+	x(directory_size_mismatch,				303,	FSCK_AUTOFIX)	\
+	x(MAX,							304,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit v1.2.3


From e32dcdb0af9f31aab05e20f950c2871378082569 Mon Sep 17 00:00:00 2001
From: Mark Harmstone <maharmstone@fb.com>
Date: Fri, 10 Jan 2025 17:23:52 +0000
Subject: btrfs: add io_uring interface for encoded writes

Add an io_uring interface for encoded writes, with the same parameters
as the BTRFS_IOC_ENCODED_WRITE ioctl.

As with the encoded reads code, there's a test program for this at
https://github.com/maharmstone/io_uring-encoded, and I'll get this
worked into an fstest.

How io_uring works is that it initially calls btrfs_uring_cmd with the
IO_URING_F_NONBLOCK flag set, and if we return -EAGAIN it tries again in
a kthread with the flag cleared.

Ideally we'd honour this and call try_lock etc., but there's still a lot
of work to be done to create non-blocking versions of all the functions
in our write path. Instead, just validate the input in
btrfs_uring_encoded_write() on the first pass and return -EAGAIN, with a
view to properly optimizing the optimistic path later on.

Signed-off-by: Mark Harmstone <maharmstone@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 69c0444369b7..ae98269a5e3a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4936,6 +4936,128 @@ out_acct:
 	return ret;
 }
 
+static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	loff_t pos;
+	struct kiocb kiocb;
+	struct file *file;
+	ssize_t ret;
+	void __user *sqe_addr;
+	struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out_acct;
+	}
+
+	file = cmd->file;
+	sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
+
+	if (!(file->f_mode & FMODE_WRITE)) {
+		ret = -EBADF;
+		goto out_acct;
+	}
+
+	if (!data) {
+		data = kzalloc(sizeof(*data), GFP_NOFS);
+		if (!data) {
+			ret = -ENOMEM;
+			goto out_acct;
+		}
+
+		io_uring_cmd_get_async_data(cmd)->op_data = data;
+
+		if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+			struct btrfs_ioctl_encoded_io_args_32 args32;
+
+			if (copy_from_user(&args32, sqe_addr, sizeof(args32))) {
+				ret = -EFAULT;
+				goto out_acct;
+			}
+			data->args.iov = compat_ptr(args32.iov);
+			data->args.iovcnt = args32.iovcnt;
+			data->args.offset = args32.offset;
+			data->args.flags = args32.flags;
+			data->args.len = args32.len;
+			data->args.unencoded_len = args32.unencoded_len;
+			data->args.unencoded_offset = args32.unencoded_offset;
+			data->args.compression = args32.compression;
+			data->args.encryption = args32.encryption;
+			memcpy(data->args.reserved, args32.reserved,
+			       sizeof(data->args.reserved));
+#else
+			ret = -ENOTTY;
+			goto out_acct;
+#endif
+		} else {
+			if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) {
+				ret = -EFAULT;
+				goto out_acct;
+			}
+		}
+
+		ret = -EINVAL;
+		if (data->args.flags != 0)
+			goto out_acct;
+		if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved)))
+			goto out_acct;
+		if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
+		    data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+			goto out_acct;
+		if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
+		    data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
+			goto out_acct;
+		if (data->args.unencoded_offset > data->args.unencoded_len)
+			goto out_acct;
+		if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset)
+			goto out_acct;
+
+		data->iov = data->iovstack;
+		ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt,
+				   ARRAY_SIZE(data->iovstack), &data->iov,
+				   &data->iter);
+		if (ret < 0)
+			goto out_acct;
+
+		if (iov_iter_count(&data->iter) == 0) {
+			ret = 0;
+			goto out_iov;
+		}
+	}
+
+	if (issue_flags & IO_URING_F_NONBLOCK) {
+		ret = -EAGAIN;
+		goto out_acct;
+	}
+
+	pos = data->args.offset;
+	ret = rw_verify_area(WRITE, file, &pos, data->args.len);
+	if (ret < 0)
+		goto out_iov;
+
+	init_sync_kiocb(&kiocb, file);
+	ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
+	if (ret)
+		goto out_iov;
+	kiocb.ki_pos = pos;
+
+	file_start_write(file);
+
+	ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args);
+	if (ret > 0)
+		fsnotify_modify(file);
+
+	file_end_write(file);
+out_iov:
+	kfree(data->iov);
+out_acct:
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
 int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	switch (cmd->cmd_op) {
@@ -4944,6 +5066,12 @@ int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	case BTRFS_IOC_ENCODED_READ_32:
 #endif
 		return btrfs_uring_encoded_read(cmd, issue_flags);
+
+	case BTRFS_IOC_ENCODED_WRITE:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+	case BTRFS_IOC_ENCODED_WRITE_32:
+#endif
+		return btrfs_uring_encoded_write(cmd, issue_flags);
 	}
 
 	return -EINVAL;
-- 
cgit v1.2.3


From fe4de594f7a2e9bc49407de60fbd20809fad4192 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 8 Jan 2025 14:14:04 +1030
Subject: btrfs: add the missing error handling inside get_canonical_dev_path

Inside function get_canonical_dev_path(), we call d_path() to get the
final device path.

But d_path() can return error, and in that case the next strscpy() call
will trigger an invalid memory access.

Add back the missing error handling for d_path().

Reported-by: Boris Burkov <boris@bur.io>
Fixes: 7e06de7c83a7 ("btrfs: canonicalize the device path before adding it")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1cccaf9c2b0d..3d0ac8bdb21f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -797,6 +797,10 @@ static int get_canonical_dev_path(const char *dev_path, char *canonical)
 	if (ret)
 		goto out;
 	resolved_path = d_path(&path, path_buf, PATH_MAX);
+	if (IS_ERR(resolved_path)) {
+		ret = PTR_ERR(resolved_path);
+		goto out;
+	}
 	ret = strscpy(canonical, resolved_path, PATH_MAX);
 out:
 	kfree(path_buf);
-- 
cgit v1.2.3


From 7a5714991872f0a4805cc6004a5bff19a71d0459 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 3 Dec 2024 18:05:10 +0000
Subject: mm: abstract get_arg_page() stack expansion and mmap read lock

Right now fs/exec.c invokes expand_downwards(), an otherwise internal
implementation detail of the VMA logic in order to ensure that an arg page
can be obtained by get_user_pages_remote().

In order to be able to move the stack expansion logic into mm/vma.c to
make it available to userland testing we need to find an alternative
approach here.

We do so by providing the mmap_read_lock_maybe_expand() function which
also helpfully documents what get_arg_page() is doing here and adds an
additional check against VM_GROWSDOWN to make explicit that the stack
expansion logic is only invoked when the VMA is indeed a downward-growing
stack.

This allows expand_downwards() to become a static function.

Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be
currently user-visible in any way, that is place within an rmap or VMA
tree.  It must be a newly allocated VMA.

This is the case when exec invokes this function.

Link: https://lkml.kernel.org/r/5295d1c70c58e6aa63d14be68d4e1de9fa1c8e6d.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c          | 14 +++-----------
 include/linux/mm.h |  5 ++---
 mm/mmap.c          | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 58 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 98cb7ba9983c..1e1f79c514de 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 	/*
 	 * Avoid relying on expanding the stack down in GUP (which
 	 * does not work for STACK_GROWSUP anyway), and just do it
-	 * by hand ahead of time.
+	 * ahead of time.
 	 */
-	if (write && pos < vma->vm_start) {
-		mmap_write_lock(mm);
-		ret = expand_downwards(vma, pos);
-		if (unlikely(ret < 0)) {
-			mmap_write_unlock(mm);
-			return NULL;
-		}
-		mmap_write_downgrade(mm);
-	} else
-		mmap_read_lock(mm);
+	if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
+		return NULL;
 
 	/*
 	 * We are doing an exec().  'current' is the process
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b1c3db9cf355..2e5ef71b8629 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3324,6 +3324,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void exit_mmap(struct mm_struct *);
 int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
+				 unsigned long addr, bool write);
 
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
@@ -3437,9 +3439,6 @@ extern unsigned long stack_guard_gap;
 int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
 struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
 
-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address);
-
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
diff --git a/mm/mmap.c b/mm/mmap.c
index 7aa372a75326..b29728df7f10 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1009,7 +1009,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  * mmap_lock held for writing.
  */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+static int expand_downwards(struct vm_area_struct *vma, unsigned long address)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *prev;
@@ -1940,3 +1940,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
 	/* Shrink the vma to just the new range */
 	return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
 }
+
+#ifdef CONFIG_MMU
+/*
+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the
+ * start of the VMA, the intent is to perform a write, and it is a
+ * downward-growing stack, then attempt to expand the stack to contain it.
+ *
+ * This function is intended only for obtaining an argument page from an ELF
+ * image, and is almost certainly NOT what you want to use for any other
+ * purpose.
+ *
+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
+ * new VMA being mapped.
+ *
+ * The function assumes that addr is either contained within the VMA or below
+ * it, and makes no attempt to validate this value beyond that.
+ *
+ * Returns true if the read lock was obtained and a stack was perhaps expanded,
+ * false if the stack expansion failed.
+ *
+ * On stack expansion the function temporarily acquires an mmap write lock
+ * before downgrading it.
+ */
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
+				 struct vm_area_struct *new_vma,
+				 unsigned long addr, bool write)
+{
+	if (!write || addr >= new_vma->vm_start) {
+		mmap_read_lock(mm);
+		return true;
+	}
+
+	if (!(new_vma->vm_flags & VM_GROWSDOWN))
+		return false;
+
+	mmap_write_lock(mm);
+	if (expand_downwards(new_vma, addr)) {
+		mmap_write_unlock(mm);
+		return false;
+	}
+
+	mmap_write_downgrade(mm);
+	return true;
+}
+#else
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
+				 unsigned long addr, bool write)
+{
+	return false;
+}
+#endif
-- 
cgit v1.2.3


From fa00b8ef1803fe133b4897c25227aa0d298dd093 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 6 Dec 2024 21:28:46 +0000
Subject: mm: perform all memfd seal checks in a single place

We no longer actually need to perform these checks in the f_op->mmap()
hook any longer.

We already moved the operation which clears VM_MAYWRITE on a read-only
mapping of a write-sealed memfd in order to work around the restrictions
imposed by commit 5de195060b2e ("mm: resolve faulty mmap_region() error
path behaviour").

There is no reason for us not to simply go ahead and additionally check to
see if any pre-existing seals are in place here rather than defer this to
the f_op->mmap() hook.

By doing this we remove more logic from shmem_mmap() which doesn't belong
there, as well as doing the same for hugetlbfs_file_mmap().  We also
remove dubious shared logic in mm.h which simply does not belong there
either.

It makes sense to do these checks at the earliest opportunity, we know
these are shmem (or hugetlbfs) mappings whose relevant VMA flags will not
change from the invoking do_mmap() so there is simply no need to wait.

This also means the implementation of further memfd seal flags can be done
within mm/memfd.c and also have the opportunity to modify VMA flags as
necessary early in the mapping logic.

[lorenzo.stoakes@oracle.com: fix typos in !memfd inline stub]
  Link: https://lkml.kernel.org/r/7dee6c5d-480b-4c24-b98e-6fa47dbd8a23@lucifer.local
Link: https://lkml.kernel.org/r/20241206212846.210835-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Isaac J. Manjarres <isaacmanjarres@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jeff Xu <jeffxu@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c  |  5 -----
 include/linux/memfd.h | 23 +++++++++++----------
 include/linux/mm.h    | 55 ---------------------------------------------------
 mm/memfd.c            | 44 ++++++++++++++++++++++++++++++++++++++++-
 mm/mmap.c             | 12 ++++++++---
 mm/shmem.c            |  6 ------
 6 files changed, 63 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index fc1ae5132127..62fb0cbc93ab 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -99,7 +99,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
-	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 	struct hstate *h = hstate_file(file);
@@ -116,10 +115,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
 	vma->vm_ops = &hugetlb_vm_ops;
 
-	ret = seal_check_write(info->seals, vma);
-	if (ret)
-		return ret;
-
 	/*
 	 * page based offset in vm_pgoff could be sufficiently large to
 	 * overflow a loff_t when converted to byte offset.  This can
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index d437e3070850..246daadbfde8 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -7,7 +7,14 @@
 #ifdef CONFIG_MEMFD_CREATE
 extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
 struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
-unsigned int *memfd_file_seals_ptr(struct file *file);
+/*
+ * Check for any existing seals on mmap, return an error if access is denied due
+ * to sealing, or 0 otherwise.
+ *
+ * We also update VMA flags if appropriate by manipulating the VMA flags pointed
+ * to by vm_flags_ptr.
+ */
+int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr);
 #else
 static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
 {
@@ -17,19 +24,11 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
 {
 	return ERR_PTR(-EINVAL);
 }
-
-static inline unsigned int *memfd_file_seals_ptr(struct file *file)
+static inline int memfd_check_seals_mmap(struct file *file,
+					 unsigned long *vm_flags_ptr)
 {
-	return NULL;
+	return 0;
 }
 #endif
 
-/* Retrieve memfd seals associated with the file, if any. */
-static inline unsigned int memfd_file_seals(struct file *file)
-{
-	unsigned int *sealsp = memfd_file_seals_ptr(file);
-
-	return sealsp ? *sealsp : 0;
-}
-
 #endif /* __LINUX_MEMFD_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a3a50c37603e..e7c54b9aac6d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4102,61 +4102,6 @@ void mem_dump_obj(void *object);
 static inline void mem_dump_obj(void *object) {}
 #endif
 
-static inline bool is_write_sealed(int seals)
-{
-	return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
-}
-
-/**
- * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
- *                      in which case writes should be disallowing moving
- *                      forwards.
- * @seals: the seals to check
- * @vm_flags: the VMA flags to check
- *
- * Returns whether readonly sealed, in which case writess should be disallowed
- * going forward.
- */
-static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
-{
-	/*
-	 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
-	 * MAP_SHARED and read-only, take care to not allow mprotect to
-	 * revert protections on such mappings. Do this only for shared
-	 * mappings. For private mappings, don't need to mask
-	 * VM_MAYWRITE as we still want them to be COW-writable.
-	 */
-	if (is_write_sealed(seals) &&
-	    ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
-		return true;
-
-	return false;
-}
-
-/**
- * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
- *                    handle them.
- * @seals: the seals to check
- * @vma: the vma to operate on
- *
- * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
- * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.
- */
-static inline int seal_check_write(int seals, struct vm_area_struct *vma)
-{
-	if (!is_write_sealed(seals))
-		return 0;
-
-	/*
-	 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
-	 * write seals are active.
-	 */
-	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
-		return -EPERM;
-
-	return 0;
-}
-
 #ifdef CONFIG_ANON_VMA_NAME
 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 			  unsigned long len_in,
diff --git a/mm/memfd.c b/mm/memfd.c
index 35a370d75c9a..5f5a23c9051d 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
 	return error;
 }
 
-unsigned int *memfd_file_seals_ptr(struct file *file)
+static unsigned int *memfd_file_seals_ptr(struct file *file)
 {
 	if (shmem_file(file))
 		return &SHMEM_I(file_inode(file))->seals;
@@ -327,6 +327,48 @@ static int check_sysctl_memfd_noexec(unsigned int *flags)
 	return 0;
 }
 
+static inline bool is_write_sealed(unsigned int seals)
+{
+	return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
+}
+
+static int check_write_seal(unsigned long *vm_flags_ptr)
+{
+	unsigned long vm_flags = *vm_flags_ptr;
+	unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
+
+	/* If a private matting then writability is irrelevant. */
+	if (!(mask & VM_SHARED))
+		return 0;
+
+	/*
+	 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+	 * write seals are active.
+	 */
+	if (mask & VM_WRITE)
+		return -EPERM;
+
+	/*
+	 * This is a read-only mapping, disallow mprotect() from making a
+	 * write-sealed mapping writable in future.
+	 */
+	*vm_flags_ptr &= ~VM_MAYWRITE;
+
+	return 0;
+}
+
+int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
+{
+	int err = 0;
+	unsigned int *seals_ptr = memfd_file_seals_ptr(file);
+	unsigned int seals = seals_ptr ? *seals_ptr : 0;
+
+	if (is_write_sealed(seals))
+		err = check_write_seal(vm_flags_ptr);
+
+	return err;
+}
+
 SYSCALL_DEFINE2(memfd_create,
 		const char __user *, uname,
 		unsigned int, flags)
diff --git a/mm/mmap.c b/mm/mmap.c
index b373486bd1c6..df9154b15ef9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -368,8 +368,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 
 	if (file) {
 		struct inode *inode = file_inode(file);
-		unsigned int seals = memfd_file_seals(file);
 		unsigned long flags_mask;
+		int err;
 
 		if (!file_mmap_ok(file, inode, pgoff, len))
 			return -EOVERFLOW;
@@ -409,8 +409,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_SHARED | VM_MAYSHARE;
 			if (!(file->f_mode & FMODE_WRITE))
 				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-			else if (is_readonly_sealed(seals, vm_flags))
-				vm_flags &= ~VM_MAYWRITE;
 			fallthrough;
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
@@ -430,6 +428,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 		default:
 			return -EINVAL;
 		}
+
+		/*
+		 * Check to see if we are violating any seals and update VMA
+		 * flags if necessary to avoid future seal violations.
+		 */
+		err = memfd_check_seals_mmap(file, &vm_flags);
+		if (err)
+			return (unsigned long)err;
 	} else {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
diff --git a/mm/shmem.c b/mm/shmem.c
index 0892889744a2..95b80c24f6f9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2816,12 +2816,6 @@ out_nomem:
 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
-	struct shmem_inode_info *info = SHMEM_I(inode);
-	int ret;
-
-	ret = seal_check_write(info->seals, vma);
-	if (ret)
-		return ret;
 
 	file_accessed(file);
 	/* This is anonymous shared memory if it is unlinked at the time of mmap */
-- 
cgit v1.2.3


From cbd6883ed8662073031a32f1294cdf53c8ec24a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:05 +0100
Subject: xfs: fix a double completion for buffers on in-memory targets

__xfs_buf_submit calls xfs_buf_ioend when b_io_remaining hits zero.  For
in-memory buftargs b_io_remaining is never incremented from it's initial
value of 1, so this always happens.  Thus the extra call to xfs_buf_ioend
in _xfs_buf_ioapply causes a double completion.  Fortunately
__xfs_buf_submit is only used for synchronous reads on in-memory buftargs
due to the peculiarities of how they work, so this is mostly harmless and
just causes a little extra work to be done.

Fixes: 5076a6040ca1 ("xfs: support in-memory buffer cache targets")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 6f313fbf7669..da38c18acbba 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1657,10 +1657,8 @@ _xfs_buf_ioapply(
 	op |= REQ_META;
 
 	/* in-memory targets are directly mapped, no IO required. */
-	if (xfs_buftarg_is_mem(bp->b_target)) {
-		xfs_buf_ioend(bp);
+	if (xfs_buftarg_is_mem(bp->b_target))
 		return;
-	}
 
 	/*
 	 * Walk all the vectors issuing IO on them. Set up the initial offset
-- 
cgit v1.2.3


From 83e9c69dcf18bff12ed205423b91e1e1ae316998 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:06 +0100
Subject: xfs: remove the incorrect comment above xfs_buf_free_maps

The comment above xfs_buf_free_maps talks about fields not even used in
the function and also doesn't add any other value.  Remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index da38c18acbba..5702cad9ccc9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -203,9 +203,6 @@ xfs_buf_get_maps(
 	return 0;
 }
 
-/*
- *	Frees b_pages if it was allocated.
- */
 static void
 xfs_buf_free_maps(
 	struct xfs_buf	*bp)
-- 
cgit v1.2.3


From 411ff3f7386a93d6170dbc067e3965ad472f11c6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:07 +0100
Subject: xfs: remove the incorrect comment about the b_pag field

The rbtree root is long gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3d56bc7a35cc..da80399c7457 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -187,7 +187,7 @@ struct xfs_buf {
 	int			b_io_error;	/* internal IO error state */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
-	struct xfs_perag	*b_pag;		/* contains rbtree root */
+	struct xfs_perag	*b_pag;
 	struct xfs_mount	*b_mount;
 	struct xfs_buftarg	*b_target;	/* buffer target (device) */
 	void			*b_addr;	/* virtual address of buffer */
-- 
cgit v1.2.3


From 05b5968f33a9fccabc5cb6672afd3ce2367db99b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:08 +0100
Subject: xfs: move xfs_buf_iowait out of (__)xfs_buf_submit

There is no good reason to pass a bool argument to wait for a buffer when
the callers that want that can easily just wait themselves.

This means the wait moves out of the extra hold of the buffer, but as the
callers of synchronous buffer I/O need to hold a reference anyway that is
perfectly fine.

Because all async buffer submitters ignore the error return value, and
the synchronous ones catch the error condition through b_error and
xfs_buf_iowait this also means the new xfs_buf_submit doesn't have to
return an error code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 42 ++++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5702cad9ccc9..5abada2b4a4a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -53,14 +53,8 @@ struct kmem_cache *xfs_buf_cache;
  *	  b_lock (trylock due to inversion)
  */
 
-static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
-
-static inline int
-xfs_buf_submit(
-	struct xfs_buf		*bp)
-{
-	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
-}
+static void xfs_buf_submit(struct xfs_buf *bp);
+static int xfs_buf_iowait(struct xfs_buf *bp);
 
 static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
 {
@@ -804,7 +798,10 @@ _xfs_buf_read(
 	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
 	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
 
-	return xfs_buf_submit(bp);
+	xfs_buf_submit(bp);
+	if (flags & XBF_ASYNC)
+		return 0;
+	return xfs_buf_iowait(bp);
 }
 
 /*
@@ -980,8 +977,8 @@ xfs_buf_read_uncached(
 	bp->b_ops = ops;
 
 	xfs_buf_submit(bp);
-	if (bp->b_error) {
-		error = bp->b_error;
+	error = xfs_buf_iowait(bp);
+	if (error) {
 		xfs_buf_relse(bp);
 		return error;
 	}
@@ -1483,7 +1480,8 @@ xfs_bwrite(
 	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
 			 XBF_DONE);
 
-	error = xfs_buf_submit(bp);
+	xfs_buf_submit(bp);
+	error = xfs_buf_iowait(bp);
 	if (error)
 		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
 	return error;
@@ -1698,13 +1696,10 @@ xfs_buf_iowait(
  * safe to reference the buffer after a call to this function unless the caller
  * holds an additional reference itself.
  */
-static int
-__xfs_buf_submit(
-	struct xfs_buf	*bp,
-	bool		wait)
+static void
+xfs_buf_submit(
+	struct xfs_buf	*bp)
 {
-	int		error = 0;
-
 	trace_xfs_buf_submit(bp, _RET_IP_);
 
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
@@ -1724,10 +1719,9 @@ __xfs_buf_submit(
 	 * state here rather than mount state to avoid corrupting the log tail
 	 * on shutdown.
 	 */
-	if (bp->b_mount->m_log &&
-	    xlog_is_shutdown(bp->b_mount->m_log)) {
+	if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) {
 		xfs_buf_ioend_fail(bp);
-		return -EIO;
+		return;
 	}
 
 	/*
@@ -1765,16 +1759,12 @@ __xfs_buf_submit(
 			xfs_buf_ioend_async(bp);
 	}
 
-	if (wait)
-		error = xfs_buf_iowait(bp);
-
 	/*
 	 * Release the hold that keeps the buffer referenced for the entire
 	 * I/O. Note that if the buffer is async, it is not safe to reference
 	 * after this release.
 	 */
 	xfs_buf_rele(bp);
-	return error;
 }
 
 void *
@@ -2323,7 +2313,7 @@ xfs_buf_delwri_submit_buffers(
 			bp->b_flags |= XBF_ASYNC;
 			xfs_buf_list_del(bp);
 		}
-		__xfs_buf_submit(bp, false);
+		xfs_buf_submit(bp);
 	}
 	blk_finish_plug(&plug);
 
-- 
cgit v1.2.3


From eb43b0b5cab885a9a76f5edb57020ad03eaf82b2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:09 +0100
Subject: xfs: simplify xfs_buf_delwri_pushbuf

xfs_buf_delwri_pushbuf synchronously writes a buffer that is on a delwri
list already.  Instead of doing a complicated dance with the delwri
and wait list, just leave them alone and open code the actual buffer
write.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5abada2b4a4a..fba494b9b9da 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2384,14 +2384,9 @@ xfs_buf_delwri_submit(
  * Push a single buffer on a delwri queue.
  *
  * The purpose of this function is to submit a single buffer of a delwri queue
- * and return with the buffer still on the original queue. The waiting delwri
- * buffer submission infrastructure guarantees transfer of the delwri queue
- * buffer reference to a temporary wait list. We reuse this infrastructure to
- * transfer the buffer back to the original queue.
+ * and return with the buffer still on the original queue.
  *
- * Note the buffer transitions from the queued state, to the submitted and wait
- * listed state and back to the queued state during this call. The buffer
- * locking and queue management logic between _delwri_pushbuf() and
+ * The buffer locking and queue management logic between _delwri_pushbuf() and
  * _delwri_queue() guarantee that the buffer cannot be queued to another list
  * before returning.
  */
@@ -2400,33 +2395,21 @@ xfs_buf_delwri_pushbuf(
 	struct xfs_buf		*bp,
 	struct list_head	*buffer_list)
 {
-	LIST_HEAD		(submit_list);
 	int			error;
 
 	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
 
 	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
 
-	/*
-	 * Isolate the buffer to a new local list so we can submit it for I/O
-	 * independently from the rest of the original list.
-	 */
 	xfs_buf_lock(bp);
-	list_move(&bp->b_list, &submit_list);
-	xfs_buf_unlock(bp);
-
-	/*
-	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
-	 * the buffer on the wait list with the original reference. Rather than
-	 * bounce the buffer from a local wait list back to the original list
-	 * after I/O completion, reuse the original list as the wait list.
-	 */
-	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
+	bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
+	bp->b_flags |= XBF_WRITE;
+	xfs_buf_submit(bp);
 
 	/*
-	 * The buffer is now locked, under I/O and wait listed on the original
-	 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
-	 * return with the buffer unlocked and on the original queue.
+	 * The buffer is now locked, under I/O but still on the original delwri
+	 * queue. Wait for I/O completion, restore the DELWRI_Q flag and
+	 * return with the buffer unlocked and still on the original queue.
 	 */
 	error = xfs_buf_iowait(bp);
 	bp->b_flags |= _XBF_DELWRI_Q;
-- 
cgit v1.2.3


From 72842dbc2b81c4a43203b47b1d4c1ec2aa508020 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:10 +0100
Subject: xfs: remove xfs_buf_delwri_submit_buffers

xfs_buf_delwri_submit_buffers has two callers for synchronous and
asynchronous writes that share very little logic.  Split out a helper for
the shared per-buffer loop and otherwise open code the submission in the
two callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 121 +++++++++++++++++++++++++------------------------------
 1 file changed, 55 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index fba494b9b9da..37318201db9a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2252,72 +2252,26 @@ xfs_buf_cmp(
 	return 0;
 }
 
-/*
- * Submit buffers for write. If wait_list is specified, the buffers are
- * submitted using sync I/O and placed on the wait list such that the caller can
- * iowait each buffer. Otherwise async I/O is used and the buffers are released
- * at I/O completion time. In either case, buffers remain locked until I/O
- * completes and the buffer is released from the queue.
- */
-static int
-xfs_buf_delwri_submit_buffers(
-	struct list_head	*buffer_list,
-	struct list_head	*wait_list)
+static bool
+xfs_buf_delwri_submit_prep(
+	struct xfs_buf		*bp)
 {
-	struct xfs_buf		*bp, *n;
-	int			pinned = 0;
-	struct blk_plug		plug;
-
-	list_sort(NULL, buffer_list, xfs_buf_cmp);
-
-	blk_start_plug(&plug);
-	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
-		if (!wait_list) {
-			if (!xfs_buf_trylock(bp))
-				continue;
-			if (xfs_buf_ispinned(bp)) {
-				xfs_buf_unlock(bp);
-				pinned++;
-				continue;
-			}
-		} else {
-			xfs_buf_lock(bp);
-		}
-
-		/*
-		 * Someone else might have written the buffer synchronously or
-		 * marked it stale in the meantime.  In that case only the
-		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
-		 * reference and remove it from the list here.
-		 */
-		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
-			xfs_buf_list_del(bp);
-			xfs_buf_relse(bp);
-			continue;
-		}
-
-		trace_xfs_buf_delwri_split(bp, _RET_IP_);
-
-		/*
-		 * If we have a wait list, each buffer (and associated delwri
-		 * queue reference) transfers to it and is submitted
-		 * synchronously. Otherwise, drop the buffer from the delwri
-		 * queue and submit async.
-		 */
-		bp->b_flags &= ~_XBF_DELWRI_Q;
-		bp->b_flags |= XBF_WRITE;
-		if (wait_list) {
-			bp->b_flags &= ~XBF_ASYNC;
-			list_move_tail(&bp->b_list, wait_list);
-		} else {
-			bp->b_flags |= XBF_ASYNC;
-			xfs_buf_list_del(bp);
-		}
-		xfs_buf_submit(bp);
+	/*
+	 * Someone else might have written the buffer synchronously or marked it
+	 * stale in the meantime.  In that case only the _XBF_DELWRI_Q flag got
+	 * cleared, and we have to drop the reference and remove it from the
+	 * list here.
+	 */
+	if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+		xfs_buf_list_del(bp);
+		xfs_buf_relse(bp);
+		return false;
 	}
-	blk_finish_plug(&plug);
 
-	return pinned;
+	trace_xfs_buf_delwri_split(bp, _RET_IP_);
+	bp->b_flags &= ~_XBF_DELWRI_Q;
+	bp->b_flags |= XBF_WRITE;
+	return true;
 }
 
 /*
@@ -2340,7 +2294,30 @@ int
 xfs_buf_delwri_submit_nowait(
 	struct list_head	*buffer_list)
 {
-	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
+	struct xfs_buf		*bp, *n;
+	int			pinned = 0;
+	struct blk_plug		plug;
+
+	list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+	blk_start_plug(&plug);
+	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+		if (!xfs_buf_trylock(bp))
+			continue;
+		if (xfs_buf_ispinned(bp)) {
+			xfs_buf_unlock(bp);
+			pinned++;
+			continue;
+		}
+		if (!xfs_buf_delwri_submit_prep(bp))
+			continue;
+		bp->b_flags |= XBF_ASYNC;
+		xfs_buf_list_del(bp);
+		xfs_buf_submit(bp);
+	}
+	blk_finish_plug(&plug);
+
+	return pinned;
 }
 
 /*
@@ -2357,9 +2334,21 @@ xfs_buf_delwri_submit(
 {
 	LIST_HEAD		(wait_list);
 	int			error = 0, error2;
-	struct xfs_buf		*bp;
+	struct xfs_buf		*bp, *n;
+	struct blk_plug		plug;
 
-	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
+	list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+	blk_start_plug(&plug);
+	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+		xfs_buf_lock(bp);
+		if (!xfs_buf_delwri_submit_prep(bp))
+			continue;
+		bp->b_flags &= ~XBF_ASYNC;
+		list_move_tail(&bp->b_list, &wait_list);
+		xfs_buf_submit(bp);
+	}
+	blk_finish_plug(&plug);
 
 	/* Wait for IO to complete. */
 	while (!list_empty(&wait_list)) {
-- 
cgit v1.2.3


From 0195647abaac92f5dbd2799f64d19f316fd97b7a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:11 +0100
Subject: xfs: move write verification out of _xfs_buf_ioapply

Split the write verification logic out of _xfs_buf_ioapply into a new
xfs_buf_verify_write helper called by xfs_buf_submit given that it isn't
about applying the I/O and doesn't really fit in with the rest of
_xfs_buf_ioapply.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 67 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 37318201db9a..02df4fde35b5 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1612,36 +1612,6 @@ _xfs_buf_ioapply(
 
 	if (bp->b_flags & XBF_WRITE) {
 		op = REQ_OP_WRITE;
-
-		/*
-		 * Run the write verifier callback function if it exists. If
-		 * this function fails it will mark the buffer with an error and
-		 * the IO should not be dispatched.
-		 */
-		if (bp->b_ops) {
-			bp->b_ops->verify_write(bp);
-			if (bp->b_error) {
-				xfs_force_shutdown(bp->b_mount,
-						   SHUTDOWN_CORRUPT_INCORE);
-				return;
-			}
-		} else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
-			struct xfs_mount *mp = bp->b_mount;
-
-			/*
-			 * non-crc filesystems don't attach verifiers during
-			 * log recovery, so don't warn for such filesystems.
-			 */
-			if (xfs_has_crc(mp)) {
-				xfs_warn(mp,
-					"%s: no buf ops on daddr 0x%llx len %d",
-					__func__, xfs_buf_daddr(bp),
-					bp->b_length);
-				xfs_hex_dump(bp->b_addr,
-						XFS_CORRUPTION_DUMP_LEN);
-				dump_stack();
-			}
-		}
 	} else {
 		op = REQ_OP_READ;
 		if (bp->b_flags & XBF_READ_AHEAD)
@@ -1690,6 +1660,36 @@ xfs_buf_iowait(
 	return bp->b_error;
 }
 
+/*
+ * Run the write verifier callback function if it exists. If this fails, mark
+ * the buffer with an error and do not dispatch the I/O.
+ */
+static bool
+xfs_buf_verify_write(
+	struct xfs_buf		*bp)
+{
+	if (bp->b_ops) {
+		bp->b_ops->verify_write(bp);
+		if (bp->b_error)
+			return false;
+	} else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
+		/*
+		 * Non-crc filesystems don't attach verifiers during log
+		 * recovery, so don't warn for such filesystems.
+		 */
+		if (xfs_has_crc(bp->b_mount)) {
+			xfs_warn(bp->b_mount,
+				"%s: no buf ops on daddr 0x%llx len %d",
+				__func__, xfs_buf_daddr(bp),
+				bp->b_length);
+			xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN);
+			dump_stack();
+		}
+	}
+
+	return true;
+}
+
 /*
  * Buffer I/O submission path, read or write. Asynchronous submission transfers
  * the buffer lock ownership and the current reference to the IO. It is not
@@ -1745,8 +1745,15 @@ xfs_buf_submit(
 	atomic_set(&bp->b_io_remaining, 1);
 	if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_ioacct_inc(bp);
+
+	if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) {
+		xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE);
+		goto done;
+	}
+
 	_xfs_buf_ioapply(bp);
 
+done:
 	/*
 	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
 	 * reference we took above. If we drop it to zero, run completion so
-- 
cgit v1.2.3


From 8db65d312b5757fd70591382a800336dcbf091af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:12 +0100
Subject: xfs: move in-memory buftarg handling out of _xfs_buf_ioapply

No I/O to apply for in-memory buffers, so skip the function call
entirely.  Clean up the b_io_error initialization logic to allow
for this.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 02df4fde35b5..1e98fa812ba9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1604,12 +1604,6 @@ _xfs_buf_ioapply(
 	int		size;
 	int		i;
 
-	/*
-	 * Make sure we capture only current IO errors rather than stale errors
-	 * left over from previous use of the buffer (e.g. failed readahead).
-	 */
-	bp->b_error = 0;
-
 	if (bp->b_flags & XBF_WRITE) {
 		op = REQ_OP_WRITE;
 	} else {
@@ -1621,10 +1615,6 @@ _xfs_buf_ioapply(
 	/* we only use the buffer cache for meta-data */
 	op |= REQ_META;
 
-	/* in-memory targets are directly mapped, no IO required. */
-	if (xfs_buftarg_is_mem(bp->b_target))
-		return;
-
 	/*
 	 * Walk all the vectors issuing IO on them. Set up the initial offset
 	 * into the buffer and the desired IO size before we start -
@@ -1734,7 +1724,11 @@ xfs_buf_submit(
 	if (bp->b_flags & XBF_WRITE)
 		xfs_buf_wait_unpin(bp);
 
-	/* clear the internal error state to avoid spurious errors */
+	/*
+	 * Make sure we capture only current IO errors rather than stale errors
+	 * left over from previous use of the buffer (e.g. failed readahead).
+	 */
+	bp->b_error = 0;
 	bp->b_io_error = 0;
 
 	/*
@@ -1751,6 +1745,10 @@ xfs_buf_submit(
 		goto done;
 	}
 
+	/* In-memory targets are directly mapped, no I/O required. */
+	if (xfs_buftarg_is_mem(bp->b_target))
+		goto done;
+
 	_xfs_buf_ioapply(bp);
 
 done:
-- 
cgit v1.2.3


From fac69ec8cd743f509129deb5feae9e3f9ebc2cc8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:13 +0100
Subject: xfs: simplify buffer I/O submission

The code in _xfs_buf_ioapply is unnecessarily complicated because it
doesn't take advantage of modern bio features.

Simplify it by making use of bio splitting and chaining, that is build
a single bio for the pages in the buffer using a simple loop, and then
split that bio on the map boundaries for discontiguous multi-FSB buffers
and chain the split bios to the main one so that there is only a single
I/O completion.

This not only simplifies the code to build the buffer, but also removes
the need for the b_io_remaining field as buffer ownership is granted
to the bio on submit of the final bio with no chance for a completion
before that as well as the b_io_error field that is now superfluous
because there always is exactly one completion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 204 ++++++++++++++++---------------------------------------
 fs/xfs/xfs_buf.h |   2 -
 2 files changed, 60 insertions(+), 146 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1e98fa812ba9..541e56b13869 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1362,13 +1362,6 @@ xfs_buf_ioend(
 {
 	trace_xfs_buf_iodone(bp, _RET_IP_);
 
-	/*
-	 * Pull in IO completion errors now. We are guaranteed to be running
-	 * single threaded, so we don't need the lock to read b_io_error.
-	 */
-	if (!bp->b_error && bp->b_io_error)
-		xfs_buf_ioerror(bp, bp->b_io_error);
-
 	if (bp->b_flags & XBF_READ) {
 		if (!bp->b_error && bp->b_ops)
 			bp->b_ops->verify_read(bp);
@@ -1491,118 +1484,26 @@ static void
 xfs_buf_bio_end_io(
 	struct bio		*bio)
 {
-	struct xfs_buf		*bp = (struct xfs_buf *)bio->bi_private;
+	struct xfs_buf		*bp = bio->bi_private;
 
-	if (!bio->bi_status &&
-	    (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
-	    XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
-		bio->bi_status = BLK_STS_IOERR;
-
-	/*
-	 * don't overwrite existing errors - otherwise we can lose errors on
-	 * buffers that require multiple bios to complete.
-	 */
-	if (bio->bi_status) {
-		int error = blk_status_to_errno(bio->bi_status);
-
-		cmpxchg(&bp->b_io_error, 0, error);
-	}
+	if (bio->bi_status)
+		xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
+	else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
+		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+		xfs_buf_ioerror(bp, -EIO);
 
 	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
 
-	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-		xfs_buf_ioend_async(bp);
+	xfs_buf_ioend_async(bp);
 	bio_put(bio);
 }
 
-static void
-xfs_buf_ioapply_map(
-	struct xfs_buf	*bp,
-	int		map,
-	int		*buf_offset,
-	int		*count,
-	blk_opf_t	op)
-{
-	int		page_index;
-	unsigned int	total_nr_pages = bp->b_page_count;
-	int		nr_pages;
-	struct bio	*bio;
-	sector_t	sector =  bp->b_maps[map].bm_bn;
-	int		size;
-	int		offset;
-
-	/* skip the pages in the buffer before the start offset */
-	page_index = 0;
-	offset = *buf_offset;
-	while (offset >= PAGE_SIZE) {
-		page_index++;
-		offset -= PAGE_SIZE;
-	}
-
-	/*
-	 * Limit the IO size to the length of the current vector, and update the
-	 * remaining IO count for the next time around.
-	 */
-	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
-	*count -= size;
-	*buf_offset += size;
-
-next_chunk:
-	atomic_inc(&bp->b_io_remaining);
-	nr_pages = bio_max_segs(total_nr_pages);
-
-	bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
-	bio->bi_iter.bi_sector = sector;
-	bio->bi_end_io = xfs_buf_bio_end_io;
-	bio->bi_private = bp;
-
-	for (; size && nr_pages; nr_pages--, page_index++) {
-		int	rbytes, nbytes = PAGE_SIZE - offset;
-
-		if (nbytes > size)
-			nbytes = size;
-
-		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
-				      offset);
-		if (rbytes < nbytes)
-			break;
-
-		offset = 0;
-		sector += BTOBB(nbytes);
-		size -= nbytes;
-		total_nr_pages--;
-	}
-
-	if (likely(bio->bi_iter.bi_size)) {
-		if (xfs_buf_is_vmapped(bp)) {
-			flush_kernel_vmap_range(bp->b_addr,
-						xfs_buf_vmap_len(bp));
-		}
-		submit_bio(bio);
-		if (size)
-			goto next_chunk;
-	} else {
-		/*
-		 * This is guaranteed not to be the last io reference count
-		 * because the caller (xfs_buf_submit) holds a count itself.
-		 */
-		atomic_dec(&bp->b_io_remaining);
-		xfs_buf_ioerror(bp, -EIO);
-		bio_put(bio);
-	}
-
-}
-
-STATIC void
-_xfs_buf_ioapply(
-	struct xfs_buf	*bp)
+static inline blk_opf_t
+xfs_buf_bio_op(
+	struct xfs_buf		*bp)
 {
-	struct blk_plug	plug;
-	blk_opf_t	op;
-	int		offset;
-	int		size;
-	int		i;
+	blk_opf_t		op;
 
 	if (bp->b_flags & XBF_WRITE) {
 		op = REQ_OP_WRITE;
@@ -1612,25 +1513,53 @@ _xfs_buf_ioapply(
 			op |= REQ_RAHEAD;
 	}
 
-	/* we only use the buffer cache for meta-data */
-	op |= REQ_META;
+	return op | REQ_META;
+}
+
+static void
+xfs_buf_submit_bio(
+	struct xfs_buf		*bp)
+{
+	unsigned int		size = BBTOB(bp->b_length);
+	unsigned int		map = 0, p;
+	struct blk_plug		plug;
+	struct bio		*bio;
+
+	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
+			xfs_buf_bio_op(bp), GFP_NOIO);
+	bio->bi_private = bp;
+	bio->bi_end_io = xfs_buf_bio_end_io;
+
+	if (bp->b_flags & _XBF_KMEM) {
+		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
+				bp->b_offset);
+	} else {
+		for (p = 0; p < bp->b_page_count; p++)
+			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
+		bio->bi_iter.bi_size = size; /* limit to the actual size used */
+
+		if (xfs_buf_is_vmapped(bp))
+			flush_kernel_vmap_range(bp->b_addr,
+					xfs_buf_vmap_len(bp));
+	}
 
 	/*
-	 * Walk all the vectors issuing IO on them. Set up the initial offset
-	 * into the buffer and the desired IO size before we start -
-	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
-	 * subsequent call.
+	 * If there is more than one map segment, split out a new bio for each
+	 * map except of the last one.  The last map is handled by the
+	 * remainder of the original bio outside the loop.
 	 */
-	offset = bp->b_offset;
-	size = BBTOB(bp->b_length);
 	blk_start_plug(&plug);
-	for (i = 0; i < bp->b_map_count; i++) {
-		xfs_buf_ioapply_map(bp, i, &offset, &size, op);
-		if (bp->b_error)
-			break;
-		if (size <= 0)
-			break;	/* all done */
+	for (map = 0; map < bp->b_map_count - 1; map++) {
+		struct bio	*split;
+
+		split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS,
+				&fs_bio_set);
+		split->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
+		bio_chain(split, bio);
+		submit_bio(split);
 	}
+	bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
+	submit_bio(bio);
 	blk_finish_plug(&plug);
 }
 
@@ -1729,14 +1658,7 @@ xfs_buf_submit(
 	 * left over from previous use of the buffer (e.g. failed readahead).
 	 */
 	bp->b_error = 0;
-	bp->b_io_error = 0;
 
-	/*
-	 * Set the count to 1 initially, this will stop an I/O completion
-	 * callout which happens before we have started all the I/O from calling
-	 * xfs_buf_ioend too early.
-	 */
-	atomic_set(&bp->b_io_remaining, 1);
 	if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_ioacct_inc(bp);
 
@@ -1749,21 +1671,15 @@ xfs_buf_submit(
 	if (xfs_buftarg_is_mem(bp->b_target))
 		goto done;
 
-	_xfs_buf_ioapply(bp);
+	xfs_buf_submit_bio(bp);
+	goto rele;
 
 done:
-	/*
-	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
-	 * reference we took above. If we drop it to zero, run completion so
-	 * that we don't return to the caller with completion still pending.
-	 */
-	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
-		if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
-			xfs_buf_ioend(bp);
-		else
-			xfs_buf_ioend_async(bp);
-	}
-
+	if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
+		xfs_buf_ioend(bp);
+	else
+		xfs_buf_ioend_async(bp);
+rele:
 	/*
 	 * Release the hold that keeps the buffer referenced for the entire
 	 * I/O. Note that if the buffer is async, it is not safe to reference
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index da80399c7457..c53d27439ff2 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -184,7 +184,6 @@ struct xfs_buf {
 	struct list_head	b_lru;		/* lru list */
 	spinlock_t		b_lock;		/* internal state lock */
 	unsigned int		b_state;	/* internal state flags */
-	int			b_io_error;	/* internal IO error state */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;
@@ -202,7 +201,6 @@ struct xfs_buf {
 	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
 	int			b_map_count;
 	atomic_t		b_pin_count;	/* pin count */
-	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
 	unsigned int		b_page_count;	/* size of page array */
 	unsigned int		b_offset;	/* page offset of b_addr,
 						   only for _XBF_KMEM buffers */
-- 
cgit v1.2.3


From 5c82a471c2b71357f6319f6ec34d20691969a6ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:14 +0100
Subject: xfs: move invalidate_kernel_vmap_range to xfs_buf_ioend

Invalidating cache lines can be fairly expensive, so don't do it
in interrupt context.  Note that in practice very few setup will
actually do anything here as virtually indexed caches are rather
uncommon, but we might as well move the call to the proper place
while touching this area.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 541e56b13869..e0a34c2aaaaf 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1363,6 +1363,9 @@ xfs_buf_ioend(
 	trace_xfs_buf_iodone(bp, _RET_IP_);
 
 	if (bp->b_flags & XBF_READ) {
+		if (!bp->b_error && xfs_buf_is_vmapped(bp))
+			invalidate_kernel_vmap_range(bp->b_addr,
+					xfs_buf_vmap_len(bp));
 		if (!bp->b_error && bp->b_ops)
 			bp->b_ops->verify_read(bp);
 		if (!bp->b_error)
@@ -1492,9 +1495,6 @@ xfs_buf_bio_end_io(
 		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
 		xfs_buf_ioerror(bp, -EIO);
 
-	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
-		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-
 	xfs_buf_ioend_async(bp);
 	bio_put(bio);
 }
-- 
cgit v1.2.3


From 6dca5abb3d10e27e919e5344ac07e057f443c318 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:15 +0100
Subject: xfs: remove the extra buffer reference in xfs_buf_submit

Nothing touches the buffer after it has been submitted now, so the need for
the extra transient reference went away as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e0a34c2aaaaf..49d087d9ba48 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1643,13 +1643,6 @@ xfs_buf_submit(
 		return;
 	}
 
-	/*
-	 * Grab a reference so the buffer does not go away underneath us. For
-	 * async buffers, I/O completion drops the callers reference, which
-	 * could occur before submission returns.
-	 */
-	xfs_buf_hold(bp);
-
 	if (bp->b_flags & XBF_WRITE)
 		xfs_buf_wait_unpin(bp);
 
@@ -1672,20 +1665,13 @@ xfs_buf_submit(
 		goto done;
 
 	xfs_buf_submit_bio(bp);
-	goto rele;
+	return;
 
 done:
 	if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
 		xfs_buf_ioend(bp);
 	else
 		xfs_buf_ioend_async(bp);
-rele:
-	/*
-	 * Release the hold that keeps the buffer referenced for the entire
-	 * I/O. Note that if the buffer is async, it is not safe to reference
-	 * after this release.
-	 */
-	xfs_buf_rele(bp);
 }
 
 void *
-- 
cgit v1.2.3


From 819f29cc7be6a9d949e017ca3f5ccc772a80daef Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:16 +0100
Subject: xfs: always complete the buffer inline in xfs_buf_submit

xfs_buf_submit now only completes a buffer on error, or for in-memory
buftargs.  There is no point in using a workqueue for the latter as
the completion will just wake up the caller.  Optimize this case by
avoiding the workqueue roundtrip.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 49d087d9ba48..8e795ccd57d6 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1657,21 +1657,17 @@ xfs_buf_submit(
 
 	if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) {
 		xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE);
-		goto done;
+		xfs_buf_ioend(bp);
+		return;
 	}
 
 	/* In-memory targets are directly mapped, no I/O required. */
-	if (xfs_buftarg_is_mem(bp->b_target))
-		goto done;
+	if (xfs_buftarg_is_mem(bp->b_target)) {
+		xfs_buf_ioend(bp);
+		return;
+	}
 
 	xfs_buf_submit_bio(bp);
-	return;
-
-done:
-	if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
-		xfs_buf_ioend(bp);
-	else
-		xfs_buf_ioend_async(bp);
 }
 
 void *
-- 
cgit v1.2.3


From 46eba93d4f582dce63dfdf506a6f2edf8f1787c8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:17 +0100
Subject: xfs: simplify xfsaild_resubmit_item

Since commit acc8f8628c37 ("xfs: attach dquot buffer to dquot log item
buffer") all buf items that use bp->b_li_list are explicitly checked for
in the branch to just clears XFS_LI_FAILED.  Remove the dead arm that
calls xfs_clear_li_failed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_trans_ail.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f56d62dced97..0fcb1828e598 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -359,13 +359,8 @@ xfsaild_resubmit_item(
 	}
 
 	/* protected by ail_lock */
-	list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
-		if (bp->b_flags & (_XBF_INODES | _XBF_DQUOTS))
-			clear_bit(XFS_LI_FAILED, &lip->li_flags);
-		else
-			xfs_clear_li_failed(lip);
-	}
-
+	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+		clear_bit(XFS_LI_FAILED, &lip->li_flags);
 	xfs_buf_unlock(bp);
 	return XFS_ITEM_SUCCESS;
 }
-- 
cgit v1.2.3


From 4f1aefd13e94bbf027f87befb2e2206ca73a5e7f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:18 +0100
Subject: xfs: move b_li_list based retry handling to common code

The dquot and inode version are very similar, which is expected given the
overall b_li_list logic.  The differences are that the inode version also
clears the XFS_LI_FLUSHING which is defined in common but only ever set
by the inode item, and that the dquot version takes the ail_lock over
the list iteration.  While this seems sensible given that additions and
removals from b_li_list are protected by the ail_lock, log items are
only added before buffer submission, and are only removed when completing
the buffer, so nothing can change the list when retrying a buffer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c        | 12 ++++++------
 fs/xfs/xfs_buf_item.h   |  5 -----
 fs/xfs/xfs_dquot.c      | 12 ------------
 fs/xfs/xfs_inode_item.c | 12 ------------
 4 files changed, 6 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8e795ccd57d6..d5e23c4c7674 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1286,6 +1286,7 @@ xfs_buf_ioend_handle_error(
 {
 	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_error_cfg	*cfg;
+	struct xfs_log_item	*lip;
 
 	/*
 	 * If we've already shutdown the journal because of I/O errors, there's
@@ -1333,12 +1334,11 @@ xfs_buf_ioend_handle_error(
 	}
 
 	/* Still considered a transient error. Caller will schedule retries. */
-	if (bp->b_flags & _XBF_INODES)
-		xfs_buf_inode_io_fail(bp);
-	else if (bp->b_flags & _XBF_DQUOTS)
-		xfs_buf_dquot_io_fail(bp);
-	else
-		ASSERT(list_empty(&bp->b_li_list));
+	list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+		set_bit(XFS_LI_FAILED, &lip->li_flags);
+		clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
+	}
+
 	xfs_buf_ioerror(bp, 0);
 	xfs_buf_relse(bp);
 	return true;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 4d8a6aece995..8cde85259a58 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -54,17 +54,12 @@ bool	xfs_buf_item_put(struct xfs_buf_log_item *);
 void	xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
 bool	xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
 void	xfs_buf_inode_iodone(struct xfs_buf *);
-void	xfs_buf_inode_io_fail(struct xfs_buf *bp);
 #ifdef CONFIG_XFS_QUOTA
 void	xfs_buf_dquot_iodone(struct xfs_buf *);
-void	xfs_buf_dquot_io_fail(struct xfs_buf *bp);
 #else
 static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
 {
 }
-static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
-{
-}
 #endif /* CONFIG_XFS_QUOTA */
 void	xfs_buf_iodone(struct xfs_buf *);
 bool	xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 84b69f686ba8..1082c5d980c8 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1230,18 +1230,6 @@ xfs_buf_dquot_iodone(
 	}
 }
 
-void
-xfs_buf_dquot_io_fail(
-	struct xfs_buf		*bp)
-{
-	struct xfs_log_item	*lip;
-
-	spin_lock(&bp->b_mount->m_ail->ail_lock);
-	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
-		set_bit(XFS_LI_FAILED, &lip->li_flags);
-	spin_unlock(&bp->b_mount->m_ail->ail_lock);
-}
-
 /* Check incore dquot for errors before we flush. */
 static xfs_failaddr_t
 xfs_qm_dqflush_check(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 70283c6419fd..9b3dac5b2a33 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -1039,18 +1039,6 @@ xfs_buf_inode_iodone(
 		list_splice_tail(&flushed_inodes, &bp->b_li_list);
 }
 
-void
-xfs_buf_inode_io_fail(
-	struct xfs_buf		*bp)
-{
-	struct xfs_log_item	*lip;
-
-	list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
-		set_bit(XFS_LI_FAILED, &lip->li_flags);
-		clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
-	}
-}
-
 /*
  * Clear the inode logging fields so no more flushes are attempted.  If we are
  * on a buffer list, it is now safe to remove it because the buffer is
-- 
cgit v1.2.3


From 4e35be63c4ad880c3dba12a287a0ea196541258e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jan 2025 15:12:19 +0100
Subject: xfs: add a b_iodone callback to struct xfs_buf

Stop open coding the log item completions and instead add a callback
into back into the submitter.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c        | 7 ++-----
 fs/xfs/xfs_buf.h        | 5 +----
 fs/xfs/xfs_dquot.c      | 2 +-
 fs/xfs/xfs_inode_item.c | 2 +-
 fs/xfs/xfs_trans_buf.c  | 8 ++++----
 5 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d5e23c4c7674..d9636bff16ce 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1392,11 +1392,8 @@ xfs_buf_ioend(
 		if (bp->b_log_item)
 			xfs_buf_item_done(bp);
 
-		if (bp->b_flags & _XBF_INODES)
-			xfs_buf_inode_iodone(bp);
-		else if (bp->b_flags & _XBF_DQUOTS)
-			xfs_buf_dquot_iodone(bp);
-
+		if (bp->b_iodone)
+			bp->b_iodone(bp);
 	}
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c53d27439ff2..10bf66e074a0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -34,8 +34,6 @@ struct xfs_buf;
 #define XBF_WRITE_FAIL	 (1u << 7) /* async writes have failed on this buffer */
 
 /* buffer type flags for write callbacks */
-#define _XBF_INODES	 (1u << 16)/* inode buffer */
-#define _XBF_DQUOTS	 (1u << 17)/* dquot buffer */
 #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
 
 /* flags used only internally */
@@ -65,8 +63,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_STALE,		"STALE" }, \
 	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
-	{ _XBF_INODES,		"INODES" }, \
-	{ _XBF_DQUOTS,		"DQUOTS" }, \
 	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
@@ -205,6 +201,7 @@ struct xfs_buf {
 	unsigned int		b_offset;	/* page offset of b_addr,
 						   only for _XBF_KMEM buffers */
 	int			b_error;	/* error code on I/O */
+	void			(*b_iodone)(struct xfs_buf *bp);
 
 	/*
 	 * async write failure retry count. Initialised to zero on the first
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1082c5d980c8..edbc521870a1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1451,7 +1451,7 @@ xfs_qm_dqflush(
 	 * Attach the dquot to the buffer so that we can remove this dquot from
 	 * the AIL and release the flush lock once the dquot is synced to disk.
 	 */
-	bp->b_flags |= _XBF_DQUOTS;
+	bp->b_iodone = xfs_buf_dquot_iodone;
 	list_add_tail(&lip->li_bio_list, &bp->b_li_list);
 
 	/*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9b3dac5b2a33..35803fcf0beb 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -199,7 +199,7 @@ xfs_inode_item_precommit(
 		xfs_buf_hold(bp);
 		spin_lock(&iip->ili_lock);
 		iip->ili_item.li_buf = bp;
-		bp->b_flags |= _XBF_INODES;
+		bp->b_iodone = xfs_buf_inode_iodone;
 		list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
 		xfs_trans_brelse(tp, bp);
 	}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8e886ecfd69a..53af546c0b23 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -659,7 +659,7 @@ xfs_trans_inode_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_INODE_BUF;
-	bp->b_flags |= _XBF_INODES;
+	bp->b_iodone = xfs_buf_inode_iodone;
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
 }
 
@@ -684,7 +684,7 @@ xfs_trans_stale_inode_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_STALE_INODE;
-	bp->b_flags |= _XBF_INODES;
+	bp->b_iodone = xfs_buf_inode_iodone;
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
 }
 
@@ -709,7 +709,7 @@ xfs_trans_inode_alloc_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
-	bp->b_flags |= _XBF_INODES;
+	bp->b_iodone = xfs_buf_inode_iodone;
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
 }
 
@@ -820,6 +820,6 @@ xfs_trans_dquot_buf(
 		break;
 	}
 
-	bp->b_flags |= _XBF_DQUOTS;
+	bp->b_iodone = xfs_buf_dquot_iodone;
 	xfs_trans_buf_set_type(tp, bp, type);
 }
-- 
cgit v1.2.3


From c221a9a29d419a456503d8e930be0b3cba14d5db Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:42 +0100
Subject: btrfs: selftests: correct RAID stripe-tree feature flag setting

RAID stripe-tree is an incompatible feature not a read-only compatible, so
set the incompat flag not a compat_ro one in the selftest code.

Subsequent changes in btrfs_delete_raid_extent() will start checking for
this flag.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index 30f17eb7b6a8..5801142ba7c3 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -478,8 +478,8 @@ static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
 		ret = PTR_ERR(root);
 		goto out;
 	}
-	btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
-					BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
+	btrfs_set_super_incompat_flags(root->fs_info->super_copy,
+				       BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
 	root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
 	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root->root_key.offset = 0;
-- 
cgit v1.2.3


From 9257d8632a36d02f02a94e674238bcc1b16db8b3 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:43 +0100
Subject: btrfs: don't try to delete RAID stripe-extents if we don't need to

Even if the RAID stripe-tree is not enabled in the filesystem,
do_free_extent_accounting() still calls into btrfs_delete_raid_extent().

Check if the extent in question is on a block-group that has a profile
which is used by RAID stripe-tree before attempting to delete a stripe
extent. Return early if it doesn't, otherwise we're doing a unnecessary
search.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 0bf3c032d9dc..be923144cc85 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -59,9 +59,22 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 	int slot;
 	int ret;
 
-	if (!stripe_root)
+	if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root)
 		return 0;
 
+	if (!btrfs_is_testing(fs_info)) {
+		struct btrfs_chunk_map *map;
+		bool use_rst;
+
+		map = btrfs_find_chunk_map(fs_info, start, length);
+		if (!map)
+			return -EINVAL;
+		use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
+		btrfs_free_chunk_map(map);
+		if (!use_rst)
+			return 0;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 5a0e38eab76991562e0754a93c2c4160819efb03 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:44 +0100
Subject: btrfs: assert RAID stripe-extent length is always greater than 0

When modifying a RAID stripe-extent, ASSERT() that the length of the new
RAID stripe-extent is always greater than 0.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index be923144cc85..0c351eda3551 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -28,6 +28,7 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
 		.offset = newlen,
 	};
 
+	ASSERT(newlen > 0);
 	ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
 
 	leaf = path->nodes[0];
-- 
cgit v1.2.3


From a678543e609dfb145f0498f895bee05bbc7994a5 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:45 +0100
Subject: btrfs: fix front delete range calculation for RAID stripe extents

When deleting the front of a RAID stripe-extent the delete code
miscalculates the size on how much to pad the remaining extent part in the
front.

Fix the calculation so we're always having the sizes we expect.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 0c351eda3551..9e559ad48810 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -136,10 +136,12 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 		 * length to the new size and then re-insert the item.
 		 */
 		if (found_end > end) {
-			u64 diff = found_end - end;
+			u64 diff_end = found_end - end;
 
 			btrfs_partially_delete_raid_extent(trans, path, &key,
-							   diff, diff);
+							   key.offset - length,
+							   length);
+			ASSERT(key.offset - diff_end == length);
 			break;
 		}
 
-- 
cgit v1.2.3


From 50cae2ca69561cbd9a90308ad2a14a442d230662 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:46 +0100
Subject: btrfs: fix tail delete of RAID stripe-extents

Fix tail delete of RAID stripe-extents, if there is a range to be deleted
as well after the tail delete of the extent.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 9e559ad48810..ef76202c3a38 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -119,11 +119,18 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 		 * length to the new size and then re-insert the item.
 		 */
 		if (found_start < start) {
-			u64 diff = start - found_start;
+			u64 diff_start = start - found_start;
 
 			btrfs_partially_delete_raid_extent(trans, path, &key,
-							   diff, 0);
-			break;
+							   diff_start, 0);
+
+			start += (key.offset - diff_start);
+			length -= (key.offset - diff_start);
+			if (length == 0)
+				break;
+
+			btrfs_release_path(path);
+			continue;
 		}
 
 		/*
-- 
cgit v1.2.3


From 76643119045eed639a3334370cba30c54c4074c1 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:47 +0100
Subject: btrfs: fix deletion of a range spanning parts two RAID stripe extents

When a user requests the deletion of a range that spans multiple stripe
extents and btrfs_search_slot() returns us the second RAID stripe extent,
we need to pick the previous item and truncate it, if there's still a
range to delete left, move on to the next item.

The following diagram illustrates the operation:

 |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---|
        |--- keep  ---|--- drop ---|

While at it, comment the trivial case of a whole item delete as well.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index ef76202c3a38..bf665fdef18b 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -99,6 +99,37 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 		found_end = found_start + key.offset;
 		ret = 0;
 
+		/*
+		 * The stripe extent starts before the range we want to delete,
+		 * but the range spans more than one stripe extent:
+		 *
+		 * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---|
+		 *        |--- keep  ---|--- drop ---|
+		 *
+		 * This means we have to get the previous item, truncate its
+		 * length and then restart the search.
+		 */
+		if (found_start > start) {
+			if (slot == 0) {
+				ret = btrfs_previous_item(stripe_root, path, start,
+							  BTRFS_RAID_STRIPE_KEY);
+				if (ret) {
+					if (ret > 0)
+						ret = -ENOENT;
+					break;
+				}
+			} else {
+				path->slots[0]--;
+			}
+
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(leaf, &key, slot);
+			found_start = key.objectid;
+			found_end = found_start + key.offset;
+			ASSERT(found_start <= start);
+		}
+
 		if (key.type != BTRFS_RAID_STRIPE_KEY)
 			break;
 
@@ -152,6 +183,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 			break;
 		}
 
+		/* Finally we can delete the whole item, no more special cases. */
 		ret = btrfs_del_item(trans, stripe_root, path);
 		if (ret)
 			break;
-- 
cgit v1.2.3


From 6aa0e7cc569eb24a7a99c70ad7477d454b3ac0ca Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:48 +0100
Subject: btrfs: implement hole punching for RAID stripe extents

If the stripe extent we want to delete starts before the range we want to
delete and ends after the range we want to delete we're punching a
hole in the stripe extent:

  |--- RAID Stripe Extent ---|
  | keep |--- drop ---| keep |

This means we need to a) truncate the existing item and b)
create a second item for the remaining range.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c            |  1 +
 fs/btrfs/raid-stripe-tree.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c93f52a30a16..92071ca0655f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3833,6 +3833,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+	       key.type != BTRFS_RAID_STRIPE_KEY &&
 	       key.type != BTRFS_EXTENT_CSUM_KEY);
 
 	if (btrfs_leaf_free_space(leaf) >= ins_len)
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index bf665fdef18b..858abf518e9b 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -140,6 +140,54 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 		trace_btrfs_raid_extent_delete(fs_info, start, end,
 					       found_start, found_end);
 
+		/*
+		 * The stripe extent starts before the range we want to delete
+		 * and ends after the range we want to delete, i.e. we're
+		 * punching a hole in the stripe extent:
+		 *
+		 *  |--- RAID Stripe Extent ---|
+		 *  | keep |--- drop ---| keep |
+		 *
+		 * This means we need to a) truncate the existing item and b)
+		 * create a second item for the remaining range.
+		 */
+		if (found_start < start && found_end > end) {
+			size_t item_size;
+			u64 diff_start = start - found_start;
+			u64 diff_end = found_end - end;
+			struct btrfs_stripe_extent *extent;
+			struct btrfs_key newkey = {
+				.objectid = end,
+				.type = BTRFS_RAID_STRIPE_KEY,
+				.offset = diff_end,
+			};
+
+			/* The "right" item. */
+			ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey);
+			if (ret)
+				break;
+
+			item_size = btrfs_item_size(leaf, path->slots[0]);
+			extent = btrfs_item_ptr(leaf, path->slots[0],
+						struct btrfs_stripe_extent);
+
+			for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
+				struct btrfs_raid_stride *stride = &extent->strides[i];
+				u64 phys;
+
+				phys = btrfs_raid_stride_physical(leaf, stride);
+				phys += diff_start + length;
+				btrfs_set_raid_stride_physical(leaf, stride, phys);
+			}
+
+			/* The "left" item. */
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+			btrfs_partially_delete_raid_extent(trans, path, &key,
+							   diff_start, 0);
+			break;
+		}
+
 		/*
 		 * The stripe extent starts before the range we want to delete:
 		 *
-- 
cgit v1.2.3


From dc14ba10781bd2629835696b7cc1febf914768e9 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:49 +0100
Subject: btrfs: don't use btrfs_set_item_key_safe on RAID stripe-extents

Don't use btrfs_set_item_key_safe() to modify the keys in the RAID
stripe-tree, as this can lead to corruption of the tree, which is caught
by the checks in btrfs_set_item_key_safe():

 BTRFS info (device nvme1n1): leaf 49168384 gen 15 total ptrs 194 free space 8329 owner 12
 BTRFS info (device nvme1n1): refs 2 lock_owner 1030 current 1030
  [ snip ]
  item 105 key (354549760 230 20480) itemoff 14587 itemsize 16
                  stride 0 devid 5 physical 67502080
  item 106 key (354631680 230 4096) itemoff 14571 itemsize 16
                  stride 0 devid 1 physical 88559616
  item 107 key (354631680 230 32768) itemoff 14555 itemsize 16
                  stride 0 devid 1 physical 88555520
  item 108 key (354717696 230 28672) itemoff 14539 itemsize 16
                  stride 0 devid 2 physical 67604480
  [ snip ]
 BTRFS critical (device nvme1n1): slot 106 key (354631680 230 32768) new key (354635776 230 4096)
 ------------[ cut here ]------------
 kernel BUG at fs/btrfs/ctree.c:2602!
 Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
 CPU: 1 UID: 0 PID: 1055 Comm: fsstress Not tainted 6.13.0-rc1+ #1464
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014
 RIP: 0010:btrfs_set_item_key_safe+0xf7/0x270
 Code: <snip>
 RSP: 0018:ffffc90001337ab0 EFLAGS: 00010287
 RAX: 0000000000000000 RBX: ffff8881115fd000 RCX: 0000000000000000
 RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff
 RBP: ffff888110ed6f50 R08: 00000000ffffefff R09: ffffffff8244c500
 R10: 00000000ffffefff R11: 00000000ffffffff R12: ffff888100586000
 R13: 00000000000000c9 R14: ffffc90001337b1f R15: ffff888110f23b58
 FS:  00007f7d75c72740(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007fa811652c60 CR3: 0000000111398001 CR4: 0000000000370eb0
 Call Trace:
  <TASK>
  ? __die_body.cold+0x14/0x1a
  ? die+0x2e/0x50
  ? do_trap+0xca/0x110
  ? do_error_trap+0x65/0x80
  ? btrfs_set_item_key_safe+0xf7/0x270
  ? exc_invalid_op+0x50/0x70
  ? btrfs_set_item_key_safe+0xf7/0x270
  ? asm_exc_invalid_op+0x1a/0x20
  ? btrfs_set_item_key_safe+0xf7/0x270
  btrfs_partially_delete_raid_extent+0xc4/0xe0
  btrfs_delete_raid_extent+0x227/0x240
  __btrfs_free_extent.isra.0+0x57f/0x9c0
  ? exc_coproc_segment_overrun+0x40/0x40
  __btrfs_run_delayed_refs+0x2fa/0xe80
  btrfs_run_delayed_refs+0x81/0xe0
  btrfs_commit_transaction+0x2dd/0xbe0
  ? preempt_count_add+0x52/0xb0
  btrfs_sync_file+0x375/0x4c0
  do_fsync+0x39/0x70
  __x64_sys_fsync+0x13/0x20
  do_syscall_64+0x54/0x110
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7f7d7550ef90
 Code: <snip>
 RSP: 002b:00007ffd70237248 EFLAGS: 00000202 ORIG_RAX: 000000000000004a
 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f7d7550ef90
 RDX: 000000000000013a RSI: 000000000040eb28 RDI: 0000000000000004
 RBP: 000000000000001b R08: 0000000000000078 R09: 00007ffd7023725c
 R10: 00007f7d75400390 R11: 0000000000000202 R12: 028f5c28f5c28f5c
 R13: 8f5c28f5c28f5c29 R14: 000000000040b520 R15: 00007f7d75c726c8
  </TASK>

While the root cause of the tree order corruption isn't clear, using
btrfs_duplicate_item() to copy the item and then adjusting both the key
and the per-device physical addresses is a safe way to counter this
problem.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 858abf518e9b..1834011ccc49 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,12 +13,13 @@
 #include "volumes.h"
 #include "print-tree.h"
 
-static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
 					       struct btrfs_path *path,
 					       const struct btrfs_key *oldkey,
 					       u64 newlen, u64 frontpad)
 {
-	struct btrfs_stripe_extent *extent;
+	struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
+	struct btrfs_stripe_extent *extent, *newitem;
 	struct extent_buffer *leaf;
 	int slot;
 	size_t item_size;
@@ -27,6 +28,7 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
 		.type = BTRFS_RAID_STRIPE_KEY,
 		.offset = newlen,
 	};
+	int ret;
 
 	ASSERT(newlen > 0);
 	ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
@@ -34,17 +36,31 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 	item_size = btrfs_item_size(leaf, slot);
+
+	newitem = kzalloc(item_size, GFP_NOFS);
+	if (!newitem)
+		return -ENOMEM;
+
 	extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
 
 	for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
 		struct btrfs_raid_stride *stride = &extent->strides[i];
 		u64 phys;
 
-		phys = btrfs_raid_stride_physical(leaf, stride);
-		btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+		phys = btrfs_raid_stride_physical(leaf, stride) + frontpad;
+		btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys);
 	}
 
-	btrfs_set_item_key_safe(trans, path, &newkey);
+	ret = btrfs_del_item(trans, stripe_root, path);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(path);
+	ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size);
+
+out:
+	kfree(newitem);
+	return ret;
 }
 
 int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
-- 
cgit v1.2.3


From d44d3d724bb24701546c92ed5f341736bc9d832e Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:50 +0100
Subject: btrfs: selftests: check for correct return value of failed lookup

Commit 5e72aabc1fff ("btrfs: return ENODATA in case RST lookup fails")
changed btrfs_get_raid_extent_offset()'s return value to ENODATA in case
the RAID stripe-tree lookup failed.

Adjust the test cases which check for absence of a given range to check
for ENODATA as return value in this case.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index 5801142ba7c3..446c46d89152 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -125,7 +125,7 @@ static int test_front_delete(struct btrfs_trans_handle *trans)
 	}
 
 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
-	if (!ret) {
+	if (ret != -ENODATA) {
 		ret = -EINVAL;
 		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
 			 logical, logical + SZ_32K);
-- 
cgit v1.2.3


From a0afdec2552cf1ae059d58c3ffaa83aae7ddbfe1 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:51 +0100
Subject: btrfs: selftests: don't split RAID extents in half

The selftests for partially deleting the start or tail of RAID
stripe-extents split these extents in half.

This can hide errors in the calculation, so don't split the RAID
stripe-extents in half but delete the first or last 16K of the 64K
extents.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 44 +++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index 446c46d89152..da73369a79b4 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -14,6 +14,8 @@
 #define RST_TEST_NUM_DEVICES	(2)
 #define RST_TEST_RAID1_TYPE	(BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1)
 
+#define SZ_48K (SZ_32K + SZ_16K)
+
 typedef int (*test_func_t)(struct btrfs_trans_handle *trans);
 
 static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices,
@@ -94,32 +96,32 @@ static int test_front_delete(struct btrfs_trans_handle *trans)
 		goto out;
 	}
 
-	ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
+	ret = btrfs_delete_raid_extent(trans, logical, SZ_16K);
 	if (ret) {
 		test_err("deleting RAID extent [%llu, %llu] failed", logical,
-			 logical + SZ_32K);
+			 logical + SZ_16K);
 		goto out;
 	}
 
-	len = SZ_32K;
-	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
+	len -= SZ_16K;
+	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len,
 					   map_type, 0, &io_stripe);
 	if (ret) {
 		test_err("lookup of RAID extent [%llu, %llu] failed",
-			 logical + SZ_32K, logical + SZ_32K + len);
+			 logical + SZ_16K, logical + SZ_64K);
 		goto out;
 	}
 
-	if (io_stripe.physical != logical + SZ_32K) {
+	if (io_stripe.physical != logical + SZ_16K) {
 		test_err("invalid physical address, expected %llu, got %llu",
-			 logical + SZ_32K, io_stripe.physical);
+			 logical + SZ_16K, io_stripe.physical);
 		ret = -EINVAL;
 		goto out;
 	}
 
-	if (len != SZ_32K) {
+	if (len != SZ_48K) {
 		test_err("invalid stripe length, expected %llu, got %llu",
-			 (u64)SZ_32K, len);
+			 (u64)SZ_48K, len);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -128,11 +130,11 @@ static int test_front_delete(struct btrfs_trans_handle *trans)
 	if (ret != -ENODATA) {
 		ret = -EINVAL;
 		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
-			 logical, logical + SZ_32K);
+			 logical, logical + SZ_16K);
 		goto out;
 	}
 
-	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+	ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K);
 out:
 	btrfs_put_bioc(bioc);
 	return ret;
@@ -209,14 +211,14 @@ static int test_tail_delete(struct btrfs_trans_handle *trans)
 		goto out;
 	}
 
-	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+	ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K);
 	if (ret) {
 		test_err("deleting RAID extent [%llu, %llu] failed",
-			 logical + SZ_32K, logical + SZ_64K);
+			 logical + SZ_48K, logical + SZ_64K);
 		goto out;
 	}
 
-	len = SZ_32K;
+	len = SZ_48K;
 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
 	if (ret) {
 		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
@@ -231,9 +233,19 @@ static int test_tail_delete(struct btrfs_trans_handle *trans)
 		goto out;
 	}
 
-	if (len != SZ_32K) {
+	if (len != SZ_48K) {
 		test_err("invalid stripe length, expected %llu, got %llu",
-			 (u64)SZ_32K, len);
+			 (u64)SZ_48K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	len = SZ_16K;
+	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len,
+					   map_type, 0, &io_stripe);
+	if (ret != -ENODATA) {
+		test_err("lookup of RAID extent [%llu, %llu] succeeded should fail",
+			 logical + SZ_48K, logical + SZ_64K);
 		ret = -EINVAL;
 		goto out;
 	}
-- 
cgit v1.2.3


From 1d395c3926d8996918ca29a67fe194e7088491d9 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:52 +0100
Subject: btrfs: selftests: test RAID stripe-tree deletion spanning two items

Add a selftest for RAID stripe-tree deletion with a delete range spanning
two items, so that we're punching a hole into two adjacent RAID stripe
extents truncating the first and "moving" the second to the right.

The following diagram illustrates the operation:

 |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---|
 |-----  keep  -----|--- drop ---|-----  keep  ----|

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 144 ++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index da73369a79b4..8dd609c66e95 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -31,6 +31,149 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de
 	return NULL;
 }
 
+/*
+ * Test a 1M RST write that spans two adjacent RST items on disk and then
+ * delete a portion starting in the first item and spanning into the second
+ * item. This is similar to test_front_delete(), but spanning multiple items.
+ */
+static int test_front_delete_prev_item(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical1 = SZ_1M;
+	u64 logical2 = SZ_2M;
+	u64 len = SZ_1M;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	bioc->map_type = map_type;
+	bioc->size = len;
+
+	/* Insert RAID extent 1. */
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical1 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	bioc->logical = logical2;
+	/* Insert RAID extent 2, directly adjacent to it. */
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical2 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed",
+			 logical1 + SZ_512K, (u64)SZ_1M);
+		goto out;
+	}
+
+	/* Verify item 1 is truncated to 512K. */
+	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0,
+					   &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical1,
+			 logical1 + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical1) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical1, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_512K) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_512K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Verify item 2's start is moved by 512K. */
+	ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len,
+					   map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed",
+			 logical2 + SZ_512K, logical2 + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical2 + SZ_512K) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical2 + SZ_512K, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_512K) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_512K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Verify there's a hole at [1M+512K, 2M+512K] . */
+	len = SZ_1M;
+	ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len,
+					   map_type, 0, &io_stripe);
+	if (ret != -ENODATA) {
+		test_err("lookup of RAID [%llu, %llu] succeeded, should fail",
+			 logical1 + SZ_512K, logical1 + SZ_512K + len);
+		goto out;
+	}
+
+	/* Clean up after us. */
+	ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K);
+	if (ret)
+		goto out;
+
+	ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K);
+
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
 /*
  * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
  * delete the 1st 32K, making the new start address 1M+32K.
@@ -468,6 +611,7 @@ static const test_func_t tests[] = {
 	test_create_update_delete,
 	test_tail_delete,
 	test_front_delete,
+	test_front_delete_prev_item,
 };
 
 static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
-- 
cgit v1.2.3


From 27ae15b25b6e892a4161bc33c7f2b8a356318a2c Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:53 +0100
Subject: btrfs: selftests: add selftest for punching holes into the RAID
 stripe extents

Add a selftest for punching a hole into a RAID stripe extent. The test
create an 1M extent and punches a 64k bytes long hole at offset of 32k from
the start of the extent.

Afterwards it verifies the start and length of both resulting new extents
"left" and "right" as well as the absence of the hole.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 140 ++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index 8dd609c66e95..d28a48470c26 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -31,6 +31,145 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de
 	return NULL;
 }
 
+/* Test punching a hole into a single RAID stripe-extent. */
+static int test_punch_hole(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical1 = SZ_1M;
+	u64 hole_start = logical1 + SZ_32K;
+	u64 hole_len = SZ_64K;
+	u64 logical2 = hole_start + hole_len;
+	u64 len = SZ_1M;
+	u64 len1 = SZ_32K;
+	u64 len2 = len - len1 - hole_len;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	bioc->map_type = map_type;
+	bioc->size = len;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical1 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0,
+					   &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical1,
+			 logical1 + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical1) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical1, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_1M) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_1M, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, hole_start, hole_len);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed",
+			 hole_start, hole_start + hole_len);
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+					   0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed",
+			 logical1, logical1 + len1);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical1) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical1, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len1 != SZ_32K) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_32K, len1);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+					   0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical2,
+			 logical2 + len2);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical2) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical2, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len2 != len - len1 - hole_len) {
+		test_err("invalid length, expected %llu, got %llu",
+			 len - len1 - hole_len, len2);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Check for the absence of the hole. */
+	ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len,
+					   map_type, 0, &io_stripe);
+	if (ret != -ENODATA) {
+		ret = -EINVAL;
+		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+			 hole_start, hole_start + SZ_64K);
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical1, len1);
+	if (ret)
+		goto out;
+
+	ret = btrfs_delete_raid_extent(trans, logical2, len2);
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
 /*
  * Test a 1M RST write that spans two adjacent RST items on disk and then
  * delete a portion starting in the first item and spanning into the second
@@ -612,6 +751,7 @@ static const test_func_t tests[] = {
 	test_tail_delete,
 	test_front_delete,
 	test_front_delete_prev_item,
+	test_punch_hole,
 };
 
 static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
-- 
cgit v1.2.3


From cfda28fb706d53b332d5183d6091224289e96863 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:54 +0100
Subject: btrfs: selftests: add test for punching a hole into 3 RAID
 stripe-extents

Test creating a range of three RAID stripe-extents and then punch a hole
in the middle, deleting all of the middle extents and partially deleting
the "book ends".

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 183 ++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index d28a48470c26..482e10b0d7ed 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -31,6 +31,188 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de
 	return NULL;
 }
 
+/*
+ * Test creating a range of three extents and then punch a hole in the middle,
+ * deleting all of the middle extents and partially deleting the "book ends".
+ */
+static int test_punch_hole_3extents(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical1 = SZ_1M;
+	u64 len1 = SZ_1M;
+	u64 logical2 = logical1 + len1;
+	u64 len2 = SZ_1M;
+	u64 logical3 = logical2 + len2;
+	u64 len3 = SZ_1M;
+	u64 hole_start = logical1 + SZ_256K;
+	u64 hole_len = SZ_2M;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+
+	/* Prepare for the test, 1st create 3 x 1M extents. */
+	bioc->map_type = map_type;
+	bioc->size = len1;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical1 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	bioc->logical = logical2;
+	bioc->size = len2;
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical2 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	bioc->logical = logical3;
+	bioc->size = len3;
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical3 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	/*
+	 * Delete a range starting at logical1 + 256K and 2M in length. Extent
+	 * 1 is truncated to 256k length, extent 2 is completely dropped and
+	 * extent 3 is moved 256K to the right.
+	 */
+	ret = btrfs_delete_raid_extent(trans, hole_start, hole_len);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed",
+			 hole_start, hole_start + hole_len);
+		goto out;
+	}
+
+	/* Get the first extent and check its size. */
+	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+					   0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed",
+			 logical1, logical1 + len1);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical1) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical1, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len1 != SZ_256K) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_256K, len1);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Get the second extent and check it's absent. */
+	ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+					   0, &io_stripe);
+	if (ret != -ENODATA) {
+		test_err("lookup of RAID extent [%llu, %llu] succeeded should fail",
+			 logical2, logical2 + len2);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Get the third extent and check its size. */
+	logical3 += SZ_256K;
+	ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type,
+					   0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed",
+			 logical3, logical3 + len3);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical3) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical3 + SZ_256K, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len3 != SZ_1M - SZ_256K) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_1M - SZ_256K, len3);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical1, len1);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed",
+			 logical1, logical1 + len1);
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical3, len3);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed",
+			 logical1, logical1 + len1);
+		goto out;
+	}
+
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
 /* Test punching a hole into a single RAID stripe-extent. */
 static int test_punch_hole(struct btrfs_trans_handle *trans)
 {
@@ -752,6 +934,7 @@ static const test_func_t tests[] = {
 	test_front_delete,
 	test_front_delete_prev_item,
 	test_punch_hole,
+	test_punch_hole_3extents,
 };
 
 static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
-- 
cgit v1.2.3


From 9d0c23db26cb58c9fc6ee8817e8f9ebeb25776e5 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 13 Jan 2025 20:31:55 +0100
Subject: btrfs: selftests: add a selftest for deleting two out of three
 extents

Add a selftest creating three extents and then deleting two out of the
three extents.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 144 ++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index 482e10b0d7ed..a7bc58a5c1e2 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -213,6 +213,149 @@ out:
 	return ret;
 }
 
+static int test_delete_two_extents(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical1 = SZ_1M;
+	u64 len1 = SZ_1M;
+	u64 logical2 = logical1 + len1;
+	u64 len2 = SZ_1M;
+	u64 logical3 = logical2 + len2;
+	u64 len3 = SZ_1M;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+
+	/* Prepare for the test, 1st create 3 x 1M extents. */
+	bioc->map_type = map_type;
+	bioc->size = len1;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical1 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	bioc->logical = logical2;
+	bioc->size = len2;
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical2 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	bioc->logical = logical3;
+	bioc->size = len3;
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical3 + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	/*
+	 * Delete a range starting at logical1 and 2M in length. Extents 1
+	 * and 2 are dropped and extent 3 is kept as is.
+	 */
+	ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed",
+			 logical1, logical1 + len1 + len2);
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+					   0, &io_stripe);
+	if (ret != -ENODATA) {
+		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+			 logical1, len1);
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+					   0, &io_stripe);
+	if (ret != -ENODATA) {
+		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+			 logical2, len2);
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type,
+					   0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed",
+			 logical3, len3);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical3) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical3, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len3 != SZ_1M) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_1M, len3);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical3, len3);
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
 /* Test punching a hole into a single RAID stripe-extent. */
 static int test_punch_hole(struct btrfs_trans_handle *trans)
 {
@@ -935,6 +1078,7 @@ static const test_func_t tests[] = {
 	test_front_delete_prev_item,
 	test_punch_hole,
 	test_punch_hole_3extents,
+	test_delete_two_extents,
 };
 
 static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
-- 
cgit v1.2.3


From 9c13cc9c7dd7852ab5080f735f896b4e2ac7e9d4 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Thu, 26 Sep 2024 00:46:02 +0800
Subject: Revert "bcachefs: Fix bch2_btree_node_upgrade()"

This reverts commit 62448afee714354a26db8a0f3c644f58628f0792.

six_lock_tryupgrade fails only if there is an intent lock held,
it won't fail no matter how many read locks are held.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 8503931463d1..c1a0f8f78eb4 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -512,7 +512,6 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 			     struct btree_path *path, unsigned level)
 {
 	struct btree *b = path->l[level].b;
-	struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
 
 	if (!is_btree_node(path, level))
 		return false;
@@ -536,24 +535,11 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
 	if (race_fault())
 		return false;
 
-	if (btree_node_locked(path, level)) {
-		bool ret;
-
-		six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
-		ret = six_lock_tryupgrade(&b->c.lock);
-		six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
-
-		if (ret)
-			goto success;
-	} else {
-		if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
-			goto success;
-	}
+	if (btree_node_locked(path, level)
+	    ? six_lock_tryupgrade(&b->c.lock)
+	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+		goto success;
 
-	/*
-	 * Do we already have an intent lock via another path? If so, just bump
-	 * lock count:
-	 */
 	if (btree_node_lock_seq_matches(path, b, level) &&
 	    btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
 		btree_node_unlock(trans, path, level);
-- 
cgit v1.2.3


From cdc419dbf286eeaabb361d5cf5cee2cb566b6f5f Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Thu, 26 Sep 2024 00:45:00 +0800
Subject: bcachefs: Merge the condition to avoid additional invocation

If the lock has been acquired and unlocked, we don't have to do clear
and wakeup again, though harmless since we hold the intent lock. Merge
the condition might be clearer.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/six.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 537bf049618f..7e7c66a1e1a6 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -491,8 +491,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 				list_del(&wait->list);
 			raw_spin_unlock(&lock->wait_lock);
 
-			if (unlikely(acquired))
+			if (unlikely(acquired)) {
 				do_six_unlock_type(lock, type);
+			} else if (type == SIX_LOCK_write) {
+				six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+				six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
+			}
 			break;
 		}
 
@@ -501,10 +505,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
 
 	__set_current_state(TASK_RUNNING);
 out:
-	if (ret && type == SIX_LOCK_write) {
-		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
-		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
-	}
 	trace_contention_end(lock, 0);
 
 	return ret;
-- 
cgit v1.2.3


From 0ef9ab34f442f7236b5a84a6691959c67639f8b9 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Thu, 10 Oct 2024 21:21:50 +0800
Subject: bcachefs: Do not allow no fail lock request to fail

If the transaction chose itself as a victim before and restarted, it
might request a no fail lock request this time. But it might be added to
others' lock graph and be chose as the victim again, it's no longer safe
without additional check. We can also convert the cycle detector to be
fully RCU-based to solve that unsoundness, but the latency added to trans_put
and additional memory required may not worth it.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index c1a0f8f78eb4..771d531d6533 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -281,7 +281,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
 
 	g.nr = 0;
 
-	if (trans->lock_must_abort) {
+	if (trans->lock_must_abort && !trans->lock_may_not_fail) {
 		if (cycle)
 			return -1;
 
-- 
cgit v1.2.3


From b5c3dcd0db0452b13a4799b9c2be751e75b541f9 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Thu, 3 Oct 2024 03:06:30 +0800
Subject: bcachefs: Convert open-coded lock_graph_pop_all to helper

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 771d531d6533..8174799dd4b3 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -229,8 +229,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
 	ret = abort_lock(g, abort);
 out:
 	if (ret)
-		while (g->nr)
-			lock_graph_up(g);
+		lock_graph_pop_all(g);
 	return ret;
 }
 
@@ -252,8 +251,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 		if (orig_trans->lock_may_not_fail)
 			return 0;
 
-		while (g->nr)
-			lock_graph_up(g);
+		lock_graph_pop_all(g);
 
 		if (cycle)
 			return 0;
-- 
cgit v1.2.3


From 6853a5e5d48b54127166291b1fc2cc7030fbcc38 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Thu, 3 Oct 2024 03:06:31 +0800
Subject: bcachefs: Introduce lock_graph_pop_from

This patch introduces a helper function called lock_graph_pop_from,
it pops the graph from i.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 8174799dd4b3..ec72a8adf713 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -109,6 +109,12 @@ static noinline void lock_graph_pop_all(struct lock_graph *g)
 		lock_graph_up(g);
 }
 
+static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+	while (g->g + g->nr > i)
+		lock_graph_up(g);
+}
+
 static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
 {
 	g->g[g->nr++] = (struct trans_waiting_for_lock) {
@@ -131,8 +137,7 @@ static bool lock_graph_remove_non_waiters(struct lock_graph *g)
 	for (i = g->g + 1; i < g->g + g->nr; i++)
 		if (i->trans->locking != i->node_want ||
 		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
-			while (g->g + g->nr > i)
-				lock_graph_up(g);
+			lock_graph_pop_from(g, i);
 			return true;
 		}
 
-- 
cgit v1.2.3


From b169138d482980d31207c2d1706a2be2529978cf Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Thu, 3 Oct 2024 03:06:32 +0800
Subject: bcachefs: Only abort the transactions in the cycle

When the cycle doesn't involve the initiator of the cycle detection,
we might choose a transaction that is not involved in the cycle to abort.
It shouldn't be that since it won't break the cycle, this patch
therefore chooses the transaction in the cycle to abort.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index ec72a8adf713..15ef9f71ca43 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -130,11 +130,17 @@ static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
 	__lock_graph_down(g, trans);
 }
 
-static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+static bool lock_graph_remove_non_waiters(struct lock_graph *g,
+					  struct trans_waiting_for_lock *from)
 {
 	struct trans_waiting_for_lock *i;
 
-	for (i = g->g + 1; i < g->g + g->nr; i++)
+	if (from->trans->locking != from->node_want) {
+		lock_graph_pop_from(g, from);
+		return true;
+	}
+
+	for (i = from + 1; i < g->g + g->nr; i++)
 		if (i->trans->locking != i->node_want ||
 		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
 			lock_graph_pop_from(g, i);
@@ -184,13 +190,14 @@ static int btree_trans_abort_preference(struct btree_trans *trans)
 	return 3;
 }
 
-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
+				struct trans_waiting_for_lock *from)
 {
 	struct trans_waiting_for_lock *i, *abort = NULL;
 	unsigned best = 0, pref;
 	int ret;
 
-	if (lock_graph_remove_non_waiters(g))
+	if (lock_graph_remove_non_waiters(g, from))
 		return 0;
 
 	/* Only checking, for debugfs: */
@@ -200,7 +207,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
 		goto out;
 	}
 
-	for (i = g->g; i < g->g + g->nr; i++) {
+	for (i = from; i < g->g + g->nr; i++) {
 		pref = btree_trans_abort_preference(i->trans);
 		if (pref > best) {
 			abort = i;
@@ -247,7 +254,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 	for (i = g->g; i < g->g + g->nr; i++)
 		if (i->trans == trans) {
 			closure_put(&trans->ref);
-			return break_cycle(g, cycle);
+			return break_cycle(g, cycle, i);
 		}
 
 	if (g->nr == ARRAY_SIZE(g->g)) {
@@ -339,7 +346,7 @@ next:
 				 * structures - which means it can't be blocked
 				 * waiting on a lock:
 				 */
-				if (!lock_graph_remove_non_waiters(&g)) {
+				if (!lock_graph_remove_non_waiters(&g, g.g)) {
 					/*
 					 * If lock_graph_remove_non_waiters()
 					 * didn't do anything, it must be
-- 
cgit v1.2.3


From 5dd21b27121985cef0642d0cb89cfd749393b727 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Thu, 3 Oct 2024 03:06:33 +0800
Subject: bcachefs: Pop all the transactions from the abort one

The transaction is going to abort, so there will be no cycle involving
this transaction anymore.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 15ef9f71ca43..10b805a60f52 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -242,6 +242,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
 out:
 	if (ret)
 		lock_graph_pop_all(g);
+	else
+		lock_graph_pop_from(g, abort);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 78423deb51ed5acd2ae5e2b7212a14a0da01b39a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 31 Dec 2024 18:42:48 -0500
Subject: bcachefs: Fix self healing on read error

We were incorrectly checking if there'd been an io error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_read.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 34a3569d085a..8c7b2d3d779d 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -91,13 +91,18 @@ static const struct rhashtable_params bch_promote_params = {
 	.automatic_shrinking	= true,
 };
 
+static inline bool have_io_error(struct bch_io_failures *failed)
+{
+	return failed && failed->nr;
+}
+
 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
 				  struct bpos pos,
 				  struct bch_io_opts opts,
 				  unsigned flags,
 				  struct bch_io_failures *failed)
 {
-	if (!failed) {
+	if (!have_io_error(failed)) {
 		BUG_ON(!opts.promote_target);
 
 		if (!(flags & BCH_READ_MAY_PROMOTE))
@@ -224,7 +229,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 
 	struct data_update_opts update_opts = {};
 
-	if (!failed) {
+	if (!have_io_error(failed)) {
 		update_opts.target = opts.promote_target;
 		update_opts.extra_replicas = 1;
 		update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
@@ -286,7 +291,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
 	 * if failed != NULL we're not actually doing a promote, we're
 	 * recovering from an io/checksum error
 	 */
-	bool promote_full = (failed ||
+	bool promote_full = (have_io_error(failed) ||
 			     *read_full ||
 			     READ_ONCE(c->opts.promote_whole_extents));
 	/* data might have to be decompressed in the write path: */
@@ -989,7 +994,7 @@ retry_pick:
 		bounce = true;
 	}
 
-	if (orig->opts.promote_target)// || failed)
+	if (orig->opts.promote_target || have_io_error(failed))
 		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
 					&rbio, &bounce, &read_full, failed);
 
-- 
cgit v1.2.3


From a4e11cea277eee080d421b62e457968fd8c92a53 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 13 Jan 2025 15:41:50 -0500
Subject: bcachefs: Document issue with bch_stripe layout

We've got a problem with bch_stripe that is going to take an on disk
format rev to fix - we can't access the block sector counts if the
checksum type is unknown.

Document it for now, there are a few other things to fix as well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec_format.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
index 64ef52e00078..b9770f24f213 100644
--- a/fs/bcachefs/ec_format.h
+++ b/fs/bcachefs/ec_format.h
@@ -20,6 +20,23 @@ struct bch_stripe {
 	 */
 	__u8			disk_label;
 
+	/*
+	 * Variable length sections:
+	 * - Pointers
+	 * - Checksums
+	 *   2D array of [stripe block/device][csum block], with checksum block
+	 *   size given by csum_granularity_bits
+	 * - Block sector counts: per-block array of u16s
+	 *
+	 * XXX:
+	 * Either checksums should have come last, or we should have included a
+	 * checksum_size field (the size in bytes of the checksum itself, not
+	 * the blocksize the checksum covers).
+	 *
+	 * Currently we aren't able to access the block sector counts if the
+	 * checksum type is unknown.
+	 */
+
 	struct bch_extent_ptr	ptrs[];
 } __packed __aligned(8);
 
-- 
cgit v1.2.3


From 7c9d9223802fbed4dee1ae301661bf346964c9d2 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 13 Jan 2025 19:31:28 +0100
Subject: gfs2: Truncate address space when flipping GFS2_DIF_JDATA flag

Truncate an inode's address space when flipping the GFS2_DIF_JDATA flag:
depending on that flag, the pages in the address space will either use
buffer heads or iomap_folio_state structs, and we cannot mix the two.

Reported-by: Kun Hu <huk23@m.fudan.edu.cn>, Jiaji Qin <jjtan24@m.fudan.edu.cn>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 1e73cf87ff88..c9bb3be21d2b 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -251,6 +251,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask)
 		error = filemap_fdatawait(inode->i_mapping);
 		if (error)
 			goto out;
+		truncate_inode_pages(inode->i_mapping, 0);
 		if (new_flags & GFS2_DIF_JDATA)
 			gfs2_ordered_del_inode(ip);
 	}
-- 
cgit v1.2.3


From 3feec68563dda59517f83d19123aa287a1dfd068 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:40:53 -0500
Subject: nfs/localio: add direct IO enablement with sync and async IO support

This commit simply adds the required O_DIRECT plumbing.  It doesn't
address the fact that NFS doesn't ensure all writes are page aligned
(nor device logical block size aligned as required by O_DIRECT).

Because NFS will read-modify-write for IO that isn't aligned, LOCALIO
will not use O_DIRECT semantics by default if/when an application
requests the use of O_DIRECT.  Allow the use of O_DIRECT semantics by:
1: Adding a flag to the nfs_pgio_header struct to allow the NFS
   O_DIRECT layer to signal that O_DIRECT was used by the application
2: Adding a 'localio_O_DIRECT_semantics' NFS module parameter that
   when enabled will cause LOCALIO to use O_DIRECT semantics (this may
   cause IO to fail if applications do not properly align their IO).

This commit is derived from code developed by Weston Andros Adamson.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 Documentation/filesystems/nfs/localio.rst | 13 +++++
 fs/nfs/direct.c                           |  1 +
 fs/nfs/localio.c                          | 93 +++++++++++++++++++++++++++----
 include/linux/nfs_xdr.h                   |  1 +
 4 files changed, 98 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/nfs/localio.rst b/Documentation/filesystems/nfs/localio.rst
index bd1967e2eab3..20fc901a08f4 100644
--- a/Documentation/filesystems/nfs/localio.rst
+++ b/Documentation/filesystems/nfs/localio.rst
@@ -306,6 +306,19 @@ is issuing IO to the underlying local filesystem that it is sharing with
 the NFS server. See: fs/nfs/localio.c:nfs_local_doio() and
 fs/nfs/localio.c:nfs_local_commit().
 
+With normal NFS that makes use of RPC to issue IO to the server, if an
+application uses O_DIRECT the NFS client will bypass the pagecache but
+the NFS server will not. Because the NFS server's use of buffered IO
+affords applications to be less precise with their alignment when
+issuing IO to the NFS client. LOCALIO can be configured to use O_DIRECT
+semantics by setting the 'localio_O_DIRECT_semantics' nfs module
+parameter to Y, e.g.:
+
+  echo Y > /sys/module/nfs/parameters/localio_O_DIRECT_semantics
+
+Once enabled, it will cause LOCALIO to use O_DIRECT semantics (this may
+cause IO to fail if applications do not properly align their IO).
+
 Security
 ========
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b08dbe96bc57..f45beea92d03 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -303,6 +303,7 @@ static void nfs_read_sync_pgio_error(struct list_head *head, int error)
 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
 {
 	get_dreq(hdr->dreq);
+	set_bit(NFS_IOHDR_ODIRECT, &hdr->flags);
 }
 
 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 4b8618cf114c..7637786c5cb7 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -35,6 +35,7 @@ struct nfs_local_kiocb {
 	struct bio_vec		*bvec;
 	struct nfs_pgio_header	*hdr;
 	struct work_struct	work;
+	void (*aio_complete_work)(struct work_struct *);
 	struct nfsd_file	*localio;
 };
 
@@ -48,6 +49,11 @@ struct nfs_local_fsync_ctx {
 static bool localio_enabled __read_mostly = true;
 module_param(localio_enabled, bool, 0644);
 
+static bool localio_O_DIRECT_semantics __read_mostly = false;
+module_param(localio_O_DIRECT_semantics, bool, 0644);
+MODULE_PARM_DESC(localio_O_DIRECT_semantics,
+		 "LOCALIO will use O_DIRECT semantics to filesystem.");
+
 static inline bool nfs_client_is_local(const struct nfs_client *clp)
 {
 	return !!test_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
@@ -285,10 +291,19 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
 		kfree(iocb);
 		return NULL;
 	}
-	init_sync_kiocb(&iocb->kiocb, file);
+
+	if (localio_O_DIRECT_semantics &&
+	    test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+		iocb->kiocb.ki_filp = file;
+		iocb->kiocb.ki_flags = IOCB_DIRECT;
+	} else
+		init_sync_kiocb(&iocb->kiocb, file);
+
 	iocb->kiocb.ki_pos = hdr->args.offset;
 	iocb->hdr = hdr;
 	iocb->kiocb.ki_flags &= ~IOCB_APPEND;
+	iocb->aio_complete_work = NULL;
+
 	return iocb;
 }
 
@@ -343,6 +358,18 @@ nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
 	nfs_local_hdr_release(hdr, hdr->task.tk_ops);
 }
 
+/*
+ * Complete the I/O from iocb->kiocb.ki_complete()
+ *
+ * Note that this function can be called from a bottom half context,
+ * hence we need to queue the rpc_call_done() etc to a workqueue
+ */
+static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb)
+{
+	INIT_WORK(&iocb->work, iocb->aio_complete_work);
+	queue_work(nfsiod_workqueue, &iocb->work);
+}
+
 static void
 nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
 {
@@ -365,6 +392,23 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
 			status > 0 ? status : 0, hdr->res.eof);
 }
 
+static void nfs_local_read_aio_complete_work(struct work_struct *work)
+{
+	struct nfs_local_kiocb *iocb =
+		container_of(work, struct nfs_local_kiocb, work);
+
+	nfs_local_pgio_release(iocb);
+}
+
+static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
+{
+	struct nfs_local_kiocb *iocb =
+		container_of(kiocb, struct nfs_local_kiocb, kiocb);
+
+	nfs_local_read_done(iocb, ret);
+	nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
+}
+
 static void nfs_local_call_read(struct work_struct *work)
 {
 	struct nfs_local_kiocb *iocb =
@@ -379,10 +423,10 @@ static void nfs_local_call_read(struct work_struct *work)
 	nfs_local_iter_init(&iter, iocb, READ);
 
 	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
-	WARN_ON_ONCE(status == -EIOCBQUEUED);
-
-	nfs_local_read_done(iocb, status);
-	nfs_local_pgio_release(iocb);
+	if (status != -EIOCBQUEUED) {
+		nfs_local_read_done(iocb, status);
+		nfs_local_pgio_release(iocb);
+	}
 
 	revert_creds(save_cred);
 }
@@ -410,6 +454,11 @@ nfs_do_local_read(struct nfs_pgio_header *hdr,
 	nfs_local_pgio_init(hdr, call_ops);
 	hdr->res.eof = false;
 
+	if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+		iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+		iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+	}
+
 	INIT_WORK(&iocb->work, nfs_local_call_read);
 	queue_work(nfslocaliod_workqueue, &iocb->work);
 
@@ -534,6 +583,24 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
 	nfs_local_pgio_done(hdr, status);
 }
 
+static void nfs_local_write_aio_complete_work(struct work_struct *work)
+{
+	struct nfs_local_kiocb *iocb =
+		container_of(work, struct nfs_local_kiocb, work);
+
+	nfs_local_vfs_getattr(iocb);
+	nfs_local_pgio_release(iocb);
+}
+
+static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
+{
+	struct nfs_local_kiocb *iocb =
+		container_of(kiocb, struct nfs_local_kiocb, kiocb);
+
+	nfs_local_write_done(iocb, ret);
+	nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
+}
+
 static void nfs_local_call_write(struct work_struct *work)
 {
 	struct nfs_local_kiocb *iocb =
@@ -552,11 +619,11 @@ static void nfs_local_call_write(struct work_struct *work)
 	file_start_write(filp);
 	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
 	file_end_write(filp);
-	WARN_ON_ONCE(status == -EIOCBQUEUED);
-
-	nfs_local_write_done(iocb, status);
-	nfs_local_vfs_getattr(iocb);
-	nfs_local_pgio_release(iocb);
+	if (status != -EIOCBQUEUED) {
+		nfs_local_write_done(iocb, status);
+		nfs_local_vfs_getattr(iocb);
+		nfs_local_pgio_release(iocb);
+	}
 
 	revert_creds(save_cred);
 	current->flags = old_flags;
@@ -592,10 +659,16 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
 	case NFS_FILE_SYNC:
 		iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC;
 	}
+
 	nfs_local_pgio_init(hdr, call_ops);
 
 	nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
 
+	if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+		iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+		iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+	}
+
 	INIT_WORK(&iocb->work, nfs_local_call_write);
 	queue_work(nfslocaliod_workqueue, &iocb->work);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 559273a0f16d..80766ff0a47c 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1637,6 +1637,7 @@ enum {
 	NFS_IOHDR_RESEND_PNFS,
 	NFS_IOHDR_RESEND_MDS,
 	NFS_IOHDR_UNSTABLE_WRITES,
+	NFS_IOHDR_ODIRECT,
 };
 
 struct nfs_io_completion;
-- 
cgit v1.2.3


From a61466315d7afca032342183a57e62d5e3a3157c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:40:54 -0500
Subject: nfsd: add nfsd_file_{get,put} to 'nfs_to' nfsd_localio_operations

In later a commit LOCALIO must call both nfsd_file_get and
nfsd_file_put to manage extra nfsd_file references.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfsd/localio.c          | 2 ++
 include/linux/nfslocalio.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index f441cb9f74d5..8beda4c85111 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -29,6 +29,8 @@ static const struct nfsd_localio_operations nfsd_localio_ops = {
 	.nfsd_serv_put  = nfsd_serv_put,
 	.nfsd_open_local_fh = nfsd_open_local_fh,
 	.nfsd_file_put_local = nfsd_file_put_local,
+	.nfsd_file_get = nfsd_file_get,
+	.nfsd_file_put = nfsd_file_put,
 	.nfsd_file_file = nfsd_file_file,
 };
 
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 9202f4b24343..ab6a2a53f505 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -56,6 +56,8 @@ struct nfsd_localio_operations {
 						const struct nfs_fh *,
 						const fmode_t);
 	struct net *(*nfsd_file_put_local)(struct nfsd_file *);
+	struct nfsd_file *(*nfsd_file_get)(struct nfsd_file *);
+	void (*nfsd_file_put)(struct nfsd_file *);
 	struct file *(*nfsd_file_file)(struct nfsd_file *);
 } ____cacheline_aligned;
 
-- 
cgit v1.2.3


From b49f049a22227df701bfbca083d6cc859496e615 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:40:55 -0500
Subject: nfs_common: rename functions that invalidate LOCALIO nfs_clients

Rename nfs_uuid_invalidate_one_client to nfs_localio_disable_client.
Rename nfs_uuid_invalidate_clients to nfs_localio_invalidate_clients.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/localio.c           | 2 +-
 fs/nfs_common/nfslocalio.c | 8 ++++----
 fs/nfsd/nfsctl.c           | 4 ++--
 include/linux/nfslocalio.h | 5 +++--
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 7637786c5cb7..94af5d89bb99 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -140,7 +140,7 @@ void nfs_local_disable(struct nfs_client *clp)
 	spin_lock(&clp->cl_localio_lock);
 	if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
 		trace_nfs_local_disable(clp);
-		nfs_uuid_invalidate_one_client(&clp->cl_uuid);
+		nfs_localio_disable_client(&clp->cl_uuid);
 	}
 	spin_unlock(&clp->cl_localio_lock);
 }
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index a74ec08f6c96..904439e4bb85 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -107,7 +107,7 @@ static void nfs_uuid_put_locked(nfs_uuid_t *nfs_uuid)
 	list_del_init(&nfs_uuid->list);
 }
 
-void nfs_uuid_invalidate_clients(struct list_head *list)
+void nfs_localio_invalidate_clients(struct list_head *list)
 {
 	nfs_uuid_t *nfs_uuid, *tmp;
 
@@ -116,9 +116,9 @@ void nfs_uuid_invalidate_clients(struct list_head *list)
 		nfs_uuid_put_locked(nfs_uuid);
 	spin_unlock(&nfs_uuid_lock);
 }
-EXPORT_SYMBOL_GPL(nfs_uuid_invalidate_clients);
+EXPORT_SYMBOL_GPL(nfs_localio_invalidate_clients);
 
-void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid)
+void nfs_localio_disable_client(nfs_uuid_t *nfs_uuid)
 {
 	if (nfs_uuid->net) {
 		spin_lock(&nfs_uuid_lock);
@@ -126,7 +126,7 @@ void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid)
 		spin_unlock(&nfs_uuid_lock);
 	}
 }
-EXPORT_SYMBOL_GPL(nfs_uuid_invalidate_one_client);
+EXPORT_SYMBOL_GPL(nfs_localio_disable_client);
 
 struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
 		   struct rpc_clnt *rpc_clnt, const struct cred *cred,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3adbc05ebaac..727904d8a4d0 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2276,14 +2276,14 @@ out_export_error:
  * nfsd_net_pre_exit - Disconnect localio clients from net namespace
  * @net: a network namespace that is about to be destroyed
  *
- * This invalidated ->net pointers held by localio clients
+ * This invalidates ->net pointers held by localio clients
  * while they can still safely access nn->counter.
  */
 static __net_exit void nfsd_net_pre_exit(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	nfs_uuid_invalidate_clients(&nn->local_clients);
+	nfs_localio_invalidate_clients(&nn->local_clients);
 }
 #endif
 
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index ab6a2a53f505..a05d1043f2b0 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -37,8 +37,9 @@ bool nfs_uuid_begin(nfs_uuid_t *);
 void nfs_uuid_end(nfs_uuid_t *);
 void nfs_uuid_is_local(const uuid_t *, struct list_head *,
 		       struct net *, struct auth_domain *, struct module *);
-void nfs_uuid_invalidate_clients(struct list_head *list);
-void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid);
+
+void nfs_localio_disable_client(nfs_uuid_t *nfs_uuid);
+void nfs_localio_invalidate_clients(struct list_head *list);
 
 /* localio needs to map filehandle -> struct nfsd_file */
 extern struct nfsd_file *
-- 
cgit v1.2.3


From 4ee7ba40007357a48447a8cbc667480acf9a006a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:40:56 -0500
Subject: nfs_common: move localio_lock to new lock member of nfs_uuid_t

Remove cl_localio_lock from 'struct nfs_client' in favor of adding a
lock to the nfs_uuid_t struct (which is embedded in each nfs_client).

Push nfs_local_{enable,disable} implementation down to nfs_common.
Those methods now call nfs_localio_{enable,disable}_client.

This allows implementing nfs_localio_invalidate_clients in terms of
nfs_localio_disable_client.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/client.c            |  1 -
 fs/nfs/localio.c           | 14 ++++--------
 fs/nfs_common/nfslocalio.c | 55 ++++++++++++++++++++++++++++++++--------------
 include/linux/nfs_fs_sb.h  |  1 -
 include/linux/nfslocalio.h |  8 ++++++-
 5 files changed, 50 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 550ca934c9cf..e83e1ce04613 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -186,7 +186,6 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 	seqlock_init(&clp->cl_boot_lock);
 	ktime_get_real_ts64(&clp->cl_nfssvc_boot);
 	nfs_uuid_init(&clp->cl_uuid);
-	spin_lock_init(&clp->cl_localio_lock);
 #endif /* CONFIG_NFS_LOCALIO */
 
 	clp->cl_principal = "*";
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 94af5d89bb99..7191135b47a4 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -126,10 +126,8 @@ const struct rpc_program nfslocalio_program = {
  */
 static void nfs_local_enable(struct nfs_client *clp)
 {
-	spin_lock(&clp->cl_localio_lock);
-	set_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
 	trace_nfs_local_enable(clp);
-	spin_unlock(&clp->cl_localio_lock);
+	nfs_localio_enable_client(clp);
 }
 
 /*
@@ -137,12 +135,8 @@ static void nfs_local_enable(struct nfs_client *clp)
  */
 void nfs_local_disable(struct nfs_client *clp)
 {
-	spin_lock(&clp->cl_localio_lock);
-	if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
-		trace_nfs_local_disable(clp);
-		nfs_localio_disable_client(&clp->cl_uuid);
-	}
-	spin_unlock(&clp->cl_localio_lock);
+	trace_nfs_local_disable(clp);
+	nfs_localio_disable_client(clp);
 }
 
 /*
@@ -184,7 +178,7 @@ static bool nfs_server_uuid_is_local(struct nfs_client *clp)
 	rpc_shutdown_client(rpcclient_localio);
 
 	/* Server is only local if it initialized required struct members */
-	if (status || !clp->cl_uuid.net || !clp->cl_uuid.dom)
+	if (status || !rcu_access_pointer(clp->cl_uuid.net) || !clp->cl_uuid.dom)
 		return false;
 
 	return true;
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 904439e4bb85..abc132166742 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -7,6 +7,9 @@
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/nfslocalio.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
 #include <net/netns/generic.h>
 
 MODULE_LICENSE("GPL");
@@ -25,6 +28,7 @@ void nfs_uuid_init(nfs_uuid_t *nfs_uuid)
 	nfs_uuid->net = NULL;
 	nfs_uuid->dom = NULL;
 	INIT_LIST_HEAD(&nfs_uuid->list);
+	spin_lock_init(&nfs_uuid->lock);
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_init);
 
@@ -94,12 +98,23 @@ void nfs_uuid_is_local(const uuid_t *uuid, struct list_head *list,
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_is_local);
 
+void nfs_localio_enable_client(struct nfs_client *clp)
+{
+	nfs_uuid_t *nfs_uuid = &clp->cl_uuid;
+
+	spin_lock(&nfs_uuid->lock);
+	set_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
+	spin_unlock(&nfs_uuid->lock);
+}
+EXPORT_SYMBOL_GPL(nfs_localio_enable_client);
+
 static void nfs_uuid_put_locked(nfs_uuid_t *nfs_uuid)
 {
-	if (nfs_uuid->net) {
-		module_put(nfsd_mod);
-		nfs_uuid->net = NULL;
-	}
+	if (!nfs_uuid->net)
+		return;
+	module_put(nfsd_mod);
+	RCU_INIT_POINTER(nfs_uuid->net, NULL);
+
 	if (nfs_uuid->dom) {
 		auth_domain_put(nfs_uuid->dom);
 		nfs_uuid->dom = NULL;
@@ -107,27 +122,35 @@ static void nfs_uuid_put_locked(nfs_uuid_t *nfs_uuid)
 	list_del_init(&nfs_uuid->list);
 }
 
-void nfs_localio_invalidate_clients(struct list_head *list)
+void nfs_localio_disable_client(struct nfs_client *clp)
 {
-	nfs_uuid_t *nfs_uuid, *tmp;
-
-	spin_lock(&nfs_uuid_lock);
-	list_for_each_entry_safe(nfs_uuid, tmp, list, list)
-		nfs_uuid_put_locked(nfs_uuid);
-	spin_unlock(&nfs_uuid_lock);
-}
-EXPORT_SYMBOL_GPL(nfs_localio_invalidate_clients);
+	nfs_uuid_t *nfs_uuid = &clp->cl_uuid;
 
-void nfs_localio_disable_client(nfs_uuid_t *nfs_uuid)
-{
-	if (nfs_uuid->net) {
+	spin_lock(&nfs_uuid->lock);
+	if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
 		spin_lock(&nfs_uuid_lock);
 		nfs_uuid_put_locked(nfs_uuid);
 		spin_unlock(&nfs_uuid_lock);
 	}
+	spin_unlock(&nfs_uuid->lock);
 }
 EXPORT_SYMBOL_GPL(nfs_localio_disable_client);
 
+void nfs_localio_invalidate_clients(struct list_head *cl_uuid_list)
+{
+	nfs_uuid_t *nfs_uuid, *tmp;
+
+	spin_lock(&nfs_uuid_lock);
+	list_for_each_entry_safe(nfs_uuid, tmp, cl_uuid_list, list) {
+		struct nfs_client *clp =
+			container_of(nfs_uuid, struct nfs_client, cl_uuid);
+
+		nfs_localio_disable_client(clp);
+	}
+	spin_unlock(&nfs_uuid_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_localio_invalidate_clients);
+
 struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
 		   struct rpc_clnt *rpc_clnt, const struct cred *cred,
 		   const struct nfs_fh *nfs_fh, const fmode_t fmode)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b804346a9741..239d86ef166c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -132,7 +132,6 @@ struct nfs_client {
 	struct timespec64	cl_nfssvc_boot;
 	seqlock_t		cl_boot_lock;
 	nfs_uuid_t		cl_uuid;
-	spinlock_t		cl_localio_lock;
 #endif /* CONFIG_NFS_LOCALIO */
 };
 
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index a05d1043f2b0..4d5583873f41 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -6,6 +6,7 @@
 #ifndef __LINUX_NFSLOCALIO_H
 #define __LINUX_NFSLOCALIO_H
 
+
 /* nfsd_file structure is purposely kept opaque to NFS client */
 struct nfsd_file;
 
@@ -19,6 +20,8 @@ struct nfsd_file;
 #include <linux/nfs.h>
 #include <net/net_namespace.h>
 
+struct nfs_client;
+
 /*
  * Useful to allow a client to negotiate if localio
  * possible with its server.
@@ -27,6 +30,8 @@ struct nfsd_file;
  */
 typedef struct {
 	uuid_t uuid;
+	/* sadly this struct is just over a cacheline, avoid bouncing */
+	spinlock_t ____cacheline_aligned lock;
 	struct list_head list;
 	struct net __rcu *net; /* nfsd's network namespace */
 	struct auth_domain *dom; /* auth_domain for localio */
@@ -38,7 +43,8 @@ void nfs_uuid_end(nfs_uuid_t *);
 void nfs_uuid_is_local(const uuid_t *, struct list_head *,
 		       struct net *, struct auth_domain *, struct module *);
 
-void nfs_localio_disable_client(nfs_uuid_t *nfs_uuid);
+void nfs_localio_enable_client(struct nfs_client *clp);
+void nfs_localio_disable_client(struct nfs_client *clp);
 void nfs_localio_invalidate_clients(struct list_head *list);
 
 /* localio needs to map filehandle -> struct nfsd_file */
-- 
cgit v1.2.3


From 86e00412254a717ffd5d38dc5ec0ee1cce6281b3 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:40:57 -0500
Subject: nfs: cache all open LOCALIO nfsd_file(s) in client

This commit switches from leaning heavily on NFSD's filecache (in
terms of GC'd nfsd_files) back to caching nfsd_files in the
client. A later commit will add the callback mechanism needed to
allow NFSD to force the NFS client to cleanup all cached nfsd_files.

Add nfs_fh_localio_init() and 'struct nfs_fh_localio' to cache opened
nfsd_file(s) (both a RO and RW nfsd_file is able to be opened and
cached for a given nfs_fh).

Update nfs_local_open_fh() to cache the nfsd_file once it is opened
using __nfs_local_open_fh().

Introduce nfs_close_local_fh() to clear the cached open nfsd_files and
call nfs_to_nfsd_file_put_local().

Refcounting is such that:
- nfs_local_open_fh() is paired with nfs_close_local_fh().
- __nfs_local_open_fh() is paired with nfs_to_nfsd_file_put_local().
- nfs_local_file_get() is paired with nfs_local_file_put().

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 25 +++++-----
 fs/nfs/flexfilelayout/flexfilelayout.h |  1 +
 fs/nfs/inode.c                         |  3 ++
 fs/nfs/internal.h                      |  4 +-
 fs/nfs/localio.c                       | 89 +++++++++++++++++++++++++++-------
 fs/nfs/pagelist.c                      |  5 +-
 fs/nfs/write.c                         |  3 +-
 fs/nfs_common/nfslocalio.c             | 52 +++++++++++++++++++-
 include/linux/nfs_fs.h                 | 22 ++++++++-
 include/linux/nfslocalio.h             | 16 +++---
 10 files changed, 176 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index f78115c6c2c1..ce61bf1ada6c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -164,18 +164,17 @@ decode_name(struct xdr_stream *xdr, u32 *id)
 }
 
 static struct nfsd_file *
-ff_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx,
+		 struct nfs_client *clp, const struct cred *cred,
 		 struct nfs_fh *fh, fmode_t mode)
 {
-	if (mode & FMODE_WRITE) {
-		/*
-		 * Always request read and write access since this corresponds
-		 * to a rw layout.
-		 */
-		mode |= FMODE_READ;
-	}
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
 
-	return nfs_local_open_fh(clp, cred, fh, mode);
+	return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode);
+#else
+	return NULL;
+#endif
 }
 
 static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
@@ -247,6 +246,7 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 		spin_lock_init(&mirror->lock);
 		refcount_set(&mirror->ref, 1);
 		INIT_LIST_HEAD(&mirror->mirrors);
+		nfs_localio_file_init(&mirror->nfl);
 	}
 	return mirror;
 }
@@ -257,6 +257,7 @@ static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 
 	ff_layout_remove_mirror(mirror);
 	kfree(mirror->fh_versions);
+	nfs_close_local_fh(&mirror->nfl);
 	cred = rcu_access_pointer(mirror->ro_cred);
 	put_cred(cred);
 	cred = rcu_access_pointer(mirror->rw_cred);
@@ -1820,7 +1821,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 	hdr->mds_offset = offset;
 
 	/* Start IO accounting for local read */
-	localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh, FMODE_READ);
+	localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ);
 	if (localio) {
 		hdr->task.tk_start = ktime_get();
 		ff_layout_read_record_layoutstats_start(&hdr->task, hdr);
@@ -1896,7 +1897,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 	hdr->args.offset = offset;
 
 	/* Start IO accounting for local write */
-	localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh,
+	localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
 				   FMODE_READ|FMODE_WRITE);
 	if (localio) {
 		hdr->task.tk_start = ktime_get();
@@ -1981,7 +1982,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
 		data->args.fh = fh;
 
 	/* Start IO accounting for local commit */
-	localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh,
+	localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
 				   FMODE_READ|FMODE_WRITE);
 	if (localio) {
 		data->task.tk_start = ktime_get();
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index f84b3fb0dddd..095df09017a5 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -83,6 +83,7 @@ struct nfs4_ff_layout_mirror {
 	nfs4_stateid			stateid;
 	const struct cred __rcu		*ro_cred;
 	const struct cred __rcu		*rw_cred;
+	struct nfs_file_localio		nfl;
 	refcount_t			ref;
 	spinlock_t			lock;
 	unsigned long			flags;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596f35170137..1aa67fca69b2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1137,6 +1137,8 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
 	ctx->lock_context.open_context = ctx;
 	INIT_LIST_HEAD(&ctx->list);
 	ctx->mdsthreshold = NULL;
+	nfs_localio_file_init(&ctx->nfl);
+
 	return ctx;
 }
 EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
@@ -1168,6 +1170,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 	nfs_sb_deactive(sb);
 	put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1));
 	kfree(ctx->mdsthreshold);
+	nfs_close_local_fh(&ctx->nfl);
 	kfree_rcu(ctx, rcu_head);
 }
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e564bd11ba60..febf289a9e4c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -460,6 +460,7 @@ extern void nfs_local_probe(struct nfs_client *);
 extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
 					   const struct cred *,
 					   struct nfs_fh *,
+					   struct nfs_file_localio *,
 					   const fmode_t);
 extern int nfs_local_doio(struct nfs_client *,
 			  struct nfsd_file *,
@@ -475,7 +476,8 @@ static inline void nfs_local_disable(struct nfs_client *clp) {}
 static inline void nfs_local_probe(struct nfs_client *clp) {}
 static inline struct nfsd_file *
 nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
-		  struct nfs_fh *fh, const fmode_t mode)
+		  struct nfs_fh *fh, struct nfs_file_localio *nfl,
+		  const fmode_t mode)
 {
 	return NULL;
 }
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 7191135b47a4..7e432057c3a1 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -211,27 +211,33 @@ void nfs_local_probe(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs_local_probe);
 
+static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf)
+{
+	return nfs_to->nfsd_file_get(nf);
+}
+
+static inline void nfs_local_file_put(struct nfsd_file *nf)
+{
+	nfs_to->nfsd_file_put(nf);
+}
+
 /*
- * nfs_local_open_fh - open a local filehandle in terms of nfsd_file
+ * __nfs_local_open_fh - open a local filehandle in terms of nfsd_file.
  *
- * Returns a pointer to a struct nfsd_file or NULL
+ * Returns a pointer to a struct nfsd_file or ERR_PTR.
+ * Caller must release returned nfsd_file with nfs_to_nfsd_file_put_local().
  */
-struct nfsd_file *
-nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
-		  struct nfs_fh *fh, const fmode_t mode)
+static struct nfsd_file *
+__nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+		    struct nfs_fh *fh, struct nfs_file_localio *nfl,
+		    const fmode_t mode)
 {
 	struct nfsd_file *localio;
-	int status;
-
-	if (!nfs_server_is_local(clp))
-		return NULL;
-	if (mode & ~(FMODE_READ | FMODE_WRITE))
-		return NULL;
 
 	localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
-				    cred, fh, mode);
+				    cred, fh, nfl, mode);
 	if (IS_ERR(localio)) {
-		status = PTR_ERR(localio);
+		int status = PTR_ERR(localio);
 		trace_nfs_local_open_fh(fh, mode, status);
 		switch (status) {
 		case -ENOMEM:
@@ -240,10 +246,59 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
 			/* Revalidate localio, will disable if unsupported */
 			nfs_local_probe(clp);
 		}
-		return NULL;
 	}
 	return localio;
 }
+
+/*
+ * nfs_local_open_fh - open a local filehandle in terms of nfsd_file.
+ * First checking if the open nfsd_file is already cached, otherwise
+ * must __nfs_local_open_fh and insert the nfsd_file in nfs_file_localio.
+ *
+ * Returns a pointer to a struct nfsd_file or NULL.
+ */
+struct nfsd_file *
+nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+		  struct nfs_fh *fh, struct nfs_file_localio *nfl,
+		  const fmode_t mode)
+{
+	struct nfsd_file *nf, *new, __rcu **pnf;
+
+	if (!nfs_server_is_local(clp))
+		return NULL;
+	if (mode & ~(FMODE_READ | FMODE_WRITE))
+		return NULL;
+
+	if (mode & FMODE_WRITE)
+		pnf = &nfl->rw_file;
+	else
+		pnf = &nfl->ro_file;
+
+	new = NULL;
+	rcu_read_lock();
+	nf = rcu_dereference(*pnf);
+	if (!nf) {
+		rcu_read_unlock();
+		new = __nfs_local_open_fh(clp, cred, fh, nfl, mode);
+		if (IS_ERR(new))
+			return NULL;
+		/* try to swap in the pointer */
+		spin_lock(&clp->cl_uuid.lock);
+		nf = rcu_dereference_protected(*pnf, 1);
+		if (!nf) {
+			nf = new;
+			new = NULL;
+			rcu_assign_pointer(*pnf, nf);
+		}
+		spin_unlock(&clp->cl_uuid.lock);
+		rcu_read_lock();
+	}
+	nf = nfs_local_file_get(nf);
+	rcu_read_unlock();
+	if (new)
+		nfs_to_nfsd_file_put_local(new);
+	return nf;
+}
 EXPORT_SYMBOL_GPL(nfs_local_open_fh);
 
 static struct bio_vec *
@@ -347,7 +402,7 @@ nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
 {
 	struct nfs_pgio_header *hdr = iocb->hdr;
 
-	nfs_to_nfsd_file_put_local(iocb->localio);
+	nfs_local_file_put(iocb->localio);
 	nfs_local_iocb_free(iocb);
 	nfs_local_hdr_release(hdr, hdr->task.tk_ops);
 }
@@ -694,7 +749,7 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
 	if (status != 0) {
 		if (status == -EAGAIN)
 			nfs_local_disable(clp);
-		nfs_to_nfsd_file_put_local(localio);
+		nfs_local_file_put(localio);
 		hdr->task.tk_status = status;
 		nfs_local_hdr_release(hdr, call_ops);
 	}
@@ -745,7 +800,7 @@ nfs_local_release_commit_data(struct nfsd_file *localio,
 		struct nfs_commit_data *data,
 		const struct rpc_call_ops *call_ops)
 {
-	nfs_to_nfsd_file_put_local(localio);
+	nfs_local_file_put(localio);
 	call_ops->rpc_call_done(&data->task, data);
 	call_ops->rpc_release(data);
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e27c07bd8929..11968dcb7243 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -961,8 +961,9 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 		struct nfs_client *clp = NFS_SERVER(hdr->inode)->nfs_client;
 
 		struct nfsd_file *localio =
-			nfs_local_open_fh(clp, hdr->cred,
-					  hdr->args.fh, hdr->args.context->mode);
+			nfs_local_open_fh(clp, hdr->cred, hdr->args.fh,
+					  &hdr->args.context->nfl,
+					  hdr->args.context->mode);
 
 		if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion)
 			task_flags = RPC_TASK_MOVEABLE;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 50fa539611f5..aa3d8bea3ec0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1826,7 +1826,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 		task_flags = RPC_TASK_MOVEABLE;
 
 	localio = nfs_local_open_fh(NFS_SERVER(inode)->nfs_client, data->cred,
-				    data->args.fh, data->context->mode);
+				    data->args.fh, &data->context->nfl,
+				    data->context->mode);
 	return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
 				   data->mds_ops, how,
 				   RPC_TASK_CRED_NOREF | task_flags, localio);
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index abc132166742..35a2e48731df 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -9,7 +9,7 @@
 #include <linux/nfslocalio.h>
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
-#include <linux/nfs_fs_sb.h>
+#include <linux/nfs_fs.h>
 #include <net/netns/generic.h>
 
 MODULE_LICENSE("GPL");
@@ -151,9 +151,18 @@ void nfs_localio_invalidate_clients(struct list_head *cl_uuid_list)
 }
 EXPORT_SYMBOL_GPL(nfs_localio_invalidate_clients);
 
+static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl)
+{
+	spin_lock(&nfs_uuid_lock);
+	if (!nfl->nfs_uuid)
+		rcu_assign_pointer(nfl->nfs_uuid, nfs_uuid);
+	spin_unlock(&nfs_uuid_lock);
+}
+
 struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
 		   struct rpc_clnt *rpc_clnt, const struct cred *cred,
-		   const struct nfs_fh *nfs_fh, const fmode_t fmode)
+		   const struct nfs_fh *nfs_fh, struct nfs_file_localio *nfl,
+		   const fmode_t fmode)
 {
 	struct net *net;
 	struct nfsd_file *localio;
@@ -180,11 +189,50 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
 					     cred, nfs_fh, fmode);
 	if (IS_ERR(localio))
 		nfs_to_nfsd_net_put(net);
+	else
+		nfs_uuid_add_file(uuid, nfl);
 
 	return localio;
 }
 EXPORT_SYMBOL_GPL(nfs_open_local_fh);
 
+void nfs_close_local_fh(struct nfs_file_localio *nfl)
+{
+	struct nfsd_file *ro_nf = NULL;
+	struct nfsd_file *rw_nf = NULL;
+	nfs_uuid_t *nfs_uuid;
+
+	rcu_read_lock();
+	nfs_uuid = rcu_dereference(nfl->nfs_uuid);
+	if (!nfs_uuid) {
+		/* regular (non-LOCALIO) NFS will hammer this */
+		rcu_read_unlock();
+		return;
+	}
+
+	ro_nf = rcu_access_pointer(nfl->ro_file);
+	rw_nf = rcu_access_pointer(nfl->rw_file);
+	if (ro_nf || rw_nf) {
+		spin_lock(&nfs_uuid_lock);
+		if (ro_nf)
+			ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1);
+		if (rw_nf)
+			rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1);
+
+		rcu_assign_pointer(nfl->nfs_uuid, NULL);
+		spin_unlock(&nfs_uuid_lock);
+		rcu_read_unlock();
+
+		if (ro_nf)
+			nfs_to_nfsd_file_put_local(ro_nf);
+		if (rw_nf)
+			nfs_to_nfsd_file_put_local(rw_nf);
+		return;
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nfs_close_local_fh);
+
 /*
  * The NFS LOCALIO code needs to call into NFSD using various symbols,
  * but cannot be statically linked, because that will make the NFS
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 039898d70954..67ae2c3f41d2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -77,6 +77,23 @@ struct nfs_lock_context {
 	struct rcu_head	rcu_head;
 };
 
+struct nfs_file_localio {
+	struct nfsd_file __rcu *ro_file;
+	struct nfsd_file __rcu *rw_file;
+	struct list_head list;
+	void __rcu *nfs_uuid; /* opaque pointer to 'nfs_uuid_t' */
+};
+
+static inline void nfs_localio_file_init(struct nfs_file_localio *nfl)
+{
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	nfl->ro_file = NULL;
+	nfl->rw_file = NULL;
+	INIT_LIST_HEAD(&nfl->list);
+	nfl->nfs_uuid = NULL;
+#endif
+}
+
 struct nfs4_state;
 struct nfs_open_context {
 	struct nfs_lock_context lock_context;
@@ -87,15 +104,16 @@ struct nfs_open_context {
 	struct nfs4_state *state;
 	fmode_t mode;
 
+	int error;
 	unsigned long flags;
 #define NFS_CONTEXT_BAD			(2)
 #define NFS_CONTEXT_UNLOCK	(3)
 #define NFS_CONTEXT_FILE_OPEN		(4)
-	int error;
 
-	struct list_head list;
 	struct nfs4_threshold	*mdsthreshold;
+	struct list_head list;
 	struct rcu_head	rcu_head;
+	struct nfs_file_localio nfl;
 };
 
 struct nfs_open_dir_context {
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 4d5583873f41..7cfc6720ed26 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -6,10 +6,6 @@
 #ifndef __LINUX_NFSLOCALIO_H
 #define __LINUX_NFSLOCALIO_H
 
-
-/* nfsd_file structure is purposely kept opaque to NFS client */
-struct nfsd_file;
-
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 
 #include <linux/module.h>
@@ -21,6 +17,7 @@ struct nfsd_file;
 #include <net/net_namespace.h>
 
 struct nfs_client;
+struct nfs_file_localio;
 
 /*
  * Useful to allow a client to negotiate if localio
@@ -52,6 +49,7 @@ extern struct nfsd_file *
 nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *,
 		   const struct cred *, const struct nfs_fh *,
 		   const fmode_t) __must_hold(rcu);
+void nfs_close_local_fh(struct nfs_file_localio *);
 
 struct nfsd_localio_operations {
 	bool (*nfsd_serv_try_get)(struct net *);
@@ -73,7 +71,8 @@ extern const struct nfsd_localio_operations *nfs_to;
 
 struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *,
 		   struct rpc_clnt *, const struct cred *,
-		   const struct nfs_fh *, const fmode_t);
+		   const struct nfs_fh *, struct nfs_file_localio *,
+		   const fmode_t);
 
 static inline void nfs_to_nfsd_net_put(struct net *net)
 {
@@ -100,12 +99,15 @@ static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio)
 }
 
 #else   /* CONFIG_NFS_LOCALIO */
-static inline void nfsd_localio_ops_init(void)
+
+struct nfs_file_localio;
+static inline void nfs_close_local_fh(struct nfs_file_localio *nfl)
 {
 }
-static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio)
+static inline void nfsd_localio_ops_init(void)
 {
 }
+
 #endif  /* CONFIG_NFS_LOCALIO */
 
 #endif  /* __LINUX_NFSLOCALIO_H */
-- 
cgit v1.2.3


From 39972494e318a21b3059287909fc090186dbe60a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:40:58 -0500
Subject: nfsd: update percpu_ref to manage references on nfsd_net

Holding a reference on nfsd_net is what is required, it was never
actually about ensuring nn->nfsd_serv available.

Move waiting for outstanding percpu references from
nfsd_destroy_serv() to nfsd_shutdown_net().

By moving it later it will be possible to invalidate localio clients
during nfsd_file_cache_shutdown_net() via __nfsd_file_cache_purge().

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfsd/nfssvc.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 49e2f32102ab..6ca554042426 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -436,6 +436,10 @@ static void nfsd_shutdown_net(struct net *net)
 
 	if (!nn->nfsd_net_up)
 		return;
+
+	percpu_ref_kill_and_confirm(&nn->nfsd_serv_ref, nfsd_serv_done);
+	wait_for_completion(&nn->nfsd_serv_confirm_done);
+
 	nfsd_export_flush(net);
 	nfs4_state_shutdown_net(net);
 	nfsd_reply_cache_shutdown(nn);
@@ -444,7 +448,10 @@ static void nfsd_shutdown_net(struct net *net)
 		lockd_down(net);
 		nn->lockd_up = false;
 	}
+
+	wait_for_completion(&nn->nfsd_serv_free_done);
 	percpu_ref_exit(&nn->nfsd_serv_ref);
+
 	nn->nfsd_net_up = false;
 	nfsd_shutdown_generic();
 }
@@ -526,11 +533,6 @@ void nfsd_destroy_serv(struct net *net)
 
 	lockdep_assert_held(&nfsd_mutex);
 
-	percpu_ref_kill_and_confirm(&nn->nfsd_serv_ref, nfsd_serv_done);
-	wait_for_completion(&nn->nfsd_serv_confirm_done);
-	wait_for_completion(&nn->nfsd_serv_free_done);
-	/* percpu_ref_exit is called in nfsd_shutdown_net */
-
 	spin_lock(&nfsd_notifier_lock);
 	nn->nfsd_serv = NULL;
 	spin_unlock(&nfsd_notifier_lock);
-- 
cgit v1.2.3


From b33f7dec3a67216123312c7bb752b8f6faa1c465 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:40:59 -0500
Subject: nfsd: rename nfsd_serv_ prefixed methods and variables with nfsd_net_

Also update Documentation/filesystems/nfs/localio.rst accordingly
and reduce the technical documentation debt that was previously
captured in that document.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 Documentation/filesystems/nfs/localio.rst | 85 ++++++++++---------------------
 fs/nfs_common/nfslocalio.c                | 10 ++--
 fs/nfsd/filecache.c                       |  2 +-
 fs/nfsd/localio.c                         |  4 +-
 fs/nfsd/netns.h                           | 11 ++--
 fs/nfsd/nfssvc.c                          | 34 ++++++-------
 include/linux/nfslocalio.h                | 12 ++---
 7 files changed, 66 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/nfs/localio.rst b/Documentation/filesystems/nfs/localio.rst
index 20fc901a08f4..7d2dbf75e96d 100644
--- a/Documentation/filesystems/nfs/localio.rst
+++ b/Documentation/filesystems/nfs/localio.rst
@@ -218,64 +218,30 @@ NFS Client and Server Interlock
 ===============================
 
 LOCALIO provides the nfs_uuid_t object and associated interfaces to
-allow proper network namespace (net-ns) and NFSD object refcounting:
-
-    We don't want to keep a long-term counted reference on each NFSD's
-    net-ns in the client because that prevents a server container from
-    completely shutting down.
-
-    So we avoid taking a reference at all and rely on the per-cpu
-    reference to the server (detailed below) being sufficient to keep
-    the net-ns active. This involves allowing the NFSD's net-ns exit
-    code to iterate all active clients and clear their ->net pointers
-    (which are needed to find the per-cpu-refcount for the nfsd_serv).
-
-    Details:
-
-     - Embed nfs_uuid_t in nfs_client. nfs_uuid_t provides a list_head
-       that can be used to find the client. It does add the 16-byte
-       uuid_t to nfs_client so it is bigger than needed (given that
-       uuid_t is only used during the initial NFS client and server
-       LOCALIO handshake to determine if they are local to each other).
-       If that is really a problem we can find a fix.
-
-     - When the nfs server confirms that the uuid_t is local, it moves
-       the nfs_uuid_t onto a per-net-ns list in NFSD's nfsd_net.
-
-     - When each server's net-ns is shutting down - in a "pre_exit"
-       handler, all these nfs_uuid_t have their ->net cleared. There is
-       an rcu_synchronize() call between pre_exit() handlers and exit()
-       handlers so any caller that sees nfs_uuid_t ->net as not NULL can
-       safely manage the per-cpu-refcount for nfsd_serv.
-
-     - The client's nfs_uuid_t is passed to nfsd_open_local_fh() so it
-       can safely dereference ->net in a private rcu_read_lock() section
-       to allow safe access to the associated nfsd_net and nfsd_serv.
-
-So LOCALIO required the introduction and use of NFSD's percpu_ref to
-interlock nfsd_destroy_serv() and nfsd_open_local_fh(), to ensure each
-nn->nfsd_serv is not destroyed while in use by nfsd_open_local_fh(), and
+allow proper network namespace (net-ns) and NFSD object refcounting.
+
+LOCALIO required the introduction and use of NFSD's percpu nfsd_net_ref
+to interlock nfsd_shutdown_net() and nfsd_open_local_fh(), to ensure
+each net-ns is not destroyed while in use by nfsd_open_local_fh(), and
 warrants a more detailed explanation:
 
-    nfsd_open_local_fh() uses nfsd_serv_try_get() before opening its
+    nfsd_open_local_fh() uses nfsd_net_try_get() before opening its
     nfsd_file handle and then the caller (NFS client) must drop the
-    reference for the nfsd_file and associated nn->nfsd_serv using
-    nfs_file_put_local() once it has completed its IO.
+    reference for the nfsd_file and associated net-ns using
+    nfsd_file_put_local() once it has completed its IO.
 
     This interlock working relies heavily on nfsd_open_local_fh() being
     afforded the ability to safely deal with the possibility that the
     NFSD's net-ns (and nfsd_net by association) may have been destroyed
-    by nfsd_destroy_serv() via nfsd_shutdown_net() -- which is only
-    possible given the nfs_uuid_t ->net pointer managemenet detailed
-    above.
-
-All told, this elaborate interlock of the NFS client and server has been
-verified to fix an easy to hit crash that would occur if an NFSD
-instance running in a container, with a LOCALIO client mounted, is
-shutdown. Upon restart of the container and associated NFSD the client
-would go on to crash due to NULL pointer dereference that occurred due
-to the LOCALIO client's attempting to nfsd_open_local_fh(), using
-nn->nfsd_serv, without having a proper reference on nn->nfsd_serv.
+    by nfsd_destroy_serv() via nfsd_shutdown_net().
+
+This interlock of the NFS client and server has been verified to fix an
+easy to hit crash that would occur if an NFSD instance running in a
+container, with a LOCALIO client mounted, is shutdown. Upon restart of
+the container and associated NFSD, the client would go on to crash due
+to NULL pointer dereference that occurred due to the LOCALIO client's
+attempting to nfsd_open_local_fh() without having a proper reference on
+NFSD's net-ns.
 
 NFS Client issues IO instead of Server
 ======================================
@@ -308,16 +274,19 @@ fs/nfs/localio.c:nfs_local_commit().
 
 With normal NFS that makes use of RPC to issue IO to the server, if an
 application uses O_DIRECT the NFS client will bypass the pagecache but
-the NFS server will not. Because the NFS server's use of buffered IO
-affords applications to be less precise with their alignment when
-issuing IO to the NFS client. LOCALIO can be configured to use O_DIRECT
-semantics by setting the 'localio_O_DIRECT_semantics' nfs module
+the NFS server will not. The NFS server's use of buffered IO affords
+applications to be less precise with their alignment when issuing IO to
+the NFS client. But if all applications properly align their IO, LOCALIO
+can be configured to use end-to-end O_DIRECT semantics from the NFS
+client to the underlying local filesystem, that it is sharing with
+the NFS server, by setting the 'localio_O_DIRECT_semantics' nfs module
 parameter to Y, e.g.:
 
-  echo Y > /sys/module/nfs/parameters/localio_O_DIRECT_semantics
+    echo Y > /sys/module/nfs/parameters/localio_O_DIRECT_semantics
 
-Once enabled, it will cause LOCALIO to use O_DIRECT semantics (this may
-cause IO to fail if applications do not properly align their IO).
+Once enabled, it will cause LOCALIO to use end-to-end O_DIRECT semantics
+(but again, this may cause IO to fail if applications do not properly
+align their IO).
 
 Security
 ========
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 35a2e48731df..e75cd21f4c8b 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -159,6 +159,10 @@ static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl
 	spin_unlock(&nfs_uuid_lock);
 }
 
+/*
+ * Caller is responsible for calling nfsd_net_put and
+ * nfsd_file_put (via nfs_to_nfsd_file_put_local).
+ */
 struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
 		   struct rpc_clnt *rpc_clnt, const struct cred *cred,
 		   const struct nfs_fh *nfs_fh, struct nfs_file_localio *nfl,
@@ -171,7 +175,7 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
 	 * Not running in nfsd context, so must safely get reference on nfsd_serv.
 	 * But the server may already be shutting down, if so disallow new localio.
 	 * uuid->net is NOT a counted reference, but rcu_read_lock() ensures that
-	 * if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe
+	 * if uuid->net is not NULL, then calling nfsd_net_try_get() is safe
 	 * and if it succeeds we will have an implied reference to the net.
 	 *
 	 * Otherwise NFS may not have ref on NFSD and therefore cannot safely
@@ -179,12 +183,12 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
 	 */
 	rcu_read_lock();
 	net = rcu_dereference(uuid->net);
-	if (!net || !nfs_to->nfsd_serv_try_get(net)) {
+	if (!net || !nfs_to->nfsd_net_try_get(net)) {
 		rcu_read_unlock();
 		return ERR_PTR(-ENXIO);
 	}
 	rcu_read_unlock();
-	/* We have an implied reference to net thanks to nfsd_serv_try_get */
+	/* We have an implied reference to net thanks to nfsd_net_try_get */
 	localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt,
 					     cred, nfs_fh, fmode);
 	if (IS_ERR(localio))
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index a1cdba42c4fa..1c3baa74e2df 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -391,7 +391,7 @@ nfsd_file_put(struct nfsd_file *nf)
 }
 
 /**
- * nfsd_file_put_local - put nfsd_file reference and arm nfsd_serv_put in caller
+ * nfsd_file_put_local - put nfsd_file reference and arm nfsd_net_put in caller
  * @nf: nfsd_file of which to put the reference
  *
  * First save the associated net to return to caller, then put
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index 8beda4c85111..f9a91cd3b5ec 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -25,8 +25,8 @@
 #include "cache.h"
 
 static const struct nfsd_localio_operations nfsd_localio_ops = {
-	.nfsd_serv_try_get  = nfsd_serv_try_get,
-	.nfsd_serv_put  = nfsd_serv_put,
+	.nfsd_net_try_get  = nfsd_net_try_get,
+	.nfsd_net_put  = nfsd_net_put,
 	.nfsd_open_local_fh = nfsd_open_local_fh,
 	.nfsd_file_put_local = nfsd_file_put_local,
 	.nfsd_file_get = nfsd_file_get,
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 26f7b34d1a03..8faef59d7122 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -140,9 +140,10 @@ struct nfsd_net {
 
 	struct svc_info nfsd_info;
 #define nfsd_serv nfsd_info.serv
-	struct percpu_ref nfsd_serv_ref;
-	struct completion nfsd_serv_confirm_done;
-	struct completion nfsd_serv_free_done;
+
+	struct percpu_ref nfsd_net_ref;
+	struct completion nfsd_net_confirm_done;
+	struct completion nfsd_net_free_done;
 
 	/*
 	 * clientid and stateid data for construction of net unique COPY
@@ -229,8 +230,8 @@ struct nfsd_net {
 extern bool nfsd_support_version(int vers);
 extern unsigned int nfsd_net_id;
 
-bool nfsd_serv_try_get(struct net *net);
-void nfsd_serv_put(struct net *net);
+bool nfsd_net_try_get(struct net *net);
+void nfsd_net_put(struct net *net);
 
 void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn);
 void nfsd_reset_write_verifier(struct nfsd_net *nn);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 6ca554042426..e937e2d0ce62 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -214,32 +214,32 @@ int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change
 	return 0;
 }
 
-bool nfsd_serv_try_get(struct net *net) __must_hold(rcu)
+bool nfsd_net_try_get(struct net *net) __must_hold(rcu)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	return (nn && percpu_ref_tryget_live(&nn->nfsd_serv_ref));
+	return (nn && percpu_ref_tryget_live(&nn->nfsd_net_ref));
 }
 
-void nfsd_serv_put(struct net *net) __must_hold(rcu)
+void nfsd_net_put(struct net *net) __must_hold(rcu)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	percpu_ref_put(&nn->nfsd_serv_ref);
+	percpu_ref_put(&nn->nfsd_net_ref);
 }
 
-static void nfsd_serv_done(struct percpu_ref *ref)
+static void nfsd_net_done(struct percpu_ref *ref)
 {
-	struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref);
+	struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref);
 
-	complete(&nn->nfsd_serv_confirm_done);
+	complete(&nn->nfsd_net_confirm_done);
 }
 
-static void nfsd_serv_free(struct percpu_ref *ref)
+static void nfsd_net_free(struct percpu_ref *ref)
 {
-	struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref);
+	struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref);
 
-	complete(&nn->nfsd_serv_free_done);
+	complete(&nn->nfsd_net_free_done);
 }
 
 /*
@@ -437,8 +437,8 @@ static void nfsd_shutdown_net(struct net *net)
 	if (!nn->nfsd_net_up)
 		return;
 
-	percpu_ref_kill_and_confirm(&nn->nfsd_serv_ref, nfsd_serv_done);
-	wait_for_completion(&nn->nfsd_serv_confirm_done);
+	percpu_ref_kill_and_confirm(&nn->nfsd_net_ref, nfsd_net_done);
+	wait_for_completion(&nn->nfsd_net_confirm_done);
 
 	nfsd_export_flush(net);
 	nfs4_state_shutdown_net(net);
@@ -449,8 +449,8 @@ static void nfsd_shutdown_net(struct net *net)
 		nn->lockd_up = false;
 	}
 
-	wait_for_completion(&nn->nfsd_serv_free_done);
-	percpu_ref_exit(&nn->nfsd_serv_ref);
+	wait_for_completion(&nn->nfsd_net_free_done);
+	percpu_ref_exit(&nn->nfsd_net_ref);
 
 	nn->nfsd_net_up = false;
 	nfsd_shutdown_generic();
@@ -654,12 +654,12 @@ int nfsd_create_serv(struct net *net)
 	if (nn->nfsd_serv)
 		return 0;
 
-	error = percpu_ref_init(&nn->nfsd_serv_ref, nfsd_serv_free,
+	error = percpu_ref_init(&nn->nfsd_net_ref, nfsd_net_free,
 				0, GFP_KERNEL);
 	if (error)
 		return error;
-	init_completion(&nn->nfsd_serv_free_done);
-	init_completion(&nn->nfsd_serv_confirm_done);
+	init_completion(&nn->nfsd_net_free_done);
+	init_completion(&nn->nfsd_net_confirm_done);
 
 	if (nfsd_max_blksize == 0)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 7cfc6720ed26..aa2b5c6561ab 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -52,8 +52,8 @@ nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *,
 void nfs_close_local_fh(struct nfs_file_localio *);
 
 struct nfsd_localio_operations {
-	bool (*nfsd_serv_try_get)(struct net *);
-	void (*nfsd_serv_put)(struct net *);
+	bool (*nfsd_net_try_get)(struct net *);
+	void (*nfsd_net_put)(struct net *);
 	struct nfsd_file *(*nfsd_open_local_fh)(struct net *,
 						struct auth_domain *,
 						struct rpc_clnt *,
@@ -77,12 +77,12 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *,
 static inline void nfs_to_nfsd_net_put(struct net *net)
 {
 	/*
-	 * Once reference to nfsd_serv is dropped, NFSD could be
-	 * unloaded, so ensure safe return from nfsd_file_put_local()
-	 * by always taking RCU.
+	 * Once reference to net (and associated nfsd_serv) is dropped, NFSD
+	 * could be unloaded, so ensure safe return from nfsd_net_put() by
+	 * always taking RCU.
 	 */
 	rcu_read_lock();
-	nfs_to->nfsd_serv_put(net);
+	nfs_to->nfsd_net_put(net);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From f9c3e1ba072d0e21e75f7033d75a134549e8ed43 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:41:00 -0500
Subject: nfsd: nfsd_file_acquire_local no longer returns GC'd nfsd_file

Now that LOCALIO no longer leans on NFSD's filecache for caching open
files (and instead uses NFS client-side open nfsd_file caching) there
is no need to use NFSD filecache's GC feature.  Avoiding GC will speed
up nfsd_file initial opens.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfsd/filecache.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 1c3baa74e2df..09dd708584c1 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1222,10 +1222,9 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * a file.  The security implications of this should be carefully
  * considered before use.
  *
- * The nfsd_file object returned by this API is reference-counted
- * and garbage-collected. The object is retained for a few
- * seconds after the final nfsd_file_put() in case the caller
- * wants to re-use it.
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
  *
  * Return values:
  *   %nfs_ok - @pnf points to an nfsd_file with its reference
@@ -1247,7 +1246,7 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
 	__be32 beres;
 
 	beres = nfsd_file_do_acquire(NULL, net, cred, client,
-				     fhp, may_flags, NULL, pnf, true);
+				     fhp, may_flags, NULL, pnf, false);
 	revert_creds(save_cred);
 	return beres;
 }
-- 
cgit v1.2.3


From e1943f4eb8d531b0b65b22d4181c6a205226b006 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:41:01 -0500
Subject: nfs_common: rename nfslocalio nfs_uuid_lock to nfs_uuids_lock

This global spinlock protects all nfs_uuid_t relative to the global
nfs_uuids list.  A later commit will split this global spinlock so
prepare by renaming this lock to reflect its intended narrow scope.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs_common/nfslocalio.c | 34 +++++++++++++++++-----------------
 fs/nfsd/localio.c          |  2 +-
 2 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index e75cd21f4c8b..5fa3f47b442e 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -15,11 +15,11 @@
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("NFS localio protocol bypass support");
 
-static DEFINE_SPINLOCK(nfs_uuid_lock);
+static DEFINE_SPINLOCK(nfs_uuids_lock);
 
 /*
  * Global list of nfs_uuid_t instances
- * that is protected by nfs_uuid_lock.
+ * that is protected by nfs_uuids_lock.
  */
 static LIST_HEAD(nfs_uuids);
 
@@ -34,15 +34,15 @@ EXPORT_SYMBOL_GPL(nfs_uuid_init);
 
 bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
 {
-	spin_lock(&nfs_uuid_lock);
+	spin_lock(&nfs_uuids_lock);
 	/* Is this nfs_uuid already in use? */
 	if (!list_empty(&nfs_uuid->list)) {
-		spin_unlock(&nfs_uuid_lock);
+		spin_unlock(&nfs_uuids_lock);
 		return false;
 	}
 	uuid_gen(&nfs_uuid->uuid);
 	list_add_tail(&nfs_uuid->list, &nfs_uuids);
-	spin_unlock(&nfs_uuid_lock);
+	spin_unlock(&nfs_uuids_lock);
 
 	return true;
 }
@@ -51,10 +51,10 @@ EXPORT_SYMBOL_GPL(nfs_uuid_begin);
 void nfs_uuid_end(nfs_uuid_t *nfs_uuid)
 {
 	if (nfs_uuid->net == NULL) {
-		spin_lock(&nfs_uuid_lock);
+		spin_lock(&nfs_uuids_lock);
 		if (nfs_uuid->net == NULL)
 			list_del_init(&nfs_uuid->list);
-		spin_unlock(&nfs_uuid_lock);
+		spin_unlock(&nfs_uuids_lock);
 	}
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_end);
@@ -78,7 +78,7 @@ void nfs_uuid_is_local(const uuid_t *uuid, struct list_head *list,
 {
 	nfs_uuid_t *nfs_uuid;
 
-	spin_lock(&nfs_uuid_lock);
+	spin_lock(&nfs_uuids_lock);
 	nfs_uuid = nfs_uuid_lookup_locked(uuid);
 	if (nfs_uuid) {
 		kref_get(&dom->ref);
@@ -94,7 +94,7 @@ void nfs_uuid_is_local(const uuid_t *uuid, struct list_head *list,
 		__module_get(mod);
 		nfsd_mod = mod;
 	}
-	spin_unlock(&nfs_uuid_lock);
+	spin_unlock(&nfs_uuids_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_is_local);
 
@@ -128,9 +128,9 @@ void nfs_localio_disable_client(struct nfs_client *clp)
 
 	spin_lock(&nfs_uuid->lock);
 	if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
-		spin_lock(&nfs_uuid_lock);
+		spin_lock(&nfs_uuids_lock);
 		nfs_uuid_put_locked(nfs_uuid);
-		spin_unlock(&nfs_uuid_lock);
+		spin_unlock(&nfs_uuids_lock);
 	}
 	spin_unlock(&nfs_uuid->lock);
 }
@@ -140,23 +140,23 @@ void nfs_localio_invalidate_clients(struct list_head *cl_uuid_list)
 {
 	nfs_uuid_t *nfs_uuid, *tmp;
 
-	spin_lock(&nfs_uuid_lock);
+	spin_lock(&nfs_uuids_lock);
 	list_for_each_entry_safe(nfs_uuid, tmp, cl_uuid_list, list) {
 		struct nfs_client *clp =
 			container_of(nfs_uuid, struct nfs_client, cl_uuid);
 
 		nfs_localio_disable_client(clp);
 	}
-	spin_unlock(&nfs_uuid_lock);
+	spin_unlock(&nfs_uuids_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_localio_invalidate_clients);
 
 static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl)
 {
-	spin_lock(&nfs_uuid_lock);
+	spin_lock(&nfs_uuids_lock);
 	if (!nfl->nfs_uuid)
 		rcu_assign_pointer(nfl->nfs_uuid, nfs_uuid);
-	spin_unlock(&nfs_uuid_lock);
+	spin_unlock(&nfs_uuids_lock);
 }
 
 /*
@@ -217,14 +217,14 @@ void nfs_close_local_fh(struct nfs_file_localio *nfl)
 	ro_nf = rcu_access_pointer(nfl->ro_file);
 	rw_nf = rcu_access_pointer(nfl->rw_file);
 	if (ro_nf || rw_nf) {
-		spin_lock(&nfs_uuid_lock);
+		spin_lock(&nfs_uuids_lock);
 		if (ro_nf)
 			ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1);
 		if (rw_nf)
 			rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1);
 
 		rcu_assign_pointer(nfl->nfs_uuid, NULL);
-		spin_unlock(&nfs_uuid_lock);
+		spin_unlock(&nfs_uuids_lock);
 		rcu_read_unlock();
 
 		if (ro_nf)
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index f9a91cd3b5ec..2ae07161b919 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -54,7 +54,7 @@ void nfsd_localio_ops_init(void)
  * avoid all the NFS overhead with reads, writes and commits.
  *
  * On successful return, returned nfsd_file will have its nf_net member
- * set. Caller (NFS client) is responsible for calling nfsd_serv_put and
+ * set. Caller (NFS client) is responsible for calling nfsd_net_put and
  * nfsd_file_put (via nfs_to_nfsd_file_put_local).
  */
 struct nfsd_file *
-- 
cgit v1.2.3


From 085804110aa13eac7f763d8d5cfe3a8220e35222 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:41:02 -0500
Subject: nfs_common: track all open nfsd_files per LOCALIO nfs_client

This tracking enables __nfsd_file_cache_purge() to call
nfs_localio_invalidate_clients(), upon shutdown or export change, to
nfs_close_local_fh() all open nfsd_files that are still cached by the
LOCALIO nfs clients associated with nfsd_net that is being shutdown.

Now that the client must track all open nfsd_files there was more work
than necessary being done with the global nfs_uuids_lock contended.
This manifested in various RCU issues, e.g.:
 hrtimer: interrupt took 47969440 ns
 rcu: INFO: rcu_sched detected stalls on CPUs/tasks:

Use nfs_uuid->lock to protect all nfs_uuid_t members, instead of
nfs_uuids_lock, once nfs_uuid_is_local() adds the client to
nn->local_clients.

Also add 'local_clients_lock' to 'struct nfsd_net' to protect
nn->local_clients.  And store a pointer to spinlock in the 'list_lock'
member of nfs_uuid_t so nfs_localio_disable_client() can use it to
avoid taking the global nfs_uuids_lock.

In combination, these split out locks eliminate the use of the single
nfslocalio.c global nfs_uuids_lock in the IO paths (open and close).

Also refactored associated fs/nfs_common/nfslocalio.c methods' locking
to reduce work performed with spinlocks held in general.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs_common/nfslocalio.c | 170 +++++++++++++++++++++++++++++++++------------
 fs/nfsd/filecache.c        |   9 +++
 fs/nfsd/localio.c          |   1 +
 fs/nfsd/netns.h            |   1 +
 fs/nfsd/nfsctl.c           |   4 +-
 include/linux/nfslocalio.h |   8 ++-
 6 files changed, 145 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 5fa3f47b442e..cfb61871fb1e 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -23,27 +23,49 @@ static DEFINE_SPINLOCK(nfs_uuids_lock);
  */
 static LIST_HEAD(nfs_uuids);
 
+/*
+ * Lock ordering:
+ * 1: nfs_uuid->lock
+ * 2: nfs_uuids_lock
+ * 3: nfs_uuid->list_lock (aka nn->local_clients_lock)
+ *
+ * May skip locks in select cases, but never hold multiple
+ * locks out of order.
+ */
+
 void nfs_uuid_init(nfs_uuid_t *nfs_uuid)
 {
 	nfs_uuid->net = NULL;
 	nfs_uuid->dom = NULL;
+	nfs_uuid->list_lock = NULL;
 	INIT_LIST_HEAD(&nfs_uuid->list);
+	INIT_LIST_HEAD(&nfs_uuid->files);
 	spin_lock_init(&nfs_uuid->lock);
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_init);
 
 bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
 {
+	spin_lock(&nfs_uuid->lock);
+	if (nfs_uuid->net) {
+		/* This nfs_uuid is already in use */
+		spin_unlock(&nfs_uuid->lock);
+		return false;
+	}
+
 	spin_lock(&nfs_uuids_lock);
-	/* Is this nfs_uuid already in use? */
 	if (!list_empty(&nfs_uuid->list)) {
+		/* This nfs_uuid is already in use */
 		spin_unlock(&nfs_uuids_lock);
+		spin_unlock(&nfs_uuid->lock);
 		return false;
 	}
-	uuid_gen(&nfs_uuid->uuid);
 	list_add_tail(&nfs_uuid->list, &nfs_uuids);
 	spin_unlock(&nfs_uuids_lock);
 
+	uuid_gen(&nfs_uuid->uuid);
+	spin_unlock(&nfs_uuid->lock);
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_begin);
@@ -51,11 +73,15 @@ EXPORT_SYMBOL_GPL(nfs_uuid_begin);
 void nfs_uuid_end(nfs_uuid_t *nfs_uuid)
 {
 	if (nfs_uuid->net == NULL) {
-		spin_lock(&nfs_uuids_lock);
-		if (nfs_uuid->net == NULL)
+		spin_lock(&nfs_uuid->lock);
+		if (nfs_uuid->net == NULL) {
+			/* Not local, remove from nfs_uuids */
+			spin_lock(&nfs_uuids_lock);
 			list_del_init(&nfs_uuid->list);
-		spin_unlock(&nfs_uuids_lock);
-	}
+			spin_unlock(&nfs_uuids_lock);
+		}
+		spin_unlock(&nfs_uuid->lock);
+        }
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_end);
 
@@ -73,28 +99,39 @@ static nfs_uuid_t * nfs_uuid_lookup_locked(const uuid_t *uuid)
 static struct module *nfsd_mod;
 
 void nfs_uuid_is_local(const uuid_t *uuid, struct list_head *list,
-		       struct net *net, struct auth_domain *dom,
-		       struct module *mod)
+		       spinlock_t *list_lock, struct net *net,
+		       struct auth_domain *dom, struct module *mod)
 {
 	nfs_uuid_t *nfs_uuid;
 
 	spin_lock(&nfs_uuids_lock);
 	nfs_uuid = nfs_uuid_lookup_locked(uuid);
-	if (nfs_uuid) {
-		kref_get(&dom->ref);
-		nfs_uuid->dom = dom;
-		/*
-		 * We don't hold a ref on the net, but instead put
-		 * ourselves on a list so the net pointer can be
-		 * invalidated.
-		 */
-		list_move(&nfs_uuid->list, list);
-		rcu_assign_pointer(nfs_uuid->net, net);
-
-		__module_get(mod);
-		nfsd_mod = mod;
+	if (!nfs_uuid) {
+		spin_unlock(&nfs_uuids_lock);
+		return;
 	}
+
+	/*
+	 * We don't hold a ref on the net, but instead put
+	 * ourselves on @list (nn->local_clients) so the net
+	 * pointer can be invalidated.
+	 */
+	spin_lock(list_lock); /* list_lock is nn->local_clients_lock */
+	list_move(&nfs_uuid->list, list);
+	spin_unlock(list_lock);
+
 	spin_unlock(&nfs_uuids_lock);
+	/* Once nfs_uuid is parented to @list, avoid global nfs_uuids_lock */
+	spin_lock(&nfs_uuid->lock);
+
+	__module_get(mod);
+	nfsd_mod = mod;
+
+	nfs_uuid->list_lock = list_lock;
+	kref_get(&dom->ref);
+	nfs_uuid->dom = dom;
+	rcu_assign_pointer(nfs_uuid->net, net);
+	spin_unlock(&nfs_uuid->lock);
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_is_local);
 
@@ -108,55 +145,96 @@ void nfs_localio_enable_client(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs_localio_enable_client);
 
-static void nfs_uuid_put_locked(nfs_uuid_t *nfs_uuid)
+/*
+ * Cleanup the nfs_uuid_t embedded in an nfs_client.
+ * This is the long-form of nfs_uuid_init().
+ */
+static void nfs_uuid_put(nfs_uuid_t *nfs_uuid)
 {
-	if (!nfs_uuid->net)
+	LIST_HEAD(local_files);
+	struct nfs_file_localio *nfl, *tmp;
+
+	spin_lock(&nfs_uuid->lock);
+	if (unlikely(!nfs_uuid->net)) {
+		spin_unlock(&nfs_uuid->lock);
 		return;
-	module_put(nfsd_mod);
+	}
 	RCU_INIT_POINTER(nfs_uuid->net, NULL);
 
 	if (nfs_uuid->dom) {
 		auth_domain_put(nfs_uuid->dom);
 		nfs_uuid->dom = NULL;
 	}
-	list_del_init(&nfs_uuid->list);
+
+	list_splice_init(&nfs_uuid->files, &local_files);
+	spin_unlock(&nfs_uuid->lock);
+
+	/* Walk list of files and ensure their last references dropped */
+	list_for_each_entry_safe(nfl, tmp, &local_files, list) {
+		nfs_close_local_fh(nfl);
+		cond_resched();
+	}
+
+	spin_lock(&nfs_uuid->lock);
+	BUG_ON(!list_empty(&nfs_uuid->files));
+
+	/* Remove client from nn->local_clients */
+	if (nfs_uuid->list_lock) {
+		spin_lock(nfs_uuid->list_lock);
+		BUG_ON(list_empty(&nfs_uuid->list));
+		list_del_init(&nfs_uuid->list);
+		spin_unlock(nfs_uuid->list_lock);
+		nfs_uuid->list_lock = NULL;
+	}
+
+	module_put(nfsd_mod);
+	spin_unlock(&nfs_uuid->lock);
 }
 
 void nfs_localio_disable_client(struct nfs_client *clp)
 {
-	nfs_uuid_t *nfs_uuid = &clp->cl_uuid;
+	nfs_uuid_t *nfs_uuid = NULL;
 
-	spin_lock(&nfs_uuid->lock);
+	spin_lock(&clp->cl_uuid.lock); /* aka &nfs_uuid->lock */
 	if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
-		spin_lock(&nfs_uuids_lock);
-		nfs_uuid_put_locked(nfs_uuid);
-		spin_unlock(&nfs_uuids_lock);
+		/* &clp->cl_uuid is always not NULL, using as bool here */
+		nfs_uuid = &clp->cl_uuid;
 	}
-	spin_unlock(&nfs_uuid->lock);
+	spin_unlock(&clp->cl_uuid.lock);
+
+	if (nfs_uuid)
+		nfs_uuid_put(nfs_uuid);
 }
 EXPORT_SYMBOL_GPL(nfs_localio_disable_client);
 
-void nfs_localio_invalidate_clients(struct list_head *cl_uuid_list)
+void nfs_localio_invalidate_clients(struct list_head *nn_local_clients,
+				    spinlock_t *nn_local_clients_lock)
 {
+	LIST_HEAD(local_clients);
 	nfs_uuid_t *nfs_uuid, *tmp;
-
-	spin_lock(&nfs_uuids_lock);
-	list_for_each_entry_safe(nfs_uuid, tmp, cl_uuid_list, list) {
-		struct nfs_client *clp =
-			container_of(nfs_uuid, struct nfs_client, cl_uuid);
-
+	struct nfs_client *clp;
+
+	spin_lock(nn_local_clients_lock);
+	list_splice_init(nn_local_clients, &local_clients);
+	spin_unlock(nn_local_clients_lock);
+	list_for_each_entry_safe(nfs_uuid, tmp, &local_clients, list) {
+		if (WARN_ON(nfs_uuid->list_lock != nn_local_clients_lock))
+			break;
+		clp = container_of(nfs_uuid, struct nfs_client, cl_uuid);
 		nfs_localio_disable_client(clp);
 	}
-	spin_unlock(&nfs_uuids_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_localio_invalidate_clients);
 
 static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl)
 {
-	spin_lock(&nfs_uuids_lock);
-	if (!nfl->nfs_uuid)
+	/* Add nfl to nfs_uuid->files if it isn't already */
+	spin_lock(&nfs_uuid->lock);
+	if (list_empty(&nfl->list)) {
 		rcu_assign_pointer(nfl->nfs_uuid, nfs_uuid);
-	spin_unlock(&nfs_uuids_lock);
+		list_add_tail(&nfl->list, &nfs_uuid->files);
+	}
+	spin_unlock(&nfs_uuid->lock);
 }
 
 /*
@@ -217,14 +295,16 @@ void nfs_close_local_fh(struct nfs_file_localio *nfl)
 	ro_nf = rcu_access_pointer(nfl->ro_file);
 	rw_nf = rcu_access_pointer(nfl->rw_file);
 	if (ro_nf || rw_nf) {
-		spin_lock(&nfs_uuids_lock);
+		spin_lock(&nfs_uuid->lock);
 		if (ro_nf)
 			ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1);
 		if (rw_nf)
 			rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1);
 
-		rcu_assign_pointer(nfl->nfs_uuid, NULL);
-		spin_unlock(&nfs_uuids_lock);
+		/* Remove nfl from nfs_uuid->files list */
+		RCU_INIT_POINTER(nfl->nfs_uuid, NULL);
+		list_del_init(&nfl->list);
+		spin_unlock(&nfs_uuid->lock);
 		rcu_read_unlock();
 
 		if (ro_nf)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 09dd708584c1..2adf95e2b379 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -39,6 +39,7 @@
 #include <linux/fsnotify.h>
 #include <linux/seq_file.h>
 #include <linux/rhashtable.h>
+#include <linux/nfslocalio.h>
 
 #include "vfs.h"
 #include "nfsd.h"
@@ -833,6 +834,14 @@ __nfsd_file_cache_purge(struct net *net)
 	struct nfsd_file *nf;
 	LIST_HEAD(dispose);
 
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	if (net) {
+		struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+		nfs_localio_invalidate_clients(&nn->local_clients,
+					       &nn->local_clients_lock);
+	}
+#endif
+
 	rhltable_walk_enter(&nfsd_file_rhltable, &iter);
 	do {
 		rhashtable_walk_start(&iter);
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index 2ae07161b919..238647fa379e 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -116,6 +116,7 @@ static __be32 localio_proc_uuid_is_local(struct svc_rqst *rqstp)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	nfs_uuid_is_local(&argp->uuid, &nn->local_clients,
+			  &nn->local_clients_lock,
 			  net, rqstp->rq_client, THIS_MODULE);
 
 	return rpc_success;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 8faef59d7122..187c4140b191 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -220,6 +220,7 @@ struct nfsd_net {
 
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 	/* Local clients to be invalidated when net is shut down */
+	spinlock_t              local_clients_lock;
 	struct list_head	local_clients;
 #endif
 };
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 727904d8a4d0..70347b0ecdc4 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2259,6 +2259,7 @@ static __net_init int nfsd_net_init(struct net *net)
 	seqlock_init(&nn->writeverf_lock);
 	nfsd_proc_stat_init(net);
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	spin_lock_init(&nn->local_clients_lock);
 	INIT_LIST_HEAD(&nn->local_clients);
 #endif
 	return 0;
@@ -2283,7 +2284,8 @@ static __net_exit void nfsd_net_pre_exit(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	nfs_localio_invalidate_clients(&nn->local_clients);
+	nfs_localio_invalidate_clients(&nn->local_clients,
+				       &nn->local_clients_lock);
 }
 #endif
 
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index aa2b5c6561ab..c68a529230c1 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -30,19 +30,23 @@ typedef struct {
 	/* sadly this struct is just over a cacheline, avoid bouncing */
 	spinlock_t ____cacheline_aligned lock;
 	struct list_head list;
+	spinlock_t *list_lock; /* nn->local_clients_lock */
 	struct net __rcu *net; /* nfsd's network namespace */
 	struct auth_domain *dom; /* auth_domain for localio */
+	/* Local files to close when net is shut down or exports change */
+	struct list_head files;
 } nfs_uuid_t;
 
 void nfs_uuid_init(nfs_uuid_t *);
 bool nfs_uuid_begin(nfs_uuid_t *);
 void nfs_uuid_end(nfs_uuid_t *);
-void nfs_uuid_is_local(const uuid_t *, struct list_head *,
+void nfs_uuid_is_local(const uuid_t *, struct list_head *, spinlock_t *,
 		       struct net *, struct auth_domain *, struct module *);
 
 void nfs_localio_enable_client(struct nfs_client *clp);
 void nfs_localio_disable_client(struct nfs_client *clp);
-void nfs_localio_invalidate_clients(struct list_head *list);
+void nfs_localio_invalidate_clients(struct list_head *nn_local_clients,
+				    spinlock_t *nn_local_clients_lock);
 
 /* localio needs to map filehandle -> struct nfsd_file */
 extern struct nfsd_file *
-- 
cgit v1.2.3


From 0dc73141524403f979a9655b4c55d763dbcb2b88 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:41:03 -0500
Subject: nfs_common: add nfs_localio trace events

The nfs_localio.ko now exposes /sys/kernel/tracing/events/nfs_localio
with nfs_localio_enable_client and nfs_localio_disable_client events.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs_common/Makefile        |  3 ++-
 fs/nfs_common/localio_trace.c | 10 ++++++++
 fs/nfs_common/localio_trace.h | 56 +++++++++++++++++++++++++++++++++++++++++++
 fs/nfs_common/nfslocalio.c    |  4 ++++
 4 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 fs/nfs_common/localio_trace.c
 create mode 100644 fs/nfs_common/localio_trace.h

(limited to 'fs')

diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index a5e54809701e..c10ead273ff2 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -6,8 +6,9 @@
 obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
 nfs_acl-objs := nfsacl.o
 
+CFLAGS_localio_trace.o += -I$(src)
 obj-$(CONFIG_NFS_COMMON_LOCALIO_SUPPORT) += nfs_localio.o
-nfs_localio-objs := nfslocalio.o
+nfs_localio-objs := nfslocalio.o localio_trace.o
 
 obj-$(CONFIG_GRACE_PERIOD) += grace.o
 obj-$(CONFIG_NFS_V4_2_SSC_HELPER) += nfs_ssc.o
diff --git a/fs/nfs_common/localio_trace.c b/fs/nfs_common/localio_trace.c
new file mode 100644
index 000000000000..7decfe57abeb
--- /dev/null
+++ b/fs/nfs_common/localio_trace.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Trond Myklebust <trond.myklebust@hammerspace.com>
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/namei.h>
+
+#define CREATE_TRACE_POINTS
+#include "localio_trace.h"
diff --git a/fs/nfs_common/localio_trace.h b/fs/nfs_common/localio_trace.h
new file mode 100644
index 000000000000..4055aec9ff8d
--- /dev/null
+++ b/fs/nfs_common/localio_trace.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Trond Myklebust <trond.myklebust@hammerspace.com>
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs_localio
+
+#if !defined(_TRACE_NFS_COMMON_LOCALIO_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS_COMMON_LOCALIO_H
+
+#include <linux/tracepoint.h>
+
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
+
+DECLARE_EVENT_CLASS(nfs_local_client_event,
+		TP_PROTO(
+			const struct nfs_client *clp
+		),
+
+		TP_ARGS(clp),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, protocol)
+			__string(server, clp->cl_hostname)
+		),
+
+		TP_fast_assign(
+			__entry->protocol = clp->rpc_ops->version;
+			__assign_str(server);
+		),
+
+		TP_printk(
+			"server=%s NFSv%u", __get_str(server), __entry->protocol
+		)
+);
+
+#define DEFINE_NFS_LOCAL_CLIENT_EVENT(name) \
+	DEFINE_EVENT(nfs_local_client_event, name, \
+			TP_PROTO( \
+				const struct nfs_client *clp \
+			), \
+			TP_ARGS(clp))
+
+DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_localio_enable_client);
+DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_localio_disable_client);
+
+#endif /* _TRACE_NFS_COMMON_LOCALIO_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE localio_trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index cfb61871fb1e..0decc2fe154c 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -12,6 +12,8 @@
 #include <linux/nfs_fs.h>
 #include <net/netns/generic.h>
 
+#include "localio_trace.h"
+
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("NFS localio protocol bypass support");
 
@@ -141,6 +143,7 @@ void nfs_localio_enable_client(struct nfs_client *clp)
 
 	spin_lock(&nfs_uuid->lock);
 	set_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
+	trace_nfs_localio_enable_client(clp);
 	spin_unlock(&nfs_uuid->lock);
 }
 EXPORT_SYMBOL_GPL(nfs_localio_enable_client);
@@ -199,6 +202,7 @@ void nfs_localio_disable_client(struct nfs_client *clp)
 	if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
 		/* &clp->cl_uuid is always not NULL, using as bool here */
 		nfs_uuid = &clp->cl_uuid;
+		trace_nfs_localio_disable_client(clp);
 	}
 	spin_unlock(&clp->cl_uuid.lock);
 
-- 
cgit v1.2.3


From 779a395189c692eec0246e7df63e2a3c0f0c8508 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:41:04 -0500
Subject: nfs/localio: remove redundant code and simplify LOCALIO enablement

Remove nfs_local_enable and nfs_local_disable, instead use
nfs_localio_enable_client and nfs_localio_disable_client.

Discontinue use of the NFS_CS_LOCAL_IO bit in the nfs_client struct's
cl_flags to reflect that LOCALIO is enabled; instead just test if the
net member of the nfs_uuid_t struct is set.

Also remove NFS_CS_LOCAL_IO.

Lastly, remove trace_nfs_local_enable and trace_nfs_local_disable
because comparable traces are available from nfs_localio.ko.

Suggested-by: NeilBrown <neilb@suse.de>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/client.c            |  4 ++--
 fs/nfs/internal.h          |  2 --
 fs/nfs/localio.c           | 28 +++++-----------------------
 fs/nfs/nfstrace.h          | 32 --------------------------------
 fs/nfs_common/nfslocalio.c | 34 +++++++++++-----------------------
 include/linux/nfs_fs_sb.h  |  1 -
 include/linux/nfslocalio.h |  4 ++++
 7 files changed, 22 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e83e1ce04613..16530c71fd15 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -38,7 +38,7 @@
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/nsproxy.h>
 #include <linux/pid_namespace.h>
-
+#include <linux/nfslocalio.h>
 
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -243,7 +243,7 @@ static void pnfs_init_server(struct nfs_server *server)
  */
 void nfs_free_client(struct nfs_client *clp)
 {
-	nfs_local_disable(clp);
+	nfs_localio_disable_client(clp);
 
 	/* -EIO all pending I/O */
 	if (!IS_ERR(clp->cl_rpcclient))
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index febf289a9e4c..aea8bcfb221b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -455,7 +455,6 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
 
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 /* localio.c */
-extern void nfs_local_disable(struct nfs_client *);
 extern void nfs_local_probe(struct nfs_client *);
 extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
 					   const struct cred *,
@@ -472,7 +471,6 @@ extern int nfs_local_commit(struct nfsd_file *,
 extern bool nfs_server_is_local(const struct nfs_client *clp);
 
 #else /* CONFIG_NFS_LOCALIO */
-static inline void nfs_local_disable(struct nfs_client *clp) {}
 static inline void nfs_local_probe(struct nfs_client *clp) {}
 static inline struct nfsd_file *
 nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 7e432057c3a1..4b6bf4ea7d7f 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(localio_O_DIRECT_semantics,
 
 static inline bool nfs_client_is_local(const struct nfs_client *clp)
 {
-	return !!test_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
+	return !!rcu_access_pointer(clp->cl_uuid.net);
 }
 
 bool nfs_server_is_local(const struct nfs_client *clp)
@@ -121,24 +121,6 @@ const struct rpc_program nfslocalio_program = {
 	.stats			= &nfslocalio_rpcstat,
 };
 
-/*
- * nfs_local_enable - enable local i/o for an nfs_client
- */
-static void nfs_local_enable(struct nfs_client *clp)
-{
-	trace_nfs_local_enable(clp);
-	nfs_localio_enable_client(clp);
-}
-
-/*
- * nfs_local_disable - disable local i/o for an nfs_client
- */
-void nfs_local_disable(struct nfs_client *clp)
-{
-	trace_nfs_local_disable(clp);
-	nfs_localio_disable_client(clp);
-}
-
 /*
  * nfs_init_localioclient - Initialise an NFS localio client connection
  */
@@ -194,19 +176,19 @@ void nfs_local_probe(struct nfs_client *clp)
 	/* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */
 	if (!localio_enabled ||
 	    clp->cl_rpcclient->cl_auth->au_flavor != RPC_AUTH_UNIX) {
-		nfs_local_disable(clp);
+		nfs_localio_disable_client(clp);
 		return;
 	}
 
 	if (nfs_client_is_local(clp)) {
 		/* If already enabled, disable and re-enable */
-		nfs_local_disable(clp);
+		nfs_localio_disable_client(clp);
 	}
 
 	if (!nfs_uuid_begin(&clp->cl_uuid))
 		return;
 	if (nfs_server_uuid_is_local(clp))
-		nfs_local_enable(clp);
+		nfs_localio_enable_client(clp);
 	nfs_uuid_end(&clp->cl_uuid);
 }
 EXPORT_SYMBOL_GPL(nfs_local_probe);
@@ -748,7 +730,7 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
 
 	if (status != 0) {
 		if (status == -EAGAIN)
-			nfs_local_disable(clp);
+			nfs_localio_disable_client(clp);
 		nfs_local_file_put(localio);
 		hdr->task.tk_status = status;
 		nfs_local_hdr_release(hdr, call_ops);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 1eab98c277fa..7a058bd8c566 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1714,38 +1714,6 @@ TRACE_EVENT(nfs_local_open_fh,
 		)
 );
 
-DECLARE_EVENT_CLASS(nfs_local_client_event,
-		TP_PROTO(
-			const struct nfs_client *clp
-		),
-
-		TP_ARGS(clp),
-
-		TP_STRUCT__entry(
-			__field(unsigned int, protocol)
-			__string(server, clp->cl_hostname)
-		),
-
-		TP_fast_assign(
-			__entry->protocol = clp->rpc_ops->version;
-			__assign_str(server);
-		),
-
-		TP_printk(
-			"server=%s NFSv%u", __get_str(server), __entry->protocol
-		)
-);
-
-#define DEFINE_NFS_LOCAL_CLIENT_EVENT(name) \
-	DEFINE_EVENT(nfs_local_client_event, name, \
-			TP_PROTO( \
-				const struct nfs_client *clp \
-			), \
-			TP_ARGS(clp))
-
-DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_local_enable);
-DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_local_disable);
-
 DECLARE_EVENT_CLASS(nfs_xdr_event,
 		TP_PROTO(
 			const struct xdr_stream *xdr,
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 0decc2fe154c..bad7691e32b9 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -37,7 +37,7 @@ static LIST_HEAD(nfs_uuids);
 
 void nfs_uuid_init(nfs_uuid_t *nfs_uuid)
 {
-	nfs_uuid->net = NULL;
+	RCU_INIT_POINTER(nfs_uuid->net, NULL);
 	nfs_uuid->dom = NULL;
 	nfs_uuid->list_lock = NULL;
 	INIT_LIST_HEAD(&nfs_uuid->list);
@@ -49,7 +49,7 @@ EXPORT_SYMBOL_GPL(nfs_uuid_init);
 bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
 {
 	spin_lock(&nfs_uuid->lock);
-	if (nfs_uuid->net) {
+	if (rcu_access_pointer(nfs_uuid->net)) {
 		/* This nfs_uuid is already in use */
 		spin_unlock(&nfs_uuid->lock);
 		return false;
@@ -74,9 +74,9 @@ EXPORT_SYMBOL_GPL(nfs_uuid_begin);
 
 void nfs_uuid_end(nfs_uuid_t *nfs_uuid)
 {
-	if (nfs_uuid->net == NULL) {
+	if (!rcu_access_pointer(nfs_uuid->net)) {
 		spin_lock(&nfs_uuid->lock);
-		if (nfs_uuid->net == NULL) {
+		if (!rcu_access_pointer(nfs_uuid->net)) {
 			/* Not local, remove from nfs_uuids */
 			spin_lock(&nfs_uuids_lock);
 			list_del_init(&nfs_uuid->list);
@@ -139,12 +139,8 @@ EXPORT_SYMBOL_GPL(nfs_uuid_is_local);
 
 void nfs_localio_enable_client(struct nfs_client *clp)
 {
-	nfs_uuid_t *nfs_uuid = &clp->cl_uuid;
-
-	spin_lock(&nfs_uuid->lock);
-	set_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
+	/* nfs_uuid_is_local() does the actual enablement */
 	trace_nfs_localio_enable_client(clp);
-	spin_unlock(&nfs_uuid->lock);
 }
 EXPORT_SYMBOL_GPL(nfs_localio_enable_client);
 
@@ -152,15 +148,15 @@ EXPORT_SYMBOL_GPL(nfs_localio_enable_client);
  * Cleanup the nfs_uuid_t embedded in an nfs_client.
  * This is the long-form of nfs_uuid_init().
  */
-static void nfs_uuid_put(nfs_uuid_t *nfs_uuid)
+static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid)
 {
 	LIST_HEAD(local_files);
 	struct nfs_file_localio *nfl, *tmp;
 
 	spin_lock(&nfs_uuid->lock);
-	if (unlikely(!nfs_uuid->net)) {
+	if (unlikely(!rcu_access_pointer(nfs_uuid->net))) {
 		spin_unlock(&nfs_uuid->lock);
-		return;
+		return false;
 	}
 	RCU_INIT_POINTER(nfs_uuid->net, NULL);
 
@@ -192,22 +188,14 @@ static void nfs_uuid_put(nfs_uuid_t *nfs_uuid)
 
 	module_put(nfsd_mod);
 	spin_unlock(&nfs_uuid->lock);
+
+	return true;
 }
 
 void nfs_localio_disable_client(struct nfs_client *clp)
 {
-	nfs_uuid_t *nfs_uuid = NULL;
-
-	spin_lock(&clp->cl_uuid.lock); /* aka &nfs_uuid->lock */
-	if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
-		/* &clp->cl_uuid is always not NULL, using as bool here */
-		nfs_uuid = &clp->cl_uuid;
+	if (nfs_uuid_put(&clp->cl_uuid))
 		trace_nfs_localio_disable_client(clp);
-	}
-	spin_unlock(&clp->cl_uuid.lock);
-
-	if (nfs_uuid)
-		nfs_uuid_put(nfs_uuid);
 }
 EXPORT_SYMBOL_GPL(nfs_localio_disable_client);
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 239d86ef166c..ed66df1093e8 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -50,7 +50,6 @@ struct nfs_client {
 #define NFS_CS_DS		7		/* - Server is a DS */
 #define NFS_CS_REUSEPORT	8		/* - reuse src port on reconnect */
 #define NFS_CS_PNFS		9		/* - Server used for pnfs */
-#define NFS_CS_LOCAL_IO		10		/* - client is local */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index c68a529230c1..05817d6ef3d1 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -111,6 +111,10 @@ static inline void nfs_close_local_fh(struct nfs_file_localio *nfl)
 static inline void nfsd_localio_ops_init(void)
 {
 }
+struct nfs_client;
+static inline void nfs_localio_disable_client(struct nfs_client *clp)
+{
+}
 
 #endif  /* CONFIG_NFS_LOCALIO */
 
-- 
cgit v1.2.3


From 76d4cb6345da0f2cd505e552157258325bcc8bcd Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:41:05 -0500
Subject: nfs: probe for LOCALIO when v4 client reconnects to server

Introduce nfs_local_probe_async() for the NFS client to initiate
if/when it reconnects with server. For NFSv4 it is a simple matter to
call nfs_local_probe_async() from nfs4_do_reclaim (during NFSv4
grace).

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/client.c           |  1 +
 fs/nfs/internal.h         |  3 +++
 fs/nfs/localio.c          | 14 ++++++++++++++
 fs/nfs/nfs4state.c        |  1 +
 include/linux/nfs_fs_sb.h |  1 +
 5 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 16530c71fd15..3b0918ade53c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -186,6 +186,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 	seqlock_init(&clp->cl_boot_lock);
 	ktime_get_real_ts64(&clp->cl_nfssvc_boot);
 	nfs_uuid_init(&clp->cl_uuid);
+	INIT_WORK(&clp->cl_local_probe_work, nfs_local_probe_async_work);
 #endif /* CONFIG_NFS_LOCALIO */
 
 	clp->cl_principal = "*";
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index aea8bcfb221b..fae2c7ae4acc 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -456,6 +456,8 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 /* localio.c */
 extern void nfs_local_probe(struct nfs_client *);
+extern void nfs_local_probe_async(struct nfs_client *);
+extern void nfs_local_probe_async_work(struct work_struct *);
 extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
 					   const struct cred *,
 					   struct nfs_fh *,
@@ -472,6 +474,7 @@ extern bool nfs_server_is_local(const struct nfs_client *clp);
 
 #else /* CONFIG_NFS_LOCALIO */
 static inline void nfs_local_probe(struct nfs_client *clp) {}
+static inline void nfs_local_probe_async(struct nfs_client *clp) {}
 static inline struct nfsd_file *
 nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
 		  struct nfs_fh *fh, struct nfs_file_localio *nfl,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 4b6bf4ea7d7f..1eee5aac2884 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -193,6 +193,20 @@ void nfs_local_probe(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs_local_probe);
 
+void nfs_local_probe_async_work(struct work_struct *work)
+{
+	struct nfs_client *clp =
+		container_of(work, struct nfs_client, cl_local_probe_work);
+
+	nfs_local_probe(clp);
+}
+
+void nfs_local_probe_async(struct nfs_client *clp)
+{
+	queue_work(nfsiod_workqueue, &clp->cl_local_probe_work);
+}
+EXPORT_SYMBOL_GPL(nfs_local_probe_async);
+
 static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf)
 {
 	return nfs_to->nfsd_file_get(nf);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9a9f60a2291b..542cdf71229f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1955,6 +1955,7 @@ restart:
 	}
 	rcu_read_unlock();
 	nfs4_free_state_owners(&freeme);
+	nfs_local_probe_async(clp);
 	if (lost_locks)
 		pr_warn("NFS: %s: lost %d locks\n",
 			clp->cl_hostname, lost_locks);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index ed66df1093e8..f00bfcee7120 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -131,6 +131,7 @@ struct nfs_client {
 	struct timespec64	cl_nfssvc_boot;
 	seqlock_t		cl_boot_lock;
 	nfs_uuid_t		cl_uuid;
+	struct work_struct	cl_local_probe_work;
 #endif /* CONFIG_NFS_LOCALIO */
 };
 
-- 
cgit v1.2.3


From 4a489220aa8c9daf5f02396c28cebade9f9ab563 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 15 Nov 2024 20:41:06 -0500
Subject: nfs: probe for LOCALIO when v3 client reconnects to server

Re-enabling NFSv3 LOCALIO is made more complex (than NFSv4) because v3
is stateless.  As such, the hueristic used to identify a LOCALIO probe
point is more adhoc by nature: if/when NFSv3 client IO begins to
complete again in terms of normal RPC-based NFSv3 server IO, attempt
nfs_local_probe_async().

Care is taken to throttle the frequency of nfs_local_probe_async(),
otherwise there could be a flood of repeat calls to
nfs_local_probe_async().

The throttle is admin controlled using a new module parameter for
nfsv3, e.g.:
  echo 512 > /sys/module/nfsv3/parameters/nfs3_localio_probe_throttle

Probe for NFSv3 LOCALIO every N IO requests (512 in this case). Must
be power-of-2, defaults to 0 (probing disabled).

On systems that expect to use LOCALIO with NFSv3 the admin should
configure the 'nfs3_localio_probe_throttle' module parameter.

This commit backfills module parameter documentation in localio.rst

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 Documentation/filesystems/nfs/localio.rst | 20 +++++++++++++-
 fs/nfs/nfs3proc.c                         | 46 +++++++++++++++++++++++++++++--
 fs/nfs_common/nfslocalio.c                |  1 +
 include/linux/nfslocalio.h                |  3 +-
 4 files changed, 65 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/nfs/localio.rst b/Documentation/filesystems/nfs/localio.rst
index 7d2dbf75e96d..79808b37d745 100644
--- a/Documentation/filesystems/nfs/localio.rst
+++ b/Documentation/filesystems/nfs/localio.rst
@@ -291,7 +291,7 @@ align their IO).
 Security
 ========
 
-Localio is only supported when UNIX-style authentication (AUTH_UNIX, aka
+LOCALIO is only supported when UNIX-style authentication (AUTH_UNIX, aka
 AUTH_SYS) is used.
 
 Care is taken to ensure the same NFS security mechanisms are used
@@ -306,6 +306,24 @@ client is afforded this same level of access (albeit in terms of the NFS
 protocol via SUNRPC). No other namespaces (user, mount, etc) have been
 altered or purposely extended from the server to the client.
 
+Module Parameters
+=================
+
+/sys/module/nfs/parameters/localio_enabled (bool)
+controls if LOCALIO is enabled, defaults to Y. If client and server are
+local but 'localio_enabled' is set to N then LOCALIO will not be used.
+
+/sys/module/nfs/parameters/localio_O_DIRECT_semantics (bool)
+controls if O_DIRECT extends down to the underlying filesystem, defaults
+to N. Application IO must be logical blocksize aligned, otherwise
+O_DIRECT will fail.
+
+/sys/module/nfsv3/parameters/nfs3_localio_probe_throttle (uint)
+controls if NFSv3 read and write IOs will trigger (re)enabling of
+LOCALIO every N (nfs3_localio_probe_throttle) IOs, defaults to 0
+(disabled). Must be power-of-2, admin keeps all the pieces if they
+misconfigure (too low a value or non-power-of-2).
+
 Testing
 =======
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1566163c6d85..7359e1a3bd84 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -844,6 +844,41 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return status;
 }
 
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+static unsigned nfs3_localio_probe_throttle __read_mostly = 0;
+module_param(nfs3_localio_probe_throttle, uint, 0644);
+MODULE_PARM_DESC(nfs3_localio_probe_throttle,
+		 "Probe for NFSv3 LOCALIO every N IO requests. Must be power-of-2, defaults to 0 (probing disabled).");
+
+static void nfs3_localio_probe(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	/* Throttled to reduce nfs_local_probe_async() frequency */
+	if (!nfs3_localio_probe_throttle || nfs_server_is_local(clp))
+		return;
+
+	/*
+	 * Try (re)enabling LOCALIO if isn't enabled -- admin deems
+	 * it worthwhile to periodically check if LOCALIO possible by
+	 * setting the 'nfs3_localio_probe_throttle' module parameter.
+	 *
+	 * This is useful if LOCALIO was previously enabled, but was
+	 * disabled due to server restart, and IO has successfully
+	 * completed in terms of normal RPC.
+	 */
+	if ((clp->cl_uuid.nfs3_localio_probe_count++ &
+	     (nfs3_localio_probe_throttle - 1)) == 0) {
+		if (!nfs_server_is_local(clp))
+			nfs_local_probe_async(clp);
+	}
+}
+
+#else
+static void nfs3_localio_probe(struct nfs_server *server) {}
+#endif
+
 static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
 	struct inode *inode = hdr->inode;
@@ -855,8 +890,11 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 
-	if (task->tk_status >= 0 && !server->read_hdrsize)
-		cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+	if (task->tk_status >= 0) {
+		if (!server->read_hdrsize)
+			cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+		nfs3_localio_probe(server);
+	}
 
 	nfs_invalidate_atime(inode);
 	nfs_refresh_inode(inode, &hdr->fattr);
@@ -886,8 +924,10 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 
 	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
-	if (task->tk_status >= 0)
+	if (task->tk_status >= 0) {
 		nfs_writeback_update_inode(hdr);
+		nfs3_localio_probe(NFS_SERVER(inode));
+	}
 	return 0;
 }
 
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index bad7691e32b9..6a0bdea6d644 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -43,6 +43,7 @@ void nfs_uuid_init(nfs_uuid_t *nfs_uuid)
 	INIT_LIST_HEAD(&nfs_uuid->list);
 	INIT_LIST_HEAD(&nfs_uuid->files);
 	spin_lock_init(&nfs_uuid->lock);
+	nfs_uuid->nfs3_localio_probe_count = 0;
 }
 EXPORT_SYMBOL_GPL(nfs_uuid_init);
 
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 05817d6ef3d1..9aa8a43843d7 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -27,7 +27,8 @@ struct nfs_file_localio;
  */
 typedef struct {
 	uuid_t uuid;
-	/* sadly this struct is just over a cacheline, avoid bouncing */
+	unsigned nfs3_localio_probe_count;
+	/* this struct is over a cacheline, avoid bouncing */
 	spinlock_t ____cacheline_aligned lock;
 	struct list_head list;
 	spinlock_t *list_lock; /* nn->local_clients_lock */
-- 
cgit v1.2.3


From e30458d690f35abb01de8b3cbc09285deb725d00 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 14 Jan 2025 14:46:03 +0000
Subject: afs: Fix the fallback handling for the YFS.RemoveFile2 RPC call

Fix a pair of bugs in the fallback handling for the YFS.RemoveFile2 RPC
call:

 (1) Fix the abort code check to also look for RXGEN_OPCODE.  The lack of
     this masks the second bug.

 (2) call->server is now not used for ordinary filesystem RPC calls that
     have an operation descriptor.  Fix to use call->op->server instead.

Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/109541.1736865963@warthog.procyon.org.uk
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/yfsclient.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 024227aba4cd..362845f9aaae 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -666,8 +666,9 @@ static int yfs_deliver_fs_remove_file2(struct afs_call *call)
 static void yfs_done_fs_remove_file2(struct afs_call *call)
 {
 	if (call->error == -ECONNABORTED &&
-	    call->abort_code == RX_INVALID_OPERATION) {
-		set_bit(AFS_SERVER_FL_NO_RM2, &call->server->flags);
+	    (call->abort_code == RX_INVALID_OPERATION ||
+	     call->abort_code == RXGEN_OPCODE)) {
+		set_bit(AFS_SERVER_FL_NO_RM2, &call->op->server->flags);
 		call->op->flags |= AFS_OPERATION_DOWNGRADE;
 	}
 }
-- 
cgit v1.2.3


From 268b36116f2f048d2fb0cd13aa3036705fcac068 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2025 08:06:45 +0000
Subject: debugfs: separate cache for debugfs inodes

Embed them into container (struct debugfs_inode_info, with nothing
else in it at the moment), set the cache up, etc.

Just the infrastructure changes letting us augment debugfs inodes
here; adding stuff will come at the next step.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20250112080705.141166-1-viro@zeniv.linux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/inode.c    | 40 ++++++++++++++++++++++++++++++++++------
 fs/debugfs/internal.h |  9 +++++++++
 2 files changed, 43 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e752009de929..5d423bd92f93 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -208,16 +208,34 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
+static struct kmem_cache *debugfs_inode_cachep __ro_after_init;
+
+static void init_once(void *foo)
+{
+	struct debugfs_inode_info *info = foo;
+	inode_init_once(&info->vfs_inode);
+}
+
+static struct inode *debugfs_alloc_inode(struct super_block *sb)
+{
+	struct debugfs_inode_info *info;
+	info = alloc_inode_sb(sb, debugfs_inode_cachep, GFP_KERNEL);
+	if (!info)
+		return NULL;
+	return &info->vfs_inode;
+}
+
 static void debugfs_free_inode(struct inode *inode)
 {
 	if (S_ISLNK(inode->i_mode))
 		kfree(inode->i_link);
-	free_inode_nonrcu(inode);
+	kmem_cache_free(debugfs_inode_cachep, DEBUGFS_I(inode));
 }
 
 static const struct super_operations debugfs_super_operations = {
 	.statfs		= simple_statfs,
 	.show_options	= debugfs_show_options,
+	.alloc_inode	= debugfs_alloc_inode,
 	.free_inode	= debugfs_free_inode,
 };
 
@@ -939,12 +957,22 @@ static int __init debugfs_init(void)
 	if (retval)
 		return retval;
 
-	retval = register_filesystem(&debug_fs_type);
-	if (retval)
+	debugfs_inode_cachep = kmem_cache_create("debugfs_inode_cache",
+				sizeof(struct debugfs_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+				init_once);
+	if (debugfs_inode_cachep == NULL) {
 		sysfs_remove_mount_point(kernel_kobj, "debug");
-	else
-		debugfs_registered = true;
+		return -ENOMEM;
+	}
 
-	return retval;
+	retval = register_filesystem(&debug_fs_type);
+	if (retval) { // Really not going to happen
+		sysfs_remove_mount_point(kernel_kobj, "debug");
+		kmem_cache_destroy(debugfs_inode_cachep);
+		return retval;
+	}
+	debugfs_registered = true;
+	return 0;
 }
 core_initcall(debugfs_init);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index bbae4a228ef4..5cb940b0b8f6 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -11,6 +11,15 @@
 
 struct file_operations;
 
+struct debugfs_inode_info {
+	struct inode vfs_inode;
+};
+
+static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode)
+{
+	return container_of(inode, struct debugfs_inode_info, vfs_inode);
+}
+
 /* declared over in file.c */
 extern const struct file_operations debugfs_noop_file_operations;
 extern const struct file_operations debugfs_open_proxy_file_operations;
-- 
cgit v1.2.3


From bacaaf833e964933c36ee571d8c0e9a87fae1a3e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2025 08:06:46 +0000
Subject: debugfs: move ->automount into debugfs_inode_info

... and don't bother with debugfs_fsdata for those.  Life's
simpler that way...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20250112080705.141166-2-viro@zeniv.linux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/inode.c    | 21 +++++----------------
 fs/debugfs/internal.h | 19 +++++++++----------
 2 files changed, 14 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 5d423bd92f93..2f5afd7b1b94 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -246,8 +246,8 @@ static void debugfs_release_dentry(struct dentry *dentry)
 	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
 		return;
 
-	/* check it wasn't a dir (no fsdata) or automount (no real_fops) */
-	if (fsd && (fsd->real_fops || fsd->short_fops)) {
+	/* check it wasn't a dir or automount (no fsdata) */
+	if (fsd) {
 		WARN_ON(!list_empty(&fsd->cancellations));
 		mutex_destroy(&fsd->cancellations_mtx);
 	}
@@ -257,9 +257,9 @@ static void debugfs_release_dentry(struct dentry *dentry)
 
 static struct vfsmount *debugfs_automount(struct path *path)
 {
-	struct debugfs_fsdata *fsd = path->dentry->d_fsdata;
+	struct inode *inode = path->dentry->d_inode;
 
-	return fsd->automount(path->dentry, d_inode(path->dentry)->i_private);
+	return DEBUGFS_I(inode)->automount(path->dentry, inode->i_private);
 }
 
 static const struct dentry_operations debugfs_dops = {
@@ -642,23 +642,13 @@ struct dentry *debugfs_create_automount(const char *name,
 					void *data)
 {
 	struct dentry *dentry = start_creating(name, parent);
-	struct debugfs_fsdata *fsd;
 	struct inode *inode;
 
 	if (IS_ERR(dentry))
 		return dentry;
 
-	fsd = kzalloc(sizeof(*fsd), GFP_KERNEL);
-	if (!fsd) {
-		failed_creating(dentry);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	fsd->automount = f;
-
 	if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
 		failed_creating(dentry);
-		kfree(fsd);
 		return ERR_PTR(-EPERM);
 	}
 
@@ -666,14 +656,13 @@ struct dentry *debugfs_create_automount(const char *name,
 	if (unlikely(!inode)) {
 		pr_err("out of free dentries, can not create automount '%s'\n",
 		       name);
-		kfree(fsd);
 		return failed_creating(dentry);
 	}
 
 	make_empty_dir_inode(inode);
 	inode->i_flags |= S_AUTOMOUNT;
 	inode->i_private = data;
-	dentry->d_fsdata = fsd;
+	DEBUGFS_I(inode)->automount = f;
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
 	inc_nlink(inode);
 	d_instantiate(dentry, inode);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 5cb940b0b8f6..a644e44a0ee4 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -13,6 +13,9 @@ struct file_operations;
 
 struct debugfs_inode_info {
 	struct inode vfs_inode;
+	union {
+		debugfs_automount_t automount;
+	};
 };
 
 static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode)
@@ -29,17 +32,13 @@ extern const struct file_operations debugfs_full_short_proxy_file_operations;
 struct debugfs_fsdata {
 	const struct file_operations *real_fops;
 	const struct debugfs_short_fops *short_fops;
-	union {
-		/* automount_fn is used when real_fops is NULL */
-		debugfs_automount_t automount;
-		struct {
-			refcount_t active_users;
-			struct completion active_users_drained;
+	struct {
+		refcount_t active_users;
+		struct completion active_users_drained;
 
-			/* protect cancellations */
-			struct mutex cancellations_mtx;
-			struct list_head cancellations;
-		};
+		/* protect cancellations */
+		struct mutex cancellations_mtx;
+		struct list_head cancellations;
 	};
 };
 
-- 
cgit v1.2.3


From 41a0ecc0997cd40d913cce18867efd1c34c64e28 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2025 08:06:47 +0000
Subject: debugfs: get rid of dynamically allocation proxy_ops

All it takes is having full_proxy_open() collect the information
about available methods and store it in debugfs_fsdata.

Wrappers are called only after full_proxy_open() has succeeded
calling debugfs_get_file(), so they are guaranteed to have
->d_fsdata already pointing to debugfs_fsdata.

As the result, they can check if method is absent and bugger off
early, without any atomic operations, etc. - same effect as what
we'd have from NULL method.  Which makes the entire proxy_fops
contents unconditional, making it completely pointless - we can
just put those methods (unconditionally) into
debugfs_full_proxy_file_operations and forget about dynamic
allocation, replace_fops, etc.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20250112080705.141166-3-viro@zeniv.linux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c     | 113 +++++++++++++++++++++++---------------------------
 fs/debugfs/internal.h |   9 ++++
 2 files changed, 61 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 16e198a26339..eb59b01f5f25 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -95,13 +95,31 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
 			return -ENOMEM;
 
 		if (mode == DBGFS_GET_SHORT) {
-			fsd->real_fops = NULL;
-			fsd->short_fops = (void *)((unsigned long)d_fsd &
-						~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+			const struct debugfs_short_fops *ops;
+			ops = (void *)((unsigned long)d_fsd &
+					~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+			fsd->short_fops = ops;
+			if (ops->llseek)
+				fsd->methods |= HAS_LSEEK;
+			if (ops->read)
+				fsd->methods |= HAS_READ;
+			if (ops->write)
+				fsd->methods |= HAS_WRITE;
 		} else {
-			fsd->real_fops = (void *)((unsigned long)d_fsd &
-						~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
-			fsd->short_fops = NULL;
+			const struct file_operations *ops;
+			ops = (void *)((unsigned long)d_fsd &
+					~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+			fsd->real_fops = ops;
+			if (ops->llseek)
+				fsd->methods |= HAS_LSEEK;
+			if (ops->read)
+				fsd->methods |= HAS_READ;
+			if (ops->write)
+				fsd->methods |= HAS_WRITE;
+			if (ops->unlocked_ioctl)
+				fsd->methods |= HAS_IOCTL;
+			if (ops->poll)
+				fsd->methods |= HAS_POLL;
 		}
 		refcount_set(&fsd->active_users, 1);
 		init_completion(&fsd->active_users_drained);
@@ -322,13 +340,16 @@ const struct file_operations debugfs_open_proxy_file_operations = {
 #define PROTO(args...) args
 #define ARGS(args...) args
 
-#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args)		\
+#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret)	\
 static ret_type full_proxy_ ## name(proto)				\
 {									\
-	struct dentry *dentry = F_DENTRY(filp);			\
+	struct dentry *dentry = F_DENTRY(filp);				\
+	struct debugfs_fsdata *fsd = dentry->d_fsdata;			\
 	const struct file_operations *real_fops;			\
 	ret_type r;							\
 									\
+	if (!(fsd->methods & bit))					\
+		return ret;						\
 	r = debugfs_file_get(dentry);					\
 	if (unlikely(r))						\
 		return r;						\
@@ -338,17 +359,18 @@ static ret_type full_proxy_ ## name(proto)				\
 	return r;							\
 }
 
-#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args)		\
+#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args, bit, ret)	\
 static ret_type full_proxy_ ## name(proto)				\
 {									\
 	struct dentry *dentry = F_DENTRY(filp);				\
-	struct debugfs_fsdata *fsd;					\
+	struct debugfs_fsdata *fsd = dentry->d_fsdata;			\
 	ret_type r;							\
 									\
+	if (!(fsd->methods & bit))					\
+		return ret;						\
 	r = debugfs_file_get(dentry);					\
 	if (unlikely(r))						\
 		return r;						\
-	fsd = dentry->d_fsdata;						\
 	if (fsd->real_fops)						\
 		r = fsd->real_fops->name(args);				\
 	else								\
@@ -359,29 +381,32 @@ static ret_type full_proxy_ ## name(proto)				\
 
 FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
 		     PROTO(struct file *filp, loff_t offset, int whence),
-		     ARGS(filp, offset, whence));
+		     ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
 
 FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
 		     PROTO(struct file *filp, char __user *buf, size_t size,
 			   loff_t *ppos),
-		     ARGS(filp, buf, size, ppos));
+		     ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
 
 FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
 		     PROTO(struct file *filp, const char __user *buf,
 			   size_t size, loff_t *ppos),
-		     ARGS(filp, buf, size, ppos));
+		     ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
 
 FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
 		PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
-		ARGS(filp, cmd, arg));
+		ARGS(filp, cmd, arg), HAS_IOCTL, -ENOTTY);
 
 static __poll_t full_proxy_poll(struct file *filp,
 				struct poll_table_struct *wait)
 {
 	struct dentry *dentry = F_DENTRY(filp);
+	struct debugfs_fsdata *fsd = dentry->d_fsdata;
 	__poll_t r = 0;
 	const struct file_operations *real_fops;
 
+	if (!(fsd->methods & HAS_POLL))
+		return DEFAULT_POLLMASK;
 	if (debugfs_file_get(dentry))
 		return EPOLLHUP;
 
@@ -393,9 +418,7 @@ static __poll_t full_proxy_poll(struct file *filp,
 
 static int full_proxy_release(struct inode *inode, struct file *filp)
 {
-	const struct dentry *dentry = F_DENTRY(filp);
 	const struct file_operations *real_fops = debugfs_real_fops(filp);
-	const struct file_operations *proxy_fops = filp->f_op;
 	int r = 0;
 
 	/*
@@ -407,42 +430,15 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
 	if (real_fops && real_fops->release)
 		r = real_fops->release(inode, filp);
 
-	replace_fops(filp, d_inode(dentry)->i_fop);
-	kfree(proxy_fops);
 	fops_put(real_fops);
 	return r;
 }
 
-static void __full_proxy_fops_init(struct file_operations *proxy_fops,
-				   struct debugfs_fsdata *fsd)
-{
-	proxy_fops->release = full_proxy_release;
-
-	if ((fsd->real_fops && fsd->real_fops->llseek) ||
-	    (fsd->short_fops && fsd->short_fops->llseek))
-		proxy_fops->llseek = full_proxy_llseek;
-
-	if ((fsd->real_fops && fsd->real_fops->read) ||
-	    (fsd->short_fops && fsd->short_fops->read))
-		proxy_fops->read = full_proxy_read;
-
-	if ((fsd->real_fops && fsd->real_fops->write) ||
-	    (fsd->short_fops && fsd->short_fops->write))
-		proxy_fops->write = full_proxy_write;
-
-	if (fsd->real_fops && fsd->real_fops->poll)
-		proxy_fops->poll = full_proxy_poll;
-
-	if (fsd->real_fops && fsd->real_fops->unlocked_ioctl)
-		proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
-}
-
 static int full_proxy_open(struct inode *inode, struct file *filp,
 			   enum dbgfs_get_mode mode)
 {
 	struct dentry *dentry = F_DENTRY(filp);
 	const struct file_operations *real_fops;
-	struct file_operations *proxy_fops = NULL;
 	struct debugfs_fsdata *fsd;
 	int r;
 
@@ -472,34 +468,20 @@ static int full_proxy_open(struct inode *inode, struct file *filp,
 		goto out;
 	}
 
-	proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL);
-	if (!proxy_fops) {
-		r = -ENOMEM;
-		goto free_proxy;
-	}
-	__full_proxy_fops_init(proxy_fops, fsd);
-	replace_fops(filp, proxy_fops);
-
 	if (!real_fops || real_fops->open) {
 		if (real_fops)
 			r = real_fops->open(inode, filp);
 		else
 			r = simple_open(inode, filp);
 		if (r) {
-			replace_fops(filp, d_inode(dentry)->i_fop);
-			goto free_proxy;
-		} else if (filp->f_op != proxy_fops) {
+			fops_put(real_fops);
+		} else if (filp->f_op != &debugfs_full_proxy_file_operations) {
 			/* No protection against file removal anymore. */
 			WARN(1, "debugfs file owner replaced proxy fops: %pd",
 				dentry);
-			goto free_proxy;
+			fops_put(real_fops);
 		}
 	}
-
-	goto out;
-free_proxy:
-	kfree(proxy_fops);
-	fops_put(real_fops);
 out:
 	debugfs_file_put(dentry);
 	return r;
@@ -512,6 +494,12 @@ static int full_proxy_open_regular(struct inode *inode, struct file *filp)
 
 const struct file_operations debugfs_full_proxy_file_operations = {
 	.open = full_proxy_open_regular,
+	.release = full_proxy_release,
+	.llseek = full_proxy_llseek,
+	.read = full_proxy_read,
+	.write = full_proxy_write,
+	.poll = full_proxy_poll,
+	.unlocked_ioctl = full_proxy_unlocked_ioctl
 };
 
 static int full_proxy_open_short(struct inode *inode, struct file *filp)
@@ -521,6 +509,9 @@ static int full_proxy_open_short(struct inode *inode, struct file *filp)
 
 const struct file_operations debugfs_full_short_proxy_file_operations = {
 	.open = full_proxy_open_short,
+	.llseek = full_proxy_llseek,
+	.read = full_proxy_read,
+	.write = full_proxy_write,
 };
 
 ssize_t debugfs_attr_read(struct file *file, char __user *buf,
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index a644e44a0ee4..011ef8b1a99a 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -39,9 +39,18 @@ struct debugfs_fsdata {
 		/* protect cancellations */
 		struct mutex cancellations_mtx;
 		struct list_head cancellations;
+		unsigned int methods;
 	};
 };
 
+enum {
+	HAS_READ = 1,
+	HAS_WRITE = 2,
+	HAS_LSEEK = 4,
+	HAS_POLL = 8,
+	HAS_IOCTL = 16
+};
+
 /*
  * A dentry's ->d_fsdata either points to the real fops or to a
  * dynamically allocated debugfs_fsdata instance.
-- 
cgit v1.2.3


From 95688800eefe28240204c2a0dd2bca5bf5f7f1d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2025 08:06:48 +0000
Subject: debugfs: don't mess with bits in ->d_fsdata

The reason we need that crap is the dual use ->d_fsdata has there -
it's both holding a debugfs_fsdata reference after the first
debugfs_file_get() (actually, after the call of proxy ->open())
*and* it serves as a place to stash a reference to real file_operations
from object creation to the first open.  Oh, and it's triple use,
actually - that stashed reference might be to debugfs_short_fops.

Bugger that for a game of solidiers - just put the operations
reference into debugfs-private augmentation of inode.  And split
debugfs_full_file_operations into full and short cases, so that
debugfs_get_file() could tell one from another.

Voila - ->d_fsdata holds NULL until the first (successful) debugfs_get_file()
and a reference to struct debugfs_fsdata afterwards.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20250112080705.141166-4-viro@zeniv.linux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c     | 58 +++++++++++++++++++++++++--------------------------
 fs/debugfs/inode.c    | 29 +++++++-------------------
 fs/debugfs/internal.h | 12 +++--------
 3 files changed, 38 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index eb59b01f5f25..ae014bd36a6f 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -51,7 +51,7 @@ const struct file_operations *debugfs_real_fops(const struct file *filp)
 {
 	struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
 
-	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) {
+	if (!fsd) {
 		/*
 		 * Urgh, we've been called w/o a protecting
 		 * debugfs_file_get().
@@ -84,9 +84,11 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
 		return -EINVAL;
 
 	d_fsd = READ_ONCE(dentry->d_fsdata);
-	if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) {
+	if (d_fsd) {
 		fsd = d_fsd;
 	} else {
+		struct inode *inode = dentry->d_inode;
+
 		if (WARN_ON(mode == DBGFS_GET_ALREADY))
 			return -EINVAL;
 
@@ -96,9 +98,7 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
 
 		if (mode == DBGFS_GET_SHORT) {
 			const struct debugfs_short_fops *ops;
-			ops = (void *)((unsigned long)d_fsd &
-					~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
-			fsd->short_fops = ops;
+			ops = fsd->short_fops = DEBUGFS_I(inode)->short_fops;
 			if (ops->llseek)
 				fsd->methods |= HAS_LSEEK;
 			if (ops->read)
@@ -107,9 +107,7 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
 				fsd->methods |= HAS_WRITE;
 		} else {
 			const struct file_operations *ops;
-			ops = (void *)((unsigned long)d_fsd &
-					~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
-			fsd->real_fops = ops;
+			ops = fsd->real_fops = DEBUGFS_I(inode)->real_fops;
 			if (ops->llseek)
 				fsd->methods |= HAS_LSEEK;
 			if (ops->read)
@@ -126,10 +124,11 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
 		INIT_LIST_HEAD(&fsd->cancellations);
 		mutex_init(&fsd->cancellations_mtx);
 
-		if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) {
+		d_fsd = cmpxchg(&dentry->d_fsdata, NULL, fsd);
+		if (d_fsd) {
 			mutex_destroy(&fsd->cancellations_mtx);
 			kfree(fsd);
-			fsd = READ_ONCE(dentry->d_fsdata);
+			fsd = d_fsd;
 		}
 	}
 
@@ -226,8 +225,7 @@ void debugfs_enter_cancellation(struct file *file,
 		return;
 
 	fsd = READ_ONCE(dentry->d_fsdata);
-	if (WARN_ON(!fsd ||
-		    ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+	if (WARN_ON(!fsd))
 		return;
 
 	mutex_lock(&fsd->cancellations_mtx);
@@ -258,8 +256,7 @@ void debugfs_leave_cancellation(struct file *file,
 		return;
 
 	fsd = READ_ONCE(dentry->d_fsdata);
-	if (WARN_ON(!fsd ||
-		    ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+	if (WARN_ON(!fsd))
 		return;
 
 	mutex_lock(&fsd->cancellations_mtx);
@@ -427,22 +424,21 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
 	 * not to leak any resources. Releasers must not assume that
 	 * ->i_private is still being meaningful here.
 	 */
-	if (real_fops && real_fops->release)
+	if (real_fops->release)
 		r = real_fops->release(inode, filp);
 
 	fops_put(real_fops);
 	return r;
 }
 
-static int full_proxy_open(struct inode *inode, struct file *filp,
-			   enum dbgfs_get_mode mode)
+static int full_proxy_open_regular(struct inode *inode, struct file *filp)
 {
 	struct dentry *dentry = F_DENTRY(filp);
 	const struct file_operations *real_fops;
 	struct debugfs_fsdata *fsd;
 	int r;
 
-	r = __debugfs_file_get(dentry, mode);
+	r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR);
 	if (r)
 		return r == -EIO ? -ENOENT : r;
 
@@ -452,7 +448,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp,
 	if (r)
 		goto out;
 
-	if (real_fops && !fops_get(real_fops)) {
+	if (!fops_get(real_fops)) {
 #ifdef CONFIG_MODULES
 		if (real_fops->owner &&
 		    real_fops->owner->state == MODULE_STATE_GOING) {
@@ -468,11 +464,8 @@ static int full_proxy_open(struct inode *inode, struct file *filp,
 		goto out;
 	}
 
-	if (!real_fops || real_fops->open) {
-		if (real_fops)
-			r = real_fops->open(inode, filp);
-		else
-			r = simple_open(inode, filp);
+	if (real_fops->open) {
+		r = real_fops->open(inode, filp);
 		if (r) {
 			fops_put(real_fops);
 		} else if (filp->f_op != &debugfs_full_proxy_file_operations) {
@@ -487,11 +480,6 @@ out:
 	return r;
 }
 
-static int full_proxy_open_regular(struct inode *inode, struct file *filp)
-{
-	return full_proxy_open(inode, filp, DBGFS_GET_REGULAR);
-}
-
 const struct file_operations debugfs_full_proxy_file_operations = {
 	.open = full_proxy_open_regular,
 	.release = full_proxy_release,
@@ -504,7 +492,17 @@ const struct file_operations debugfs_full_proxy_file_operations = {
 
 static int full_proxy_open_short(struct inode *inode, struct file *filp)
 {
-	return full_proxy_open(inode, filp, DBGFS_GET_SHORT);
+	struct dentry *dentry = F_DENTRY(filp);
+	int r;
+
+	r = __debugfs_file_get(dentry, DBGFS_GET_SHORT);
+	if (r)
+		return r == -EIO ? -ENOENT : r;
+	r = debugfs_locked_down(inode, filp, NULL);
+	if (!r)
+		r = simple_open(inode, filp);
+	debugfs_file_put(dentry);
+	return r;
 }
 
 const struct file_operations debugfs_full_short_proxy_file_operations = {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 2f5afd7b1b94..c4e8b7f758e0 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -243,15 +243,10 @@ static void debugfs_release_dentry(struct dentry *dentry)
 {
 	struct debugfs_fsdata *fsd = dentry->d_fsdata;
 
-	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
-		return;
-
-	/* check it wasn't a dir or automount (no fsdata) */
 	if (fsd) {
 		WARN_ON(!list_empty(&fsd->cancellations));
 		mutex_destroy(&fsd->cancellations_mtx);
 	}
-
 	kfree(fsd);
 }
 
@@ -459,9 +454,10 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 	inode->i_private = data;
 
 	inode->i_op = &debugfs_file_inode_operations;
+	if (!real_fops)
+		proxy_fops = &debugfs_noop_file_operations;
 	inode->i_fop = proxy_fops;
-	dentry->d_fsdata = (void *)((unsigned long)real_fops |
-				DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+	DEBUGFS_I(inode)->raw = real_fops;
 
 	d_instantiate(dentry, inode);
 	fsnotify_create(d_inode(dentry->d_parent), dentry);
@@ -472,13 +468,8 @@ struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
 					const struct file_operations *fops)
 {
-	if (WARN_ON((unsigned long)fops &
-		    DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
-		return ERR_PTR(-EINVAL);
-
 	return __debugfs_create_file(name, mode, parent, data,
-				fops ? &debugfs_full_proxy_file_operations :
-					&debugfs_noop_file_operations,
+				&debugfs_full_proxy_file_operations,
 				fops);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file_full);
@@ -487,13 +478,8 @@ struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
 					 struct dentry *parent, void *data,
 					 const struct debugfs_short_fops *fops)
 {
-	if (WARN_ON((unsigned long)fops &
-		    DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
-		return ERR_PTR(-EINVAL);
-
 	return __debugfs_create_file(name, mode, parent, data,
-				fops ? &debugfs_full_short_proxy_file_operations :
-					&debugfs_noop_file_operations,
+				&debugfs_full_short_proxy_file_operations,
 				fops);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file_short);
@@ -531,8 +517,7 @@ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
 {
 
 	return __debugfs_create_file(name, mode, parent, data,
-				fops ? &debugfs_open_proxy_file_operations :
-					&debugfs_noop_file_operations,
+				&debugfs_open_proxy_file_operations,
 				fops);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
@@ -737,7 +722,7 @@ static void __debugfs_file_removed(struct dentry *dentry)
 	 */
 	smp_mb();
 	fsd = READ_ONCE(dentry->d_fsdata);
-	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
+	if (!fsd)
 		return;
 
 	/* if this was the last reference, we're done */
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 011ef8b1a99a..8d2de647b42c 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -14,6 +14,9 @@ struct file_operations;
 struct debugfs_inode_info {
 	struct inode vfs_inode;
 	union {
+		const void *raw;
+		const struct file_operations *real_fops;
+		const struct debugfs_short_fops *short_fops;
 		debugfs_automount_t automount;
 	};
 };
@@ -51,15 +54,6 @@ enum {
 	HAS_IOCTL = 16
 };
 
-/*
- * A dentry's ->d_fsdata either points to the real fops or to a
- * dynamically allocated debugfs_fsdata instance.
- * In order to distinguish between these two cases, a real fops
- * pointer gets its lowest bit set.
- */
-#define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0)
-
-/* Access BITS */
 #define DEBUGFS_ALLOW_API	BIT(0)
 #define DEBUGFS_ALLOW_MOUNT	BIT(1)
 
-- 
cgit v1.2.3


From 12c92098932b4bbf38396e9aed0a343d35437a21 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2025 08:06:49 +0000
Subject: debugfs: allow to store an additional opaque pointer at file creation

Set by debugfs_create_file_aux(name, mode, parent, data, aux, fops).
Plain debugfs_create_file() has it set to NULL.
Accessed by debugfs_get_aux(file).

Convenience macros for numeric opaque data - debugfs_create_file_aux_num
and debugfs_get_aux_num, resp.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20250112080705.141166-5-viro@zeniv.linux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       |  6 ++++++
 fs/debugfs/inode.c      | 14 +++++++++-----
 fs/debugfs/internal.h   |  1 +
 include/linux/debugfs.h | 27 ++++++++++++++++++++++++++-
 4 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index ae014bd36a6f..e33cc77699cd 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -47,6 +47,12 @@ const struct file_operations debugfs_noop_file_operations = {
 
 #define F_DENTRY(filp) ((filp)->f_path.dentry)
 
+const void *debugfs_get_aux(const struct file *file)
+{
+	return DEBUGFS_I(file_inode(file))->aux;
+}
+EXPORT_SYMBOL_GPL(debugfs_get_aux);
+
 const struct file_operations *debugfs_real_fops(const struct file *filp)
 {
 	struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c4e8b7f758e0..51d4c3e9d422 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -424,6 +424,7 @@ static struct dentry *end_creating(struct dentry *dentry)
 
 static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 				struct dentry *parent, void *data,
+				const void *aux,
 				const struct file_operations *proxy_fops,
 				const void *real_fops)
 {
@@ -458,6 +459,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 		proxy_fops = &debugfs_noop_file_operations;
 	inode->i_fop = proxy_fops;
 	DEBUGFS_I(inode)->raw = real_fops;
+	DEBUGFS_I(inode)->aux = aux;
 
 	d_instantiate(dentry, inode);
 	fsnotify_create(d_inode(dentry->d_parent), dentry);
@@ -466,19 +468,21 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 
 struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
+					const void *aux,
 					const struct file_operations *fops)
 {
-	return __debugfs_create_file(name, mode, parent, data,
+	return __debugfs_create_file(name, mode, parent, data, aux,
 				&debugfs_full_proxy_file_operations,
 				fops);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file_full);
 
 struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
-					 struct dentry *parent, void *data,
-					 const struct debugfs_short_fops *fops)
+					struct dentry *parent, void *data,
+					const void *aux,
+					const struct debugfs_short_fops *fops)
 {
-	return __debugfs_create_file(name, mode, parent, data,
+	return __debugfs_create_file(name, mode, parent, data, aux,
 				&debugfs_full_short_proxy_file_operations,
 				fops);
 }
@@ -516,7 +520,7 @@ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
 				   const struct file_operations *fops)
 {
 
-	return __debugfs_create_file(name, mode, parent, data,
+	return __debugfs_create_file(name, mode, parent, data, NULL,
 				&debugfs_open_proxy_file_operations,
 				fops);
 }
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 8d2de647b42c..93483fe84425 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -19,6 +19,7 @@ struct debugfs_inode_info {
 		const struct debugfs_short_fops *short_fops;
 		debugfs_automount_t automount;
 	};
+	const void *aux;
 };
 
 static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode)
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 59444b495d49..7c97417d73b5 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -79,9 +79,11 @@ struct debugfs_short_fops {
 
 struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
+					const void *aux,
 					const struct file_operations *fops);
 struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
 					 struct dentry *parent, void *data,
+					 const void *aux,
 					 const struct debugfs_short_fops *fops);
 
 /**
@@ -126,7 +128,15 @@ struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
 		 const struct debugfs_short_fops *: debugfs_create_file_short,	\
 		 struct file_operations *: debugfs_create_file_full,		\
 		 struct debugfs_short_fops *: debugfs_create_file_short)	\
-		(name, mode, parent, data, fops)
+		(name, mode, parent, data, NULL, fops)
+
+#define debugfs_create_file_aux(name, mode, parent, data, aux, fops)		\
+	_Generic(fops,								\
+		 const struct file_operations *: debugfs_create_file_full,	\
+		 const struct debugfs_short_fops *: debugfs_create_file_short,	\
+		 struct file_operations *: debugfs_create_file_full,		\
+		 struct debugfs_short_fops *: debugfs_create_file_short)	\
+		(name, mode, parent, data, aux, fops)
 
 struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
@@ -153,6 +163,7 @@ void debugfs_remove(struct dentry *dentry);
 void debugfs_lookup_and_remove(const char *name, struct dentry *parent);
 
 const struct file_operations *debugfs_real_fops(const struct file *filp);
+const void *debugfs_get_aux(const struct file *file);
 
 int debugfs_file_get(struct dentry *dentry);
 void debugfs_file_put(struct dentry *dentry);
@@ -259,6 +270,14 @@ static inline struct dentry *debugfs_lookup(const char *name,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct dentry *debugfs_create_file_aux(const char *name,
+					umode_t mode, struct dentry *parent,
+					void *data, void *aux,
+					const void *fops)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
 					const void *fops)
@@ -312,6 +331,7 @@ static inline void debugfs_lookup_and_remove(const char *name,
 { }
 
 const struct file_operations *debugfs_real_fops(const struct file *filp);
+void *debugfs_get_aux(const struct file *file);
 
 static inline int debugfs_file_get(struct dentry *dentry)
 {
@@ -452,6 +472,11 @@ static inline ssize_t debugfs_read_file_str(struct file *file,
 
 #endif
 
+#define debugfs_create_file_aux_num(name, mode, parent, data, n, fops) \
+	debugfs_create_file_aux(name, mode, parent, data, \
+				(void *)(unsigned long)n, fops)
+#define debugfs_get_aux_num(f) (unsigned long)debugfs_get_aux(f)
+
 /**
  * debugfs_create_xul - create a debugfs file that is used to read and write an
  * unsigned long value, formatted in hexadecimal
-- 
cgit v1.2.3


From c2a3a216c7e9ad790ba3b5972ee93eee8fe337b1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2025 08:07:04 +0000
Subject: orangefs-debugfs: don't mess with ->d_name

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20250112080705.141166-20-viro@zeniv.linux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/orangefs/orangefs-debugfs.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 1b508f543384..9729f071c5aa 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -206,8 +206,8 @@ static void orangefs_kernel_debug_init(void)
 		pr_info("%s: overflow 1!\n", __func__);
 	}
 
-	debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
-			    &kernel_debug_fops);
+	debugfs_create_file_aux_num(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
+			    0, &kernel_debug_fops);
 }
 
 
@@ -306,11 +306,10 @@ static void orangefs_client_debug_init(void)
 		pr_info("%s: overflow! 2\n", __func__);
 	}
 
-	client_debug_dentry = debugfs_create_file(ORANGEFS_CLIENT_DEBUG_FILE,
-						  0444,
-						  debug_dir,
-						  c_buffer,
-						  &kernel_debug_fops);
+	client_debug_dentry = debugfs_create_file_aux_num(
+					  ORANGEFS_CLIENT_DEBUG_FILE,
+					  0444, debug_dir, c_buffer, 1,
+					  &kernel_debug_fops);
 }
 
 /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
@@ -418,8 +417,7 @@ static ssize_t orangefs_debug_write(struct file *file,
 	 * A service operation is required to set a new client-side
 	 * debug mask.
 	 */
-	if (!strcmp(file->f_path.dentry->d_name.name,
-		    ORANGEFS_KMOD_DEBUG_FILE)) {
+	if (!debugfs_get_aux_num(file)) {	// kernel-debug
 		debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0);
 		debug_mask_to_string(&orangefs_gossip_debug_mask, 0);
 		debug_string = kernel_debug_string;
-- 
cgit v1.2.3


From f7862dfef6612b87b2ad8352c4d73886f09456d6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2025 08:07:05 +0000
Subject: saner replacement for debugfs_rename()

Existing primitive has several problems:
	1) calling conventions are clumsy - it returns a dentry reference
that is either identical to its second argument or is an ERR_PTR(-E...);
in both cases no refcount changes happen.  Inconvenient for users and
bug-prone; it would be better to have it return 0 on success and -E... on
failure.
	2) it allows cross-directory moves; however, no such caller have
ever materialized and considering the way debugfs is used, it's unlikely
to happen in the future.  What's more, any such caller would have fun
issues to deal with wrt interplay with recursive removal.  It also makes
the calling conventions clumsier...
	3) tautological rename fails; the callers have no race-free way
to deal with that.
	4) new name must have been formed by the caller; quite a few
callers have it done by sprintf/kasprintf/etc., ending up with considerable
boilerplate.

Proposed replacement: int debugfs_change_name(dentry, fmt, ...).  All callers
convert to that easily, and it's simpler internally.

IMO debugfs_rename() should go; if we ever get a real-world use case for
cross-directory moves in debugfs, we can always look into the right way
to handle that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20250112080705.141166-21-viro@zeniv.linux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/filesystems/debugfs.rst             |  12 +--
 drivers/net/bonding/bond_debugfs.c                |   9 +-
 drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c      |  19 +---
 drivers/net/ethernet/marvell/skge.c               |   5 +-
 drivers/net/ethernet/marvell/sky2.c               |   5 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |   6 +-
 drivers/opp/debugfs.c                             |  10 +-
 fs/debugfs/inode.c                                | 106 ++++++++++------------
 include/linux/debugfs.h                           |   9 +-
 mm/shrinker_debug.c                               |  16 +---
 net/hsr/hsr_debugfs.c                             |   9 +-
 net/mac80211/debugfs_netdev.c                     |  11 +--
 net/wireless/core.c                               |   5 +-
 13 files changed, 77 insertions(+), 145 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/debugfs.rst b/Documentation/filesystems/debugfs.rst
index dc35da8b8792..f7f977ffbf8d 100644
--- a/Documentation/filesystems/debugfs.rst
+++ b/Documentation/filesystems/debugfs.rst
@@ -211,18 +211,16 @@ seq_file content.
 
 There are a couple of other directory-oriented helper functions::
 
-    struct dentry *debugfs_rename(struct dentry *old_dir,
-    				  struct dentry *old_dentry,
-		                  struct dentry *new_dir,
-				  const char *new_name);
+    struct dentry *debugfs_change_name(struct dentry *dentry,
+					  const char *fmt, ...);
 
     struct dentry *debugfs_create_symlink(const char *name,
                                           struct dentry *parent,
 				      	  const char *target);
 
-A call to debugfs_rename() will give a new name to an existing debugfs
-file, possibly in a different directory.  The new_name must not exist prior
-to the call; the return value is old_dentry with updated information.
+A call to debugfs_change_name() will give a new name to an existing debugfs
+file, always in the same directory.  The new_name must not exist prior
+to the call; the return value is 0 on success and -E... on failuer.
 Symbolic links can be created with debugfs_create_symlink().
 
 There is one important thing that all debugfs users must take into account:
diff --git a/drivers/net/bonding/bond_debugfs.c b/drivers/net/bonding/bond_debugfs.c
index b19492a7f6ad..8adbec7c5084 100644
--- a/drivers/net/bonding/bond_debugfs.c
+++ b/drivers/net/bonding/bond_debugfs.c
@@ -63,13 +63,8 @@ void bond_debug_unregister(struct bonding *bond)
 
 void bond_debug_reregister(struct bonding *bond)
 {
-	struct dentry *d;
-
-	d = debugfs_rename(bonding_debug_root, bond->debug_dir,
-			   bonding_debug_root, bond->dev->name);
-	if (!IS_ERR(d)) {
-		bond->debug_dir = d;
-	} else {
+	int err = debugfs_change_name(bond->debug_dir, "%s", bond->dev->name);
+	if (err) {
 		netdev_warn(bond->dev, "failed to reregister, so just unregister old one\n");
 		bond_debug_unregister(bond);
 	}
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
index b0a6c96b6ef4..b35808d3d07f 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
@@ -505,21 +505,6 @@ void xgbe_debugfs_exit(struct xgbe_prv_data *pdata)
 
 void xgbe_debugfs_rename(struct xgbe_prv_data *pdata)
 {
-	char *buf;
-
-	if (!pdata->xgbe_debugfs)
-		return;
-
-	buf = kasprintf(GFP_KERNEL, "amd-xgbe-%s", pdata->netdev->name);
-	if (!buf)
-		return;
-
-	if (!strcmp(pdata->xgbe_debugfs->d_name.name, buf))
-		goto out;
-
-	debugfs_rename(pdata->xgbe_debugfs->d_parent, pdata->xgbe_debugfs,
-		       pdata->xgbe_debugfs->d_parent, buf);
-
-out:
-	kfree(buf);
+	debugfs_change_name(pdata->xgbe_debugfs,
+			    "amd-xgbe-%s", pdata->netdev->name);
 }
diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index 25bf6ec44289..a1bada9eaaf6 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -3742,10 +3742,7 @@ static int skge_device_event(struct notifier_block *unused,
 	skge = netdev_priv(dev);
 	switch (event) {
 	case NETDEV_CHANGENAME:
-		if (skge->debugfs)
-			skge->debugfs = debugfs_rename(skge_debug,
-						       skge->debugfs,
-						       skge_debug, dev->name);
+		debugfs_change_name(skge->debugfs, "%s", dev->name);
 		break;
 
 	case NETDEV_GOING_DOWN:
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 988fa28cfb5f..d7121c836508 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -4494,10 +4494,7 @@ static int sky2_device_event(struct notifier_block *unused,
 
 	switch (event) {
 	case NETDEV_CHANGENAME:
-		if (sky2->debugfs) {
-			sky2->debugfs = debugfs_rename(sky2_debug, sky2->debugfs,
-						       sky2_debug, dev->name);
-		}
+		debugfs_change_name(sky2->debugfs, "%s", dev->name);
 		break;
 
 	case NETDEV_GOING_DOWN:
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c81ea8cdfe6e..82e2908016bd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -6489,11 +6489,7 @@ static int stmmac_device_event(struct notifier_block *unused,
 
 	switch (event) {
 	case NETDEV_CHANGENAME:
-		if (priv->dbgfs_dir)
-			priv->dbgfs_dir = debugfs_rename(stmmac_fs_dir,
-							 priv->dbgfs_dir,
-							 stmmac_fs_dir,
-							 dev->name);
+		debugfs_change_name(priv->dbgfs_dir, "%s", dev->name);
 		break;
 	}
 done:
diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c
index 105de7c3274a..8fc6238b1728 100644
--- a/drivers/opp/debugfs.c
+++ b/drivers/opp/debugfs.c
@@ -217,7 +217,7 @@ static void opp_migrate_dentry(struct opp_device *opp_dev,
 {
 	struct opp_device *new_dev = NULL, *iter;
 	const struct device *dev;
-	struct dentry *dentry;
+	int err;
 
 	/* Look for next opp-dev */
 	list_for_each_entry(iter, &opp_table->dev_list, node)
@@ -234,16 +234,14 @@ static void opp_migrate_dentry(struct opp_device *opp_dev,
 
 	opp_set_dev_name(dev, opp_table->dentry_name);
 
-	dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
-				opp_table->dentry_name);
-	if (IS_ERR(dentry)) {
+	err = debugfs_change_name(opp_dev->dentry, "%s", opp_table->dentry_name);
+	if (err) {
 		dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
 			__func__, dev_name(opp_dev->dev), dev_name(dev));
 		return;
 	}
 
-	new_dev->dentry = dentry;
-	opp_table->dentry = dentry;
+	new_dev->dentry = opp_table->dentry = opp_dev->dentry;
 }
 
 /**
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 51d4c3e9d422..75715d8877ee 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -830,76 +830,70 @@ void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
 EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);
 
 /**
- * debugfs_rename - rename a file/directory in the debugfs filesystem
- * @old_dir: a pointer to the parent dentry for the renamed object. This
- *          should be a directory dentry.
- * @old_dentry: dentry of an object to be renamed.
- * @new_dir: a pointer to the parent dentry where the object should be
- *          moved. This should be a directory dentry.
- * @new_name: a pointer to a string containing the target name.
+ * debugfs_change_name - rename a file/directory in the debugfs filesystem
+ * @dentry: dentry of an object to be renamed.
+ * @fmt: format for new name
  *
  * This function renames a file/directory in debugfs.  The target must not
  * exist for rename to succeed.
  *
- * This function will return a pointer to old_dentry (which is updated to
- * reflect renaming) if it succeeds. If an error occurs, ERR_PTR(-ERROR)
- * will be returned.
+ * This function will return 0 on success and -E... on failure.
  *
  * If debugfs is not enabled in the kernel, the value -%ENODEV will be
  * returned.
  */
-struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
-		struct dentry *new_dir, const char *new_name)
+int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, ...)
 {
-	int error;
-	struct dentry *dentry = NULL, *trap;
+	int error = 0;
+	const char *new_name;
 	struct name_snapshot old_name;
+	struct dentry *parent, *target;
+	struct inode *dir;
+	va_list ap;
 
-	if (IS_ERR(old_dir))
-		return old_dir;
-	if (IS_ERR(new_dir))
-		return new_dir;
-	if (IS_ERR_OR_NULL(old_dentry))
-		return old_dentry;
-
-	trap = lock_rename(new_dir, old_dir);
-	/* Source or destination directories don't exist? */
-	if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
-		goto exit;
-	/* Source does not exist, cyclic rename, or mountpoint? */
-	if (d_really_is_negative(old_dentry) || old_dentry == trap ||
-	    d_mountpoint(old_dentry))
-		goto exit;
-	dentry = lookup_one_len(new_name, new_dir, strlen(new_name));
-	/* Lookup failed, cyclic rename or target exists? */
-	if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry))
-		goto exit;
-
-	take_dentry_name_snapshot(&old_name, old_dentry);
-
-	error = simple_rename(&nop_mnt_idmap, d_inode(old_dir), old_dentry,
-			      d_inode(new_dir), dentry, 0);
-	if (error) {
-		release_dentry_name_snapshot(&old_name);
-		goto exit;
+	if (IS_ERR_OR_NULL(dentry))
+		return 0;
+
+	va_start(ap, fmt);
+	new_name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+	va_end(ap);
+	if (!new_name)
+		return -ENOMEM;
+
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+	inode_lock(dir);
+
+	take_dentry_name_snapshot(&old_name, dentry);
+
+	if (WARN_ON_ONCE(dentry->d_parent != parent)) {
+		error = -EINVAL;
+		goto out;
+	}
+	if (strcmp(old_name.name.name, new_name) == 0)
+		goto out;
+	target = lookup_one_len(new_name, parent, strlen(new_name));
+	if (IS_ERR(target)) {
+		error = PTR_ERR(target);
+		goto out;
 	}
-	d_move(old_dentry, dentry);
-	fsnotify_move(d_inode(old_dir), d_inode(new_dir), &old_name.name,
-		d_is_dir(old_dentry),
-		NULL, old_dentry);
+	if (d_really_is_positive(target)) {
+		dput(target);
+		error = -EINVAL;
+		goto out;
+	}
+	simple_rename_timestamp(dir, dentry, dir, target);
+	d_move(dentry, target);
+	dput(target);
+	fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry);
+out:
 	release_dentry_name_snapshot(&old_name);
-	unlock_rename(new_dir, old_dir);
-	dput(dentry);
-	return old_dentry;
-exit:
-	if (dentry && !IS_ERR(dentry))
-		dput(dentry);
-	unlock_rename(new_dir, old_dir);
-	if (IS_ERR(dentry))
-		return dentry;
-	return ERR_PTR(-EINVAL);
+	inode_unlock(dir);
+	dput(parent);
+	kfree_const(new_name);
+	return error;
 }
-EXPORT_SYMBOL_GPL(debugfs_rename);
+EXPORT_SYMBOL_GPL(debugfs_change_name);
 
 /**
  * debugfs_initialized - Tells whether debugfs has been registered
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 68e9c6cbd835..fa2568b4380d 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -175,8 +175,7 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
 ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
 			size_t len, loff_t *ppos);
 
-struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
-                struct dentry *new_dir, const char *new_name);
+int debugfs_change_name(struct dentry *dentry, const char *fmt, ...) __printf(2, 3);
 
 void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent,
 		       u8 *value);
@@ -361,10 +360,10 @@ static inline ssize_t debugfs_attr_write_signed(struct file *file,
 	return -ENODEV;
 }
 
-static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
-                struct dentry *new_dir, char *new_name)
+static inline int __printf(2, 3) debugfs_change_name(struct dentry *dentry,
+					const char *fmt, ...)
 {
-	return ERR_PTR(-ENODEV);
+	return -ENODEV;
 }
 
 static inline void debugfs_create_u8(const char *name, umode_t mode,
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 4a85b94d12ce..794bd433cce0 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -195,8 +195,6 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
 
 int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
 {
-	struct dentry *entry;
-	char buf[128];
 	const char *new, *old;
 	va_list ap;
 	int ret = 0;
@@ -213,18 +211,8 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
 	old = shrinker->name;
 	shrinker->name = new;
 
-	if (shrinker->debugfs_entry) {
-		snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
-			 shrinker->debugfs_id);
-
-		entry = debugfs_rename(shrinker_debugfs_root,
-				       shrinker->debugfs_entry,
-				       shrinker_debugfs_root, buf);
-		if (IS_ERR(entry))
-			ret = PTR_ERR(entry);
-		else
-			shrinker->debugfs_entry = entry;
-	}
+	ret = debugfs_change_name(shrinker->debugfs_entry, "%s-%d",
+			shrinker->name, shrinker->debugfs_id);
 
 	mutex_unlock(&shrinker_mutex);
 
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index 1a195efc79cd..5b2cfac3b2ba 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -57,14 +57,11 @@ DEFINE_SHOW_ATTRIBUTE(hsr_node_table);
 void hsr_debugfs_rename(struct net_device *dev)
 {
 	struct hsr_priv *priv = netdev_priv(dev);
-	struct dentry *d;
+	int err;
 
-	d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root,
-			   hsr_debugfs_root_dir, dev->name);
-	if (IS_ERR(d))
+	err = debugfs_change_name(priv->node_tbl_root, "%s", dev->name);
+	if (err)
 		netdev_warn(dev, "failed to rename\n");
-	else
-		priv->node_tbl_root = d;
 }
 
 /* hsr_debugfs_init - create hsr node_table file for dumping
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index a9bc2fd59f55..9fa38c489edc 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -1025,16 +1025,7 @@ void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
 
 void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
 {
-	struct dentry *dir;
-	char buf[10 + IFNAMSIZ];
-
-	dir = sdata->vif.debugfs_dir;
-
-	if (IS_ERR_OR_NULL(dir))
-		return;
-
-	sprintf(buf, "netdev:%s", sdata->name);
-	debugfs_rename(dir->d_parent, dir, dir->d_parent, buf);
+	debugfs_change_name(sdata->vif.debugfs_dir, "netdev:%s", sdata->name);
 }
 
 void ieee80211_debugfs_recreate_netdev(struct ieee80211_sub_if_data *sdata,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index afbdc549fb4a..9130cb872ed3 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -143,10 +143,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
 	if (result)
 		return result;
 
-	if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir))
-		debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
-			       rdev->wiphy.debugfsdir,
-			       rdev->wiphy.debugfsdir->d_parent, newname);
+	debugfs_change_name(rdev->wiphy.debugfsdir, "%s", newname);
 
 	nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
 
-- 
cgit v1.2.3


From 3b7d93db450e9d8ead80d75e2a303248f1528c35 Mon Sep 17 00:00:00 2001
From: Antoine Viallon <antoine@lesviallon.fr>
Date: Tue, 14 Jan 2025 23:45:14 +0100
Subject: ceph: fix memory leak in ceph_mds_auth_match()

We now free the temporary target path substring allocation on every
possible branch, instead of omitting the default branch.  In some
cases, a memory leak occured, which could rapidly crash the system
(depending on how many file accesses were attempted).

This was detected in production because it caused a continuous memory
growth, eventually triggering kernel OOM and completely hard-locking
the kernel.

Relevant kmemleak stacktrace:

    unreferenced object 0xffff888131e69900 (size 128):
      comm "git", pid 66104, jiffies 4295435999
      hex dump (first 32 bytes):
        76 6f 6c 75 6d 65 73 2f 63 6f 6e 74 61 69 6e 65  volumes/containe
        72 73 2f 67 69 74 65 61 2f 67 69 74 65 61 2f 67  rs/gitea/gitea/g
      backtrace (crc 2f3bb450):
        [<ffffffffaa68fb49>] __kmalloc_noprof+0x359/0x510
        [<ffffffffc32bf1df>] ceph_mds_check_access+0x5bf/0x14e0 [ceph]
        [<ffffffffc3235722>] ceph_open+0x312/0xd80 [ceph]
        [<ffffffffaa7dd786>] do_dentry_open+0x456/0x1120
        [<ffffffffaa7e3729>] vfs_open+0x79/0x360
        [<ffffffffaa832875>] path_openat+0x1de5/0x4390
        [<ffffffffaa834fcc>] do_filp_open+0x19c/0x3c0
        [<ffffffffaa7e44a1>] do_sys_openat2+0x141/0x180
        [<ffffffffaa7e4945>] __x64_sys_open+0xe5/0x1a0
        [<ffffffffac2cc2f7>] do_syscall_64+0xb7/0x210
        [<ffffffffac400130>] entry_SYSCALL_64_after_hwframe+0x77/0x7f

It can be triggered by mouting a subdirectory of a CephFS filesystem,
and then trying to access files on this subdirectory with an auth token
using a path-scoped capability:

    $ ceph auth get client.services
    [client.services]
            key = REDACTED
            caps mds = "allow rw fsname=cephfs path=/volumes/"
            caps mon = "allow r fsname=cephfs"
            caps osd = "allow rw tag cephfs data=cephfs"

    $ cat /proc/self/mounts
    services@[REDACTED].cephfs=/volumes/containers /ceph/containers ceph rw,noatime,name=services,secret=<hidden>,ms_mode=prefer-crc,mount_timeout=300,acl,mon_addr=[REDACTED]:3300,recover_session=clean 0 0

    $ seq 1 1000000 | xargs -P32 --replace={} touch /ceph/containers/file-{} && \
    seq 1 1000000 | xargs -P32 --replace={} cat /ceph/containers/file-{}

[ idryomov: combine if statements, rename rc to path_matched and make
            it a bool, formatting ]

Cc: stable@vger.kernel.org
Fixes: 596afb0b8933 ("ceph: add ceph_mds_check_access() helper")
Signed-off-by: Antoine Viallon <antoine@lesviallon.fr>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 785fe489ef4b..ae37f0e24c99 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5690,18 +5690,18 @@ static int ceph_mds_auth_match(struct ceph_mds_client *mdsc,
 			 *
 			 * All the other cases                       --> mismatch
 			 */
+			bool path_matched = true;
 			char *first = strstr(_tpath, auth->match.path);
-			if (first != _tpath) {
-				if (free_tpath)
-					kfree(_tpath);
-				return 0;
+			if (first != _tpath ||
+			    (tlen > len && _tpath[len] != '/')) {
+				path_matched = false;
 			}
 
-			if (tlen > len && _tpath[len] != '/') {
-				if (free_tpath)
-					kfree(_tpath);
+			if (free_tpath)
+				kfree(_tpath);
+
+			if (!path_matched)
 				return 0;
-			}
 		}
 	}
 
-- 
cgit v1.2.3


From ff0b7ed607e779f0e109f7f24388e0ce07af2ebe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 15 Jan 2025 12:17:28 -0500
Subject: bcachefs: Fix check_inode_hash_info_matches_root()

Can't use memcmp() when the struct contains padding.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/str_hash.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index f5977c5c6743..8c2c5539de2e 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -167,10 +167,19 @@ found:;
 		goto err;
 
 	struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
-	if (memcmp(hash_info, &hash2, sizeof(hash2))) {
+	if (hash_info->type != hash2.type ||
+	    memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) {
 		ret = repair_inode_hash_info(trans, &inode);
 		if (!ret) {
-			bch_err(c, "inode hash info mismatch with root, but mismatch not found");
+			bch_err(c, "inode hash info mismatch with root, but mismatch not found\n"
+				"%u %llx %llx\n"
+				"%u %llx %llx",
+				hash_info->type,
+				hash_info->siphash_key.k0,
+				hash_info->siphash_key.k1,
+				hash2.type,
+				hash2.siphash_key.k0,
+				hash2.siphash_key.k1);
 			ret = -BCH_ERR_fsck_repair_unimplemented;
 		}
 	}
-- 
cgit v1.2.3


From fa2f9906a7b333ba757a7dbae0713d8a5396186e Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Tue, 14 Jan 2025 12:48:48 -0300
Subject: smb: client: fix double free of TCP_Server_Info::hostname

When shutting down the server in cifs_put_tcp_session(), cifsd thread
might be reconnecting to multiple DFS targets before it realizes it
should exit the loop, so @server->hostname can't be freed as long as
cifsd thread isn't done.  Otherwise the following can happen:

  RIP: 0010:__slab_free+0x223/0x3c0
  Code: 5e 41 5f c3 cc cc cc cc 4c 89 de 4c 89 cf 44 89 44 24 08 4c 89
  1c 24 e8 fb cf 8e 00 44 8b 44 24 08 4c 8b 1c 24 e9 5f fe ff ff <0f>
  0b 41 f7 45 08 00 0d 21 00 0f 85 2d ff ff ff e9 1f ff ff ff 80
  RSP: 0018:ffffb26180dbfd08 EFLAGS: 00010246
  RAX: ffff8ea34728e510 RBX: ffff8ea34728e500 RCX: 0000000000800068
  RDX: 0000000000800068 RSI: 0000000000000000 RDI: ffff8ea340042400
  RBP: ffffe112041ca380 R08: 0000000000000001 R09: 0000000000000000
  R10: 6170732e31303000 R11: 70726f632e786563 R12: ffff8ea34728e500
  R13: ffff8ea340042400 R14: ffff8ea34728e500 R15: 0000000000800068
  FS: 0000000000000000(0000) GS:ffff8ea66fd80000(0000)
  000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007ffc25376080 CR3: 000000012a2ba001 CR4:
  PKRU: 55555554
  Call Trace:
   <TASK>
   ? show_trace_log_lvl+0x1c4/0x2df
   ? show_trace_log_lvl+0x1c4/0x2df
   ? __reconnect_target_unlocked+0x3e/0x160 [cifs]
   ? __die_body.cold+0x8/0xd
   ? die+0x2b/0x50
   ? do_trap+0xce/0x120
   ? __slab_free+0x223/0x3c0
   ? do_error_trap+0x65/0x80
   ? __slab_free+0x223/0x3c0
   ? exc_invalid_op+0x4e/0x70
   ? __slab_free+0x223/0x3c0
   ? asm_exc_invalid_op+0x16/0x20
   ? __slab_free+0x223/0x3c0
   ? extract_hostname+0x5c/0xa0 [cifs]
   ? extract_hostname+0x5c/0xa0 [cifs]
   ? __kmalloc+0x4b/0x140
   __reconnect_target_unlocked+0x3e/0x160 [cifs]
   reconnect_dfs_server+0x145/0x430 [cifs]
   cifs_handle_standard+0x1ad/0x1d0 [cifs]
   cifs_demultiplex_thread+0x592/0x730 [cifs]
   ? __pfx_cifs_demultiplex_thread+0x10/0x10 [cifs]
   kthread+0xdd/0x100
   ? __pfx_kthread+0x10/0x10
   ret_from_fork+0x29/0x50
   </TASK>

Fixes: 7be3248f3139 ("cifs: To match file servers, make sure the server hostname matches")
Reported-by: Jay Shin <jaeshin@redhat.com>
Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index ddcc9e514a0e..eaa6be4456d0 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -1044,6 +1044,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
 	/* Release netns reference for this server. */
 	put_net(cifs_net_ns(server));
 	kfree(server->leaf_fullpath);
+	kfree(server->hostname);
 	kfree(server);
 
 	length = atomic_dec_return(&tcpSesAllocCount);
@@ -1670,8 +1671,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
 	kfree_sensitive(server->session_key.response);
 	server->session_key.response = NULL;
 	server->session_key.len = 0;
-	kfree(server->hostname);
-	server->hostname = NULL;
 
 	task = xchg(&server->tsk, NULL);
 	if (task)
-- 
cgit v1.2.3


From b0fce54b8c0d8e5f2b4c243c803c5996e73baee8 Mon Sep 17 00:00:00 2001
From: Su Yue <glass.su@suse.com>
Date: Mon, 6 Jan 2025 22:06:40 +0800
Subject: ocfs2: check dir i_size in ocfs2_find_entry

syz reports an out of bounds read:

==================================================================
BUG: KASAN: slab-out-of-bounds in ocfs2_match fs/ocfs2/dir.c:334
[inline]
BUG: KASAN: slab-out-of-bounds in ocfs2_search_dirblock+0x283/0x6e0
fs/ocfs2/dir.c:367
Read of size 1 at addr ffff88804d8b9982 by task syz-executor.2/14802

CPU: 0 UID: 0 PID: 14802 Comm: syz-executor.2 Not tainted 6.13.0-rc4 #2
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1
04/01/2014
Sched_ext: serialise (enabled+all), task: runnable_at=-10ms
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:94 [inline]
dump_stack_lvl+0x229/0x350 lib/dump_stack.c:120
print_address_description mm/kasan/report.c:378 [inline]
print_report+0x164/0x530 mm/kasan/report.c:489
kasan_report+0x147/0x180 mm/kasan/report.c:602
ocfs2_match fs/ocfs2/dir.c:334 [inline]
ocfs2_search_dirblock+0x283/0x6e0 fs/ocfs2/dir.c:367
ocfs2_find_entry_id fs/ocfs2/dir.c:414 [inline]
ocfs2_find_entry+0x1143/0x2db0 fs/ocfs2/dir.c:1078
ocfs2_find_files_on_disk+0x18e/0x530 fs/ocfs2/dir.c:1981
ocfs2_lookup_ino_from_name+0xb6/0x110 fs/ocfs2/dir.c:2003
ocfs2_lookup+0x30a/0xd40 fs/ocfs2/namei.c:122
lookup_open fs/namei.c:3627 [inline]
open_last_lookups fs/namei.c:3748 [inline]
path_openat+0x145a/0x3870 fs/namei.c:3984
do_filp_open+0xe9/0x1c0 fs/namei.c:4014
do_sys_openat2+0x135/0x1d0 fs/open.c:1402
do_sys_open fs/open.c:1417 [inline]
__do_sys_openat fs/open.c:1433 [inline]
__se_sys_openat fs/open.c:1428 [inline]
__x64_sys_openat+0x15d/0x1c0 fs/open.c:1428
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xf6/0x210 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f01076903ad
Code: c3 e8 a7 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89
f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01
f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f01084acfc8 EFLAGS: 00000246 ORIG_RAX: 0000000000000101
RAX: ffffffffffffffda RBX: 00007f01077cbf80 RCX: 00007f01076903ad
RDX: 0000000000105042 RSI: 0000000020000080 RDI: ffffffffffffff9c
RBP: 00007f01077cbf80 R08: 0000000000000000 R09: 0000000000000000
R10: 00000000000001ff R11: 0000000000000246 R12: 0000000000000000
R13: 00007f01077cbf80 R14: 00007f010764fc90 R15: 00007f010848d000
</TASK>
==================================================================

And a general protection fault in ocfs2_prepare_dir_for_insert:

==================================================================
loop0: detected capacity change from 0 to 32768
JBD2: Ignoring recovery information on journal
ocfs2: Mounting device (7,0) on (node local, slot 0) with ordered data
mode.
Oops: general protection fault, probably for non-canonical address
0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
CPU: 0 UID: 0 PID: 5096 Comm: syz-executor792 Not tainted
6.11.0-rc4-syzkaller-00002-gb0da640826ba #0
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014
RIP: 0010:ocfs2_find_dir_space_id fs/ocfs2/dir.c:3406 [inline]
RIP: 0010:ocfs2_prepare_dir_for_insert+0x3309/0x5c70 fs/ocfs2/dir.c:4280
Code: 00 00 e8 2a 25 13 fe e9 ba 06 00 00 e8 20 25 13 fe e9 4f 01 00 00
e8 16 25 13 fe 49 8d 7f 08 49 8d 5f 09 48 89 f8 48 c1 e8 03 <42> 0f b6
04 20 84 c0 0f 85 bd 23 00 00 48 89 d8 48 c1 e8 03 42 0f
RSP: 0018:ffffc9000af9f020 EFLAGS: 00010202
RAX: 0000000000000001 RBX: 0000000000000009 RCX: ffff88801e27a440
RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000008
RBP: ffffc9000af9f830 R08: ffffffff8380395b R09: ffffffff838090a7
R10: 0000000000000002 R11: ffff88801e27a440 R12: dffffc0000000000
R13: ffff88803c660878 R14: f700000000000088 R15: 0000000000000000
FS:  000055555a677380(0000) GS:ffff888020800000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000560bce569178 CR3: 000000001de5a000 CR4: 0000000000350ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
ocfs2_mknod+0xcaf/0x2b40 fs/ocfs2/namei.c:292
vfs_mknod+0x36d/0x3b0 fs/namei.c:4088
do_mknodat+0x3ec/0x5b0
__do_sys_mknodat fs/namei.c:4166 [inline]
__se_sys_mknodat fs/namei.c:4163 [inline]
__x64_sys_mknodat+0xa7/0xc0 fs/namei.c:4163
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f2dafda3a99
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 17 00 00 90 48 89
f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08
0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8
64 89 01 48
RSP: 002b:00007ffe336a6658 EFLAGS: 00000246 ORIG_RAX:
0000000000000103
RAX: ffffffffffffffda RBX: 0000000000000000 RCX:
00007f2dafda3a99
RDX: 00000000000021c0 RSI: 0000000020000040 RDI:
00000000ffffff9c
RBP: 00007f2dafe1b5f0 R08: 0000000000004480 R09:
000055555a6784c0
R10: 0000000000000103 R11: 0000000000000246 R12:
00007ffe336a6680
R13: 00007ffe336a68a8 R14: 431bde82d7b634db R15:
00007f2dafdec03b
</TASK>
==================================================================

The two reports are all caused invalid negative i_size of dir inode.  For
ocfs2, dir_inode can't be negative or zero.

Here add a check in which is called by ocfs2_check_dir_for_entry().  It
fixes the second report as ocfs2_check_dir_for_entry() must be called
before ocfs2_prepare_dir_for_insert().  Also set a up limit for dir with
OCFS2_INLINE_DATA_FL.  The i_size can't be great than blocksize.

Link: https://lkml.kernel.org/r/20250106140640.92260-1-glass.su@suse.com
Reported-by: Jiacheng Xu <stitch@zju.edu.cn>
Link: https://lore.kernel.org/ocfs2-devel/17a04f01.1ae74.19436d003fc.Coremail.stitch@zju.edu.cn/T/#u
Reported-by: syzbot+5a64828fcc4c2ad9b04f@syzkaller.appspotmail.com
Link: https://lore.kernel.org/all/0000000000005894f3062018caf1@google.com/T/
Signed-off-by: Su Yue <glass.su@suse.com>
Reviewed-by: Heming Zhao <heming.zhao@suse.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dir.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 213206ebdd58..7799f4d16ce9 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1065,26 +1065,39 @@ int ocfs2_find_entry(const char *name, int namelen,
 {
 	struct buffer_head *bh;
 	struct ocfs2_dir_entry *res_dir = NULL;
+	int ret = 0;
 
 	if (ocfs2_dir_indexed(dir))
 		return ocfs2_find_entry_dx(name, namelen, dir, lookup);
 
+	if (unlikely(i_size_read(dir) <= 0)) {
+		ret = -EFSCORRUPTED;
+		mlog_errno(ret);
+		goto out;
+	}
 	/*
 	 * The unindexed dir code only uses part of the lookup
 	 * structure, so there's no reason to push it down further
 	 * than this.
 	 */
-	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		if (unlikely(i_size_read(dir) > dir->i_sb->s_blocksize)) {
+			ret = -EFSCORRUPTED;
+			mlog_errno(ret);
+			goto out;
+		}
 		bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
-	else
+	} else {
 		bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+	}
 
 	if (bh == NULL)
 		return -ENOENT;
 
 	lookup->dl_leaf_bh = bh;
 	lookup->dl_entry = res_dir;
-	return 0;
+out:
+	return ret;
 }
 
 /*
@@ -2010,6 +2023,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
  *
  * Return 0 if the name does not exist
  * Return -EEXIST if the directory contains the name
+ * Return -EFSCORRUPTED if found corruption
  *
  * Callers should have i_rwsem + a cluster lock on dir
  */
@@ -2023,9 +2037,12 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 	trace_ocfs2_check_dir_for_entry(
 		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
 
-	if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
+	ret = ocfs2_find_entry(name, namelen, dir, &lookup);
+	if (ret == 0) {
 		ret = -EEXIST;
 		mlog_errno(ret);
+	} else if (ret == -ENOENT) {
+		ret = 0;
 	}
 
 	ocfs2_free_dir_lookup_result(&lookup);
-- 
cgit v1.2.3


From fe4b4188ebc29df1f082e79739a0acd0f5646b7f Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 12 Jan 2025 15:16:37 +0000
Subject: ksmbd: Remove unused functions

ksmbd_rpc_rap() was added in 2021 as part of
commit 0626e6641f6b ("cifsd: add server handler for central processing and
tranport layers")

ksmbd_vfs_posix_lock_wait_timeout() was added in 2021 as part of
commit f44158485826 ("cifsd: add file operations")

both have remained unused.

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/transport_ipc.c | 25 -------------------------
 fs/smb/server/transport_ipc.h |  2 --
 fs/smb/server/vfs.c           |  7 -------
 fs/smb/server/vfs.h           |  1 -
 4 files changed, 35 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index befaf42b84cc..e5049ea1d8b2 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -871,31 +871,6 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle
 	return resp;
 }
 
-struct ksmbd_rpc_command *ksmbd_rpc_rap(struct ksmbd_session *sess, void *payload,
-					size_t payload_sz)
-{
-	struct ksmbd_ipc_msg *msg;
-	struct ksmbd_rpc_command *req;
-	struct ksmbd_rpc_command *resp;
-
-	msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1);
-	if (!msg)
-		return NULL;
-
-	msg->type = KSMBD_EVENT_RPC_REQUEST;
-	req = (struct ksmbd_rpc_command *)msg->payload;
-	req->handle = ksmbd_acquire_id(&ipc_ida);
-	req->flags = rpc_context_flags(sess);
-	req->flags |= KSMBD_RPC_RAP_METHOD;
-	req->payload_sz = payload_sz;
-	memcpy(req->payload, payload, payload_sz);
-
-	resp = ipc_msg_send_request(msg, req->handle);
-	ipc_msg_handle_free(req->handle);
-	ipc_msg_free(msg);
-	return resp;
-}
-
 static int __ipc_heartbeat(void)
 {
 	unsigned long delta;
diff --git a/fs/smb/server/transport_ipc.h b/fs/smb/server/transport_ipc.h
index d9b6737f8cd0..e51850f1423b 100644
--- a/fs/smb/server/transport_ipc.h
+++ b/fs/smb/server/transport_ipc.h
@@ -41,8 +41,6 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle
 struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle);
 struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle,
 					  void *payload, size_t payload_sz);
-struct ksmbd_rpc_command *ksmbd_rpc_rap(struct ksmbd_session *sess, void *payload,
-					size_t payload_sz);
 void ksmbd_ipc_release(void);
 void ksmbd_ipc_soft_reset(void);
 int ksmbd_ipc_init(void);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 40f08eac519c..6890016e1923 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1856,13 +1856,6 @@ void ksmbd_vfs_posix_lock_wait(struct file_lock *flock)
 	wait_event(flock->c.flc_wait, !flock->c.flc_blocker);
 }
 
-int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout)
-{
-	return wait_event_interruptible_timeout(flock->c.flc_wait,
-						!flock->c.flc_blocker,
-						timeout);
-}
-
 void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
 {
 	locks_delete_block(flock);
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index 06903024a2d8..2893f59803a6 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -140,7 +140,6 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
 				struct dentry *dentry,
 				struct ksmbd_kstat *ksmbd_kstat);
 void ksmbd_vfs_posix_lock_wait(struct file_lock *flock);
-int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout);
 void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock);
 int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap,
 				const struct path *path);
-- 
cgit v1.2.3


From b2d99376c5d61eb60ffdb6c503e4b6c8f9712ddd Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Fri, 10 Jan 2025 13:37:05 +0900
Subject: ksmbd: browse interfaces list on FSCTL_QUERY_INTERFACE_INFO IOCTL

ksmbd.mount will give each interfaces list and bind_interfaces_only flags
to ksmbd server. Previously, the interfaces list was sent only
when bind_interfaces_only was enabled.
ksmbd server browse only interfaces list given from ksmbd.conf on
FSCTL_QUERY_INTERFACE_INFO IOCTL.

Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/ksmbd_netlink.h |  3 +-
 fs/smb/server/server.h        |  1 +
 fs/smb/server/smb2pdu.c       |  4 +++
 fs/smb/server/transport_ipc.c |  1 +
 fs/smb/server/transport_tcp.c | 73 +++++++++++++++++++------------------------
 fs/smb/server/transport_tcp.h |  1 +
 6 files changed, 41 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index 3d01d9d15293..3f07a612c05b 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -111,7 +111,8 @@ struct ksmbd_startup_request {
 	__u32	smb2_max_credits;	/* MAX credits */
 	__u32	smbd_max_io_size;	/* smbd read write size */
 	__u32	max_connections;	/* Number of maximum simultaneous connections */
-	__u32	reserved[126];		/* Reserved room */
+	__s8	bind_interfaces_only;
+	__s8	reserved[503];		/* Reserved room */
 	__u32	ifc_list_sz;		/* interfaces list size */
 	__s8	____payload[];
 };
diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h
index 94187628ff08..995555febe7d 100644
--- a/fs/smb/server/server.h
+++ b/fs/smb/server/server.h
@@ -46,6 +46,7 @@ struct ksmbd_server_config {
 
 	char			*conf[SERVER_CONF_WORK_GROUP + 1];
 	struct task_struct	*dh_task;
+	bool			bind_interfaces_only;
 };
 
 extern struct ksmbd_server_config server_conf;
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 772deec5b90f..f1efcd027475 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -38,6 +38,7 @@
 #include "mgmt/user_session.h"
 #include "mgmt/ksmbd_ida.h"
 #include "ndr.h"
+#include "transport_tcp.h"
 
 static void __wbuf(struct ksmbd_work *work, void **req, void **rsp)
 {
@@ -7759,6 +7760,9 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
 		if (netdev->type == ARPHRD_LOOPBACK)
 			continue;
 
+		if (!ksmbd_find_netdev_name_iface_list(netdev->name))
+			continue;
+
 		flags = dev_get_flags(netdev);
 		if (!(flags & IFF_RUNNING))
 			continue;
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index e5049ea1d8b2..c0bb8c7722d7 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -333,6 +333,7 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
 	ret = ksmbd_set_netbios_name(req->netbios_name);
 	ret |= ksmbd_set_server_string(req->server_string);
 	ret |= ksmbd_set_work_group(req->work_group);
+	server_conf.bind_interfaces_only = req->bind_interfaces_only;
 	ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req),
 					req->ifc_list_sz);
 	if (ret) {
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 0d9007285e30..7f38a3c3f5bd 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -504,32 +504,37 @@ out_clear:
 	return ret;
 }
 
+struct interface *ksmbd_find_netdev_name_iface_list(char *netdev_name)
+{
+	struct interface *iface;
+
+	list_for_each_entry(iface, &iface_list, entry)
+		if (!strcmp(iface->name, netdev_name))
+			return iface;
+	return NULL;
+}
+
 static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event,
 			      void *ptr)
 {
 	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
 	struct interface *iface;
-	int ret, found = 0;
+	int ret;
 
 	switch (event) {
 	case NETDEV_UP:
 		if (netif_is_bridge_port(netdev))
 			return NOTIFY_OK;
 
-		list_for_each_entry(iface, &iface_list, entry) {
-			if (!strcmp(iface->name, netdev->name)) {
-				found = 1;
-				if (iface->state != IFACE_STATE_DOWN)
-					break;
-				ksmbd_debug(CONN, "netdev-up event: netdev(%s) is going up\n",
-					    iface->name);
-				ret = create_socket(iface);
-				if (ret)
-					return NOTIFY_OK;
-				break;
-			}
+		iface = ksmbd_find_netdev_name_iface_list(netdev->name);
+		if (iface && iface->state == IFACE_STATE_DOWN) {
+			ksmbd_debug(CONN, "netdev-up event: netdev(%s) is going up\n",
+					iface->name);
+			ret = create_socket(iface);
+			if (ret)
+				return NOTIFY_OK;
 		}
-		if (!found && bind_additional_ifaces) {
+		if (!iface && bind_additional_ifaces) {
 			iface = alloc_iface(kstrdup(netdev->name, KSMBD_DEFAULT_GFP));
 			if (!iface)
 				return NOTIFY_OK;
@@ -541,21 +546,19 @@ static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event,
 		}
 		break;
 	case NETDEV_DOWN:
-		list_for_each_entry(iface, &iface_list, entry) {
-			if (!strcmp(iface->name, netdev->name) &&
-			    iface->state == IFACE_STATE_CONFIGURED) {
-				ksmbd_debug(CONN, "netdev-down event: netdev(%s) is going down\n",
-						iface->name);
-				tcp_stop_kthread(iface->ksmbd_kthread);
-				iface->ksmbd_kthread = NULL;
-				mutex_lock(&iface->sock_release_lock);
-				tcp_destroy_socket(iface->ksmbd_socket);
-				iface->ksmbd_socket = NULL;
-				mutex_unlock(&iface->sock_release_lock);
-
-				iface->state = IFACE_STATE_DOWN;
-				break;
-			}
+		iface = ksmbd_find_netdev_name_iface_list(netdev->name);
+		if (iface && iface->state == IFACE_STATE_CONFIGURED) {
+			ksmbd_debug(CONN, "netdev-down event: netdev(%s) is going down\n",
+					iface->name);
+			tcp_stop_kthread(iface->ksmbd_kthread);
+			iface->ksmbd_kthread = NULL;
+			mutex_lock(&iface->sock_release_lock);
+			tcp_destroy_socket(iface->ksmbd_socket);
+			iface->ksmbd_socket = NULL;
+			mutex_unlock(&iface->sock_release_lock);
+
+			iface->state = IFACE_STATE_DOWN;
+			break;
 		}
 		break;
 	}
@@ -624,18 +627,6 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz)
 	int sz = 0;
 
 	if (!ifc_list_sz) {
-		struct net_device *netdev;
-
-		rtnl_lock();
-		for_each_netdev(&init_net, netdev) {
-			if (netif_is_bridge_port(netdev))
-				continue;
-			if (!alloc_iface(kstrdup(netdev->name, KSMBD_DEFAULT_GFP))) {
-				rtnl_unlock();
-				return -ENOMEM;
-			}
-		}
-		rtnl_unlock();
 		bind_additional_ifaces = 1;
 		return 0;
 	}
diff --git a/fs/smb/server/transport_tcp.h b/fs/smb/server/transport_tcp.h
index e338bebe322f..8c9aa624cfe3 100644
--- a/fs/smb/server/transport_tcp.h
+++ b/fs/smb/server/transport_tcp.h
@@ -7,6 +7,7 @@
 #define __KSMBD_TRANSPORT_TCP_H__
 
 int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz);
+struct interface *ksmbd_find_netdev_name_iface_list(char *netdev_name);
 int ksmbd_tcp_init(void);
 void ksmbd_tcp_destroy(void);
 
-- 
cgit v1.2.3


From aab98e2dbd648510f8f51b83fbf4721206ccae45 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 15 Jan 2025 09:28:35 +0900
Subject: ksmbd: fix integer overflows on 32 bit systems

On 32bit systems the addition operations in ipc_msg_alloc() can
potentially overflow leading to memory corruption.
Add bounds checking using KSMBD_IPC_MAX_PAYLOAD to avoid overflow.

Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers")
Cc: stable@vger.kernel.org
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/transport_ipc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index c0bb8c7722d7..0460ebea6ff0 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -627,6 +627,9 @@ ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len)
 	struct ksmbd_spnego_authen_request *req;
 	struct ksmbd_spnego_authen_response *resp;
 
+	if (blob_len > KSMBD_IPC_MAX_PAYLOAD)
+		return NULL;
+
 	msg = ipc_msg_alloc(sizeof(struct ksmbd_spnego_authen_request) +
 			blob_len + 1);
 	if (!msg)
@@ -806,6 +809,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle
 	struct ksmbd_rpc_command *req;
 	struct ksmbd_rpc_command *resp;
 
+	if (payload_sz > KSMBD_IPC_MAX_PAYLOAD)
+		return NULL;
+
 	msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1);
 	if (!msg)
 		return NULL;
@@ -854,6 +860,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle
 	struct ksmbd_rpc_command *req;
 	struct ksmbd_rpc_command *resp;
 
+	if (payload_sz > KSMBD_IPC_MAX_PAYLOAD)
+		return NULL;
+
 	msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1);
 	if (!msg)
 		return NULL;
-- 
cgit v1.2.3


From 07eae0fa67ca4bbb199ad85645e0f9dfaef931cd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 16 Jan 2025 07:01:41 +0100
Subject: xfs: check for dead buffers in xfs_buf_find_insert

Commit 32dd4f9c506b ("xfs: remove a superflous hash lookup when inserting
new buffers") converted xfs_buf_find_insert to use
rhashtable_lookup_get_insert_fast and thus an operation that returns the
existing buffer when an insert would duplicate the hash key.  But this
code path misses the check for a buffer with a reference count of zero,
which could lead to reusing an about to be freed buffer.  Fix this by
using the same atomic_inc_not_zero pattern as xfs_buf_insert.

Fixes: 32dd4f9c506b ("xfs: remove a superflous hash lookup when inserting new buffers")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: stable@vger.kernel.org # v6.0
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d9636bff16ce..183428fbc607 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -655,9 +655,8 @@ xfs_buf_find_insert(
 		spin_unlock(&bch->bc_lock);
 		goto out_free_buf;
 	}
-	if (bp) {
+	if (bp && atomic_inc_not_zero(&bp->b_hold)) {
 		/* found an existing buffer */
-		atomic_inc(&bp->b_hold);
 		spin_unlock(&bch->bc_lock);
 		error = xfs_buf_find_lock(bp, flags);
 		if (error)
-- 
cgit v1.2.3


From ee10f6fcdb961e810d7b16be1285319c15c78ef6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 16 Jan 2025 07:01:42 +0100
Subject: xfs: fix buffer lookup vs release race

Since commit 298f34224506 ("xfs: lockless buffer lookup") the buffer
lookup fastpath is done without a hash-wide lock (then pag_buf_lock, now
bc_lock) and only under RCU protection.  But this means that nothing
serializes lookups against the temporary 0 reference count for buffers
that are added to the LRU after dropping the last regular reference,
and a concurrent lookup would fail to find them.

Fix this by doing all b_hold modifications under b_lock.  We're already
doing this for release so this "only" ~ doubles the b_lock round trips.
We'll later look into the lockref infrastructure to optimize the number
of lock round trips again.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
 fs/xfs/xfs_buf.c   | 93 ++++++++++++++++++++++++++++--------------------------
 fs/xfs/xfs_buf.h   |  2 +-
 fs/xfs/xfs_trace.h | 10 +++---
 3 files changed, 54 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 183428fbc607..7fbdd4b30676 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -127,15 +127,6 @@ __xfs_buf_ioacct_dec(
 	}
 }
 
-static inline void
-xfs_buf_ioacct_dec(
-	struct xfs_buf	*bp)
-{
-	spin_lock(&bp->b_lock);
-	__xfs_buf_ioacct_dec(bp);
-	spin_unlock(&bp->b_lock);
-}
-
 /*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -171,9 +162,9 @@ xfs_buf_stale(
 	atomic_set(&bp->b_lru_ref, 0);
 	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
 	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
-		atomic_dec(&bp->b_hold);
+		bp->b_hold--;
 
-	ASSERT(atomic_read(&bp->b_hold) >= 1);
+	ASSERT(bp->b_hold >= 1);
 	spin_unlock(&bp->b_lock);
 }
 
@@ -229,14 +220,14 @@ _xfs_buf_alloc(
 	 */
 	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
 
-	atomic_set(&bp->b_hold, 1);
+	spin_lock_init(&bp->b_lock);
+	bp->b_hold = 1;
 	atomic_set(&bp->b_lru_ref, 1);
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_lru);
 	INIT_LIST_HEAD(&bp->b_list);
 	INIT_LIST_HEAD(&bp->b_li_list);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
-	spin_lock_init(&bp->b_lock);
 	bp->b_target = target;
 	bp->b_mount = target->bt_mount;
 	bp->b_flags = flags;
@@ -580,6 +571,20 @@ xfs_buf_find_lock(
 	return 0;
 }
 
+static bool
+xfs_buf_try_hold(
+	struct xfs_buf		*bp)
+{
+	spin_lock(&bp->b_lock);
+	if (bp->b_hold == 0) {
+		spin_unlock(&bp->b_lock);
+		return false;
+	}
+	bp->b_hold++;
+	spin_unlock(&bp->b_lock);
+	return true;
+}
+
 static inline int
 xfs_buf_lookup(
 	struct xfs_buf_cache	*bch,
@@ -592,7 +597,7 @@ xfs_buf_lookup(
 
 	rcu_read_lock();
 	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
-	if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+	if (!bp || !xfs_buf_try_hold(bp)) {
 		rcu_read_unlock();
 		return -ENOENT;
 	}
@@ -655,7 +660,7 @@ xfs_buf_find_insert(
 		spin_unlock(&bch->bc_lock);
 		goto out_free_buf;
 	}
-	if (bp && atomic_inc_not_zero(&bp->b_hold)) {
+	if (bp && xfs_buf_try_hold(bp)) {
 		/* found an existing buffer */
 		spin_unlock(&bch->bc_lock);
 		error = xfs_buf_find_lock(bp, flags);
@@ -1037,7 +1042,10 @@ xfs_buf_hold(
 	struct xfs_buf		*bp)
 {
 	trace_xfs_buf_hold(bp, _RET_IP_);
-	atomic_inc(&bp->b_hold);
+
+	spin_lock(&bp->b_lock);
+	bp->b_hold++;
+	spin_unlock(&bp->b_lock);
 }
 
 static void
@@ -1045,10 +1053,15 @@ xfs_buf_rele_uncached(
 	struct xfs_buf		*bp)
 {
 	ASSERT(list_empty(&bp->b_lru));
-	if (atomic_dec_and_test(&bp->b_hold)) {
-		xfs_buf_ioacct_dec(bp);
-		xfs_buf_free(bp);
+
+	spin_lock(&bp->b_lock);
+	if (--bp->b_hold) {
+		spin_unlock(&bp->b_lock);
+		return;
 	}
+	__xfs_buf_ioacct_dec(bp);
+	spin_unlock(&bp->b_lock);
+	xfs_buf_free(bp);
 }
 
 static void
@@ -1058,51 +1071,40 @@ xfs_buf_rele_cached(
 	struct xfs_buftarg	*btp = bp->b_target;
 	struct xfs_perag	*pag = bp->b_pag;
 	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
-	bool			release;
 	bool			freebuf = false;
 
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
-	ASSERT(atomic_read(&bp->b_hold) > 0);
-
-	/*
-	 * We grab the b_lock here first to serialise racing xfs_buf_rele()
-	 * calls. The pag_buf_lock being taken on the last reference only
-	 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
-	 * to last reference we drop here is not serialised against the last
-	 * reference until we take bp->b_lock. Hence if we don't grab b_lock
-	 * first, the last "release" reference can win the race to the lock and
-	 * free the buffer before the second-to-last reference is processed,
-	 * leading to a use-after-free scenario.
-	 */
 	spin_lock(&bp->b_lock);
-	release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
-	if (!release) {
+	ASSERT(bp->b_hold >= 1);
+	if (bp->b_hold > 1) {
 		/*
 		 * Drop the in-flight state if the buffer is already on the LRU
 		 * and it holds the only reference. This is racy because we
 		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
 		 * ensures the decrement occurs only once per-buf.
 		 */
-		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+		if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
 			__xfs_buf_ioacct_dec(bp);
 		goto out_unlock;
 	}
 
-	/* the last reference has been dropped ... */
+	/* we are asked to drop the last reference */
+	spin_lock(&bch->bc_lock);
 	__xfs_buf_ioacct_dec(bp);
 	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
 		/*
-		 * If the buffer is added to the LRU take a new reference to the
+		 * If the buffer is added to the LRU, keep the reference to the
 		 * buffer for the LRU and clear the (now stale) dispose list
-		 * state flag
+		 * state flag, else drop the reference.
 		 */
-		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
+		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
 			bp->b_state &= ~XFS_BSTATE_DISPOSE;
-			atomic_inc(&bp->b_hold);
-		}
+		else
+			bp->b_hold--;
 		spin_unlock(&bch->bc_lock);
 	} else {
+		bp->b_hold--;
 		/*
 		 * most of the time buffers will already be removed from the
 		 * LRU, so optimise that case by checking for the
@@ -1748,13 +1750,14 @@ xfs_buftarg_drain_rele(
 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
 	struct list_head	*dispose = arg;
 
-	if (atomic_read(&bp->b_hold) > 1) {
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	if (bp->b_hold > 1) {
 		/* need to wait, so skip it this pass */
+		spin_unlock(&bp->b_lock);
 		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
 		return LRU_SKIP;
 	}
-	if (!spin_trylock(&bp->b_lock))
-		return LRU_SKIP;
 
 	/*
 	 * clear the LRU reference count so the buffer doesn't get
@@ -2093,7 +2096,7 @@ xfs_buf_delwri_queue(
 	 */
 	bp->b_flags |= _XBF_DELWRI_Q;
 	if (list_empty(&bp->b_list)) {
-		atomic_inc(&bp->b_hold);
+		xfs_buf_hold(bp);
 		list_add_tail(&bp->b_list, list);
 	}
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 10bf66e074a0..7e73663c5d4a 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -168,7 +168,7 @@ struct xfs_buf {
 
 	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
 	int			b_length;	/* size of buffer in BBs */
-	atomic_t		b_hold;		/* reference count */
+	unsigned int		b_hold;		/* reference count */
 	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 4fe689410eb6..b29462363b81 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -498,7 +498,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
 		__entry->dev = bp->b_target->bt_dev;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->nblks = bp->b_length;
-		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->hold = bp->b_hold;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->flags = bp->b_flags;
@@ -569,7 +569,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->length = bp->b_length;
 		__entry->flags = flags;
-		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->hold = bp->b_hold;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->caller_ip = caller_ip;
@@ -612,7 +612,7 @@ TRACE_EVENT(xfs_buf_ioerror,
 		__entry->dev = bp->b_target->bt_dev;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->length = bp->b_length;
-		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->hold = bp->b_hold;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->error = error;
@@ -656,7 +656,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
 		__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
 		__entry->buf_len = bip->bli_buf->b_length;
 		__entry->buf_flags = bip->bli_buf->b_flags;
-		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+		__entry->buf_hold = bip->bli_buf->b_hold;
 		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
 		__entry->buf_lockval = bip->bli_buf->b_sema.count;
 		__entry->li_flags = bip->bli_item.li_flags;
@@ -4978,7 +4978,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
 		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->nblks = bp->b_length;
-		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->hold = bp->b_hold;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->flags = bp->b_flags;
-- 
cgit v1.2.3


From 4f3b63e8a8a28e3dcdcf3ff260f57a732a20b92b Mon Sep 17 00:00:00 2001
From: Sentaro Onizuka <sentaro@amazon.com>
Date: Tue, 14 Jan 2025 00:14:00 +0900
Subject: fs: Fix return type of do_mount() from long to int

Fix the return type of do_mount() function from long to int to match its ac
tual behavior. The function only returns int values, and all callers, inclu
ding those in fs/namespace.c and arch/alpha/kernel/osf_sys.c, already treat
 the return value as int. This change improves type consistency across the
filesystem code and aligns the function signature with its existing impleme
ntation and usage.

Signed-off-by: Sentaro Onizuka <sentaro@amazon.com>
Link: https://lore.kernel.org/r/20250113151400.55512-1-sentaro@amazon.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c        | 2 +-
 include/linux/mount.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 23e81c2a1e3f..5d808778a3ae 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3835,7 +3835,7 @@ int path_mount(const char *dev_name, struct path *path,
 			    data_page);
 }
 
-long do_mount(const char *dev_name, const char __user *dir_name,
+int do_mount(const char *dev_name, const char __user *dir_name,
 		const char *type_page, unsigned long flags, void *data_page)
 {
 	struct path path;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 33f17b6e8732..a7b472faec2c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -114,7 +114,7 @@ extern struct vfsmount *kern_mount(struct file_system_type *);
 extern void kern_unmount(struct vfsmount *mnt);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
-extern long do_mount(const char *, const char __user *,
+int do_mount(const char *, const char __user *,
 		     const char *, unsigned long, void *);
 extern struct vfsmount *collect_mounts(const struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
-- 
cgit v1.2.3


From 8c32b87c4f885fab3c9b2378a3f855dbf280fbca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Jan 2025 10:46:42 +0100
Subject: dcache: use lockref_init for d_lockref

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250115094702.504610-7-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/dcache.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index b4d5e9e1e43d..1a01d7a6a7a9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1681,9 +1681,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	/* Make sure we always see the terminating NUL character */
 	smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
 
-	dentry->d_lockref.count = 1;
 	dentry->d_flags = 0;
-	spin_lock_init(&dentry->d_lock);
+	lockref_init(&dentry->d_lockref, 1);
 	seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
 	dentry->d_inode = NULL;
 	dentry->d_parent = dentry;
-- 
cgit v1.2.3


From 6f86f1465b595864d4e4c58179f2ebcc3dbf5b62 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Jan 2025 10:46:43 +0100
Subject: erofs: use lockref_init for pcl->lockref

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250115094702.504610-8-hch@lst.de
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/erofs/zdata.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 01f147505487..59f143d9744f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -747,8 +747,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 	if (IS_ERR(pcl))
 		return PTR_ERR(pcl);
 
-	spin_lock_init(&pcl->lockref.lock);
-	pcl->lockref.count = 1;		/* one ref for this request */
+	lockref_init(&pcl->lockref, 1); /* one ref for this request */
 	pcl->algorithmformat = map->m_algorithmformat;
 	pcl->length = 0;
 	pcl->partial = true;
-- 
cgit v1.2.3


From 3e652eba244c222b0ba95a3f6fd79315eb020f73 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Jan 2025 10:46:44 +0100
Subject: gfs2: use lockref_init for qd_lockref

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250115094702.504610-9-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/gfs2/quota.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 72b48f6f5561..58bc5013ca49 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -236,8 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str
 		return NULL;
 
 	qd->qd_sbd = sdp;
-	qd->qd_lockref.count = 0;
-	spin_lock_init(&qd->qd_lockref.lock);
+	lockref_init(&qd->qd_lockref, 0);
 	qd->qd_id = qid;
 	qd->qd_slot = -1;
 	INIT_LIST_HEAD(&qd->qd_lru);
-- 
cgit v1.2.3


From 120ac1dc322f402544423582234f441d98ea4a6e Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.sun@unisoc.com>
Date: Wed, 15 Jan 2025 13:39:43 +0800
Subject: f2fs: Optimize f2fs_truncate_data_blocks_range()

Function f2fs_invalidate_blocks() can process consecutive
blocks at a time, so f2fs_truncate_data_blocks_range() is
optimized to use the new functionality of
f2fs_invalidate_blocks().

Add two variables @blkstart and @blklen, @blkstart records
the first address of the consecutive blocks, and @blkstart
records the number of consecutive blocks.

Signed-off-by: Yi Sun <yi.sun@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 81764b10840b..3e06fea9795c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -621,8 +621,11 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	int cluster_index = 0, valid_blocks = 0;
 	int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
 	bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks);
+	block_t blkstart;
+	int blklen = 0;
 
 	addr = get_dnode_addr(dn->inode, dn->node_page) + ofs;
+	blkstart = le32_to_cpu(*addr);
 
 	/* Assumption: truncation starts with cluster */
 	for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
@@ -638,26 +641,44 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		}
 
 		if (blkaddr == NULL_ADDR)
-			continue;
+			goto next;
 
 		f2fs_set_data_blkaddr(dn, NULL_ADDR);
 
 		if (__is_valid_data_blkaddr(blkaddr)) {
 			if (time_to_inject(sbi, FAULT_BLKADDR_CONSISTENCE))
-				continue;
+				goto next;
 			if (!f2fs_is_valid_blkaddr_raw(sbi, blkaddr,
 						DATA_GENERIC_ENHANCE))
-				continue;
+				goto next;
 			if (compressed_cluster)
 				valid_blocks++;
 		}
 
-		f2fs_invalidate_blocks(sbi, blkaddr, 1);
+		if (blkstart + blklen == blkaddr) {
+			blklen++;
+		} else {
+			f2fs_invalidate_blocks(sbi, blkstart, blklen);
+			blkstart = blkaddr;
+			blklen = 1;
+		}
 
 		if (!released || blkaddr != COMPRESS_ADDR)
 			nr_free++;
+
+		continue;
+
+next:
+		if (blklen)
+			f2fs_invalidate_blocks(sbi, blkstart, blklen);
+
+		blkstart = le32_to_cpu(*(addr + 1));
+		blklen = 0;
 	}
 
+	if (blklen)
+		f2fs_invalidate_blocks(sbi, blkstart, blklen);
+
 	if (compressed_cluster)
 		f2fs_i_compr_blocks_update(dn->inode, valid_blocks, false);
 
-- 
cgit v1.2.3


From 4811fee8283f3f7416dd66e2303a55f6761187fc Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 8 Jan 2025 18:18:32 +0000
Subject: f2fs: remove blk_finish_plug

Let's remove unclear blk_finish_plug.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e9582e06403b..008ecd8fd386 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -511,10 +511,6 @@ static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio,
 				  enum page_type type)
 {
 	WARN_ON_ONCE(is_read_io(bio_op(bio)));
-
-	if (f2fs_lfs_mode(sbi) && current->plug && PAGE_TYPE_ON_MAIN(type))
-		blk_finish_plug(current->plug);
-
 	trace_f2fs_submit_write_bio(sbi->sb, type, bio);
 	iostat_update_submit_ctx(bio, type);
 	submit_bio(bio);
-- 
cgit v1.2.3


From 5c1768b6725049e1fcfc841924d65f2872413000 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Tue, 14 Jan 2025 20:34:10 +0800
Subject: f2fs: fix to do sanity check correctly on i_inline_xattr_size

syzbot reported an out-of-range access issue as below:

UBSAN: array-index-out-of-bounds in fs/f2fs/f2fs.h:3292:19
index 18446744073709550491 is out of range for type '__le32[923]' (aka 'unsigned int[923]')
CPU: 0 UID: 0 PID: 5338 Comm: syz.0.0 Not tainted 6.12.0-syzkaller-10689-g7af08b57bcb9 #0
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:94 [inline]
 dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
 ubsan_epilogue lib/ubsan.c:231 [inline]
 __ubsan_handle_out_of_bounds+0x121/0x150 lib/ubsan.c:429
 read_inline_xattr+0x273/0x280
 lookup_all_xattrs fs/f2fs/xattr.c:341 [inline]
 f2fs_getxattr+0x57b/0x13b0 fs/f2fs/xattr.c:533
 vfs_getxattr_alloc+0x472/0x5c0 fs/xattr.c:393
 ima_read_xattr+0x38/0x60 security/integrity/ima/ima_appraise.c:229
 process_measurement+0x117a/0x1fb0 security/integrity/ima/ima_main.c:353
 ima_file_check+0xd9/0x120 security/integrity/ima/ima_main.c:572
 security_file_post_open+0xb9/0x280 security/security.c:3121
 do_open fs/namei.c:3830 [inline]
 path_openat+0x2ccd/0x3590 fs/namei.c:3987
 do_file_open_root+0x3a7/0x720 fs/namei.c:4039
 file_open_root+0x247/0x2a0 fs/open.c:1382
 do_handle_open+0x85b/0x9d0 fs/fhandle.c:414
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

index: 18446744073709550491 (decimal, unsigned long long)
= 0xfffffffffffffb9b (hexadecimal) = -1125 (decimal, long long)
UBSAN detects that inline_xattr_addr() tries to access .i_addr[-1125].

w/ below testcase, it can reproduce this bug easily:
- mkfs.f2fs -f -O extra_attr,flexible_inline_xattr /dev/sdb
- mount -o inline_xattr_size=512 /dev/sdb /mnt/f2fs
- touch /mnt/f2fs/file
- umount /mnt/f2fs
- inject.f2fs --node --mb i_inline --nid 4 --val 0x1 /dev/sdb
- inject.f2fs --node --mb i_inline_xattr_size --nid 4 --val 2048 /dev/sdb
- mount /dev/sdb /mnt/f2fs
- getfattr /mnt/f2fs/file

The root cause is if metadata of filesystem and inode were fuzzed as below:
- extra_attr feature is enabled
- flexible_inline_xattr feature is enabled
- ri.i_inline_xattr_size = 2048
- F2FS_EXTRA_ATTR bit in ri.i_inline was not set

sanity_check_inode() will skip doing sanity check on fi->i_inline_xattr_size,
result in using invalid inline_xattr_size later incorrectly, fix it.

Meanwhile, let's fix to check lower boundary for .i_inline_xattr_size w/
MIN_INLINE_XATTR_SIZE like we did in parse_options().

There is a related issue reported by syzbot, Qasim Ijaz has anlyzed and
fixed it w/ very similar way [1], as discussed, we all agree that it will
be better to do sanity check in sanity_check_inode() for fix, so finally,
let's fix these two related bugs w/ current patch.

Including commit message from Qasim's patch as below, thanks a lot for
his contribution.

"In f2fs_getxattr(), the function lookup_all_xattrs() allocates a 12-byte
(base_size) buffer for an inline extended attribute. However, when
__find_inline_xattr() calls __find_xattr(), it uses the macro
"list_for_each_xattr(entry, addr)", which starts by calling
XATTR_FIRST_ENTRY(addr). This skips a 24-byte struct f2fs_xattr_header
at the beginning of the buffer, causing an immediate out-of-bounds read
in a 12-byte allocation. The subsequent !IS_XATTR_LAST_ENTRY(entry)
check then dereferences memory outside the allocated region, triggering
the slab-out-of bounds read.

This patch prevents the out-of-bounds read by adding a check to bail
out early if inline_size is too small and does not account for the
header plus the 4-byte value that IS_XATTR_LAST_ENTRY reads."

[1]: https://lore.kernel.org/linux-f2fs-devel/Z32y1rfBY9Qb5ZjM@qasdev.system/

Fixes: 6afc662e68b5 ("f2fs: support flexible inline xattr size")
Reported-by: syzbot+69f5379a1717a0b982a1@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-f2fs-devel/674f4e7d.050a0220.17bd51.004f.GAE@google.com
Reported-by: syzbot <syzbot+f5e74075e096e757bdbf@syzkaller.appspotmail.com>
Closes: https://syzkaller.appspot.com/bug?extid=f5e74075e096e757bdbf
Tested-by: syzbot <syzbot+f5e74075e096e757bdbf@syzkaller.appspotmail.com>
Tested-by: Qasim Ijaz <qasdev00@gmail.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 282fd320bdb3..7de33da8b3ea 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -302,15 +302,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 				  F2FS_TOTAL_EXTRA_ATTR_SIZE);
 			return false;
 		}
-		if (f2fs_sb_has_flexible_inline_xattr(sbi) &&
-			f2fs_has_inline_xattr(inode) &&
-			(!fi->i_inline_xattr_size ||
-			fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
-			f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %lu",
-				  __func__, inode->i_ino, fi->i_inline_xattr_size,
-				  MAX_INLINE_XATTR_SIZE);
-			return false;
-		}
 		if (f2fs_sb_has_compression(sbi) &&
 			fi->i_flags & F2FS_COMPR_FL &&
 			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
@@ -320,6 +311,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 		}
 	}
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi) &&
+		f2fs_has_inline_xattr(inode) &&
+		(fi->i_inline_xattr_size < MIN_INLINE_XATTR_SIZE ||
+		fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
+		f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, min: %lu, max: %lu",
+			  __func__, inode->i_ino, fi->i_inline_xattr_size,
+			  MIN_INLINE_XATTR_SIZE, MAX_INLINE_XATTR_SIZE);
+		return false;
+	}
+
 	if (!f2fs_sb_has_extra_attr(sbi)) {
 		if (f2fs_sb_has_project_quota(sbi)) {
 			f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.",
-- 
cgit v1.2.3


From a78c5c2269e39544fab4140105f55fca4502e915 Mon Sep 17 00:00:00 2001
From: Ethan Carter Edwards <ethan@ethancedwards.com>
Date: Sun, 22 Dec 2024 21:23:56 -0500
Subject: fs: erofs: xattr.c change kzalloc to kcalloc

Refactor xattr.c to use kcalloc instead of kzalloc when multiplying
allocation size by count. This refactor prevents unintentional
memory overflows. Discovered by checkpatch.pl.

Signed-off-by: Ethan Carter Edwards <ethan@ethancedwards.com>
Link: https://lore.kernel.org/r/i3CLJhMELKzBJr3DaRyv-hP_4m-3Twx0sgBWXW6naZlMtHrIeWr93xOFshX8qZHDrJeSjHMTiUOh8JmBZ9v0AB-S1lIYM_d-vasSRlsF_s4=@ethancedwards.com
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index a90d7d649739..7940241d9355 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -478,7 +478,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
 	if (!sbi->xattr_prefix_count)
 		return 0;
 
-	pfs = kzalloc(sbi->xattr_prefix_count * sizeof(*pfs), GFP_KERNEL);
+	pfs = kcalloc(sbi->xattr_prefix_count, sizeof(*pfs), GFP_KERNEL);
 	if (!pfs)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 54ab25d03eab652a321c3d1566edc2b050d83f2f Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Thu, 12 Dec 2024 10:39:48 +0800
Subject: erofs: micro-optimize superblock checksum

Just verify the remaining unknown on-disk data instead of allocating a
temporary buffer for the whole superblock and zeroing out the checksum
field since .magic(EROFS_SUPER_MAGIC_V1) is verified and .checksum(0)
is fixed.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241212023948.1143038-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/erofs_fs.h |  3 ++-
 fs/erofs/super.c    | 30 +++++++++++-------------------
 2 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index c8f2ae845bd2..199395ed1c1f 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -9,6 +9,7 @@
 #ifndef __EROFS_FS_H
 #define __EROFS_FS_H
 
+/* to allow for x86 boot sectors and other oddities. */
 #define EROFS_SUPER_OFFSET      1024
 
 #define EROFS_FEATURE_COMPAT_SB_CHKSUM          0x00000001
@@ -54,7 +55,7 @@ struct erofs_deviceslot {
 /* erofs on-disk super block (currently 128 bytes) */
 struct erofs_super_block {
 	__le32 magic;           /* file system magic number */
-	__le32 checksum;        /* crc32c(super_block) */
+	__le32 checksum;        /* crc32c to avoid unexpected on-disk overlap */
 	__le32 feature_compat;
 	__u8 blkszbits;         /* filesystem block size in bit shift */
 	__u8 sb_extslots;	/* superblock size = 128 + sb_extslots * 16 */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index f5956474bfde..1fc5623c3a4d 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -39,29 +39,21 @@ void _erofs_printk(struct super_block *sb, const char *fmt, ...)
 
 static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
 {
-	size_t len = 1 << EROFS_SB(sb)->blkszbits;
-	struct erofs_super_block *dsb;
-	u32 expected_crc, crc;
+	struct erofs_super_block *dsb = sbdata + EROFS_SUPER_OFFSET;
+	u32 len = 1 << EROFS_SB(sb)->blkszbits, crc;
 
 	if (len > EROFS_SUPER_OFFSET)
 		len -= EROFS_SUPER_OFFSET;
+	len -= offsetof(struct erofs_super_block, checksum) +
+			sizeof(dsb->checksum);
 
-	dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL);
-	if (!dsb)
-		return -ENOMEM;
-
-	expected_crc = le32_to_cpu(dsb->checksum);
-	dsb->checksum = 0;
-	/* to allow for x86 boot sectors and other oddities. */
-	crc = crc32c(~0, dsb, len);
-	kfree(dsb);
-
-	if (crc != expected_crc) {
-		erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
-			  crc, expected_crc);
-		return -EBADMSG;
-	}
-	return 0;
+	/* skip .magic(pre-verified) and .checksum(0) fields */
+	crc = crc32c(0x5045B54A, (&dsb->checksum) + 1, len);
+	if (crc == le32_to_cpu(dsb->checksum))
+		return 0;
+	erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
+		  crc, le32_to_cpu(dsb->checksum));
+	return -EBADMSG;
 }
 
 static void erofs_inode_init_once(void *ptr)
-- 
cgit v1.2.3


From 9f74ae8c9ac97a79f9d45c92bd8ac8598e17f21f Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 7 Jan 2025 16:28:25 +0800
Subject: erofs: shorten bvecs[] for file-backed mounts

BIO_MAX_VECS is too large for __GFP_NOFAIL allocation.  We could use
a mempool (since BIOs can always proceed), but it seems overly
complicated for now.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20250107082825.74242-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/fileio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index 33f8539dda4a..0ffd1c63beeb 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -6,7 +6,7 @@
 #include <trace/events/erofs.h>
 
 struct erofs_fileio_rq {
-	struct bio_vec bvecs[BIO_MAX_VECS];
+	struct bio_vec bvecs[16];
 	struct bio bio;
 	struct kiocb iocb;
 	struct super_block *sb;
@@ -68,7 +68,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev)
 	struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq),
 					     GFP_KERNEL | __GFP_NOFAIL);
 
-	bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ);
+	bio_init(&rq->bio, NULL, rq->bvecs, ARRAY_SIZE(rq->bvecs), REQ_OP_READ);
 	rq->iocb.ki_filp = mdev->m_dif->file;
 	rq->sb = mdev->m_sb;
 	return rq;
-- 
cgit v1.2.3


From db902986dee453bfb5835cbc8efa67154ab34caf Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 14 Jan 2025 12:00:58 +0800
Subject: erofs: fix potential return value overflow of z_erofs_shrink_scan()

z_erofs_shrink_scan() could return small numbers due to the mistyped
`freed`.

Although I don't think it has any visible impact.

Fixes: 3883a79abd02 ("staging: erofs: introduce VLE decompression support")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20250114040058.459981-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/zdata.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 19ef4ff2a134..f416b73e0ca3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -927,8 +927,7 @@ unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
 				  unsigned long nr_shrink)
 {
 	struct z_erofs_pcluster *pcl;
-	unsigned int freed = 0;
-	unsigned long index;
+	unsigned long index, freed = 0;
 
 	xa_lock(&sbi->managed_pslots);
 	xa_for_each(&sbi->managed_pslots, index, pcl) {
-- 
cgit v1.2.3


From 2a810ea79cd7a6d5f134ea69ca2ba726e600cbc4 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 14 Jan 2025 11:44:26 +0800
Subject: erofs: simplify z_erofs_load_compact_lcluster()

 - Get rid of unpack_compacted_index() and fold it into
   z_erofs_load_compact_lcluster();

 - Avoid a goto.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20250114034429.431408-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/zmap.c | 89 +++++++++++++++++++++++----------------------------------
 1 file changed, 36 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 4535f2f0a014..b9e35089c9b8 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -97,17 +97,48 @@ static int get_compacted_la_distance(unsigned int lobits,
 	return d1;
 }
 
-static int unpack_compacted_index(struct z_erofs_maprecorder *m,
-				  unsigned int amortizedshift,
-				  erofs_off_t pos, bool lookahead)
+static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
+					 unsigned long lcn, bool lookahead)
 {
-	struct erofs_inode *const vi = EROFS_I(m->inode);
+	struct inode *const inode = m->inode;
+	struct erofs_inode *const vi = EROFS_I(inode);
+	const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
+		ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
 	const unsigned int lclusterbits = vi->z_logical_clusterbits;
+	const unsigned int totalidx = erofs_iblks(inode);
+	unsigned int compacted_4b_initial, compacted_2b, amortizedshift;
 	unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
-	bool big_pcluster;
+	bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+	erofs_off_t pos;
 	u8 *in, type;
 	int i;
 
+	if (lcn >= totalidx || lclusterbits > 14)
+		return -EINVAL;
+
+	m->lcn = lcn;
+	/* used to align to 32-byte (compacted_2b) alignment */
+	compacted_4b_initial = ((32 - ebase % 32) / 4) & 7;
+	compacted_2b = 0;
+	if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
+	    compacted_4b_initial < totalidx)
+		compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+
+	pos = ebase;
+	amortizedshift = 2;	/* compact_4b */
+	if (lcn >= compacted_4b_initial) {
+		pos += compacted_4b_initial * 4;
+		lcn -= compacted_4b_initial;
+		if (lcn < compacted_2b) {
+			amortizedshift = 1;
+		} else {
+			pos += compacted_2b * 2;
+			lcn -= compacted_2b;
+		}
+	}
+	pos += lcn * (1 << amortizedshift);
+
+	/* figure out the lcluster count in this pack */
 	if (1 << amortizedshift == 4 && lclusterbits <= 14)
 		vcnt = 2;
 	else if (1 << amortizedshift == 2 && lclusterbits <= 12)
@@ -122,7 +153,6 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 	/* it doesn't equal to round_up(..) */
 	m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
 			 (vcnt << amortizedshift);
-	big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
 	lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
 	encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
 	bytes = pos & ((vcnt << amortizedshift) - 1);
@@ -207,53 +237,6 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 	return 0;
 }
 
-static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
-					 unsigned long lcn, bool lookahead)
-{
-	struct inode *const inode = m->inode;
-	struct erofs_inode *const vi = EROFS_I(inode);
-	const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
-		ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
-	unsigned int totalidx = erofs_iblks(inode);
-	unsigned int compacted_4b_initial, compacted_2b;
-	unsigned int amortizedshift;
-	erofs_off_t pos;
-
-	if (lcn >= totalidx || vi->z_logical_clusterbits > 14)
-		return -EINVAL;
-
-	m->lcn = lcn;
-	/* used to align to 32-byte (compacted_2b) alignment */
-	compacted_4b_initial = (32 - ebase % 32) / 4;
-	if (compacted_4b_initial == 32 / 4)
-		compacted_4b_initial = 0;
-
-	if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
-	    compacted_4b_initial < totalidx)
-		compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
-	else
-		compacted_2b = 0;
-
-	pos = ebase;
-	if (lcn < compacted_4b_initial) {
-		amortizedshift = 2;
-		goto out;
-	}
-	pos += compacted_4b_initial * 4;
-	lcn -= compacted_4b_initial;
-
-	if (lcn < compacted_2b) {
-		amortizedshift = 1;
-		goto out;
-	}
-	pos += compacted_2b * 2;
-	lcn -= compacted_2b;
-	amortizedshift = 2;
-out:
-	pos += lcn * (1 << amortizedshift);
-	return unpack_compacted_index(m, amortizedshift, pos, lookahead);
-}
-
 static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
 					   unsigned int lcn, bool lookahead)
 {
-- 
cgit v1.2.3


From 5514d8478b8ef3f0ba1b77beaa65f05c12825143 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 14 Jan 2025 11:44:27 +0800
Subject: erofs: get rid of `z_erofs_next_pcluster_t`

It was originally intended for tagged pointer reservation.

Now all encoded data can be represented uniformally with
`struct z_erofs_pcluster` as described in commit bf1aa03980f4
("erofs: sunset `struct erofs_workgroup`"), let's drop it too.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20250114034429.431408-2-hsiangkao@linux.alibaba.com
---
 fs/erofs/zdata.c | 80 ++++++++++++++++++++------------------------------------
 1 file changed, 29 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index f416b73e0ca3..057d68eaa5d2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -12,12 +12,6 @@
 #define Z_EROFS_PCLUSTER_MAX_PAGES	(Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
 #define Z_EROFS_INLINE_BVECS		2
 
-/*
- * let's leave a type here in case of introducing
- * another tagged pointer later.
- */
-typedef void *z_erofs_next_pcluster_t;
-
 struct z_erofs_bvec {
 	struct page *page;
 	int offset;
@@ -48,7 +42,7 @@ struct z_erofs_pcluster {
 	struct lockref lockref;
 
 	/* A: point to next chained pcluster or TAILs */
-	z_erofs_next_pcluster_t next;
+	struct z_erofs_pcluster *next;
 
 	/* I: start block address of this pcluster */
 	erofs_off_t index;
@@ -94,12 +88,11 @@ struct z_erofs_pcluster {
 
 /* the end of a chain of pclusters */
 #define Z_EROFS_PCLUSTER_TAIL           ((void *) 0x700 + POISON_POINTER_DELTA)
-#define Z_EROFS_PCLUSTER_NIL            (NULL)
 
 struct z_erofs_decompressqueue {
 	struct super_block *sb;
+	struct z_erofs_pcluster *head;
 	atomic_t pending_bios;
-	z_erofs_next_pcluster_t head;
 
 	union {
 		struct completion done;
@@ -493,8 +486,7 @@ struct z_erofs_decompress_frontend {
 
 	struct page *pagepool;
 	struct page *candidate_bvpage;
-	struct z_erofs_pcluster *pcl;
-	z_erofs_next_pcluster_t owned_head;
+	struct z_erofs_pcluster *pcl, *head;
 	enum z_erofs_pclustermode mode;
 
 	erofs_off_t headoffset;
@@ -504,7 +496,7 @@ struct z_erofs_decompress_frontend {
 };
 
 #define DECOMPRESS_FRONTEND_INIT(__i) { \
-	.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+	.inode = __i, .head = Z_EROFS_PCLUSTER_TAIL, \
 	.mode = Z_EROFS_PCLUSTER_FOLLOWED }
 
 static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
@@ -752,9 +744,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 	pcl->algorithmformat = map->m_algorithmformat;
 	pcl->length = 0;
 	pcl->partial = true;
-
-	/* new pclusters should be claimed as type 1, primary and followed */
-	pcl->next = fe->owned_head;
+	pcl->next = fe->head;
 	pcl->pageofs_out = map->m_la & ~PAGE_MASK;
 	fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
 
@@ -790,8 +780,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 			goto err_out;
 		}
 	}
-	fe->owned_head = &pcl->next;
-	fe->pcl = pcl;
+	fe->head = fe->pcl = pcl;
 	return 0;
 
 err_out:
@@ -810,7 +799,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 
 	DBG_BUGON(fe->pcl);
 	/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
-	DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+	DBG_BUGON(!fe->head);
 
 	if (!(map->m_flags & EROFS_MAP_META)) {
 		while (1) {
@@ -838,10 +827,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 	if (ret == -EEXIST) {
 		mutex_lock(&fe->pcl->lock);
 		/* check if this pcluster hasn't been linked into any chain. */
-		if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL,
-			    fe->owned_head) == Z_EROFS_PCLUSTER_NIL) {
+		if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) {
 			/* .. so it can be attached to our submission chain */
-			fe->owned_head = &fe->pcl->next;
+			fe->head = fe->pcl;
 			fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
 		} else {	/* otherwise, it belongs to an inflight chain */
 			fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
@@ -1393,7 +1381,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 	pcl->vcnt = 0;
 
 	/* pcluster lock MUST be taken before the following line */
-	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+	WRITE_ONCE(pcl->next, NULL);
 	mutex_unlock(&pcl->lock);
 
 	if (z_erofs_is_inline_pcluster(pcl))
@@ -1411,16 +1399,14 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
 		.pagepool = pagepool,
 		.decompressed_secondary_bvecs =
 			LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+		.pcl = io->head,
 	};
-	z_erofs_next_pcluster_t owned = io->head;
+	struct z_erofs_pcluster *next;
 	int err = io->eio ? -EIO : 0;
 
-	while (owned != Z_EROFS_PCLUSTER_TAIL) {
-		DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
-
-		be.pcl = container_of(owned, struct z_erofs_pcluster, next);
-		owned = READ_ONCE(be.pcl->next);
-
+	for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) {
+		DBG_BUGON(!be.pcl);
+		next = READ_ONCE(be.pcl->next);
 		err = z_erofs_decompress_pcluster(&be, err) ?: err;
 	}
 	return err;
@@ -1630,18 +1616,13 @@ enum {
 	NR_JOBQUEUES,
 };
 
-static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
-				    z_erofs_next_pcluster_t qtail[],
-				    z_erofs_next_pcluster_t owned_head)
+static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl,
+					 struct z_erofs_pcluster *next,
+					 struct z_erofs_pcluster **qtail[])
 {
-	z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
-	z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
-
 	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
-
-	WRITE_ONCE(*submit_qtail, owned_head);
-	WRITE_ONCE(*bypass_qtail, &pcl->next);
-
+	WRITE_ONCE(*qtail[JQ_SUBMIT], next);
+	WRITE_ONCE(*qtail[JQ_BYPASS], pcl);
 	qtail[JQ_BYPASS] = &pcl->next;
 }
 
@@ -1676,9 +1657,9 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 {
 	struct super_block *sb = f->inode->i_sb;
 	struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
-	z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+	struct z_erofs_pcluster **qtail[NR_JOBQUEUES];
 	struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
-	z_erofs_next_pcluster_t owned_head = f->owned_head;
+	struct z_erofs_pcluster *pcl, *next;
 	/* bio is NULL initially, so no need to initialize last_{index,bdev} */
 	erofs_off_t last_pa;
 	unsigned int nr_bios = 0;
@@ -1694,22 +1675,19 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 	qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
 
 	/* by default, all need io submission */
-	q[JQ_SUBMIT]->head = owned_head;
+	q[JQ_SUBMIT]->head = next = f->head;
 
 	do {
 		struct erofs_map_dev mdev;
-		struct z_erofs_pcluster *pcl;
 		erofs_off_t cur, end;
 		struct bio_vec bvec;
 		unsigned int i = 0;
 		bool bypass = true;
 
-		DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
-		pcl = container_of(owned_head, struct z_erofs_pcluster, next);
-		owned_head = READ_ONCE(pcl->next);
-
+		pcl = next;
+		next = READ_ONCE(pcl->next);
 		if (z_erofs_is_inline_pcluster(pcl)) {
-			move_to_bypass_jobqueue(pcl, qtail, owned_head);
+			z_erofs_move_to_bypass_queue(pcl, next, qtail);
 			continue;
 		}
 
@@ -1781,8 +1759,8 @@ drain_io:
 		if (!bypass)
 			qtail[JQ_SUBMIT] = &pcl->next;
 		else
-			move_to_bypass_jobqueue(pcl, qtail, owned_head);
-	} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+			z_erofs_move_to_bypass_queue(pcl, next, qtail);
+	} while (next != Z_EROFS_PCLUSTER_TAIL);
 
 	if (bio) {
 		if (erofs_is_fileio_mode(EROFS_SB(sb)))
@@ -1814,7 +1792,7 @@ static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
 	bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios);
 	int err;
 
-	if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
+	if (f->head == Z_EROFS_PCLUSTER_TAIL)
 		return 0;
 	z_erofs_submit_queue(f, io, &force_fg, !!ra_folios);
 
-- 
cgit v1.2.3


From 6f435e94a19ad25b372bc61443afd0839b8a521c Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 14 Jan 2025 11:44:28 +0800
Subject: erofs: tidy up zdata.c

All small code style adjustments, no logic changes:

 - z_erofs_decompress_frontend => z_erofs_frontend;
 - z_erofs_decompress_backend => z_erofs_backend;
 - Use Z_EROFS_DEFINE_FRONTEND() to replace DECOMPRESS_FRONTEND_INIT();
 - `nr_folios` should be `nrpages` in z_erofs_readahead();
 - Refine in-line comments.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20250114034429.431408-3-hsiangkao@linux.alibaba.com
---
 fs/erofs/zdata.c | 111 +++++++++++++++++++++----------------------------------
 1 file changed, 43 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 057d68eaa5d2..72c1d5d27c3a 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -455,31 +455,25 @@ err_decompressor:
 }
 
 enum z_erofs_pclustermode {
+	/* It has previously been linked into another processing chain */
 	Z_EROFS_PCLUSTER_INFLIGHT,
 	/*
-	 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
-	 * could be dispatched into bypass queue later due to uptodated managed
-	 * pages. All related online pages cannot be reused for inplace I/O (or
-	 * bvpage) since it can be directly decoded without I/O submission.
+	 * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it
+	 * may be dispatched to the bypass queue later due to uptodated managed
+	 * folios.  All file-backed folios related to this pcluster cannot be
+	 * reused for in-place I/O (or bvpage) since the pcluster may be decoded
+	 * in a separate queue (and thus out of order).
 	 */
 	Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
 	/*
-	 * The pcluster was just linked to a decompression chain by us.  It can
-	 * also be linked with the remaining pclusters, which means if the
-	 * processing page is the tail page of a pcluster, this pcluster can
-	 * safely use the whole page (since the previous pcluster is within the
-	 * same chain) for in-place I/O, as illustrated below:
-	 *  ___________________________________________________
-	 * |  tail (partial) page  |    head (partial) page    |
-	 * |  (of the current pcl) |   (of the previous pcl)   |
-	 * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
-	 *
-	 * [  (*) the page above can be used as inplace I/O.   ]
+	 * The pcluster has just been linked to our processing chain.
+	 * File-backed folios (except for the head page) related to it can be
+	 * used for in-place I/O (or bvpage).
 	 */
 	Z_EROFS_PCLUSTER_FOLLOWED,
 };
 
-struct z_erofs_decompress_frontend {
+struct z_erofs_frontend {
 	struct inode *const inode;
 	struct erofs_map_blocks map;
 	struct z_erofs_bvec_iter biter;
@@ -495,11 +489,11 @@ struct z_erofs_decompress_frontend {
 	unsigned int icur;
 };
 
-#define DECOMPRESS_FRONTEND_INIT(__i) { \
-	.inode = __i, .head = Z_EROFS_PCLUSTER_TAIL, \
-	.mode = Z_EROFS_PCLUSTER_FOLLOWED }
+#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \
+	.inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \
+	.mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho }
 
-static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe)
 {
 	unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
 
@@ -516,7 +510,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
 	return false;
 }
 
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
 {
 	struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
 	struct z_erofs_pcluster *pcl = fe->pcl;
@@ -673,7 +667,7 @@ int erofs_init_managed_cache(struct super_block *sb)
 }
 
 /* callers must be with pcluster lock held */
-static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
+static int z_erofs_attach_page(struct z_erofs_frontend *fe,
 			       struct z_erofs_bvec *bvec, bool exclusive)
 {
 	struct z_erofs_pcluster *pcl = fe->pcl;
@@ -719,7 +713,7 @@ static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
 	return true;
 }
 
-static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
 {
 	struct erofs_map_blocks *map = &fe->map;
 	struct super_block *sb = fe->inode->i_sb;
@@ -789,7 +783,7 @@ err_out:
 	return err;
 }
 
-static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
 {
 	struct erofs_map_blocks *map = &fe->map;
 	struct super_block *sb = fe->inode->i_sb;
@@ -862,14 +856,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 	return 0;
 }
 
-/*
- * keep in mind that no referenced pclusters will be freed
- * only after a RCU grace period.
- */
 static void z_erofs_rcu_callback(struct rcu_head *head)
 {
-	z_erofs_free_pcluster(container_of(head,
-			struct z_erofs_pcluster, rcu));
+	z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu));
 }
 
 static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
@@ -911,8 +900,7 @@ static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
 	return free;
 }
 
-unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
-				  unsigned long nr_shrink)
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr)
 {
 	struct z_erofs_pcluster *pcl;
 	unsigned long index, freed = 0;
@@ -925,7 +913,7 @@ unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
 		xa_unlock(&sbi->managed_pslots);
 
 		++freed;
-		if (!--nr_shrink)
+		if (!--nr)
 			return freed;
 		xa_lock(&sbi->managed_pslots);
 	}
@@ -954,7 +942,7 @@ static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
 		call_rcu(&pcl->rcu, z_erofs_rcu_callback);
 }
 
-static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_pcluster_end(struct z_erofs_frontend *fe)
 {
 	struct z_erofs_pcluster *pcl = fe->pcl;
 
@@ -967,13 +955,9 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
 	if (fe->candidate_bvpage)
 		fe->candidate_bvpage = NULL;
 
-	/*
-	 * if all pending pages are added, don't hold its reference
-	 * any longer if the pcluster isn't hosted by ourselves.
-	 */
+	/* Drop refcount if it doesn't belong to our processing chain */
 	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
 		z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
-
 	fe->pcl = NULL;
 }
 
@@ -1002,7 +986,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
 	return 0;
 }
 
-static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f,
+static int z_erofs_scan_folio(struct z_erofs_frontend *f,
 			      struct folio *folio, bool ra)
 {
 	struct inode *const inode = f->inode;
@@ -1117,7 +1101,7 @@ static bool z_erofs_page_is_invalidated(struct page *page)
 	return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
 }
 
-struct z_erofs_decompress_backend {
+struct z_erofs_backend {
 	struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
 	struct super_block *sb;
 	struct z_erofs_pcluster *pcl;
@@ -1137,7 +1121,7 @@ struct z_erofs_bvec_item {
 	struct list_head list;
 };
 
-static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
 					 struct z_erofs_bvec *bvec)
 {
 	struct z_erofs_bvec_item *item;
@@ -1160,8 +1144,7 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
 	list_add(&item->list, &be->decompressed_secondary_bvecs);
 }
 
-static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
-				      int err)
+static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
 {
 	unsigned int off0 = be->pcl->pageofs_out;
 	struct list_head *p, *n;
@@ -1202,7 +1185,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
 	}
 }
 
-static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be)
 {
 	struct z_erofs_pcluster *pcl = be->pcl;
 	struct z_erofs_bvec_iter biter;
@@ -1227,8 +1210,7 @@ static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
 		z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
 }
 
-static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
-				  bool *overlapped)
+static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
 {
 	struct z_erofs_pcluster *pcl = be->pcl;
 	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
@@ -1263,8 +1245,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
 	return err;
 }
 
-static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
-				       int err)
+static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
 {
 	struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
 	struct z_erofs_pcluster *pcl = be->pcl;
@@ -1394,7 +1375,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
 				    struct page **pagepool)
 {
-	struct z_erofs_decompress_backend be = {
+	struct z_erofs_backend be = {
 		.sb = io->sb,
 		.pagepool = pagepool,
 		.decompressed_secondary_bvecs =
@@ -1472,7 +1453,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 }
 
 static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
-				 struct z_erofs_decompress_frontend *f,
+				 struct z_erofs_frontend *f,
 				 struct z_erofs_pcluster *pcl,
 				 unsigned int nr,
 				 struct address_space *mc)
@@ -1651,7 +1632,7 @@ static void z_erofs_endio(struct bio *bio)
 		bio_put(bio);
 }
 
-static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
+static void z_erofs_submit_queue(struct z_erofs_frontend *f,
 				 struct z_erofs_decompressqueue *fgq,
 				 bool *force_fg, bool readahead)
 {
@@ -1784,17 +1765,16 @@ drain_io:
 	z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
 }
 
-static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
-			    unsigned int ra_folios)
+static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages)
 {
 	struct z_erofs_decompressqueue io[NR_JOBQUEUES];
 	struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
-	bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios);
+	bool force_fg = z_erofs_is_sync_decompress(sbi, rapages);
 	int err;
 
 	if (f->head == Z_EROFS_PCLUSTER_TAIL)
 		return 0;
-	z_erofs_submit_queue(f, io, &force_fg, !!ra_folios);
+	z_erofs_submit_queue(f, io, &force_fg, !!rapages);
 
 	/* handle bypass queue (no i/o pclusters) immediately */
 	err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
@@ -1812,7 +1792,7 @@ static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
  * Since partial uptodate is still unimplemented for now, we have to use
  * approximate readmore strategies as a start.
  */
-static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
 		struct readahead_control *rac, bool backmost)
 {
 	struct inode *inode = f->inode;
@@ -1867,12 +1847,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
 static int z_erofs_read_folio(struct file *file, struct folio *folio)
 {
 	struct inode *const inode = folio->mapping->host;
-	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+	Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio));
 	int err;
 
 	trace_erofs_read_folio(folio, false);
-	f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
-
 	z_erofs_pcluster_readmore(&f, NULL, true);
 	err = z_erofs_scan_folio(&f, folio, false);
 	z_erofs_pcluster_readmore(&f, NULL, false);
@@ -1892,17 +1870,14 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
 static void z_erofs_readahead(struct readahead_control *rac)
 {
 	struct inode *const inode = rac->mapping->host;
-	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+	Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
 	struct folio *head = NULL, *folio;
-	unsigned int nr_folios;
+	unsigned int nrpages = readahead_count(rac);
 	int err;
 
-	f.headoffset = readahead_pos(rac);
-
 	z_erofs_pcluster_readmore(&f, rac, true);
-	nr_folios = readahead_count(rac);
-	trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
-
+	nrpages = readahead_count(rac);
+	trace_erofs_readpages(inode, readahead_index(rac), nrpages, false);
 	while ((folio = readahead_folio(rac))) {
 		folio->private = head;
 		head = folio;
@@ -1921,7 +1896,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 	z_erofs_pcluster_readmore(&f, rac, false);
 	z_erofs_pcluster_end(&f);
 
-	(void)z_erofs_runqueue(&f, nr_folios);
+	(void)z_erofs_runqueue(&f, nrpages);
 	erofs_put_metabuf(&f.map.buf);
 	erofs_release_pages(&f.pagepool);
 }
-- 
cgit v1.2.3


From e180b8c4c23beb876927a999c49c18b361fa7975 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 14 Jan 2025 11:44:29 +0800
Subject: erofs: convert z_erofs_bind_cache() to folios

The managed cache uses a pseudo inode to keep (necessary) compressed
data.

Currently, it still uses zero-order folios, so this is just a trivial
conversion, except that the use of the pagepool is temporarily dropped.

Drop some obsoleted comments too.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20250114034429.431408-4-hsiangkao@linux.alibaba.com
---
 fs/erofs/compress.h | 23 +----------------------
 fs/erofs/zdata.c    | 51 ++++++++++++++++++++-------------------------------
 2 files changed, 21 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 7bfe251680ec..65ff39401020 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -29,29 +29,8 @@ struct z_erofs_decompressor {
 	char *name;
 };
 
-/* some special page->private (unsigned long, see below) */
 #define Z_EROFS_SHORTLIVED_PAGE		(-1UL << 2)
-#define Z_EROFS_PREALLOCATED_PAGE	(-2UL << 2)
-
-/*
- * For all pages in a pcluster, page->private should be one of
- * Type                         Last 2bits      page->private
- * short-lived page             00              Z_EROFS_SHORTLIVED_PAGE
- * preallocated page (tryalloc) 00              Z_EROFS_PREALLOCATED_PAGE
- * cached/managed page          00              pointer to z_erofs_pcluster
- * online page (file-backed,    01/10/11        sub-index << 2 | count
- *              some pages can be used for inplace I/O)
- *
- * page->mapping should be one of
- * Type                 page->mapping
- * short-lived page     NULL
- * preallocated page    NULL
- * cached/managed page  non-NULL or NULL (invalidated/truncated page)
- * online page          non-NULL
- *
- * For all managed pages, PG_private should be set with 1 extra refcount,
- * which is used for page reclaim / migration.
- */
+#define Z_EROFS_PREALLOCATED_FOLIO	((void *)(-2UL << 2))
 
 /*
  * Currently, short-lived pages are pages directly from buddy system
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 72c1d5d27c3a..aff09f94afb2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -516,13 +516,11 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
 	struct z_erofs_pcluster *pcl = fe->pcl;
 	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
 	bool shouldalloc = z_erofs_should_alloc_cache(fe);
-	bool standalone = true;
-	/*
-	 * optimistic allocation without direct reclaim since inplace I/O
-	 * can be used if low memory otherwise.
-	 */
+	bool may_bypass = true;
+	/* Optimistic allocation, as in-place I/O can be used as a fallback */
 	gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
 			__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+	struct folio *folio, *newfolio;
 	unsigned int i;
 
 	if (i_blocksize(fe->inode) != PAGE_SIZE ||
@@ -530,47 +528,42 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
 		return;
 
 	for (i = 0; i < pclusterpages; ++i) {
-		struct page *page, *newpage;
-
 		/* Inaccurate check w/o locking to avoid unneeded lookups */
 		if (READ_ONCE(pcl->compressed_bvecs[i].page))
 			continue;
 
-		page = find_get_page(mc, pcl->index + i);
-		if (!page) {
-			/* I/O is needed, no possible to decompress directly */
-			standalone = false;
+		folio = filemap_get_folio(mc, pcl->index + i);
+		if (IS_ERR(folio)) {
+			may_bypass = false;
 			if (!shouldalloc)
 				continue;
 
 			/*
-			 * Try cached I/O if allocation succeeds or fallback to
-			 * in-place I/O instead to avoid any direct reclaim.
+			 * Allocate a managed folio for cached I/O, or it may be
+			 * then filled with a file-backed folio for in-place I/O
 			 */
-			newpage = erofs_allocpage(&fe->pagepool, gfp);
-			if (!newpage)
+			newfolio = filemap_alloc_folio(gfp, 0);
+			if (!newfolio)
 				continue;
-			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+			newfolio->private = Z_EROFS_PREALLOCATED_FOLIO;
+			folio = NULL;
 		}
 		spin_lock(&pcl->lockref.lock);
 		if (!pcl->compressed_bvecs[i].page) {
-			pcl->compressed_bvecs[i].page = page ? page : newpage;
+			pcl->compressed_bvecs[i].page =
+				folio_page(folio ?: newfolio, 0);
 			spin_unlock(&pcl->lockref.lock);
 			continue;
 		}
 		spin_unlock(&pcl->lockref.lock);
-
-		if (page)
-			put_page(page);
-		else if (newpage)
-			erofs_pagepool_add(&fe->pagepool, newpage);
+		folio_put(folio ?: newfolio);
 	}
 
 	/*
-	 * don't do inplace I/O if all compressed pages are available in
-	 * managed cache since it can be moved to the bypass queue instead.
+	 * Don't perform in-place I/O if all compressed pages are available in
+	 * the managed cache, as the pcluster can be moved to the bypass queue.
 	 */
-	if (standalone)
+	if (may_bypass)
 		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
 }
 
@@ -1480,12 +1473,8 @@ repeat:
 	DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
 
 	folio = page_folio(zbv.page);
-	/*
-	 * Handle preallocated cached folios.  We tried to allocate such folios
-	 * without triggering direct reclaim.  If allocation failed, inplace
-	 * file-backed folios will be used instead.
-	 */
-	if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
+	/* For preallocated managed folios, add them to page cache here */
+	if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) {
 		tocache = true;
 		goto out_tocache;
 	}
-- 
cgit v1.2.3


From 41fb0fabc40113769ce53ea85ffd1f4bc87ae03a Mon Sep 17 00:00:00 2001
From: Chen Linxuan <chenlinxuan@uniontech.com>
Date: Thu, 16 Jan 2025 16:33:02 +0800
Subject: erofs: return SHRINK_EMPTY if no objects to free

Comments in file include/linux/shrinker.h says that
`count_objects` of `struct shrinker` should return SHRINK_EMPTY
when there are no objects to free.

> If there are no objects to free, it should return SHRINK_EMPTY,
> while 0 is returned in cases of the number of freeable items cannot
> be determined or shrinker should skip this cache for this time
> (e.g., their number is below shrinkable limit).

Signed-off-by: Chen Linxuan <chenlinxuan@uniontech.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/149E6E64B5B6B5E8+20250116083303.199817-1-chenlinxuan@uniontech.com
[ Gao Xiang: should have no impact since it's not memcg-aware. ]
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/zutil.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 0dd65cefce33..83fbcab70a92 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -243,7 +243,7 @@ void erofs_shrinker_unregister(struct super_block *sb)
 static unsigned long erofs_shrink_count(struct shrinker *shrink,
 					struct shrink_control *sc)
 {
-	return atomic_long_read(&erofs_global_shrink_cnt);
+	return atomic_long_read(&erofs_global_shrink_cnt) ?: SHRINK_EMPTY;
 }
 
 static unsigned long erofs_shrink_scan(struct shrinker *shrink,
-- 
cgit v1.2.3


From 5f4e6f7f8b77a3b1fb0005f6e1692475785ae05f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 12 Nov 2024 16:20:45 -0500
Subject: fs/overlayfs/namei.c: get rid of include ../internal.h

Added for the sake of vfs_path_lookup(), which is in linux/namei.h
these days.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/namei.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 7e27b7d4adee..600046ebc2f3 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -14,8 +14,6 @@
 #include <linux/exportfs.h>
 #include "overlayfs.h"
 
-#include "../internal.h"	/* for vfs_path_lookup */
-
 struct ovl_lookup_data {
 	struct super_block *sb;
 	const struct ovl_layer *layer;
-- 
cgit v1.2.3


From 61bc24ac974a4873e3040765e640f62fe99d6226 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Dec 2024 19:15:58 -0500
Subject: make sure that DNAME_INLINE_LEN is a multiple of word size

... calling the number of words DNAME_INLINE_WORDS.

The next step will be to have a structure to hold inline name arrays
(both in dentry and in name_snapshot) and use that to alias the
existing arrays of unsigned char there.  That will allow both
full-structure copies and convenient word-by-word accesses.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 4 +---
 include/linux/dcache.h | 8 +++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index b4d5e9e1e43d..ea0f0bea511b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2748,9 +2748,7 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
 			/*
 			 * Both are internal.
 			 */
-			unsigned int i;
-			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
-			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
+			for (int i = 0; i < DNAME_INLINE_WORDS; i++) {
 				swap(((long *) &dentry->d_iname)[i],
 				     ((long *) &target->d_iname)[i]);
 			}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index bff956f7b2b9..42dd89beaf4e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -68,15 +68,17 @@ extern const struct qstr dotdot_name;
  * large memory footprint increase).
  */
 #ifdef CONFIG_64BIT
-# define DNAME_INLINE_LEN 40 /* 192 bytes */
+# define DNAME_INLINE_WORDS 5 /* 192 bytes */
 #else
 # ifdef CONFIG_SMP
-#  define DNAME_INLINE_LEN 36 /* 128 bytes */
+#  define DNAME_INLINE_WORDS 9 /* 128 bytes */
 # else
-#  define DNAME_INLINE_LEN 44 /* 128 bytes */
+#  define DNAME_INLINE_WORDS 11 /* 128 bytes */
 # endif
 #endif
 
+#define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long))
+
 #define d_lock	d_lockref.lock
 
 struct dentry {
-- 
cgit v1.2.3


From 58cf9c383c5c686668082f83f7e0f3e0bd5cc2e3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Dec 2024 19:35:36 -0500
Subject: dcache: back inline names with a struct-wrapped array of unsigned
 long

... so that they can be copied with struct assignment (which generates
better code) and accessed word-by-word.

The type is union shortname_storage; it's a union of arrays of
unsigned char and unsigned long.

struct name_snapshot.inline_name turned into union shortname_storage;
users (all in fs/dcache.c) adjusted.

struct dentry.d_iname has some users outside of fs/dcache.c; to
reduce the amount of noise in commit, it is replaced with
union shortname_storage d_shortname and d_iname is turned into a macro
that expands to d_shortname.string (similar to d_lock handling).
That compat macro is temporary - most of the remaining instances will
be taken out by debugfs series, and once that is merged and few others
are taken care of this will go away.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c                                  | 43 ++++++++++++----------------
 include/linux/dcache.h                       | 10 +++++--
 tools/testing/selftests/bpf/progs/find_vma.c |  2 +-
 3 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index ea0f0bea511b..52662a5d08e4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -324,7 +324,7 @@ static void __d_free_external(struct rcu_head *head)
 
 static inline int dname_external(const struct dentry *dentry)
 {
-	return dentry->d_name.name != dentry->d_iname;
+	return dentry->d_name.name != dentry->d_shortname.string;
 }
 
 void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
@@ -334,9 +334,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry
 	if (unlikely(dname_external(dentry))) {
 		atomic_inc(&external_name(dentry)->u.count);
 	} else {
-		memcpy(name->inline_name, dentry->d_iname,
-		       dentry->d_name.len + 1);
-		name->name.name = name->inline_name;
+		name->inline_name = dentry->d_shortname;
+		name->name.name = name->inline_name.string;
 	}
 	spin_unlock(&dentry->d_lock);
 }
@@ -344,7 +343,7 @@ EXPORT_SYMBOL(take_dentry_name_snapshot);
 
 void release_dentry_name_snapshot(struct name_snapshot *name)
 {
-	if (unlikely(name->name.name != name->inline_name)) {
+	if (unlikely(name->name.name != name->inline_name.string)) {
 		struct external_name *p;
 		p = container_of(name->name.name, struct external_name, name[0]);
 		if (unlikely(atomic_dec_and_test(&p->u.count)))
@@ -1654,10 +1653,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	 * will still always have a NUL at the end, even if we might
 	 * be overwriting an internal NUL character
 	 */
-	dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
+	dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0;
 	if (unlikely(!name)) {
 		name = &slash_name;
-		dname = dentry->d_iname;
+		dname = dentry->d_shortname.string;
 	} else if (name->len > DNAME_INLINE_LEN-1) {
 		size_t size = offsetof(struct external_name, name[1]);
 		struct external_name *p = kmalloc(size + name->len,
@@ -1670,7 +1669,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 		atomic_set(&p->u.count, 1);
 		dname = p->name;
 	} else  {
-		dname = dentry->d_iname;
+		dname = dentry->d_shortname.string;
 	}	
 
 	dentry->d_name.len = name->len;
@@ -2729,10 +2728,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
 			 * dentry:internal, target:external.  Steal target's
 			 * storage and make target internal.
 			 */
-			memcpy(target->d_iname, dentry->d_name.name,
-					dentry->d_name.len + 1);
 			dentry->d_name.name = target->d_name.name;
-			target->d_name.name = target->d_iname;
+			target->d_shortname = dentry->d_shortname;
+			target->d_name.name = target->d_shortname.string;
 		}
 	} else {
 		if (unlikely(dname_external(dentry))) {
@@ -2740,18 +2738,16 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
 			 * dentry:external, target:internal.  Give dentry's
 			 * storage to target and make dentry internal
 			 */
-			memcpy(dentry->d_iname, target->d_name.name,
-					target->d_name.len + 1);
 			target->d_name.name = dentry->d_name.name;
-			dentry->d_name.name = dentry->d_iname;
+			dentry->d_shortname = target->d_shortname;
+			dentry->d_name.name = dentry->d_shortname.string;
 		} else {
 			/*
 			 * Both are internal.
 			 */
-			for (int i = 0; i < DNAME_INLINE_WORDS; i++) {
-				swap(((long *) &dentry->d_iname)[i],
-				     ((long *) &target->d_iname)[i]);
-			}
+			for (int i = 0; i < DNAME_INLINE_WORDS; i++)
+				swap(dentry->d_shortname.words[i],
+				     target->d_shortname.words[i]);
 		}
 	}
 	swap(dentry->d_name.hash_len, target->d_name.hash_len);
@@ -2766,9 +2762,8 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
 		atomic_inc(&external_name(target)->u.count);
 		dentry->d_name = target->d_name;
 	} else {
-		memcpy(dentry->d_iname, target->d_name.name,
-				target->d_name.len + 1);
-		dentry->d_name.name = dentry->d_iname;
+		dentry->d_shortname = target->d_shortname;
+		dentry->d_name.name = dentry->d_shortname.string;
 		dentry->d_name.hash_len = target->d_name.hash_len;
 	}
 	if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
@@ -3101,12 +3096,12 @@ void d_mark_tmpfile(struct file *file, struct inode *inode)
 {
 	struct dentry *dentry = file->f_path.dentry;
 
-	BUG_ON(dentry->d_name.name != dentry->d_iname ||
+	BUG_ON(dname_external(dentry) ||
 		!hlist_unhashed(&dentry->d_u.d_alias) ||
 		!d_unlinked(dentry));
 	spin_lock(&dentry->d_parent->d_lock);
 	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
+	dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
 				(unsigned long long)inode->i_ino);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dentry->d_parent->d_lock);
@@ -3194,7 +3189,7 @@ static void __init dcache_init(void)
 	 */
 	dentry_cache = KMEM_CACHE_USERCOPY(dentry,
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
-		d_iname);
+		d_shortname.string);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 42dd89beaf4e..8bc567a35718 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -79,7 +79,13 @@ extern const struct qstr dotdot_name;
 
 #define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long))
 
+union shortname_store {
+	unsigned char string[DNAME_INLINE_LEN];
+	unsigned long words[DNAME_INLINE_WORDS];
+};
+
 #define d_lock	d_lockref.lock
+#define d_iname d_shortname.string
 
 struct dentry {
 	/* RCU lookup touched fields */
@@ -90,7 +96,7 @@ struct dentry {
 	struct qstr d_name;
 	struct inode *d_inode;		/* Where the name belongs to - NULL is
 					 * negative */
-	unsigned char d_iname[DNAME_INLINE_LEN];	/* small names */
+	union shortname_store d_shortname;
 	/* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */
 
 	/* Ref lookup also touches following */
@@ -591,7 +597,7 @@ static inline struct inode *d_real_inode(const struct dentry *dentry)
 
 struct name_snapshot {
 	struct qstr name;
-	unsigned char inline_name[DNAME_INLINE_LEN];
+	union shortname_store inline_name;
 };
 void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
 void release_dentry_name_snapshot(struct name_snapshot *);
diff --git a/tools/testing/selftests/bpf/progs/find_vma.c b/tools/testing/selftests/bpf/progs/find_vma.c
index 38034fb82530..02b82774469c 100644
--- a/tools/testing/selftests/bpf/progs/find_vma.c
+++ b/tools/testing/selftests/bpf/progs/find_vma.c
@@ -25,7 +25,7 @@ static long check_vma(struct task_struct *task, struct vm_area_struct *vma,
 {
 	if (vma->vm_file)
 		bpf_probe_read_kernel_str(d_iname, DNAME_INLINE_LEN - 1,
-					  vma->vm_file->f_path.dentry->d_iname);
+					  vma->vm_file->f_path.dentry->d_shortname.string);
 
 	/* check for VM_EXEC */
 	if (vma->vm_flags & VM_EXEC)
-- 
cgit v1.2.3


From 1c9be84a9074b2b91c0db6b88174145504acf0ef Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Dec 2024 22:27:51 -0500
Subject: make take_dentry_name_snapshot() lockless

Use ->d_seq instead of grabbing ->d_lock; in case of shortname dentries
that avoids any stores to shared data objects and in case of long names
we are down to (unavoidable) atomic_inc on the external_name refcount.

Makes the thing safer as well - the areas where ->d_seq is held odd are
all nested inside the areas where ->d_lock is held, and the latter are
much more numerous.

NOTE: now that there is a lockless path where we might try to grab
a reference to an already doomed external_name instance, it is no
longer possible for external_name.u.count and external_name.u.head
to share space (kudos to Linus for spotting that).

To reduce the noise this commit just make external_name.u a struct
(instead of union); the next commit will dissolve it.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 52662a5d08e4..f387dc97df86 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,9 +296,9 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
 }
 
 struct external_name {
-	union {
-		atomic_t count;
-		struct rcu_head head;
+	struct {
+		atomic_t count;		// ->count and ->head can't be combined
+		struct rcu_head head;	// see take_dentry_name_snapshot()
 	} u;
 	unsigned char name[];
 };
@@ -329,15 +329,30 @@ static inline int dname_external(const struct dentry *dentry)
 
 void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
 {
-	spin_lock(&dentry->d_lock);
-	name->name = dentry->d_name;
-	if (unlikely(dname_external(dentry))) {
-		atomic_inc(&external_name(dentry)->u.count);
-	} else {
+	unsigned seq;
+	const unsigned char *s;
+
+	rcu_read_lock();
+retry:
+	seq = read_seqcount_begin(&dentry->d_seq);
+	s = READ_ONCE(dentry->d_name.name);
+	name->name.hash_len = dentry->d_name.hash_len;
+	name->name.name = name->inline_name.string;
+	if (likely(s == dentry->d_shortname.string)) {
 		name->inline_name = dentry->d_shortname;
-		name->name.name = name->inline_name.string;
+	} else {
+		struct external_name *p;
+		p = container_of(s, struct external_name, name[0]);
+		// get a valid reference
+		if (unlikely(!atomic_inc_not_zero(&p->u.count)))
+			goto retry;
+		name->name.name = s;
 	}
-	spin_unlock(&dentry->d_lock);
+	if (read_seqcount_retry(&dentry->d_seq, seq)) {
+		release_dentry_name_snapshot(name);
+		goto retry;
+	}
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(take_dentry_name_snapshot);
 
-- 
cgit v1.2.3


From 923d3583ead133da742b42d9debbb7d5c5a56587 Mon Sep 17 00:00:00 2001
From: Pintu Kumar <quic_pintu@quicinc.com>
Date: Mon, 9 Dec 2024 19:29:36 +0530
Subject: ubifs: dump_lpt_leb: remove return at end of void function

Noticed that there is a useless return statement at the end of void
function dump_lpt_leb().
Just removing it.

Signed-off-by: Pintu Kumar <quic_pintu@quicinc.com>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/lpt_commit.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index aa8837e6247c..f2cb214581fd 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1932,7 +1932,6 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 	pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
 out:
 	vfree(buf);
-	return;
 }
 
 /**
-- 
cgit v1.2.3


From 404de7abc05758254ad57d9501d30427d9c57417 Mon Sep 17 00:00:00 2001
From: Pintu Kumar <quic_pintu@quicinc.com>
Date: Mon, 9 Dec 2024 21:51:04 +0530
Subject: ubifs: ubifs_dump_leb: remove return from end of void function

Noticed that there is a useless return statement at the end of void
function ubifs_dump_leb().
Just removed it.

Signed-off-by: Pintu Kumar <quic_pintu@quicinc.com>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/debug.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 5cc69beaa62e..987eb5b6782a 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -863,7 +863,6 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
 
 out:
 	vfree(buf);
-	return;
 }
 
 void ubifs_dump_znode(const struct ubifs_info *c,
-- 
cgit v1.2.3


From bdb0ca39e0acccf6771db49c3f94ed787d05f2d7 Mon Sep 17 00:00:00 2001
From: pangliyuan <pangliyuan1@huawei.com>
Date: Tue, 24 Dec 2024 16:18:23 +0800
Subject: ubifs: skip dumping tnc tree when zroot is null

Clearing slab cache will free all znode in memory and make
c->zroot.znode = NULL, then dumping tnc tree will access
c->zroot.znode which cause null pointer dereference.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=219624#c0
Fixes: 1e51764a3c2a ("UBIFS: add new flash file system")
Signed-off-by: pangliyuan <pangliyuan1@huawei.com>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/debug.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 987eb5b6782a..b01f382ce8db 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -945,16 +945,20 @@ void ubifs_dump_tnc(struct ubifs_info *c)
 
 	pr_err("\n");
 	pr_err("(pid %d) start dumping TNC tree\n", current->pid);
-	znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, NULL);
-	level = znode->level;
-	pr_err("== Level %d ==\n", level);
-	while (znode) {
-		if (level != znode->level) {
-			level = znode->level;
-			pr_err("== Level %d ==\n", level);
+	if (c->zroot.znode) {
+		znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, NULL);
+		level = znode->level;
+		pr_err("== Level %d ==\n", level);
+		while (znode) {
+			if (level != znode->level) {
+				level = znode->level;
+				pr_err("== Level %d ==\n", level);
+			}
+			ubifs_dump_znode(c, znode);
+			znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, znode);
 		}
-		ubifs_dump_znode(c, znode);
-		znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, znode);
+	} else {
+		pr_err("empty TNC tree in memory\n");
 	}
 	pr_err("(pid %d) finish dumping TNC tree\n", current->pid);
 }
-- 
cgit v1.2.3


From 6e2c2342e2b608264cf763a27410295b05995191 Mon Sep 17 00:00:00 2001
From: Chen Linxuan <chenlinxuan@uniontech.com>
Date: Fri, 17 Jan 2025 18:06:36 +0800
Subject: erofs: remove dead code in erofs_fc_parse_param

If an option is unknown to erofs, which means that option is not in
`erofs_fs_parameters`, `fs_parse` will return -ENOPARAM, which makes
`erofs_fc_parse_param` returns earlier.

Signed-off-by: Chen Linxuan <chenlinxuan@uniontech.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/DB86A4E2BB2BB44E+20250117100635.335963-2-chenlinxuan@uniontech.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/super.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1fc5623c3a4d..827b62665649 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -508,8 +508,6 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 		errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
 #endif
 		break;
-	default:
-		return -ENOPARAM;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 8b4bc207f981bd87728d2e1c0cf6f4304f9d0159 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 19 Jan 2025 09:59:40 -0500
Subject: efivarfs: prevent setting of zero size on the inodes in the cache

Current efivarfs uses simple_setattr which allows the setting of any
size in the inode cache.  This is wrong because a zero size file is
used to indicate an "uncommitted" variable, so by simple means of
truncating the file (as root) any variable may be turned to look like
it's uncommitted.  Fix by adding an efivarfs_setattr routine which
does not allow updating of the cached inode size (which now only comes
from the underlying variable).

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/inode.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index ec23da8405ff..a4a6587ecd2e 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -187,7 +187,24 @@ efivarfs_fileattr_set(struct mnt_idmap *idmap,
 	return 0;
 }
 
+/* copy of simple_setattr except that it doesn't do i_size updates */
+static int efivarfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+		   struct iattr *iattr)
+{
+	struct inode *inode = d_inode(dentry);
+	int error;
+
+	error = setattr_prepare(idmap, dentry, iattr);
+	if (error)
+		return error;
+
+	setattr_copy(idmap, inode, iattr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
 static const struct inode_operations efivarfs_file_inode_operations = {
 	.fileattr_get = efivarfs_fileattr_get,
 	.fileattr_set = efivarfs_fileattr_set,
+	.setattr      = efivarfs_setattr,
 };
-- 
cgit v1.2.3


From fddca52766e276cb2fdc0be940ad75e0f01e214e Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 19 Jan 2025 10:12:10 -0500
Subject: efivarfs: move variable lifetime management into the inodes

Make the inodes the default management vehicle for struct
efivar_entry, so they are now all freed automatically if the file is
removed and on unmount in kill_litter_super().  Remove the now
superfluous iterator to free the entries after kill_litter_super().

Also fixes a bug where some entry freeing was missing causing efivarfs
to leak memory.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/inode.c    | 23 ++++++++++-------------
 fs/efivarfs/internal.h |  7 ++++++-
 fs/efivarfs/super.c    | 51 ++++++++++++++++++++++++++++----------------------
 fs/efivarfs/vars.c     | 39 +++-----------------------------------
 4 files changed, 48 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index ec23da8405ff..8f5994dea204 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -82,26 +82,23 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	struct efivar_entry *var;
 	int namelen, i = 0, err = 0;
 	bool is_removable = false;
+	efi_guid_t vendor;
 
 	if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
 		return -EINVAL;
 
-	var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
-	if (!var)
-		return -ENOMEM;
-
 	/* length of the variable name itself: remove GUID and separator */
 	namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
 
-	err = guid_parse(dentry->d_name.name + namelen + 1, &var->var.VendorGuid);
+	err = guid_parse(dentry->d_name.name + namelen + 1, &vendor);
 	if (err)
 		goto out;
-	if (guid_equal(&var->var.VendorGuid, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) {
+	if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) {
 		err = -EPERM;
 		goto out;
 	}
 
-	if (efivar_variable_is_removable(var->var.VendorGuid,
+	if (efivar_variable_is_removable(vendor,
 					 dentry->d_name.name, namelen))
 		is_removable = true;
 
@@ -110,6 +107,9 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
 		err = -ENOMEM;
 		goto out;
 	}
+	var = efivar_entry(inode);
+
+	var->var.VendorGuid = vendor;
 
 	for (i = 0; i < namelen; i++)
 		var->var.VariableName[i] = dentry->d_name.name[i];
@@ -117,7 +117,6 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	var->var.VariableName[i] = '\0';
 
 	inode->i_private = var;
-	kmemleak_ignore(var);
 
 	err = efivar_entry_add(var, &info->efivarfs_list);
 	if (err)
@@ -126,11 +125,9 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	d_instantiate(dentry, inode);
 	dget(dentry);
 out:
-	if (err) {
-		kfree(var);
-		if (inode)
-			iput(inode);
-	}
+	if (err && inode)
+		iput(inode);
+
 	return err;
 }
 
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 597ccaa60d37..fce7d5e5c763 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -29,15 +29,20 @@ struct efi_variable {
 struct efivar_entry {
 	struct efi_variable var;
 	struct list_head list;
+	struct inode vfs_inode;
 };
 
+static inline struct efivar_entry *efivar_entry(struct inode *inode)
+{
+	return container_of(inode, struct efivar_entry, vfs_inode);
+}
+
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 			    struct list_head *),
 		void *data, struct list_head *head);
 
 int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
 void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
-void efivar_entry_remove(struct efivar_entry *entry);
 int efivar_entry_delete(struct efivar_entry *entry);
 
 int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 9e90823f8009..85ab3af3f1e9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -39,9 +39,25 @@ static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event,
 	return NOTIFY_OK;
 }
 
-static void efivarfs_evict_inode(struct inode *inode)
+static struct inode *efivarfs_alloc_inode(struct super_block *sb)
 {
-	clear_inode(inode);
+	struct efivar_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+
+	if (!entry)
+		return NULL;
+
+	inode_init_once(&entry->vfs_inode);
+
+	return &entry->vfs_inode;
+}
+
+static void efivarfs_free_inode(struct inode *inode)
+{
+	struct efivar_entry *entry = efivar_entry(inode);
+
+	if (inode->i_private)
+		list_del(&entry->list);
+	kfree(entry);
 }
 
 static int efivarfs_show_options(struct seq_file *m, struct dentry *root)
@@ -106,7 +122,8 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static const struct super_operations efivarfs_ops = {
 	.statfs = efivarfs_statfs,
 	.drop_inode = generic_delete_inode,
-	.evict_inode = efivarfs_evict_inode,
+	.alloc_inode = efivarfs_alloc_inode,
+	.free_inode = efivarfs_free_inode,
 	.show_options = efivarfs_show_options,
 };
 
@@ -227,21 +244,14 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 	if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
 		return 0;
 
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return err;
-
-	memcpy(entry->var.VariableName, name16, name_size);
-	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
-
 	name = efivar_get_utf8name(name16, &vendor);
 	if (!name)
-		goto fail;
+		return err;
 
 	/* length of the variable name itself: remove GUID and separator */
 	len = strlen(name) - EFI_VARIABLE_GUID_LEN - 1;
 
-	if (efivar_variable_is_removable(entry->var.VendorGuid, name, len))
+	if (efivar_variable_is_removable(vendor, name, len))
 		is_removable = true;
 
 	inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
@@ -249,6 +259,11 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 	if (!inode)
 		goto fail_name;
 
+	entry = efivar_entry(inode);
+
+	memcpy(entry->var.VariableName, name16, name_size);
+	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
 	dentry = efivarfs_alloc_dentry(root, name);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -273,16 +288,8 @@ fail_inode:
 	iput(inode);
 fail_name:
 	kfree(name);
-fail:
-	kfree(entry);
-	return err;
-}
 
-static int efivarfs_destroy(struct efivar_entry *entry, void *data)
-{
-	efivar_entry_remove(entry);
-	kfree(entry);
-	return 0;
+	return err;
 }
 
 enum {
@@ -407,7 +414,7 @@ static void efivarfs_kill_sb(struct super_block *sb)
 	kill_litter_super(sb);
 
 	/* Remove all entries and destroy */
-	efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL);
+	WARN_ON(!list_empty(&sfi->efivarfs_list));
 	kfree(sfi);
 }
 
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index b2fc5bdc759a..bb9406e03a10 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -485,34 +485,6 @@ void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
 	list_add(&entry->list, head);
 }
 
-/**
- * efivar_entry_remove - remove entry from variable list
- * @entry: entry to remove from list
- *
- * Returns 0 on success, or a kernel error code on failure.
- */
-void efivar_entry_remove(struct efivar_entry *entry)
-{
-	list_del(&entry->list);
-}
-
-/*
- * efivar_entry_list_del_unlock - remove entry from variable list
- * @entry: entry to remove
- *
- * Remove @entry from the variable list and release the list lock.
- *
- * NOTE: slightly weird locking semantics here - we expect to be
- * called with the efivars lock already held, and we release it before
- * returning. This is because this function is usually called after
- * set_variable() while the lock is still held.
- */
-static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
-{
-	list_del(&entry->list);
-	efivar_unlock();
-}
-
 /**
  * efivar_entry_delete - delete variable and remove entry from list
  * @entry: entry containing variable to delete
@@ -536,12 +508,10 @@ int efivar_entry_delete(struct efivar_entry *entry)
 	status = efivar_set_variable_locked(entry->var.VariableName,
 					    &entry->var.VendorGuid,
 					    0, 0, NULL, false);
-	if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) {
-		efivar_unlock();
+	efivar_unlock();
+	if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND))
 		return efi_status_to_err(status);
-	}
 
-	efivar_entry_list_del_unlock(entry);
 	return 0;
 }
 
@@ -679,10 +649,7 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
 				    &entry->var.VendorGuid,
 				    NULL, size, NULL);
 
-	if (status == EFI_NOT_FOUND)
-		efivar_entry_list_del_unlock(entry);
-	else
-		efivar_unlock();
+	efivar_unlock();
 
 	if (status && status != EFI_BUFFER_TOO_SMALL)
 		return efi_status_to_err(status);
-- 
cgit v1.2.3


From a58e954464db477307ac879d772a535deca8efb7 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 19 Jan 2025 10:12:11 -0500
Subject: efivarfs: remove unused efivarfs_list

Remove all function helpers and mentions of the efivarfs_list now that
all consumers of the list have been removed and entry management goes
exclusively through the inode.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/inode.c    | 24 ++++----------
 fs/efivarfs/internal.h | 12 ++-----
 fs/efivarfs/super.c    | 12 ++-----
 fs/efivarfs/vars.c     | 89 +++++++-------------------------------------------
 4 files changed, 21 insertions(+), 116 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 8f5994dea204..4ce330fbfdf4 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -77,7 +77,6 @@ static bool efivarfs_valid_name(const char *str, int len)
 static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
 			   struct dentry *dentry, umode_t mode, bool excl)
 {
-	struct efivarfs_fs_info *info = dir->i_sb->s_fs_info;
 	struct inode *inode = NULL;
 	struct efivar_entry *var;
 	int namelen, i = 0, err = 0;
@@ -92,21 +91,17 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
 
 	err = guid_parse(dentry->d_name.name + namelen + 1, &vendor);
 	if (err)
-		goto out;
-	if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) {
-		err = -EPERM;
-		goto out;
-	}
+		return err;
+	if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
+		return -EPERM;
 
 	if (efivar_variable_is_removable(vendor,
 					 dentry->d_name.name, namelen))
 		is_removable = true;
 
 	inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0, is_removable);
-	if (!inode) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!inode)
+		return -ENOMEM;
 	var = efivar_entry(inode);
 
 	var->var.VendorGuid = vendor;
@@ -118,17 +113,10 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
 
 	inode->i_private = var;
 
-	err = efivar_entry_add(var, &info->efivarfs_list);
-	if (err)
-		goto out;
-
 	d_instantiate(dentry, inode);
 	dget(dentry);
-out:
-	if (err && inode)
-		iput(inode);
 
-	return err;
+	return 0;
 }
 
 static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index fce7d5e5c763..4366f7949614 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -6,7 +6,6 @@
 #ifndef EFIVAR_FS_INTERNAL_H
 #define EFIVAR_FS_INTERNAL_H
 
-#include <linux/list.h>
 #include <linux/efi.h>
 
 struct efivarfs_mount_opts {
@@ -16,7 +15,6 @@ struct efivarfs_mount_opts {
 
 struct efivarfs_fs_info {
 	struct efivarfs_mount_opts mount_opts;
-	struct list_head efivarfs_list;
 	struct super_block *sb;
 	struct notifier_block nb;
 };
@@ -28,7 +26,6 @@ struct efi_variable {
 
 struct efivar_entry {
 	struct efi_variable var;
-	struct list_head list;
 	struct inode vfs_inode;
 };
 
@@ -37,12 +34,9 @@ static inline struct efivar_entry *efivar_entry(struct inode *inode)
 	return container_of(inode, struct efivar_entry, vfs_inode);
 }
 
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
-			    struct list_head *),
-		void *data, struct list_head *head);
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+		void *data);
 
-int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
-void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
 int efivar_entry_delete(struct efivar_entry *entry);
 
 int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
@@ -53,8 +47,6 @@ int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
 int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
 			      unsigned long *size, void *data, bool *set);
 
-int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
-		      struct list_head *head, void *data);
 
 bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 		     unsigned long data_size);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 85ab3af3f1e9..7b3650c97e60 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -55,8 +55,6 @@ static void efivarfs_free_inode(struct inode *inode)
 {
 	struct efivar_entry *entry = efivar_entry(inode);
 
-	if (inode->i_private)
-		list_del(&entry->list);
 	kfree(entry);
 }
 
@@ -228,8 +226,7 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name,
 }
 
 static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
-			     unsigned long name_size, void *data,
-			     struct list_head *list)
+			     unsigned long name_size, void *data)
 {
 	struct super_block *sb = (struct super_block *)data;
 	struct efivar_entry *entry;
@@ -271,7 +268,6 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 	}
 
 	__efivar_entry_get(entry, NULL, &size, NULL);
-	__efivar_entry_add(entry, list);
 
 	/* copied by the above to local storage in the dentry. */
 	kfree(name);
@@ -361,7 +357,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (err)
 		return err;
 
-	return efivar_init(efivarfs_callback, sb, &sfi->efivarfs_list);
+	return efivar_init(efivarfs_callback, sb);
 }
 
 static int efivarfs_get_tree(struct fs_context *fc)
@@ -396,8 +392,6 @@ static int efivarfs_init_fs_context(struct fs_context *fc)
 	if (!sfi)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&sfi->efivarfs_list);
-
 	sfi->mount_opts.uid = GLOBAL_ROOT_UID;
 	sfi->mount_opts.gid = GLOBAL_ROOT_GID;
 
@@ -413,8 +407,6 @@ static void efivarfs_kill_sb(struct super_block *sb)
 	blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
 	kill_litter_super(sb);
 
-	/* Remove all entries and destroy */
-	WARN_ON(!list_empty(&sfi->efivarfs_list));
 	kfree(sfi);
 }
 
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index bb9406e03a10..d0beecbf9441 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -364,16 +364,14 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
  * efivar_init - build the initial list of EFI variables
  * @func: callback function to invoke for every variable
  * @data: function-specific data to pass to @func
- * @head: initialised head of variable list
  *
  * Get every EFI variable from the firmware and invoke @func. @func
- * should call efivar_entry_add() to build the list of variables.
+ * should populate the initial dentry and inode tree.
  *
  * Returns 0 on success, or a kernel error code on failure.
  */
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
-			    struct list_head *),
-		void *data, struct list_head *head)
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+		void *data)
 {
 	unsigned long variable_name_size = 512;
 	efi_char16_t *variable_name;
@@ -424,7 +422,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 				status = EFI_NOT_FOUND;
 			} else {
 				err = func(variable_name, vendor_guid,
-					   variable_name_size, data, head);
+					   variable_name_size, data);
 				if (err)
 					status = EFI_NOT_FOUND;
 			}
@@ -456,42 +454,12 @@ free:
 }
 
 /**
- * efivar_entry_add - add entry to variable list
- * @entry: entry to add to list
- * @head: list head
- *
- * Returns 0 on success, or a kernel error code on failure.
- */
-int efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
-{
-	int err;
-
-	err = efivar_lock();
-	if (err)
-		return err;
-	list_add(&entry->list, head);
-	efivar_unlock();
-
-	return 0;
-}
-
-/**
- * __efivar_entry_add - add entry to variable list
- * @entry: entry to add to list
- * @head: list head
- */
-void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
-{
-	list_add(&entry->list, head);
-}
-
-/**
- * efivar_entry_delete - delete variable and remove entry from list
+ * efivar_entry_delete - delete variable
  * @entry: entry containing variable to delete
  *
- * Delete the variable from the firmware and remove @entry from the
- * variable list. It is the caller's responsibility to free @entry
- * once we return.
+ * Delete the variable from the firmware. It is the caller's
+ * responsibility to free @entry (by deleting the dentry/inode) once
+ * we return.
  *
  * Returns 0 on success, -EINTR if we can't grab the semaphore,
  * converted EFI status code if set_variable() fails.
@@ -605,7 +573,7 @@ int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
  * get_variable() fail.
  *
  * If the EFI variable does not exist when calling set_variable()
- * (EFI_NOT_FOUND), @entry is removed from the variable list.
+ * (EFI_NOT_FOUND).
  */
 int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
 			      unsigned long *size, void *data, bool *set)
@@ -621,9 +589,8 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
 		return -EINVAL;
 
 	/*
-	 * The lock here protects the get_variable call, the conditional
-	 * set_variable call, and removal of the variable from the efivars
-	 * list (in the case of an authenticated delete).
+	 * The lock here protects the get_variable call and the
+	 * conditional set_variable call
 	 */
 	err = efivar_lock();
 	if (err)
@@ -661,37 +628,3 @@ out:
 	return err;
 
 }
-
-/**
- * efivar_entry_iter - iterate over variable list
- * @func: callback function
- * @head: head of variable list
- * @data: function-specific data to pass to callback
- *
- * Iterate over the list of EFI variables and call @func with every
- * entry on the list. It is safe for @func to remove entries in the
- * list via efivar_entry_delete() while iterating.
- *
- * Some notes for the callback function:
- *  - a non-zero return value indicates an error and terminates the loop
- *  - @func is called from atomic context
- */
-int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
-		      struct list_head *head, void *data)
-{
-	struct efivar_entry *entry, *n;
-	int err = 0;
-
-	err = efivar_lock();
-	if (err)
-		return err;
-
-	list_for_each_entry_safe(entry, n, head, list) {
-		err = func(entry, data);
-		if (err)
-			break;
-	}
-	efivar_unlock();
-
-	return err;
-}
-- 
cgit v1.2.3


From 908af31f4896f2c0645031f8b74a89d3a8beb5b9 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 19 Jan 2025 10:12:12 -0500
Subject: efivarfs: fix error on write to new variable leaving remnants

Make variable cleanup go through the fops release mechanism and use
zero inode size as the indicator to delete the file.  Since all EFI
variables must have an initial u32 attribute, zero size occurs either
because the update deleted the variable or because an unsuccessful
write after create caused the size never to be set in the first place.
In the case of multiple racing opens and closes, the open is counted
to ensure that the zero size check is done on the last close.

Even though this fixes the bug that a create either not followed by a
write or followed by a write that errored would leave a remnant file
for the variable, the file will appear momentarily globally visible
until the last close of the fd deletes it.  This is safe because the
normal filesystem operations will mediate any races; however, it is
still possible for a directory listing at that instant between create
and close contain a zero size variable that doesn't exist in the EFI
table.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/file.c     | 59 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/efivarfs/internal.h |  2 ++
 fs/efivarfs/super.c    |  1 +
 3 files changed, 53 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 23c51d62f902..cb1b6d0c3454 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -36,28 +36,41 @@ static ssize_t efivarfs_file_write(struct file *file,
 	if (IS_ERR(data))
 		return PTR_ERR(data);
 
+	inode_lock(inode);
+	if (var->removed) {
+		/*
+		 * file got removed; don't allow a set.  Caused by an
+		 * unsuccessful create or successful delete write
+		 * racing with us.
+		 */
+		bytes = -EIO;
+		goto out;
+	}
+
 	bytes = efivar_entry_set_get_size(var, attributes, &datasize,
 					  data, &set);
-	if (!set && bytes) {
+	if (!set) {
 		if (bytes == -ENOENT)
 			bytes = -EIO;
 		goto out;
 	}
 
 	if (bytes == -ENOENT) {
-		drop_nlink(inode);
-		d_delete(file->f_path.dentry);
-		dput(file->f_path.dentry);
+		/*
+		 * zero size signals to release that the write deleted
+		 * the variable
+		 */
+		i_size_write(inode, 0);
 	} else {
-		inode_lock(inode);
 		i_size_write(inode, datasize + sizeof(attributes));
 		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
-		inode_unlock(inode);
 	}
 
 	bytes = count;
 
 out:
+	inode_unlock(inode);
+
 	kfree(data);
 
 	return bytes;
@@ -106,8 +119,36 @@ out_free:
 	return size;
 }
 
+static int efivarfs_file_release(struct inode *inode, struct file *file)
+{
+	struct efivar_entry *var = inode->i_private;
+
+	inode_lock(inode);
+	var->removed = (--var->open_count == 0 && i_size_read(inode) == 0);
+	inode_unlock(inode);
+
+	if (var->removed)
+		simple_recursive_removal(file->f_path.dentry, NULL);
+
+	return 0;
+}
+
+static int efivarfs_file_open(struct inode *inode, struct file *file)
+{
+	struct efivar_entry *entry = inode->i_private;
+
+	file->private_data = entry;
+
+	inode_lock(inode);
+	entry->open_count++;
+	inode_unlock(inode);
+
+	return 0;
+}
+
 const struct file_operations efivarfs_file_operations = {
-	.open	= simple_open,
-	.read	= efivarfs_file_read,
-	.write	= efivarfs_file_write,
+	.open		= efivarfs_file_open,
+	.read		= efivarfs_file_read,
+	.write		= efivarfs_file_write,
+	.release	= efivarfs_file_release,
 };
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 4366f7949614..1f84844fe032 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -27,6 +27,8 @@ struct efi_variable {
 struct efivar_entry {
 	struct efi_variable var;
 	struct inode vfs_inode;
+	unsigned long open_count;
+	bool removed;
 };
 
 static inline struct efivar_entry *efivar_entry(struct inode *inode)
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 7b3650c97e60..89010c5878ce 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -47,6 +47,7 @@ static struct inode *efivarfs_alloc_inode(struct super_block *sb)
 		return NULL;
 
 	inode_init_once(&entry->vfs_inode);
+	entry->removed = false;
 
 	return &entry->vfs_inode;
 }
-- 
cgit v1.2.3


From 62eecd8aac6c880a4a09ccb5d890949f2a09a730 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 3 Jan 2025 16:50:27 -0300
Subject: smb: client: introduce av_for_each_entry() helper

Use new helper in find_domain_name() and find_timestamp() to avoid
duplicating code.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsencrypt.c | 125 +++++++++++++++++++-------------------------
 fs/smb/client/cifspdu.h     |   2 +-
 2 files changed, 55 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 7a43daacc815..981897ec4dcd 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -315,6 +315,39 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	return 0;
 }
 
+#define AV_TYPE(av)		(le16_to_cpu(av->type))
+#define AV_LEN(av)		(le16_to_cpu(av->length))
+#define AV_DATA_PTR(av)	((void *)av->data)
+
+#define av_for_each_entry(ses, av) \
+	for (av = NULL; (av = find_next_av(ses, av));)
+
+static struct ntlmssp2_name *find_next_av(struct cifs_ses *ses,
+					  struct ntlmssp2_name *av)
+{
+	u16 len;
+	u8 *end;
+
+	end = (u8 *)ses->auth_key.response + ses->auth_key.len;
+	if (!av) {
+		if (unlikely(!ses->auth_key.response || !ses->auth_key.len))
+			return NULL;
+		av = (void *)ses->auth_key.response;
+	} else {
+		av = (void *)((u8 *)av + sizeof(*av) + AV_LEN(av));
+	}
+
+	if ((u8 *)av + sizeof(*av) > end)
+		return NULL;
+
+	len = AV_LEN(av);
+	if (AV_TYPE(av) == NTLMSSP_AV_EOL)
+		return NULL;
+	if (!len || (u8 *)av + sizeof(*av) + len > end)
+		return NULL;
+	return av;
+}
+
 /* Server has provided av pairs/target info in the type 2 challenge
  * packet and we have plucked it and stored within smb session.
  * We parse that blob here to find netbios domain name to be used
@@ -325,49 +358,23 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
  * may not fail against other (those who are not very particular
  * about target string i.e. for some, just user name might suffice.
  */
-static int
-find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
+static int find_domain_name(struct cifs_ses *ses)
 {
-	unsigned int attrsize;
-	unsigned int type;
-	unsigned int onesize = sizeof(struct ntlmssp2_name);
-	unsigned char *blobptr;
-	unsigned char *blobend;
-	struct ntlmssp2_name *attrptr;
-
-	if (!ses->auth_key.len || !ses->auth_key.response)
-		return 0;
-
-	blobptr = ses->auth_key.response;
-	blobend = blobptr + ses->auth_key.len;
-
-	while (blobptr + onesize < blobend) {
-		attrptr = (struct ntlmssp2_name *) blobptr;
-		type = le16_to_cpu(attrptr->type);
-		if (type == NTLMSSP_AV_EOL)
-			break;
-		blobptr += 2; /* advance attr type */
-		attrsize = le16_to_cpu(attrptr->length);
-		blobptr += 2; /* advance attr size */
-		if (blobptr + attrsize > blobend)
-			break;
-		if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
-			if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN)
-				break;
-			if (!ses->domainName) {
-				ses->domainName =
-					kmalloc(attrsize + 1, GFP_KERNEL);
-				if (!ses->domainName)
-						return -ENOMEM;
-				cifs_from_utf16(ses->domainName,
-					(__le16 *)blobptr, attrsize, attrsize,
-					nls_cp, NO_MAP_UNI_RSVD);
-				break;
-			}
+	const struct nls_table *nlsc = ses->local_nls;
+	struct ntlmssp2_name *av;
+	u16 len;
+
+	av_for_each_entry(ses, av) {
+		len = AV_LEN(av);
+		if (AV_TYPE(av) == NTLMSSP_AV_NB_DOMAIN_NAME &&
+		    len < CIFS_MAX_DOMAINNAME_LEN && !ses->domainName) {
+			ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+			if (!ses->domainName)
+				return -ENOMEM;
+			cifs_from_utf16(ses->domainName, AV_DATA_PTR(av),
+					len, len, nlsc, NO_MAP_UNI_RSVD);
 		}
-		blobptr += attrsize; /* advance attr  value */
 	}
-
 	return 0;
 }
 
@@ -377,40 +384,16 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
  * as part of ntlmv2 authentication (or local current time as
  * default in case of failure)
  */
-static __le64
-find_timestamp(struct cifs_ses *ses)
+static __le64 find_timestamp(struct cifs_ses *ses)
 {
-	unsigned int attrsize;
-	unsigned int type;
-	unsigned int onesize = sizeof(struct ntlmssp2_name);
-	unsigned char *blobptr;
-	unsigned char *blobend;
-	struct ntlmssp2_name *attrptr;
+	struct ntlmssp2_name *av;
 	struct timespec64 ts;
 
-	if (!ses->auth_key.len || !ses->auth_key.response)
-		return 0;
-
-	blobptr = ses->auth_key.response;
-	blobend = blobptr + ses->auth_key.len;
-
-	while (blobptr + onesize < blobend) {
-		attrptr = (struct ntlmssp2_name *) blobptr;
-		type = le16_to_cpu(attrptr->type);
-		if (type == NTLMSSP_AV_EOL)
-			break;
-		blobptr += 2; /* advance attr type */
-		attrsize = le16_to_cpu(attrptr->length);
-		blobptr += 2; /* advance attr size */
-		if (blobptr + attrsize > blobend)
-			break;
-		if (type == NTLMSSP_AV_TIMESTAMP) {
-			if (attrsize == sizeof(u64))
-				return *((__le64 *)blobptr);
-		}
-		blobptr += attrsize; /* advance attr value */
+	av_for_each_entry(ses, av) {
+		if (AV_TYPE(av) == NTLMSSP_AV_TIMESTAMP &&
+		    AV_LEN(av) == sizeof(u64))
+			return *((__le64 *)AV_DATA_PTR(av));
 	}
-
 	ktime_get_real_ts64(&ts);
 	return cpu_to_le64(cifs_UnixTimeToNT(ts));
 }
@@ -563,7 +546,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
 		if (!ses->domainName) {
 			if (ses->domainAuto) {
-				rc = find_domain_name(ses, nls_cp);
+				rc = find_domain_name(ses);
 				if (rc) {
 					cifs_dbg(VFS, "error %d finding domain name\n",
 						 rc);
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index ee78bb6741d6..17202754e6d0 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -649,7 +649,7 @@ typedef union smb_com_session_setup_andx {
 struct ntlmssp2_name {
 	__le16 type;
 	__le16 length;
-/*	char   name[length]; */
+	__u8 data[];
 } __attribute__((packed));
 
 struct ntlmv2_resp {
-- 
cgit v1.2.3


From 0e8ae9b953bc2c12aebd21e1e552e5deb1a0ff1e Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 3 Jan 2025 16:52:25 -0300
Subject: smb: client: parse av pair type 4 in CHALLENGE_MESSAGE

Parse FQDN of the domain in CHALLENGE_MESSAGE message as it's gonna be
useful when mounting DFS shares against old Windows Servers (2012 R2
or earlier) that return not fully qualified hostnames for DFS targets
by default.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsencrypt.c | 63 +++++++++++++++++++++++++++++----------------
 fs/smb/client/cifsglob.h    |  1 +
 fs/smb/client/misc.c        |  1 +
 3 files changed, 43 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 981897ec4dcd..e69968e88fe7 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -348,31 +348,37 @@ static struct ntlmssp2_name *find_next_av(struct cifs_ses *ses,
 	return av;
 }
 
-/* Server has provided av pairs/target info in the type 2 challenge
- * packet and we have plucked it and stored within smb session.
- * We parse that blob here to find netbios domain name to be used
- * as part of ntlmv2 authentication (in Target String), if not already
- * specified on the command line.
- * If this function returns without any error but without fetching
- * domain name, authentication may fail against some server but
- * may not fail against other (those who are not very particular
- * about target string i.e. for some, just user name might suffice.
+/*
+ * Check if server has provided av pair of @type in the NTLMSSP
+ * CHALLENGE_MESSAGE blob.
  */
-static int find_domain_name(struct cifs_ses *ses)
+static int find_av_name(struct cifs_ses *ses, u16 type, char **name, u16 maxlen)
 {
 	const struct nls_table *nlsc = ses->local_nls;
 	struct ntlmssp2_name *av;
-	u16 len;
+	u16 len, nlen;
+
+	if (*name)
+		return 0;
 
 	av_for_each_entry(ses, av) {
 		len = AV_LEN(av);
-		if (AV_TYPE(av) == NTLMSSP_AV_NB_DOMAIN_NAME &&
-		    len < CIFS_MAX_DOMAINNAME_LEN && !ses->domainName) {
-			ses->domainName = kmalloc(len + 1, GFP_KERNEL);
-			if (!ses->domainName)
+		if (AV_TYPE(av) != type)
+			continue;
+		if (!IS_ALIGNED(len, sizeof(__le16))) {
+			cifs_dbg(VFS | ONCE, "%s: bad length(%u) for type %u\n",
+				 __func__, len, type);
+			continue;
+		}
+		nlen = len / sizeof(__le16);
+		if (nlen <= maxlen) {
+			++nlen;
+			*name = kmalloc(nlen, GFP_KERNEL);
+			if (!*name)
 				return -ENOMEM;
-			cifs_from_utf16(ses->domainName, AV_DATA_PTR(av),
-					len, len, nlsc, NO_MAP_UNI_RSVD);
+			cifs_from_utf16(*name, AV_DATA_PTR(av), nlen,
+					len, nlsc, NO_MAP_UNI_RSVD);
+			break;
 		}
 	}
 	return 0;
@@ -546,16 +552,29 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
 		if (!ses->domainName) {
 			if (ses->domainAuto) {
-				rc = find_domain_name(ses);
-				if (rc) {
-					cifs_dbg(VFS, "error %d finding domain name\n",
-						 rc);
+				/*
+				 * Domain (workgroup) hasn't been specified in
+				 * mount options, so try to find it in
+				 * CHALLENGE_MESSAGE message and then use it as
+				 * part of NTLMv2 authentication.
+				 */
+				rc = find_av_name(ses, NTLMSSP_AV_NB_DOMAIN_NAME,
+						  &ses->domainName,
+						  CIFS_MAX_DOMAINNAME_LEN);
+				if (rc)
 					goto setup_ntlmv2_rsp_ret;
-				}
 			} else {
 				ses->domainName = kstrdup("", GFP_KERNEL);
+				if (!ses->domainName) {
+					rc = -ENOMEM;
+					goto setup_ntlmv2_rsp_ret;
+				}
 			}
 		}
+		rc = find_av_name(ses, NTLMSSP_AV_DNS_DOMAIN_NAME,
+				  &ses->dns_dom, CIFS_MAX_DOMAINNAME_LEN);
+		if (rc)
+			goto setup_ntlmv2_rsp_ret;
 	} else {
 		rc = build_avpair_blob(ses, nls_cp);
 		if (rc) {
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 6e63abe461fd..e5982136e66f 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1154,6 +1154,7 @@ struct cifs_ses {
 	/* ========= end: protected by chan_lock ======== */
 	struct cifs_ses *dfs_root_ses;
 	struct nls_table *local_nls;
+	char *dns_dom; /* FQDN of the domain */
 };
 
 static inline bool
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 4373dd64b66d..c23d5ba44cae 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -101,6 +101,7 @@ sesInfoFree(struct cifs_ses *buf_to_free)
 	kfree_sensitive(buf_to_free->password2);
 	kfree(buf_to_free->user_name);
 	kfree(buf_to_free->domainName);
+	kfree(buf_to_free->dns_dom);
 	kfree_sensitive(buf_to_free->auth_key.response);
 	spin_lock(&buf_to_free->iface_lock);
 	list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list,
-- 
cgit v1.2.3


From ad46faff1afcc8db72f9954a6ea2296d31259217 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 3 Jan 2025 16:58:12 -0300
Subject: smb: client: fix DFS mount against old servers with NTLMSSP

Old Windows servers will return not fully qualified DFS targets by
default as specified in

  MS-DFSC 3.2.5.5 Receiving a Root Referral Request or Link Referral
  Request

    | Servers SHOULD<30> return fully qualified DNS host names of
    | targets in responses to root referral requests and link referral
    | requests.
    | ...
    | <30> Section 3.2.5.5: By default, Windows Server 2003, Windows
    | Server 2008, Windows Server 2008 R2, Windows Server 2012, and
    | Windows Server 2012 R2 return DNS host names that are not fully
    | qualified for targets.

Fix this by converting all NetBIOS host names from DFS targets to
FQDNs and try resolving them first if DNS domain name was provided in
NTLMSSP CHALLENGE_MESSAGE message from previous SMB2_SESSION_SETUP.
This also prevents the client from translating the DFS target
hostnames to another domain depending on the network domain search
order.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h    |  21 +++++++++
 fs/smb/client/connect.c     |   5 ++-
 fs/smb/client/dfs.c         |  13 ++++--
 fs/smb/client/dfs_cache.c   |   3 +-
 fs/smb/client/dns_resolve.c | 102 ++++++++++++++++++++++++++------------------
 fs/smb/client/dns_resolve.h |   3 +-
 fs/smb/client/fs_context.c  |   4 ++
 fs/smb/client/fs_context.h  |   1 +
 fs/smb/client/misc.c        |   3 +-
 9 files changed, 105 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index e5982136e66f..c747b6f9baca 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -828,6 +828,7 @@ struct TCP_Server_Info {
 	 */
 	char *leaf_fullpath;
 	bool dfs_conn:1;
+	char dns_dom[CIFS_MAX_DOMAINNAME_LEN + 1];
 };
 
 static inline bool is_smb1(struct TCP_Server_Info *server)
@@ -2312,4 +2313,24 @@ static inline bool cifs_ses_exiting(struct cifs_ses *ses)
 	return ret;
 }
 
+static inline bool cifs_netbios_name(const char *name, size_t namelen)
+{
+	bool ret = false;
+	size_t i;
+
+	if (namelen >= 1 && namelen <= RFC1001_NAME_LEN) {
+		for (i = 0; i < namelen; i++) {
+			const unsigned char c = name[i];
+
+			if (c == '\\' || c == '/' || c == ':' || c == '*' ||
+			    c == '?' || c == '"' || c == '<' || c == '>' ||
+			    c == '|' || c == '.')
+				return false;
+			if (!ret && isalpha(c))
+				ret = true;
+		}
+	}
+	return ret;
+}
+
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index eaa6be4456d0..9e16edf5c883 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -97,7 +97,8 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
 	ss = server->dstaddr;
 	spin_unlock(&server->srv_lock);
 
-	rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, NULL);
+	rc = dns_resolve_server_name_to_ip(server->dns_dom, unc,
+					   (struct sockaddr *)&ss, NULL);
 	kfree(unc);
 
 	if (rc < 0) {
@@ -1710,6 +1711,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
 			goto out_err;
 		}
 	}
+	if (ctx->dns_dom)
+		strscpy(tcp_ses->dns_dom, ctx->dns_dom);
 
 	if (ctx->nosharesock)
 		tcp_ses->nosharesock = true;
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 4647df9e1e3b..09d8808cd2e0 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -9,6 +9,8 @@
 #include "fs_context.h"
 #include "dfs.h"
 
+#define DFS_DOM(ctx) (ctx->dfs_root_ses ? ctx->dfs_root_ses->dns_dom : NULL)
+
 /**
  * dfs_parse_target_referral - set fs context for dfs target referral
  *
@@ -46,8 +48,9 @@ int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_para
 	if (rc)
 		goto out;
 
-	rc = dns_resolve_server_name_to_ip(path, (struct sockaddr *)&ctx->dstaddr, NULL);
-
+	rc = dns_resolve_server_name_to_ip(DFS_DOM(ctx), path,
+					   (struct sockaddr *)&ctx->dstaddr,
+					   NULL);
 out:
 	kfree(path);
 	return rc;
@@ -59,8 +62,9 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path)
 	int rc;
 
 	ctx->leaf_fullpath = (char *)full_path;
+	ctx->dns_dom = DFS_DOM(ctx);
 	rc = cifs_mount_get_session(mnt_ctx);
-	ctx->leaf_fullpath = NULL;
+	ctx->leaf_fullpath = ctx->dns_dom = NULL;
 
 	return rc;
 }
@@ -264,7 +268,8 @@ static int update_fs_context_dstaddr(struct smb3_fs_context *ctx)
 	int rc = 0;
 
 	if (!ctx->nodfs && ctx->dfs_automount) {
-		rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
+		rc = dns_resolve_server_name_to_ip(NULL, ctx->source,
+						   addr, NULL);
 		if (!rc)
 			cifs_set_port(addr, ctx->port);
 		ctx->dfs_automount = false;
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 541608b0267e..c0a167c869fb 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1114,7 +1114,8 @@ static bool target_share_equal(struct cifs_tcon *tcon, const char *s1)
 	extract_unc_hostname(s1, &host, &hostlen);
 	scnprintf(unc, sizeof(unc), "\\\\%.*s", (int)hostlen, host);
 
-	rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, NULL);
+	rc = dns_resolve_server_name_to_ip(server->dns_dom, unc,
+					   (struct sockaddr *)&ss, NULL);
 	if (rc < 0) {
 		cifs_dbg(FYI, "%s: could not resolve %.*s. assuming server address matches.\n",
 			 __func__, (int)hostlen, host);
diff --git a/fs/smb/client/dns_resolve.c b/fs/smb/client/dns_resolve.c
index 8bf8978bc5d6..83db27f9c8f1 100644
--- a/fs/smb/client/dns_resolve.c
+++ b/fs/smb/client/dns_resolve.c
@@ -20,69 +20,87 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
+static int resolve_name(const char *name, size_t namelen,
+			struct sockaddr *addr, time64_t *expiry)
+{
+	char *ip;
+	int rc;
+
+	rc = dns_query(current->nsproxy->net_ns, NULL, name,
+		       namelen, NULL, &ip, expiry, false);
+	if (rc < 0) {
+		cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
+			 __func__, (int)namelen, (int)namelen, name);
+	} else {
+		cifs_dbg(FYI, "%s: resolved: %*.*s to %s expiry %llu\n",
+			 __func__, (int)namelen, (int)namelen, name, ip,
+			 expiry ? (*expiry) : 0);
+
+		rc = cifs_convert_address(addr, ip, strlen(ip));
+		kfree(ip);
+		if (!rc) {
+			cifs_dbg(FYI, "%s: unable to determine ip address\n",
+				 __func__);
+			rc = -EHOSTUNREACH;
+		} else {
+			rc = 0;
+		}
+	}
+	return rc;
+}
+
 /**
  * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
+ * @dom: optional DNS domain name
  * @unc: UNC path specifying the server (with '/' as delimiter)
  * @ip_addr: Where to return the IP address.
  * @expiry: Where to return the expiry time for the dns record.
  *
  * Returns zero success, -ve on error.
  */
-int
-dns_resolve_server_name_to_ip(const char *unc, struct sockaddr *ip_addr, time64_t *expiry)
+int dns_resolve_server_name_to_ip(const char *dom, const char *unc,
+				  struct sockaddr *ip_addr, time64_t *expiry)
 {
-	const char *hostname, *sep;
-	char *ip;
-	int len, rc;
+	const char *name;
+	size_t namelen, len;
+	char *s;
+	int rc;
 
 	if (!ip_addr || !unc)
 		return -EINVAL;
 
-	len = strlen(unc);
-	if (len < 3) {
-		cifs_dbg(FYI, "%s: unc is too short: %s\n", __func__, unc);
+	cifs_dbg(FYI, "%s: dom=%s unc=%s\n", __func__, dom, unc);
+	if (strlen(unc) < 3)
 		return -EINVAL;
-	}
-
-	/* Discount leading slashes for cifs */
-	len -= 2;
-	hostname = unc + 2;
 
-	/* Search for server name delimiter */
-	sep = memchr(hostname, '/', len);
-	if (sep)
-		len = sep - hostname;
-	else
-		cifs_dbg(FYI, "%s: probably server name is whole unc: %s\n",
-			 __func__, unc);
+	extract_unc_hostname(unc, &name, &namelen);
+	if (!namelen)
+		return -EINVAL;
 
+	cifs_dbg(FYI, "%s: hostname=%.*s\n", __func__, (int)namelen, name);
 	/* Try to interpret hostname as an IPv4 or IPv6 address */
-	rc = cifs_convert_address(ip_addr, hostname, len);
+	rc = cifs_convert_address(ip_addr, name, namelen);
 	if (rc > 0) {
-		cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %*.*s\n", __func__, len, len,
-			 hostname);
+		cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %*.*s\n",
+			 __func__, (int)namelen, (int)namelen, name);
 		return 0;
 	}
 
-	/* Perform the upcall */
-	rc = dns_query(current->nsproxy->net_ns, NULL, hostname, len,
-		       NULL, &ip, expiry, false);
-	if (rc < 0) {
-		cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
-			 __func__, len, len, hostname);
-	} else {
-		cifs_dbg(FYI, "%s: resolved: %*.*s to %s expiry %llu\n",
-			 __func__, len, len, hostname, ip,
-			 expiry ? (*expiry) : 0);
-
-		rc = cifs_convert_address(ip_addr, ip, strlen(ip));
-		kfree(ip);
+	/*
+	 * If @name contains a NetBIOS name and @dom has been specified, then
+	 * convert @name to an FQDN and try resolving it first.
+	 */
+	if (dom && *dom && cifs_netbios_name(name, namelen)) {
+		len = strnlen(dom, CIFS_MAX_DOMAINNAME_LEN) + namelen + 2;
+		s = kmalloc(len, GFP_KERNEL);
+		if (!s)
+			return -ENOMEM;
 
-		if (!rc) {
-			cifs_dbg(FYI, "%s: unable to determine ip address\n", __func__);
-			rc = -EHOSTUNREACH;
-		} else
-			rc = 0;
+		scnprintf(s, len, "%.*s.%s", (int)namelen, name, dom);
+		rc = resolve_name(s, len - 1, ip_addr, expiry);
+		kfree(s);
+		if (!rc)
+			return 0;
 	}
-	return rc;
+	return resolve_name(name, namelen, ip_addr, expiry);
 }
diff --git a/fs/smb/client/dns_resolve.h b/fs/smb/client/dns_resolve.h
index 6eb0c15a2440..64c1dd2ad39b 100644
--- a/fs/smb/client/dns_resolve.h
+++ b/fs/smb/client/dns_resolve.h
@@ -14,7 +14,8 @@
 #include <linux/net.h>
 
 #ifdef __KERNEL__
-int dns_resolve_server_name_to_ip(const char *unc, struct sockaddr *ip_addr, time64_t *expiry);
+int dns_resolve_server_name_to_ip(const char *dom, const char *unc,
+				  struct sockaddr *ip_addr, time64_t *expiry);
 #endif /* KERNEL */
 
 #endif /* _DNS_RESOLVE_H */
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 49123f458d0c..5381f05420bc 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -385,6 +385,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
 	new_ctx->source = NULL;
 	new_ctx->iocharset = NULL;
 	new_ctx->leaf_fullpath = NULL;
+	new_ctx->dns_dom = NULL;
 	/*
 	 * Make sure to stay in sync with smb3_cleanup_fs_context_contents()
 	 */
@@ -399,6 +400,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
 	DUP_CTX_STR(nodename);
 	DUP_CTX_STR(iocharset);
 	DUP_CTX_STR(leaf_fullpath);
+	DUP_CTX_STR(dns_dom);
 
 	return 0;
 }
@@ -1863,6 +1865,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
 	ctx->prepath = NULL;
 	kfree(ctx->leaf_fullpath);
 	ctx->leaf_fullpath = NULL;
+	kfree(ctx->dns_dom);
+	ctx->dns_dom = NULL;
 }
 
 void
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index ac6baa774ad3..8813533345ee 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -295,6 +295,7 @@ struct smb3_fs_context {
 	bool dfs_automount:1; /* set for dfs automount only */
 	enum cifs_reparse_type reparse_type;
 	bool dfs_conn:1; /* set for dfs mounts */
+	char *dns_dom;
 };
 
 extern const struct fs_parameter_spec smb3_fs_parameters[];
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index c23d5ba44cae..0d483cd08354 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -1189,7 +1189,8 @@ int match_target_ip(struct TCP_Server_Info *server,
 
 	cifs_dbg(FYI, "%s: target name: %s\n", __func__, target + 2);
 
-	rc = dns_resolve_server_name_to_ip(target, (struct sockaddr *)&ss, NULL);
+	rc = dns_resolve_server_name_to_ip(server->dns_dom, target,
+					   (struct sockaddr *)&ss, NULL);
 	kfree(target);
 
 	if (rc < 0)
-- 
cgit v1.2.3


From 489d1523105e14e7a2c6e68205e05397e3b714fb Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 3 Jan 2025 16:38:51 -0300
Subject: smb: client: parse DNS domain name from domain= option

If the user specified a DNS domain name in domain= mount option, then
use it instead of parsing it in NTLMSSP CHALLENGE_MESSAGE message.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 9e16edf5c883..99c018aec2eb 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2279,12 +2279,13 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx __attribute__((unused)),
 struct cifs_ses *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
 {
-	int rc = 0;
-	int retries = 0;
-	unsigned int xid;
-	struct cifs_ses *ses;
-	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
 	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+	struct cifs_ses *ses;
+	unsigned int xid;
+	int retries = 0;
+	size_t len;
+	int rc = 0;
 
 	xid = get_xid();
 
@@ -2374,6 +2375,14 @@ retry_old_session:
 		ses->domainName = kstrdup(ctx->domainname, GFP_KERNEL);
 		if (!ses->domainName)
 			goto get_ses_fail;
+
+		len = strnlen(ctx->domainname, CIFS_MAX_DOMAINNAME_LEN);
+		if (!cifs_netbios_name(ctx->domainname, len)) {
+			ses->dns_dom = kstrndup(ctx->domainname,
+						len, GFP_KERNEL);
+			if (!ses->dns_dom)
+				goto get_ses_fail;
+		}
 	}
 
 	strscpy(ses->workstation_name, ctx->workstation_name, sizeof(ses->workstation_name));
-- 
cgit v1.2.3


From 4b1b4c8be9dee9aa1a751cfa3954b2fcfdfe9c3d Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Tue, 7 Jan 2025 12:22:50 -0300
Subject: smb: client: provide dns_resolve_{unc,name} helpers

Some places pass hostnames rather than UNC paths to resolve them to ip
addresses, so provide helpers to handle both cases and then stop
converting hostnames to UNC paths by inserting path delimiters into
them.  Also kill @expiry parameter as it's not used anywhere.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsproto.h   |  2 +-
 fs/smb/client/connect.c     | 27 +++++----------------------
 fs/smb/client/dfs.c         |  8 +++-----
 fs/smb/client/dfs_cache.c   | 17 ++++-------------
 fs/smb/client/dns_resolve.c | 44 +++++++++++++++++---------------------------
 fs/smb/client/dns_resolve.h | 24 ++++++++++++++++++++++--
 fs/smb/client/misc.c        | 23 +++++++----------------
 7 files changed, 59 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index d26f9bbb5382..c6c6acb9e206 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -680,7 +680,7 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
 }
 
 int match_target_ip(struct TCP_Server_Info *server,
-		    const char *share, size_t share_len,
+		    const char *host, size_t hostlen,
 		    bool *result);
 int cifs_inval_name_dfs_link_error(const unsigned int xid,
 				   struct cifs_tcon *tcon,
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 99c018aec2eb..004fec33efab 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -72,10 +72,8 @@ static void cifs_prune_tlinks(struct work_struct *work);
  */
 static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
 {
-	int rc;
-	int len;
-	char *unc;
 	struct sockaddr_storage ss;
+	int rc;
 
 	if (!server->hostname)
 		return -EINVAL;
@@ -84,33 +82,18 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
 	if (server->hostname[0] == '\0')
 		return 0;
 
-	len = strlen(server->hostname) + 3;
-
-	unc = kmalloc(len, GFP_KERNEL);
-	if (!unc) {
-		cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
-		return -ENOMEM;
-	}
-	scnprintf(unc, len, "\\\\%s", server->hostname);
-
 	spin_lock(&server->srv_lock);
 	ss = server->dstaddr;
 	spin_unlock(&server->srv_lock);
 
-	rc = dns_resolve_server_name_to_ip(server->dns_dom, unc,
-					   (struct sockaddr *)&ss, NULL);
-	kfree(unc);
-
-	if (rc < 0) {
-		cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n",
-			 __func__, server->hostname, rc);
-	} else {
+	rc = dns_resolve_name(server->dns_dom, server->hostname,
+			      strlen(server->hostname),
+			      (struct sockaddr *)&ss);
+	if (!rc) {
 		spin_lock(&server->srv_lock);
 		memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr));
 		spin_unlock(&server->srv_lock);
-		rc = 0;
 	}
-
 	return rc;
 }
 
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 09d8808cd2e0..b7de814e39bd 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -48,9 +48,8 @@ int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_para
 	if (rc)
 		goto out;
 
-	rc = dns_resolve_server_name_to_ip(DFS_DOM(ctx), path,
-					   (struct sockaddr *)&ctx->dstaddr,
-					   NULL);
+	rc = dns_resolve_unc(DFS_DOM(ctx), path,
+			     (struct sockaddr *)&ctx->dstaddr);
 out:
 	kfree(path);
 	return rc;
@@ -268,8 +267,7 @@ static int update_fs_context_dstaddr(struct smb3_fs_context *ctx)
 	int rc = 0;
 
 	if (!ctx->nodfs && ctx->dfs_automount) {
-		rc = dns_resolve_server_name_to_ip(NULL, ctx->source,
-						   addr, NULL);
+		rc = dns_resolve_unc(NULL, ctx->source, addr);
 		if (!rc)
 			cifs_set_port(addr, ctx->port);
 		ctx->dfs_automount = false;
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index c0a167c869fb..72527623f433 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1096,11 +1096,8 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
 static bool target_share_equal(struct cifs_tcon *tcon, const char *s1)
 {
 	struct TCP_Server_Info *server = tcon->ses->server;
-	struct sockaddr_storage ss;
-	const char *host;
 	const char *s2 = &tcon->tree_name[1];
-	size_t hostlen;
-	char unc[sizeof("\\\\") + SERVER_NAME_LENGTH] = {0};
+	struct sockaddr_storage ss;
 	bool match;
 	int rc;
 
@@ -1111,19 +1108,13 @@ static bool target_share_equal(struct cifs_tcon *tcon, const char *s1)
 	 * Resolve share's hostname and check if server address matches.  Otherwise just ignore it
 	 * as we could not have upcall to resolve hostname or failed to convert ip address.
 	 */
-	extract_unc_hostname(s1, &host, &hostlen);
-	scnprintf(unc, sizeof(unc), "\\\\%.*s", (int)hostlen, host);
-
-	rc = dns_resolve_server_name_to_ip(server->dns_dom, unc,
-					   (struct sockaddr *)&ss, NULL);
-	if (rc < 0) {
-		cifs_dbg(FYI, "%s: could not resolve %.*s. assuming server address matches.\n",
-			 __func__, (int)hostlen, host);
+	rc = dns_resolve_unc(server->dns_dom, s1, (struct sockaddr *)&ss);
+	if (rc < 0)
 		return true;
-	}
 
 	cifs_server_lock(server);
 	match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss);
+	cifs_dbg(FYI, "%s: [share=%s] ipaddr matched: %s\n", __func__, s1, str_yes_no(match));
 	cifs_server_unlock(server);
 
 	return match;
diff --git a/fs/smb/client/dns_resolve.c b/fs/smb/client/dns_resolve.c
index 83db27f9c8f1..de7f4b384718 100644
--- a/fs/smb/client/dns_resolve.c
+++ b/fs/smb/client/dns_resolve.c
@@ -20,21 +20,19 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-static int resolve_name(const char *name, size_t namelen,
-			struct sockaddr *addr, time64_t *expiry)
+static int resolve_name(const char *name, size_t namelen, struct sockaddr *addr)
 {
 	char *ip;
 	int rc;
 
 	rc = dns_query(current->nsproxy->net_ns, NULL, name,
-		       namelen, NULL, &ip, expiry, false);
+		       namelen, NULL, &ip, NULL, false);
 	if (rc < 0) {
 		cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
 			 __func__, (int)namelen, (int)namelen, name);
 	} else {
-		cifs_dbg(FYI, "%s: resolved: %*.*s to %s expiry %llu\n",
-			 __func__, (int)namelen, (int)namelen, name, ip,
-			 expiry ? (*expiry) : 0);
+		cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n",
+			 __func__, (int)namelen, (int)namelen, name, ip);
 
 		rc = cifs_convert_address(addr, ip, strlen(ip));
 		kfree(ip);
@@ -50,31 +48,23 @@ static int resolve_name(const char *name, size_t namelen,
 }
 
 /**
- * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
- * @dom: optional DNS domain name
- * @unc: UNC path specifying the server (with '/' as delimiter)
- * @ip_addr: Where to return the IP address.
- * @expiry: Where to return the expiry time for the dns record.
+ * dns_resolve_name - Perform an upcall to resolve hostname to an ip address.
+ * @dom: DNS domain name (or NULL)
+ * @name: Name to look up
+ * @namelen: Length of name
+ * @ip_addr: Where to return the IP address
  *
- * Returns zero success, -ve on error.
+ * Returns zero on success, -ve code otherwise.
  */
-int dns_resolve_server_name_to_ip(const char *dom, const char *unc,
-				  struct sockaddr *ip_addr, time64_t *expiry)
+int dns_resolve_name(const char *dom, const char *name,
+		     size_t namelen, struct sockaddr *ip_addr)
 {
-	const char *name;
-	size_t namelen, len;
+	size_t len;
 	char *s;
 	int rc;
 
-	if (!ip_addr || !unc)
-		return -EINVAL;
-
-	cifs_dbg(FYI, "%s: dom=%s unc=%s\n", __func__, dom, unc);
-	if (strlen(unc) < 3)
-		return -EINVAL;
-
-	extract_unc_hostname(unc, &name, &namelen);
-	if (!namelen)
+	cifs_dbg(FYI, "%s: dom=%s name=%.*s\n", __func__, dom, (int)namelen, name);
+	if (!ip_addr || !name || !*name || !namelen)
 		return -EINVAL;
 
 	cifs_dbg(FYI, "%s: hostname=%.*s\n", __func__, (int)namelen, name);
@@ -97,10 +87,10 @@ int dns_resolve_server_name_to_ip(const char *dom, const char *unc,
 			return -ENOMEM;
 
 		scnprintf(s, len, "%.*s.%s", (int)namelen, name, dom);
-		rc = resolve_name(s, len - 1, ip_addr, expiry);
+		rc = resolve_name(s, len - 1, ip_addr);
 		kfree(s);
 		if (!rc)
 			return 0;
 	}
-	return resolve_name(name, namelen, ip_addr, expiry);
+	return resolve_name(name, namelen, ip_addr);
 }
diff --git a/fs/smb/client/dns_resolve.h b/fs/smb/client/dns_resolve.h
index 64c1dd2ad39b..0dc706f2c422 100644
--- a/fs/smb/client/dns_resolve.h
+++ b/fs/smb/client/dns_resolve.h
@@ -12,10 +12,30 @@
 #define _DNS_RESOLVE_H
 
 #include <linux/net.h>
+#include "cifsglob.h"
+#include "cifsproto.h"
 
 #ifdef __KERNEL__
-int dns_resolve_server_name_to_ip(const char *dom, const char *unc,
-				  struct sockaddr *ip_addr, time64_t *expiry);
+
+int dns_resolve_name(const char *dom, const char *name,
+		     size_t namelen, struct sockaddr *ip_addr);
+
+static inline int dns_resolve_unc(const char *dom, const char *unc,
+				  struct sockaddr *ip_addr)
+{
+	const char *name;
+	size_t namelen;
+
+	if (!unc || strlen(unc) < 3)
+		return -EINVAL;
+
+	extract_unc_hostname(unc, &name, &namelen);
+	if (!namelen)
+		return -EINVAL;
+
+	return dns_resolve_name(dom, name, namelen, ip_addr);
+}
+
 #endif /* KERNEL */
 
 #endif /* _DNS_RESOLVE_H */
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 0d483cd08354..a6efd3644e6a 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -1172,34 +1172,25 @@ void cifs_put_tcp_super(struct super_block *sb)
 
 #ifdef CONFIG_CIFS_DFS_UPCALL
 int match_target_ip(struct TCP_Server_Info *server,
-		    const char *share, size_t share_len,
+		    const char *host, size_t hostlen,
 		    bool *result)
 {
-	int rc;
-	char *target;
 	struct sockaddr_storage ss;
+	int rc;
 
-	*result = false;
-
-	target = kzalloc(share_len + 3, GFP_KERNEL);
-	if (!target)
-		return -ENOMEM;
-
-	scnprintf(target, share_len + 3, "\\\\%.*s", (int)share_len, share);
-
-	cifs_dbg(FYI, "%s: target name: %s\n", __func__, target + 2);
+	cifs_dbg(FYI, "%s: hostname=%.*s\n", __func__, (int)hostlen, host);
 
-	rc = dns_resolve_server_name_to_ip(server->dns_dom, target,
-					   (struct sockaddr *)&ss, NULL);
-	kfree(target);
+	*result = false;
 
+	rc = dns_resolve_name(server->dns_dom, host, hostlen,
+			      (struct sockaddr *)&ss);
 	if (rc < 0)
 		return rc;
 
 	spin_lock(&server->srv_lock);
 	*result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss);
 	spin_unlock(&server->srv_lock);
-	cifs_dbg(FYI, "%s: ip addresses match: %u\n", __func__, *result);
+	cifs_dbg(FYI, "%s: ip addresses matched: %s\n", __func__, str_yes_no(*result));
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5433c629e8d4eee77233a2bc88075886dc6e37ef Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 10 Jan 2025 15:58:08 -0300
Subject: smb: client: optimize referral walk on failed link targets

If a link referral request sent to root server was successful but
client failed to connect to all link targets, there is no need to
retry same link referral on a different root server.  Set an end
marker for the DFS root referral so the client will not attempt to
re-send link referrals to different root servers on failures.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/dfs.c | 69 ++++++++++++++++++++---------------------------------
 fs/smb/client/dfs.h | 44 +++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index b7de814e39bd..dad521336b5e 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -98,15 +98,16 @@ static inline int parse_dfs_target(struct smb3_fs_context *ctx,
 	return rc;
 }
 
-static int setup_dfs_ref(struct cifs_mount_ctx *mnt_ctx,
-			 struct dfs_info3_param *tgt,
-			 struct dfs_ref_walk *rw)
+static int setup_dfs_ref(struct dfs_info3_param *tgt, struct dfs_ref_walk *rw)
 {
-	struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
-	struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+	struct cifs_sb_info *cifs_sb = rw->mnt_ctx->cifs_sb;
+	struct smb3_fs_context *ctx = rw->mnt_ctx->fs_ctx;
 	char *ref_path, *full_path;
 	int rc;
 
+	set_root_smb_session(rw->mnt_ctx);
+	ref_walk_ses(rw) = ctx->dfs_root_ses;
+
 	full_path = smb3_fs_context_fullpath(ctx, CIFS_DIR_SEP(cifs_sb));
 	if (IS_ERR(full_path))
 		return PTR_ERR(full_path);
@@ -123,35 +124,22 @@ static int setup_dfs_ref(struct cifs_mount_ctx *mnt_ctx,
 	}
 	ref_walk_path(rw) = ref_path;
 	ref_walk_fpath(rw) = full_path;
-	ref_walk_ses(rw) = ctx->dfs_root_ses;
-	return 0;
+
+	return dfs_get_referral(rw->mnt_ctx,
+				ref_walk_path(rw) + 1,
+				ref_walk_tl(rw));
 }
 
-static int __dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx,
-			       struct dfs_ref_walk *rw)
+static int __dfs_referral_walk(struct dfs_ref_walk *rw)
 {
-	struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+	struct smb3_fs_context *ctx = rw->mnt_ctx->fs_ctx;
+	struct cifs_mount_ctx *mnt_ctx = rw->mnt_ctx;
 	struct dfs_info3_param tgt = {};
 	int rc = -ENOENT;
 
 again:
 	do {
 		ctx->dfs_root_ses = ref_walk_ses(rw);
-		if (ref_walk_empty(rw)) {
-			rc = dfs_get_referral(mnt_ctx, ref_walk_path(rw) + 1,
-					      NULL, ref_walk_tl(rw));
-			if (rc) {
-				rc = cifs_mount_get_tcon(mnt_ctx);
-				if (!rc)
-					rc = cifs_is_path_remote(mnt_ctx);
-				continue;
-			}
-			if (!ref_walk_num_tgts(rw)) {
-				rc = -ENOENT;
-				continue;
-			}
-		}
-
 		while (ref_walk_next_tgt(rw)) {
 			rc = parse_dfs_target(ctx, rw, &tgt);
 			if (rc)
@@ -162,32 +150,29 @@ again:
 			if (rc)
 				continue;
 
-			ref_walk_set_tgt_hint(rw);
 			if (tgt.flags & DFSREF_STORAGE_SERVER) {
 				rc = cifs_mount_get_tcon(mnt_ctx);
 				if (!rc)
 					rc = cifs_is_path_remote(mnt_ctx);
-				if (!rc)
+				if (!rc) {
+					ref_walk_set_tgt_hint(rw);
 					break;
+				}
 				if (rc != -EREMOTE)
 					continue;
 			}
 
-			set_root_smb_session(mnt_ctx);
 			rc = ref_walk_advance(rw);
 			if (!rc) {
-				rc = setup_dfs_ref(mnt_ctx, &tgt, rw);
-				if (!rc) {
-					rc = -EREMOTE;
-					goto again;
-				}
+				rc = setup_dfs_ref(&tgt, rw);
+				if (rc)
+					break;
+				ref_walk_mark_end(rw);
+				goto again;
 			}
-			if (rc != -ELOOP)
-				goto out;
 		}
 	} while (rc && ref_walk_descend(rw));
 
-out:
 	free_dfs_info_param(&tgt);
 	return rc;
 }
@@ -204,10 +189,10 @@ static int dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx,
 		return rc;
 	}
 
-	ref_walk_init(*rw);
-	rc = setup_dfs_ref(mnt_ctx, NULL, *rw);
+	ref_walk_init(*rw, mnt_ctx);
+	rc = setup_dfs_ref(NULL, *rw);
 	if (!rc)
-		rc = __dfs_referral_walk(mnt_ctx, *rw);
+		rc = __dfs_referral_walk(*rw);
 	return rc;
 }
 
@@ -297,7 +282,7 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
 	 * to respond with PATH_NOT_COVERED to requests that include the prefix.
 	 */
 	if (!nodfs) {
-		rc = dfs_get_referral(mnt_ctx, ctx->UNC + 1, NULL, NULL);
+		rc = dfs_get_referral(mnt_ctx, ctx->UNC + 1, NULL);
 		if (rc) {
 			cifs_dbg(FYI, "%s: no dfs referral for %s: %d\n",
 				 __func__, ctx->UNC + 1, rc);
@@ -317,10 +302,8 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
 		cifs_mount_put_conns(mnt_ctx);
 		rc = get_session(mnt_ctx, NULL);
 	}
-	if (!rc) {
-		set_root_smb_session(mnt_ctx);
+	if (!rc)
 		rc = __dfs_mount_share(mnt_ctx);
-	}
 	return rc;
 }
 
diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h
index 1aa2bc65b3bc..ed4cd7cf1ec6 100644
--- a/fs/smb/client/dfs.h
+++ b/fs/smb/client/dfs.h
@@ -12,6 +12,7 @@
 #include "dfs_cache.h"
 #include "cifs_unicode.h"
 #include <linux/namei.h>
+#include <linux/errno.h>
 
 #define DFS_INTERLINK(v) \
 	(((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER))
@@ -25,8 +26,9 @@ struct dfs_ref {
 };
 
 struct dfs_ref_walk {
-	struct dfs_ref *ref;
-	struct dfs_ref refs[MAX_NESTED_LINKS];
+	struct cifs_mount_ctx	*mnt_ctx;
+	struct dfs_ref		*ref;
+	struct dfs_ref		refs[MAX_NESTED_LINKS];
 };
 
 #define ref_walk_start(w)	((w)->refs)
@@ -35,7 +37,6 @@ struct dfs_ref_walk {
 #define ref_walk_descend(w)	(--ref_walk_cur(w) >= ref_walk_start(w))
 
 #define ref_walk_tit(w)	(ref_walk_cur(w)->tit)
-#define ref_walk_empty(w)	(!ref_walk_tit(w))
 #define ref_walk_path(w)	(ref_walk_cur(w)->path)
 #define ref_walk_fpath(w)	(ref_walk_cur(w)->full_path)
 #define ref_walk_tl(w)		(&ref_walk_cur(w)->tl)
@@ -51,9 +52,11 @@ static inline struct dfs_ref_walk *ref_walk_alloc(void)
 	return rw;
 }
 
-static inline void ref_walk_init(struct dfs_ref_walk *rw)
+static inline void ref_walk_init(struct dfs_ref_walk *rw,
+				 struct cifs_mount_ctx *mnt_ctx)
 {
 	memset(rw, 0, sizeof(*rw));
+	rw->mnt_ctx = mnt_ctx;
 	ref_walk_cur(rw) = ref_walk_start(rw);
 }
 
@@ -93,15 +96,23 @@ static inline int ref_walk_advance(struct dfs_ref_walk *rw)
 static inline struct dfs_cache_tgt_iterator *
 ref_walk_next_tgt(struct dfs_ref_walk *rw)
 {
-	struct dfs_cache_tgt_iterator *tit;
 	struct dfs_ref *ref = ref_walk_cur(rw);
+	struct dfs_cache_tgt_iterator *tit;
+
+	if (IS_ERR(ref->tit))
+		return NULL;
 
 	if (!ref->tit)
 		tit = dfs_cache_get_tgt_iterator(&ref->tl);
 	else
 		tit = dfs_cache_get_next_tgt(&ref->tl, ref->tit);
+
+	if (!tit) {
+		ref->tit = ERR_PTR(-ENOENT);
+		return NULL;
+	}
 	ref->tit = tit;
-	return tit;
+	return ref->tit;
 }
 
 static inline int ref_walk_get_tgt(struct dfs_ref_walk *rw,
@@ -112,11 +123,6 @@ static inline int ref_walk_get_tgt(struct dfs_ref_walk *rw,
 					  ref_walk_tit(rw), tgt);
 }
 
-static inline int ref_walk_num_tgts(struct dfs_ref_walk *rw)
-{
-	return dfs_cache_get_nr_tgts(ref_walk_tl(rw));
-}
-
 static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw)
 {
 	dfs_cache_noreq_update_tgthint(ref_walk_path(rw) + 1,
@@ -136,6 +142,15 @@ static inline void ref_walk_set_tcon(struct dfs_ref_walk *rw,
 	}
 }
 
+static inline void ref_walk_mark_end(struct dfs_ref_walk *rw)
+{
+	struct dfs_ref *ref = ref_walk_cur(rw) - 1;
+
+	WARN_ON_ONCE(ref < ref_walk_start(rw));
+	dfs_cache_noreq_update_tgthint(ref->path + 1, ref->tit);
+	ref->tit = ERR_PTR(-ENOENT); /* end marker */
+}
+
 int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref,
 			      struct smb3_fs_context *ctx);
 int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx);
@@ -145,15 +160,16 @@ static inline char *dfs_get_path(struct cifs_sb_info *cifs_sb, const char *path)
 	return dfs_cache_canonical_path(path, cifs_sb->local_nls, cifs_remap(cifs_sb));
 }
 
-static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *path,
-				   struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tl)
+static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx,
+				   const char *path,
+				   struct dfs_cache_tgt_list *tl)
 {
 	struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
 	struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
 	struct cifs_ses *rses = ctx->dfs_root_ses ?: mnt_ctx->ses;
 
 	return dfs_cache_find(mnt_ctx->xid, rses, cifs_sb->local_nls,
-			      cifs_remap(cifs_sb), path, ref, tl);
+			      cifs_remap(cifs_sb), path, NULL, tl);
 }
 
 /*
-- 
cgit v1.2.3


From bfc1155030151912e7c5426449c7703dad45a890 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 10 Jan 2025 16:45:44 -0300
Subject: smb: client: fix return value of parse_dfs_referrals()

Return -ENOENT in parse_dfs_referrals() when server returns no targets
for a referral request as specified in

  MS-DFSC 3.1.5.4.3 Receiving a Root Referral Response or Link
  Referral Response:

    > If the referral request is successful, but the NumberOfReferrals
    > field in the referral header (as specified in section 2.2.4) is
    > 0, the DFS server could not find suitable targets to return to
    > the client.  In this case, the client MUST fail the original I/O
    > operation with STATUS_OBJECT_PATH_NOT_FOUND.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/misc.c    | 6 +++---
 fs/smb/client/smb2ops.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index a6efd3644e6a..b328dc5c7988 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -909,9 +909,9 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,
 	*num_of_nodes = le16_to_cpu(rsp->NumberOfReferrals);
 
 	if (*num_of_nodes < 1) {
-		cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n",
-			 *num_of_nodes);
-		rc = -EINVAL;
+		cifs_dbg(VFS | ONCE, "%s: [path=%s] num_referrals must be at least > 0, but we got %d\n",
+			 __func__, searchName, *num_of_nodes);
+		rc = -ENOENT;
 		goto parse_DFS_referrals_exit;
 	}
 
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 87cb1872db28..0af4abf78a8e 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -3007,9 +3007,9 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
 				 num_of_nodes, target_nodes,
 				 nls_codepage, remap, search_name,
 				 true /* is_unicode */);
-	if (rc) {
-		cifs_tcon_dbg(VFS, "parse error in %s rc=%d\n", __func__, rc);
-		goto out;
+	if (rc && rc != -ENOENT) {
+		cifs_tcon_dbg(VFS, "%s: failed to parse DFS referral %s: %d\n",
+			      __func__, search_name, rc);
 	}
 
  out:
-- 
cgit v1.2.3


From 48aa99523e88e5792edc007e0c7f89faffacc5f7 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Mon, 13 Jan 2025 19:00:29 -0300
Subject: smb: client: don't retry DFS targets on server shutdown

If TCP Server is about to be destroyed (e.g. CifsExiting was set) and
it is reconnecting, stop retrying DFS targets from cached DFS referral
as this would potentially delay server shutdown in several seconds.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 004fec33efab..73d07d95d435 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -422,7 +422,8 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
 }
 
 #ifdef CONFIG_CIFS_DFS_UPCALL
-static int __reconnect_target_unlocked(struct TCP_Server_Info *server, const char *target)
+static int __reconnect_target_locked(struct TCP_Server_Info *server,
+				     const char *target)
 {
 	int rc;
 	char *hostname;
@@ -455,27 +456,36 @@ static int __reconnect_target_unlocked(struct TCP_Server_Info *server, const cha
 	return rc;
 }
 
-static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_cache_tgt_list *tl,
-				     struct dfs_cache_tgt_iterator **target_hint)
+static int reconnect_target_locked(struct TCP_Server_Info *server,
+				   struct dfs_cache_tgt_list *tl,
+				   struct dfs_cache_tgt_iterator **target_hint)
 {
-	int rc;
 	struct dfs_cache_tgt_iterator *tit;
+	int rc;
 
 	*target_hint = NULL;
 
 	/* If dfs target list is empty, then reconnect to last server */
 	tit = dfs_cache_get_tgt_iterator(tl);
 	if (!tit)
-		return __reconnect_target_unlocked(server, server->hostname);
+		return __reconnect_target_locked(server, server->hostname);
 
 	/* Otherwise, try every dfs target in @tl */
-	for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) {
-		rc = __reconnect_target_unlocked(server, dfs_cache_get_tgt_name(tit));
+	do {
+		const char *target = dfs_cache_get_tgt_name(tit);
+
+		spin_lock(&server->srv_lock);
+		if (server->tcpStatus != CifsNeedReconnect) {
+			spin_unlock(&server->srv_lock);
+			return -ECONNRESET;
+		}
+		spin_unlock(&server->srv_lock);
+		rc = __reconnect_target_locked(server, target);
 		if (!rc) {
 			*target_hint = tit;
 			break;
 		}
-	}
+	} while ((tit = dfs_cache_get_next_tgt(tl, tit)));
 	return rc;
 }
 
@@ -518,7 +528,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
 		try_to_freeze();
 		cifs_server_lock(server);
 
-		rc = reconnect_target_unlocked(server, &tl, &target_hint);
+		rc = reconnect_target_locked(server, &tl, &target_hint);
 		if (rc) {
 			/* Failed to reconnect socket */
 			cifs_server_unlock(server);
-- 
cgit v1.2.3


From 215b7f9ecb8d7c14d56febdcdd246f3579c32aba Mon Sep 17 00:00:00 2001
From: Liang Jie <liangjie@lixiang.com>
Date: Sat, 18 Jan 2025 20:35:28 +0800
Subject: smb: client: correctly handle ErrorContextData as a flexible array

The `smb2_symlink_err_rsp` structure was previously defined with
`ErrorContextData` as a single `__u8` byte. However, the `ErrorContextData`
field is intended to be a variable-length array based on `ErrorDataLength`.
This mismatch leads to incorrect pointer arithmetic and potential memory
access issues when processing error contexts.

Updates the `ErrorContextData` field to be a flexible array
(`__u8 ErrorContextData[]`). Additionally, it modifies the corresponding
casts in the `symlink_data()` function to properly handle the flexible
array, ensuring correct memory calculations and data handling.

These changes improve the robustness of SMB2 symlink error processing.

Signed-off-by: Liang Jie <liangjie@lixiang.com>
Suggested-by: Tom Talpey <tom@talpey.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2file.c | 4 ++--
 fs/smb/client/smb2pdu.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index e836bc2193dd..9ec44eab8dbc 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -42,14 +42,14 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
 		end = (struct smb2_error_context_rsp *)((u8 *)err + iov->iov_len);
 		do {
 			if (le32_to_cpu(p->ErrorId) == SMB2_ERROR_ID_DEFAULT) {
-				sym = (struct smb2_symlink_err_rsp *)&p->ErrorContextData;
+				sym = (struct smb2_symlink_err_rsp *)p->ErrorContextData;
 				break;
 			}
 			cifs_dbg(FYI, "%s: skipping unhandled error context: 0x%x\n",
 				 __func__, le32_to_cpu(p->ErrorId));
 
 			len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8);
-			p = (struct smb2_error_context_rsp *)((u8 *)&p->ErrorContextData + len);
+			p = (struct smb2_error_context_rsp *)(p->ErrorContextData + len);
 		} while (p < end);
 	} else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) &&
 		   iov->iov_len >= SMB2_SYMLINK_STRUCT_SIZE) {
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index 076d9e83e1a0..3c09a58dfd07 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -79,7 +79,7 @@ struct smb2_symlink_err_rsp {
 struct smb2_error_context_rsp {
 	__le32 ErrorDataLength;
 	__le32 ErrorId;
-	__u8  ErrorContextData; /* ErrorDataLength long array */
+	__u8  ErrorContextData[] __counted_by_le(ErrorDataLength);
 } __packed;
 
 /* ErrorId values */
-- 
cgit v1.2.3


From be7a6a77669588bfa5022a470989702bbbb11e7f Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Thu, 16 Jan 2025 14:29:03 -0300
Subject: smb: client: fix oops due to unset link speed

It isn't guaranteed that NETWORK_INTERFACE_INFO::LinkSpeed will always
be set by the server, so the client must handle any values and then
prevent oopses like below from happening:

Oops: divide error: 0000 [#1] PREEMPT SMP KASAN NOPTI
CPU: 0 UID: 0 PID: 1323 Comm: cat Not tainted 6.13.0-rc7 #2
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-3.fc41
04/01/2014
RIP: 0010:cifs_debug_data_proc_show+0xa45/0x1460 [cifs] Code: 00 00 48
89 df e8 3b cd 1b c1 41 f6 44 24 2c 04 0f 84 50 01 00 00 48 89 ef e8
e7 d0 1b c1 49 8b 44 24 18 31 d2 49 8d 7c 24 28 <48> f7 74 24 18 48 89
c3 e8 6e cf 1b c1 41 8b 6c 24 28 49 8d 7c 24
RSP: 0018:ffffc90001817be0 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff88811230022c RCX: ffffffffc041bd99
RDX: 0000000000000000 RSI: 0000000000000567 RDI: ffff888112300228
RBP: ffff888112300218 R08: fffff52000302f5f R09: ffffed1022fa58ac
R10: ffff888117d2c566 R11: 00000000fffffffe R12: ffff888112300200
R13: 000000012a15343f R14: 0000000000000001 R15: ffff888113f2db58
FS: 00007fe27119e740(0000) GS:ffff888148600000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fe2633c5000 CR3: 0000000124da0000 CR4: 0000000000750ef0
PKRU: 55555554
Call Trace:
 <TASK>
 ? __die_body.cold+0x19/0x27
 ? die+0x2e/0x50
 ? do_trap+0x159/0x1b0
 ? cifs_debug_data_proc_show+0xa45/0x1460 [cifs]
 ? do_error_trap+0x90/0x130
 ? cifs_debug_data_proc_show+0xa45/0x1460 [cifs]
 ? exc_divide_error+0x39/0x50
 ? cifs_debug_data_proc_show+0xa45/0x1460 [cifs]
 ? asm_exc_divide_error+0x1a/0x20
 ? cifs_debug_data_proc_show+0xa39/0x1460 [cifs]
 ? cifs_debug_data_proc_show+0xa45/0x1460 [cifs]
 ? seq_read_iter+0x42e/0x790
 seq_read_iter+0x19a/0x790
 proc_reg_read_iter+0xbe/0x110
 ? __pfx_proc_reg_read_iter+0x10/0x10
 vfs_read+0x469/0x570
 ? do_user_addr_fault+0x398/0x760
 ? __pfx_vfs_read+0x10/0x10
 ? find_held_lock+0x8a/0xa0
 ? __pfx_lock_release+0x10/0x10
 ksys_read+0xd3/0x170
 ? __pfx_ksys_read+0x10/0x10
 ? __rcu_read_unlock+0x50/0x270
 ? mark_held_locks+0x1a/0x90
 do_syscall_64+0xbb/0x1d0
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fe271288911
Code: 00 48 8b 15 01 25 10 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8
20 ad 01 00 f3 0f 1e fa 80 3d b5 a7 10 00 00 74 13 31 c0 0f 05 <48> 3d
00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec
RSP: 002b:00007ffe87c079d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 0000000000040000 RCX: 00007fe271288911
RDX: 0000000000040000 RSI: 00007fe2633c6000 RDI: 0000000000000003
RBP: 00007ffe87c07a00 R08: 0000000000000000 R09: 00007fe2713e6380
R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000040000
R13: 00007fe2633c6000 R14: 0000000000000003 R15: 0000000000000000
 </TASK>

Fix this by setting cifs_server_iface::speed to a sane value (1Gbps)
by default when link speed is unset.

Cc: Shyam Prasad N <nspmangalore@gmail.com>
Cc: Tom Talpey <tom@talpey.com>
Fixes: a6d8fb54a515 ("cifs: distribute channels across interfaces based on speed")
Reported-by: Frank Sorenson <sorenson@redhat.com>
Reported-by: Jay Shin <jaeshin@redhat.com>
Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2ops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 0af4abf78a8e..7d9a209fe6e0 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -658,7 +658,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
 
 	while (bytes_left >= (ssize_t)sizeof(*p)) {
 		memset(&tmp_iface, 0, sizeof(tmp_iface));
-		tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
+		/* default to 1Gbps when link speed is unset */
+		tmp_iface.speed = le64_to_cpu(p->LinkSpeed) ?: 1000000000;
 		tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
 		tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
 
-- 
cgit v1.2.3


From 11f8b80ab9f99291dc88d09855b9f8f43b772335 Mon Sep 17 00:00:00 2001
From: Ruben Devos <devosruben6@gmail.com>
Date: Sat, 18 Jan 2025 21:03:30 +0100
Subject: smb: client: fix order of arguments of tracepoints

The tracepoints based on smb3_inf_compound_*_class have tcon id and
session id swapped around. This results in incorrect output in
`trace-cmd report`.

Fix the order of arguments to resolve this issue. The trace-cmd output
below shows the before and after of the smb3_delete_enter and
smb3_delete_done events as an example. The smb3_cmd_* events show the
correct session and tcon id for reference.

Also fix tracepoint set -> get in the SMB2_OP_GET_REPARSE case.

BEFORE:
rm-2211  [001] .....  1839.550888: smb3_delete_enter:    xid=281 sid=0x5 tid=0x3d path=\hello2.txt
rm-2211  [001] .....  1839.550894: smb3_cmd_enter:        sid=0x1ac000000003d tid=0x5 cmd=5 mid=61
rm-2211  [001] .....  1839.550896: smb3_cmd_enter:        sid=0x1ac000000003d tid=0x5 cmd=6 mid=62
rm-2211  [001] .....  1839.552091: smb3_cmd_done:         sid=0x1ac000000003d tid=0x5 cmd=5 mid=61
rm-2211  [001] .....  1839.552093: smb3_cmd_done:         sid=0x1ac000000003d tid=0x5 cmd=6 mid=62
rm-2211  [001] .....  1839.552103: smb3_delete_done:     xid=281 sid=0x5 tid=0x3d

AFTER:
rm-2501  [001] .....  3237.656110: smb3_delete_enter:    xid=88 sid=0x1ac0000000041 tid=0x5 path=\hello2.txt
rm-2501  [001] .....  3237.656122: smb3_cmd_enter:        sid=0x1ac0000000041 tid=0x5 cmd=5 mid=84
rm-2501  [001] .....  3237.656123: smb3_cmd_enter:        sid=0x1ac0000000041 tid=0x5 cmd=6 mid=85
rm-2501  [001] .....  3237.657909: smb3_cmd_done:         sid=0x1ac0000000041 tid=0x5 cmd=5 mid=84
rm-2501  [001] .....  3237.657909: smb3_cmd_done:         sid=0x1ac0000000041 tid=0x5 cmd=6 mid=85
rm-2501  [001] .....  3237.657922: smb3_delete_done:     xid=88 sid=0x1ac0000000041 tid=0x5

Cc: stable@vger.kernel.org
Signed-off-by: Ruben Devos <devosruben6@gmail.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/dir.c       |   6 +--
 fs/smb/client/smb2inode.c | 108 +++++++++++++++++++++++-----------------------
 2 files changed, 57 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 864b194dbaa0..1822493dd084 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -627,7 +627,7 @@ int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode,
 		goto mknod_out;
 	}
 
-	trace_smb3_mknod_enter(xid, tcon->ses->Suid, tcon->tid, full_path);
+	trace_smb3_mknod_enter(xid, tcon->tid, tcon->ses->Suid, full_path);
 
 	rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
 					       full_path, mode,
@@ -635,9 +635,9 @@ int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode,
 
 mknod_out:
 	if (rc)
-		trace_smb3_mknod_err(xid,  tcon->ses->Suid, tcon->tid, rc);
+		trace_smb3_mknod_err(xid,  tcon->tid, tcon->ses->Suid, rc);
 	else
-		trace_smb3_mknod_done(xid, tcon->ses->Suid, tcon->tid);
+		trace_smb3_mknod_done(xid, tcon->tid, tcon->ses->Suid);
 
 	free_dentry_path(page);
 	free_xid(xid);
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index a55f0044d30b..274672755c19 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -298,8 +298,8 @@ replay_again:
 				goto finished;
 			}
 			num_rqst++;
-			trace_smb3_query_info_compound_enter(xid, ses->Suid,
-							     tcon->tid, full_path);
+			trace_smb3_query_info_compound_enter(xid, tcon->tid,
+							     ses->Suid, full_path);
 			break;
 		case SMB2_OP_POSIX_QUERY_INFO:
 			rqst[num_rqst].rq_iov = &vars->qi_iov;
@@ -334,18 +334,18 @@ replay_again:
 				goto finished;
 			}
 			num_rqst++;
-			trace_smb3_posix_query_info_compound_enter(xid, ses->Suid,
-								   tcon->tid, full_path);
+			trace_smb3_posix_query_info_compound_enter(xid, tcon->tid,
+								   ses->Suid, full_path);
 			break;
 		case SMB2_OP_DELETE:
-			trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
+			trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path);
 			break;
 		case SMB2_OP_MKDIR:
 			/*
 			 * Directories are created through parameters in the
 			 * SMB2_open() call.
 			 */
-			trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
+			trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path);
 			break;
 		case SMB2_OP_RMDIR:
 			rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -363,7 +363,7 @@ replay_again:
 				goto finished;
 			smb2_set_next_command(tcon, &rqst[num_rqst]);
 			smb2_set_related(&rqst[num_rqst++]);
-			trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
+			trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path);
 			break;
 		case SMB2_OP_SET_EOF:
 			rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -398,7 +398,7 @@ replay_again:
 				goto finished;
 			}
 			num_rqst++;
-			trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
+			trace_smb3_set_eof_enter(xid, tcon->tid, ses->Suid, full_path);
 			break;
 		case SMB2_OP_SET_INFO:
 			rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -429,8 +429,8 @@ replay_again:
 				goto finished;
 			}
 			num_rqst++;
-			trace_smb3_set_info_compound_enter(xid, ses->Suid,
-							   tcon->tid, full_path);
+			trace_smb3_set_info_compound_enter(xid, tcon->tid,
+							   ses->Suid, full_path);
 			break;
 		case SMB2_OP_RENAME:
 			rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -469,7 +469,7 @@ replay_again:
 				goto finished;
 			}
 			num_rqst++;
-			trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
+			trace_smb3_rename_enter(xid, tcon->tid, ses->Suid, full_path);
 			break;
 		case SMB2_OP_HARDLINK:
 			rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -496,7 +496,7 @@ replay_again:
 				goto finished;
 			smb2_set_next_command(tcon, &rqst[num_rqst]);
 			smb2_set_related(&rqst[num_rqst++]);
-			trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
+			trace_smb3_hardlink_enter(xid, tcon->tid, ses->Suid, full_path);
 			break;
 		case SMB2_OP_SET_REPARSE:
 			rqst[num_rqst].rq_iov = vars->io_iov;
@@ -523,8 +523,8 @@ replay_again:
 				goto finished;
 			}
 			num_rqst++;
-			trace_smb3_set_reparse_compound_enter(xid, ses->Suid,
-							      tcon->tid, full_path);
+			trace_smb3_set_reparse_compound_enter(xid, tcon->tid,
+							      ses->Suid, full_path);
 			break;
 		case SMB2_OP_GET_REPARSE:
 			rqst[num_rqst].rq_iov = vars->io_iov;
@@ -549,8 +549,8 @@ replay_again:
 				goto finished;
 			}
 			num_rqst++;
-			trace_smb3_get_reparse_compound_enter(xid, ses->Suid,
-							      tcon->tid, full_path);
+			trace_smb3_get_reparse_compound_enter(xid, tcon->tid,
+							      ses->Suid, full_path);
 			break;
 		case SMB2_OP_QUERY_WSL_EA:
 			rqst[num_rqst].rq_iov = &vars->ea_iov;
@@ -656,11 +656,11 @@ finished:
 			}
 			SMB2_query_info_free(&rqst[num_rqst++]);
 			if (rc)
-				trace_smb3_query_info_compound_err(xid,  ses->Suid,
-								   tcon->tid, rc);
+				trace_smb3_query_info_compound_err(xid,  tcon->tid,
+								   ses->Suid, rc);
 			else
-				trace_smb3_query_info_compound_done(xid, ses->Suid,
-								    tcon->tid);
+				trace_smb3_query_info_compound_done(xid, tcon->tid,
+								    ses->Suid);
 			break;
 		case SMB2_OP_POSIX_QUERY_INFO:
 			idata = in_iov[i].iov_base;
@@ -683,15 +683,15 @@ finished:
 
 			SMB2_query_info_free(&rqst[num_rqst++]);
 			if (rc)
-				trace_smb3_posix_query_info_compound_err(xid,  ses->Suid,
-									 tcon->tid, rc);
+				trace_smb3_posix_query_info_compound_err(xid,  tcon->tid,
+									 ses->Suid, rc);
 			else
-				trace_smb3_posix_query_info_compound_done(xid, ses->Suid,
-									  tcon->tid);
+				trace_smb3_posix_query_info_compound_done(xid, tcon->tid,
+									  ses->Suid);
 			break;
 		case SMB2_OP_DELETE:
 			if (rc)
-				trace_smb3_delete_err(xid,  ses->Suid, tcon->tid, rc);
+				trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc);
 			else {
 				/*
 				 * If dentry (hence, inode) is NULL, lease break is going to
@@ -699,59 +699,59 @@ finished:
 				 */
 				if (inode)
 					cifs_mark_open_handles_for_deleted_file(inode, full_path);
-				trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+				trace_smb3_delete_done(xid, tcon->tid, ses->Suid);
 			}
 			break;
 		case SMB2_OP_MKDIR:
 			if (rc)
-				trace_smb3_mkdir_err(xid,  ses->Suid, tcon->tid, rc);
+				trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc);
 			else
-				trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
+				trace_smb3_mkdir_done(xid, tcon->tid, ses->Suid);
 			break;
 		case SMB2_OP_HARDLINK:
 			if (rc)
-				trace_smb3_hardlink_err(xid,  ses->Suid, tcon->tid, rc);
+				trace_smb3_hardlink_err(xid,  tcon->tid, ses->Suid, rc);
 			else
-				trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
+				trace_smb3_hardlink_done(xid, tcon->tid, ses->Suid);
 			SMB2_set_info_free(&rqst[num_rqst++]);
 			break;
 		case SMB2_OP_RENAME:
 			if (rc)
-				trace_smb3_rename_err(xid,  ses->Suid, tcon->tid, rc);
+				trace_smb3_rename_err(xid, tcon->tid, ses->Suid, rc);
 			else
-				trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
+				trace_smb3_rename_done(xid, tcon->tid, ses->Suid);
 			SMB2_set_info_free(&rqst[num_rqst++]);
 			break;
 		case SMB2_OP_RMDIR:
 			if (rc)
-				trace_smb3_rmdir_err(xid,  ses->Suid, tcon->tid, rc);
+				trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc);
 			else
-				trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
+				trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid);
 			SMB2_set_info_free(&rqst[num_rqst++]);
 			break;
 		case SMB2_OP_SET_EOF:
 			if (rc)
-				trace_smb3_set_eof_err(xid,  ses->Suid, tcon->tid, rc);
+				trace_smb3_set_eof_err(xid, tcon->tid, ses->Suid, rc);
 			else
-				trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
+				trace_smb3_set_eof_done(xid, tcon->tid, ses->Suid);
 			SMB2_set_info_free(&rqst[num_rqst++]);
 			break;
 		case SMB2_OP_SET_INFO:
 			if (rc)
-				trace_smb3_set_info_compound_err(xid,  ses->Suid,
-								 tcon->tid, rc);
+				trace_smb3_set_info_compound_err(xid,  tcon->tid,
+								 ses->Suid, rc);
 			else
-				trace_smb3_set_info_compound_done(xid, ses->Suid,
-								  tcon->tid);
+				trace_smb3_set_info_compound_done(xid, tcon->tid,
+								  ses->Suid);
 			SMB2_set_info_free(&rqst[num_rqst++]);
 			break;
 		case SMB2_OP_SET_REPARSE:
 			if (rc) {
-				trace_smb3_set_reparse_compound_err(xid,  ses->Suid,
-								    tcon->tid, rc);
+				trace_smb3_set_reparse_compound_err(xid, tcon->tid,
+								    ses->Suid, rc);
 			} else {
-				trace_smb3_set_reparse_compound_done(xid, ses->Suid,
-								     tcon->tid);
+				trace_smb3_set_reparse_compound_done(xid, tcon->tid,
+								     ses->Suid);
 			}
 			SMB2_ioctl_free(&rqst[num_rqst++]);
 			break;
@@ -764,18 +764,18 @@ finished:
 				rbuf = reparse_buf_ptr(iov);
 				if (IS_ERR(rbuf)) {
 					rc = PTR_ERR(rbuf);
-					trace_smb3_set_reparse_compound_err(xid,  ses->Suid,
-									    tcon->tid, rc);
+					trace_smb3_get_reparse_compound_err(xid, tcon->tid,
+									    ses->Suid, rc);
 				} else {
 					idata->reparse.tag = le32_to_cpu(rbuf->ReparseTag);
-					trace_smb3_set_reparse_compound_done(xid, ses->Suid,
-									     tcon->tid);
+					trace_smb3_get_reparse_compound_done(xid, tcon->tid,
+									     ses->Suid);
 				}
 				memset(iov, 0, sizeof(*iov));
 				resp_buftype[i + 1] = CIFS_NO_BUFFER;
 			} else {
-				trace_smb3_set_reparse_compound_err(xid, ses->Suid,
-								    tcon->tid, rc);
+				trace_smb3_get_reparse_compound_err(xid, tcon->tid,
+								    ses->Suid, rc);
 			}
 			SMB2_ioctl_free(&rqst[num_rqst++]);
 			break;
@@ -792,11 +792,11 @@ finished:
 				}
 			}
 			if (!rc) {
-				trace_smb3_query_wsl_ea_compound_done(xid, ses->Suid,
-								      tcon->tid);
+				trace_smb3_query_wsl_ea_compound_done(xid, tcon->tid,
+								      ses->Suid);
 			} else {
-				trace_smb3_query_wsl_ea_compound_err(xid, ses->Suid,
-								     tcon->tid, rc);
+				trace_smb3_query_wsl_ea_compound_err(xid, tcon->tid,
+								     ses->Suid, rc);
 			}
 			SMB2_query_info_free(&rqst[num_rqst++]);
 			break;
-- 
cgit v1.2.3


From 72cf9e94f32fc18096be3cc21216a78f50220e03 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Sun, 19 Jan 2025 01:02:06 -0600
Subject: smb3: add missing tracepoint for querying wsl EAs

We had tracepoints for the return code for querying WSL EAs
(trace_smb3_query_wsl_ea_compound_err and
trace_smb3_query_wsl_ea_compound_done) but were missing one for
trace_smb3_query_wsl_ea_compound_enter.

Fixes: ea41367b2a60 ("smb: client: introduce SMB2_OP_QUERY_WSL_EA")
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2inode.c | 2 ++
 fs/smb/client/trace.h     | 1 +
 2 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 274672755c19..7d3685dd655a 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -584,6 +584,8 @@ replay_again:
 				goto finished;
 			}
 			num_rqst++;
+			trace_smb3_query_wsl_ea_compound_enter(xid, tcon->tid,
+							       ses->Suid, full_path);
 			break;
 		default:
 			cifs_dbg(VFS, "Invalid command\n");
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 12cbd3428a6d..52bcb55d9952 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -674,6 +674,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
-- 
cgit v1.2.3


From 015683d4ed0d23698c71f2194f09bd17dbfad044 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Thu, 2 Jan 2025 21:38:48 +0100
Subject: cifs: Use cifs_autodisable_serverino() for disabling
 CIFS_MOUNT_SERVER_INUM in readdir.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In all other places is used function cifs_autodisable_serverino() for
disabling CIFS_MOUNT_SERVER_INUM mount flag. So use is also in readir.c
_initiate_cifs_search() function. Benefit of cifs_autodisable_serverino()
is that it also prints dmesg message that server inode numbers are being
disabled.

Fixes: ec06aedd4454 ("cifs: clean up handling when server doesn't consistently support inode numbers")
Fixes: f534dc994397 ("cifs: clear server inode number flag while autodisabling")
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/readdir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 273358d20a46..50f96259d9ad 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -413,7 +413,7 @@ ffirst_retry:
 		cifsFile->invalidHandle = false;
 	} else if ((rc == -EOPNOTSUPP) &&
 		   (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
-		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
+		cifs_autodisable_serverino(cifs_sb);
 		goto ffirst_retry;
 	}
 error_exit:
-- 
cgit v1.2.3


From 4e2ee328291585b7224ac8cc025625841ccfbf03 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Wed, 25 Dec 2024 15:54:22 +0100
Subject: cifs: Fix endian types in struct rfc1002_session_packet
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All fields in struct rfc1002_session_packet are in big endian. This is
because all NetBIOS packet headers are in big endian as opposite of SMB
structures which are in little endian.

Therefore use __be16 and __be32 types instead of __u16 and __u32 in
struct rfc1002_session_packet.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/rfc1002pdu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/rfc1002pdu.h b/fs/smb/client/rfc1002pdu.h
index ae1d025da294..ac82c2f3a4a2 100644
--- a/fs/smb/client/rfc1002pdu.h
+++ b/fs/smb/client/rfc1002pdu.h
@@ -24,7 +24,7 @@
 struct rfc1002_session_packet {
 	__u8	type;
 	__u8	flags;
-	__u16	length;
+	__be16	length;
 	union {
 		struct {
 			__u8 called_len;
@@ -35,8 +35,8 @@ struct rfc1002_session_packet {
 			__u8 scope2; /* null */
 		} __attribute__((packed)) session_req;
 		struct {
-			__u32 retarget_ip_addr;
-			__u16 port;
+			__be32 retarget_ip_addr;
+			__be16 port;
 		} __attribute__((packed)) retarget_resp;
 		__u8 neg_ses_resp_error_code;
 		/* POSITIVE_SESSION_RESPONSE packet does not include trailer.
-- 
cgit v1.2.3


From 014fdae602c1dd23f967d539c8174d6aca689763 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Thu, 26 Dec 2024 17:47:58 +0100
Subject: cifs: Add missing NT_STATUS_* codes from nterr.h to nterr.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows cifs_print_status() to show string representation also for
these error codes.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/nterr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/nterr.c b/fs/smb/client/nterr.c
index 358a766375b4..d396a8e98a81 100644
--- a/fs/smb/client/nterr.c
+++ b/fs/smb/client/nterr.c
@@ -13,6 +13,13 @@
 
 const struct nt_err_code_struct nt_errs[] = {
 	{"NT_STATUS_OK", NT_STATUS_OK},
+	{"NT_STATUS_MEDIA_CHANGED", NT_STATUS_MEDIA_CHANGED},
+	{"NT_STATUS_END_OF_MEDIA", NT_STATUS_END_OF_MEDIA},
+	{"NT_STATUS_MEDIA_CHECK", NT_STATUS_MEDIA_CHECK},
+	{"NT_STATUS_NO_DATA_DETECTED", NT_STATUS_NO_DATA_DETECTED},
+	{"NT_STATUS_STOPPED_ON_SYMLINK", NT_STATUS_STOPPED_ON_SYMLINK},
+	{"NT_STATUS_DEVICE_REQUIRES_CLEANING", NT_STATUS_DEVICE_REQUIRES_CLEANING},
+	{"NT_STATUS_DEVICE_DOOR_OPEN", NT_STATUS_DEVICE_DOOR_OPEN},
 	{"NT_STATUS_UNSUCCESSFUL", NT_STATUS_UNSUCCESSFUL},
 	{"NT_STATUS_NOT_IMPLEMENTED", NT_STATUS_NOT_IMPLEMENTED},
 	{"NT_STATUS_INVALID_INFO_CLASS", NT_STATUS_INVALID_INFO_CLASS},
@@ -670,5 +677,6 @@ const struct nt_err_code_struct nt_errs[] = {
 	{"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES},
 	{"NT_STATUS_MORE_ENTRIES", NT_STATUS_MORE_ENTRIES},
 	{"NT_STATUS_SOME_UNMAPPED", NT_STATUS_SOME_UNMAPPED},
+	{"NT_STATUS_NO_SUCH_JOB", NT_STATUS_NO_SUCH_JOB},
 	{NULL, 0}
 };
-- 
cgit v1.2.3


From 6d08851c601328025615d779c08753067ecefa34 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Thu, 26 Dec 2024 14:27:16 +0100
Subject: cifs: Fix printing Status code into dmesg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NT Status code is 32-bit number, so for comparing two NT Status codes is
needed to check all 32 bits, and not just low 24 bits.

Before this change kernel printed message:
"Status code returned 0x8000002d NT_STATUS_NOT_COMMITTED"

It was incorrect as because NT_STATUS_NOT_COMMITTED is defined as
0xC000002d and 0x8000002d has defined name NT_STATUS_STOPPED_ON_SYMLINK.

With this change kernel prints message:
"Status code returned 0x8000002d NT_STATUS_STOPPED_ON_SYMLINK"

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/netmisc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/netmisc.c b/fs/smb/client/netmisc.c
index 2a8d71221e5e..17b3e21ea868 100644
--- a/fs/smb/client/netmisc.c
+++ b/fs/smb/client/netmisc.c
@@ -775,10 +775,10 @@ cifs_print_status(__u32 status_code)
 	int idx = 0;
 
 	while (nt_errs[idx].nt_errstr != NULL) {
-		if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) ==
-		    (status_code & 0xFFFFFF)) {
+		if (nt_errs[idx].nt_errcode == status_code) {
 			pr_notice("Status code returned 0x%08x %s\n",
 				  status_code, nt_errs[idx].nt_errstr);
+			return;
 		}
 		idx++;
 	}
-- 
cgit v1.2.3


From 4bda5f4de03883913e9d0a4ce9a495aa918051ac Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Wed, 25 Dec 2024 18:04:55 +0100
Subject: cifs: Remove declaration of dead CIFSSMBQuerySymLink function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Function CIFSSMBQuerySymLink() was renamed to cifs_query_reparse_point() in
commit ed3e0a149b58 ("smb: client: implement ->query_reparse_point() for
SMB1"). Remove its dead declaration from header file too.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsproto.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index c6c6acb9e206..223e5e231f42 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -474,9 +474,6 @@ extern int cifs_query_reparse_point(const unsigned int xid,
 				    const char *full_path,
 				    u32 *tag, struct kvec *rsp,
 				    int *rsp_buftype);
-extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
-			       __u16 fid, char **symlinkinfo,
-			       const struct nls_table *nls_codepage);
 extern int CIFSSMB_set_compression(const unsigned int xid,
 				   struct cifs_tcon *tcon, __u16 fid);
 extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
-- 
cgit v1.2.3


From e20a405fe4f240d94f1baaa0eaf816cbf16ffb60 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Fri, 27 Dec 2024 16:04:38 +0100
Subject: cifs: Do not attempt to call CIFSSMBRenameOpenFile() without
 CAP_INFOLEVEL_PASSTHRU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CIFSSMBRenameOpenFile() uses SMB_SET_FILE_RENAME_INFORMATION (0x3f2) level
which is SMB PASSTHROUGH level (>= 0x03e8). SMB PASSTHROUGH levels are
supported only when server announce CAP_INFOLEVEL_PASSTHRU.

All usage of CIFSSMBRenameOpenFile() execept the one is already guarded by
checks which prevents calling it against servers without support for
CAP_INFOLEVEL_PASSTHRU.

The remaning usage without guard is in cifs_do_rename() function, so add
missing guard here.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index f146e06c97eb..93e9188b2632 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2392,6 +2392,13 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
 	if (to_dentry->d_parent != from_dentry->d_parent)
 		goto do_rename_exit;
 
+	/*
+	 * CIFSSMBRenameOpenFile() uses SMB_SET_FILE_RENAME_INFORMATION
+	 * which is SMB PASSTHROUGH level.
+	 */
+	if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU))
+		goto do_rename_exit;
+
 	oparms = (struct cifs_open_parms) {
 		.tcon = tcon,
 		.cifs_sb = cifs_sb,
-- 
cgit v1.2.3


From 10e6fe53d90b550876bffc5847107c5372c01084 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Sat, 28 Dec 2024 21:21:56 +0100
Subject: cifs: Do not attempt to call CIFSGetSrvInodeNumber() without
 CAP_INFOLEVEL_PASSTHRU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CIFSGetSrvInodeNumber() uses SMB_QUERY_FILE_INTERNAL_INFO (0x3ee) level
which is SMB PASSTHROUGH level (>= 0x03e8). SMB PASSTHROUGH levels are
supported only when server announce CAP_INFOLEVEL_PASSTHRU.

So add guard in cifs_query_file_info() function which is the only user of
CIFSGetSrvInodeNumber() function and returns -EOPNOTSUPP when server does
not announce CAP_INFOLEVEL_PASSTHRU.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb1ops.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index db3695eddcf9..749a83cd0deb 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -614,7 +614,13 @@ static int cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
 	 * There may be higher info levels that work but are there Windows
 	 * server or network appliances for which IndexNumber field is not
 	 * guaranteed unique?
+	 *
+	 * CIFSGetSrvInodeNumber() uses SMB_QUERY_FILE_INTERNAL_INFO
+	 * which is SMB PASSTHROUGH level therefore check for capability.
+	 * Note that this function can be called with tcon == NULL.
 	 */
+	if (tcon && !(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU))
+		return -EOPNOTSUPP;
 	return CIFSGetSrvInodeNumber(xid, tcon, full_path, uniqueid,
 				     cifs_sb->local_nls,
 				     cifs_remap(cifs_sb));
-- 
cgit v1.2.3


From 2948f0d4db40d839db2c1d141121c2e4e12a6faa Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Sat, 28 Sep 2024 14:16:31 +0200
Subject: cifs: Remove duplicate struct reparse_symlink_data and
 SYMLINK_FLAG_RELATIVE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In file common/smb2pdu.h is defined struct reparse_symlink_data_buffer
which is same as struct reparse_symlink_data and is used in the whole code.
So remove duplicate struct reparse_symlink_data from client/cifspdu.h.

In file common/smb2pdu.h is defined also SYMLINK_FLAG_RELATIVE constant, so
remove duplication from client/cifspdu.h.

Signed-off-by: Pali Rohár <pali@kernel.org>
Acked-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifspdu.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 17202754e6d0..5c047b00516f 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -1484,22 +1484,6 @@ struct file_notify_information {
 	__u8  FileName[];
 } __attribute__((packed));
 
-/* For IO_REPARSE_TAG_SYMLINK */
-struct reparse_symlink_data {
-	__le32	ReparseTag;
-	__le16	ReparseDataLength;
-	__u16	Reserved;
-	__le16	SubstituteNameOffset;
-	__le16	SubstituteNameLength;
-	__le16	PrintNameOffset;
-	__le16	PrintNameLength;
-	__le32	Flags;
-	char	PathBuffer[];
-} __attribute__((packed));
-
-/* Flag above */
-#define SYMLINK_FLAG_RELATIVE 0x00000001
-
 /* For IO_REPARSE_TAG_NFS */
 #define NFS_SPECFILE_LNK	0x00000000014B4E4C
 #define NFS_SPECFILE_CHR	0x0000000000524843
-- 
cgit v1.2.3


From e02938613eb206ebf788e2d3d4fccf534e4ea12e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 17 Jan 2025 21:38:22 +0000
Subject: f2fs: avoid trying to get invalid block address

In f2fs_new_inode(), if we fail to get a new inode, we go iput(), followed by
f2fs_evict_inode(). If the inode is not marked as bad, it'll try to call
f2fs_remove_inode_page() which tries to read the inode block given node id.
But, there's no block address allocated yet, which gives a chance to access
a wrong block address, if the block device has some garbage data in NAT table.

We need to make sure NAT table should have zero data for all the unallocated
node ids, but also would be better to take this unnecessary path as well.
Let's mark the faild inode as bad.

Fixes: 0abd675e97e6 ("f2fs: support plain user/group quota")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 57d46e1439de..a278c7da8177 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -341,6 +341,7 @@ fail_drop:
 	trace_f2fs_new_inode(inode, err);
 	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
+	make_bad_inode(inode);
 	if (nid_free)
 		set_inode_flag(inode, FI_FREE_NID);
 	clear_nlink(inode);
-- 
cgit v1.2.3


From ead11ac50ad4b8ef1b64806e962ea984862d96ad Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Mon, 13 Jan 2025 17:29:03 -0500
Subject: nfs: fix incorrect error handling in LOCALIO

nfs4_stat_to_errno() expects a NFSv4 error code as an argument and
returns a POSIX errno.

The problem is LOCALIO is passing nfs4_stat_to_errno() the POSIX errno
return values from filp->f_op->read_iter(), filp->f_op->write_iter()
and vfs_fsync_range().

So the POSIX errno that nfs_local_pgio_done() and
nfs_local_commit_done() are passing to nfs4_stat_to_errno() are
failing to match any NFSv4 error code, which results in
nfs4_stat_to_errno() defaulting to returning -EREMOTEIO. This causes
assertions in upper layers due to -EREMOTEIO not being a valid NFSv4
error code.

Fix this by updating nfs_local_pgio_done() and nfs_local_commit_done()
to use the new nfs_localio_errno_to_nfs4_stat() to map a POSIX errno
to an NFSv4 error code.

Care was taken to factor out nfs4_errtbl_common[] to avoid duplicating
the same NFS error to errno table. nfs4_errtbl_common[] is checked
first by both nfs4_stat_to_errno and nfs_localio_errno_to_nfs4_stat
before they check their own more specialized tables (nfs4_errtbl[] and
nfs4_errtbl_localio[] respectively).

While auditing the associated error mapping tables, the (ab)use of -1
for the last table entry was removed in favor of using ARRAY_SIZE to
iterate the nfs_errtbl[] and nfs4_errtbl[]. And 'errno_NFSERR_IO' was
removed because it caused needless obfuscation.

Fixes: 70ba381e1a431 ("nfs: add LOCALIO support")
Reported-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/localio.c           |  4 +--
 fs/nfs_common/common.c     | 89 ++++++++++++++++++++++++++++++++++++++++------
 include/linux/nfs_common.h |  3 +-
 3 files changed, 82 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 1eee5aac2884..5c21caeae075 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -388,7 +388,7 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
 		hdr->res.op_status = NFS4_OK;
 		hdr->task.tk_status = 0;
 	} else {
-		hdr->res.op_status = nfs4_stat_to_errno(status);
+		hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
 		hdr->task.tk_status = status;
 	}
 }
@@ -786,7 +786,7 @@ nfs_local_commit_done(struct nfs_commit_data *data, int status)
 		data->task.tk_status = 0;
 	} else {
 		nfs_reset_boot_verifier(data->inode);
-		data->res.op_status = nfs4_stat_to_errno(status);
+		data->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
 		data->task.tk_status = status;
 	}
 }
diff --git a/fs/nfs_common/common.c b/fs/nfs_common/common.c
index 34a115176f97..af09aed09fd2 100644
--- a/fs/nfs_common/common.c
+++ b/fs/nfs_common/common.c
@@ -15,7 +15,7 @@ static const struct {
 	{ NFS_OK,		0		},
 	{ NFSERR_PERM,		-EPERM		},
 	{ NFSERR_NOENT,		-ENOENT		},
-	{ NFSERR_IO,		-errno_NFSERR_IO},
+	{ NFSERR_IO,		-EIO		},
 	{ NFSERR_NXIO,		-ENXIO		},
 /*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
 	{ NFSERR_ACCES,		-EACCES		},
@@ -45,7 +45,6 @@ static const struct {
 	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
 	{ NFSERR_BADTYPE,	-EBADTYPE	},
 	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
-	{ -1,			-EIO		}
 };
 
 /**
@@ -59,26 +58,29 @@ int nfs_stat_to_errno(enum nfs_stat status)
 {
 	int i;
 
-	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+	for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
 		if (nfs_errtbl[i].stat == (int)status)
 			return nfs_errtbl[i].errno;
 	}
-	return nfs_errtbl[i].errno;
+	return -EIO;
 }
 EXPORT_SYMBOL_GPL(nfs_stat_to_errno);
 
 /*
  * We need to translate between nfs v4 status return values and
  * the local errno values which may not be the same.
+ *
+ * nfs4_errtbl_common[] is used before more specialized mappings
+ * available in nfs4_errtbl[] or nfs4_errtbl_localio[].
  */
 static const struct {
 	int stat;
 	int errno;
-} nfs4_errtbl[] = {
+} nfs4_errtbl_common[] = {
 	{ NFS4_OK,		0		},
 	{ NFS4ERR_PERM,		-EPERM		},
 	{ NFS4ERR_NOENT,	-ENOENT		},
-	{ NFS4ERR_IO,		-errno_NFSERR_IO},
+	{ NFS4ERR_IO,		-EIO		},
 	{ NFS4ERR_NXIO,		-ENXIO		},
 	{ NFS4ERR_ACCESS,	-EACCES		},
 	{ NFS4ERR_EXIST,	-EEXIST		},
@@ -98,15 +100,20 @@ static const struct {
 	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
 	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
 	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
-	{ NFS4ERR_SERVERFAULT,	-EREMOTEIO	},
 	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
-	{ NFS4ERR_LOCKED,	-EAGAIN		},
 	{ NFS4ERR_SYMLINK,	-ELOOP		},
-	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},
 	{ NFS4ERR_DEADLOCK,	-EDEADLK	},
+};
+
+static const struct {
+	int stat;
+	int errno;
+} nfs4_errtbl[] = {
+	{ NFS4ERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFS4ERR_LOCKED,	-EAGAIN		},
+	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},
 	{ NFS4ERR_NOXATTR,	-ENODATA	},
 	{ NFS4ERR_XATTR2BIG,	-E2BIG		},
-	{ -1,			-EIO		}
 };
 
 /*
@@ -116,7 +123,14 @@ static const struct {
 int nfs4_stat_to_errno(int stat)
 {
 	int i;
-	for (i = 0; nfs4_errtbl[i].stat != -1; i++) {
+
+	/* First check nfs4_errtbl_common */
+	for (i = 0; i < ARRAY_SIZE(nfs4_errtbl_common); i++) {
+		if (nfs4_errtbl_common[i].stat == stat)
+			return nfs4_errtbl_common[i].errno;
+	}
+	/* Then check nfs4_errtbl */
+	for (i = 0; i < ARRAY_SIZE(nfs4_errtbl); i++) {
 		if (nfs4_errtbl[i].stat == stat)
 			return nfs4_errtbl[i].errno;
 	}
@@ -132,3 +146,56 @@ int nfs4_stat_to_errno(int stat)
 	return -stat;
 }
 EXPORT_SYMBOL_GPL(nfs4_stat_to_errno);
+
+/*
+ * This table is useful for conversion from local errno to NFS error.
+ * It provides more logically correct mappings for use with LOCALIO
+ * (which is focused on converting from errno to NFS status).
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs4_errtbl_localio[] = {
+	/* Map errors differently than nfs4_errtbl */
+	{ NFS4ERR_IO,		-EREMOTEIO	},
+	{ NFS4ERR_DELAY,	-EAGAIN		},
+	{ NFS4ERR_FBIG,		-E2BIG		},
+	/* Map errors not handled by nfs4_errtbl */
+	{ NFS4ERR_STALE,	-EBADF		},
+	{ NFS4ERR_STALE,	-EOPENSTALE	},
+	{ NFS4ERR_DELAY,	-ETIMEDOUT	},
+	{ NFS4ERR_DELAY,	-ERESTARTSYS	},
+	{ NFS4ERR_DELAY,	-ENOMEM		},
+	{ NFS4ERR_IO,		-ETXTBSY	},
+	{ NFS4ERR_IO,		-EBUSY		},
+	{ NFS4ERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFS4ERR_SERVERFAULT,	-ENFILE		},
+	{ NFS4ERR_IO,		-EUCLEAN	},
+	{ NFS4ERR_PERM,		-ENOKEY		},
+};
+
+/*
+ * Convert an errno to an NFS error code for LOCALIO.
+ */
+__u32 nfs_localio_errno_to_nfs4_stat(int errno)
+{
+	int i;
+
+	/* First check nfs4_errtbl_common */
+	for (i = 0; i < ARRAY_SIZE(nfs4_errtbl_common); i++) {
+		if (nfs4_errtbl_common[i].errno == errno)
+			return nfs4_errtbl_common[i].stat;
+	}
+	/* Then check nfs4_errtbl_localio */
+	for (i = 0; i < ARRAY_SIZE(nfs4_errtbl_localio); i++) {
+		if (nfs4_errtbl_localio[i].errno == errno)
+			return nfs4_errtbl_localio[i].stat;
+	}
+	/* If we cannot translate the error, the recovery routines should
+	 * handle it.
+	 * Note: remaining NFSv4 error codes have values > 10000, so should
+	 * not conflict with native Linux error codes.
+	 */
+	return NFS4ERR_SERVERFAULT;
+}
+EXPORT_SYMBOL_GPL(nfs_localio_errno_to_nfs4_stat);
diff --git a/include/linux/nfs_common.h b/include/linux/nfs_common.h
index 5fc02df88252..a541c3a02887 100644
--- a/include/linux/nfs_common.h
+++ b/include/linux/nfs_common.h
@@ -9,9 +9,10 @@
 #include <uapi/linux/nfs.h>
 
 /* Mapping from NFS error code to "errno" error code. */
-#define errno_NFSERR_IO EIO
 
 int nfs_stat_to_errno(enum nfs_stat status);
 int nfs4_stat_to_errno(int stat);
 
+__u32 nfs_localio_errno_to_nfs4_stat(int errno);
+
 #endif /* _LINUX_NFS_COMMON_H */
-- 
cgit v1.2.3


From 90190ba1c3b11687e2c251fda1f5d9893b4bab17 Mon Sep 17 00:00:00 2001
From: Dragan Simic <dsimic@manjaro.org>
Date: Fri, 27 Dec 2024 20:17:58 +0100
Subject: nfs: Make NFS_FSCACHE select NETFS_SUPPORT instead of depending on it

Having the NFS_FSCACHE option depend on the NETFS_SUPPORT options makes
selecting NFS_FSCACHE impossible unless another option that additionally
selects NETFS_SUPPORT is already selected.

As a result, for example, being able to reach and select the NFS_FSCACHE
option requires the CEPH_FS or CIFS option to be selected beforehand, which
obviously doesn't make much sense.

Let's correct this by making the NFS_FSCACHE option actually select the
NETFS_SUPPORT option, instead of depending on it.

Fixes: 915cd30cdea8 ("netfs, fscache: Combine fscache with netfs")
Cc: stable@vger.kernel.org
Reported-by: Diederik de Haas <didi.debian@cknow.org>
Signed-off-by: Dragan Simic <dsimic@manjaro.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 0eb20012792f..d3f76101ad4b 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -170,7 +170,8 @@ config ROOT_NFS
 
 config NFS_FSCACHE
 	bool "Provide NFS client caching support"
-	depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y
+	depends on NFS_FS
+	select NETFS_SUPPORT
 	select FSCACHE
 	help
 	  Say Y here if you want NFS data to be cached locally on disc through
-- 
cgit v1.2.3


From d2fc83c5df63f1391006ff7f734bfb794c76badd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 13 Jan 2025 10:32:36 -0500
Subject: NFS: CB_OFFLOAD can return NFS4ERR_DELAY

RFC 7862 permits the callback service to respond to a CB_OFFLOAD
operation with NFS4ERR_DELAY. Use that instead of
NFS4ERR_SERVERFAULT for temporary memory allocation failure, as that
is more consistent with how other operations report memory
allocation failure.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/callback_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 7832fb0369a1..8397c43358bd 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -718,7 +718,7 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
 
 	copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
 	if (!copy)
-		return htonl(NFS4ERR_SERVERFAULT);
+		return cpu_to_be32(NFS4ERR_DELAY);
 
 	spin_lock(&cps->clp->cl_lock);
 	rcu_read_lock();
-- 
cgit v1.2.3


From 36f4e9ef84bb334392628bcc21557ddad7dc8e5f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 13 Jan 2025 10:32:37 -0500
Subject: NFS: Fix typo in OFFLOAD_CANCEL comment

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/nfs42xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 9e3ae53e2205..ef5730c5e704 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -549,7 +549,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
 }
 
 /*
- * Encode OFFLOAD_CANEL request
+ * Encode OFFLOAD_CANCEL request
  */
 static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req,
 					struct xdr_stream *xdr,
-- 
cgit v1.2.3


From aa9e4ad0fd87f44748af84616accbd42cfb875aa Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 13 Jan 2025 10:32:38 -0500
Subject: NFS: Rename struct nfs4_offloadcancel_data

Refactor: This struct can be used unchanged for the new
OFFLOAD_STATUS implementation, so give it a more generic name.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/nfs42proc.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 531c9c20ef1d..9d716907cf30 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -498,15 +498,15 @@ out_put_src_lock:
 	return err;
 }
 
-struct nfs42_offloadcancel_data {
+struct nfs42_offload_data {
 	struct nfs_server *seq_server;
 	struct nfs42_offload_status_args args;
 	struct nfs42_offload_status_res res;
 };
 
-static void nfs42_offload_cancel_prepare(struct rpc_task *task, void *calldata)
+static void nfs42_offload_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs42_offloadcancel_data *data = calldata;
+	struct nfs42_offload_data *data = calldata;
 
 	nfs4_setup_sequence(data->seq_server->nfs_client,
 				&data->args.osa_seq_args,
@@ -515,7 +515,7 @@ static void nfs42_offload_cancel_prepare(struct rpc_task *task, void *calldata)
 
 static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs42_offloadcancel_data *data = calldata;
+	struct nfs42_offload_data *data = calldata;
 
 	trace_nfs4_offload_cancel(&data->args, task->tk_status);
 	nfs41_sequence_done(task, &data->res.osr_seq_res);
@@ -525,22 +525,22 @@ static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata)
 		rpc_restart_call_prepare(task);
 }
 
-static void nfs42_free_offloadcancel_data(void *data)
+static void nfs42_offload_release(void *data)
 {
 	kfree(data);
 }
 
 static const struct rpc_call_ops nfs42_offload_cancel_ops = {
-	.rpc_call_prepare = nfs42_offload_cancel_prepare,
+	.rpc_call_prepare = nfs42_offload_prepare,
 	.rpc_call_done = nfs42_offload_cancel_done,
-	.rpc_release = nfs42_free_offloadcancel_data,
+	.rpc_release = nfs42_offload_release,
 };
 
 static int nfs42_do_offload_cancel_async(struct file *dst,
 					 nfs4_stateid *stateid)
 {
 	struct nfs_server *dst_server = NFS_SERVER(file_inode(dst));
-	struct nfs42_offloadcancel_data *data = NULL;
+	struct nfs42_offload_data *data = NULL;
 	struct nfs_open_context *ctx = nfs_file_open_context(dst);
 	struct rpc_task *task;
 	struct rpc_message msg = {
@@ -559,7 +559,7 @@ static int nfs42_do_offload_cancel_async(struct file *dst,
 	if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL))
 		return -EOPNOTSUPP;
 
-	data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL);
+	data = kzalloc(sizeof(struct nfs42_offload_data), GFP_KERNEL);
 	if (data == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From e8380c2d06055665b3df6c03964911375d7f9290 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Fri, 13 Dec 2024 11:52:00 -0500
Subject: NFSv4.2: fix COPY_NOTIFY xdr buf size calculation

We need to include sequence size in the compound.

Fixes: 0491567b51ef ("NFS: add COPY_NOTIFY operation")
Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/nfs42xdr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index ef5730c5e704..5072d7ea72e9 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -144,9 +144,11 @@
 					 decode_putfh_maxsz + \
 					 decode_offload_cancel_maxsz)
 #define NFS4_enc_copy_notify_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_copy_notify_maxsz)
 #define NFS4_dec_copy_notify_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_copy_notify_maxsz)
 #define NFS4_enc_deallocate_sz		(compound_encode_hdr_maxsz + \
-- 
cgit v1.2.3


From 668135b9348c53fd205f5e07d11e82b10f31b55b Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Fri, 13 Dec 2024 11:52:01 -0500
Subject: NFSv4.2: mark OFFLOAD_CANCEL MOVEABLE

OFFLOAD_CANCEL should be marked MOVEABLE for when we need to move
tasks off a non-functional transport.

Fixes: c975c2092657 ("NFS send OFFLOAD_CANCEL when COPY killed")
Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/nfs42proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 9d716907cf30..f01a42965254 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -552,7 +552,7 @@ static int nfs42_do_offload_cancel_async(struct file *dst,
 		.rpc_message = &msg,
 		.callback_ops = &nfs42_offload_cancel_ops,
 		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
 	};
 	int status;
 
-- 
cgit v1.2.3


From 0b96c75d86560c077d29ce629e9d1ca0258c9cc1 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Fri, 13 Dec 2024 11:52:02 -0500
Subject: NFSv4.2: make LAYOUTSTATS and LAYOUTERROR MOVEABLE

LAYOUTSTATS and LAYOUTERROR should be marked MOVEABLE for when we
need to move tasks off a non-functional transport.

Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/nfs42proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index f01a42965254..1924c4a2077b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -861,7 +861,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
 		.rpc_message = &msg,
 		.callback_ops = &nfs42_layoutstat_ops,
 		.callback_data = data,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
 	};
 	struct rpc_task *task;
 
@@ -1016,7 +1016,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
 	struct rpc_task_setup task_setup = {
 		.rpc_message = &msg,
 		.callback_ops = &nfs42_layouterror_ops,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
 	};
 	unsigned int i;
 
-- 
cgit v1.2.3


From a8581752278ef9d6ea73bb10eef2657bfb73e116 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 21 Jan 2025 02:26:13 -0500
Subject: bcachefs: Fix btree_trans_peek_key_cache()

BTREE_ITER_cached_nofill has some tricky corner cases; it's used
internally for iterators that aren't walking the key cache, but need to
be coherent with the key cache.

It tells traverse to look up and lock the key cache entry if present,
but don't create one if it doesn't exist.

That means we have to have a BTREE_ITER_UPTODATE path (because after
traverse the path has to be UPTODATE, or we pop assertions) that doesn't
point to anything (which is the less bad option, taken by the previous
fix).

The previous fix for this path missed an issue that can happen in
bch2_trans_peek_key_cache(): we can't set should_be_locked on a path
that doesn't point to anything and doesn't hold locks.

Fixes: bd5b09727f3d ("bcachefs: Don't set btree_path to updtodate if we don't fill")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c      | 3 +--
 fs/bcachefs/btree_key_cache.c | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 367231ab1980..5988219c6908 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2239,8 +2239,6 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
-
 	k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
 	if (!k.k)
 		return k;
@@ -2251,6 +2249,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 
 	iter->k = u;
 	k.k = &iter->k;
+	btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
 	return k;
 }
 
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 3b62296c3100..c378b97ebeca 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -291,8 +291,10 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
 					 struct btree_path *ck_path,
 					 unsigned flags)
 {
-	if (flags & BTREE_ITER_cached_nofill)
+	if (flags & BTREE_ITER_cached_nofill) {
+		ck_path->l[0].b = NULL;
 		return 0;
+	}
 
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-- 
cgit v1.2.3


From f917016f69ba5a0c42a3a3327792707315f2a937 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 21 Jan 2025 12:56:00 -0500
Subject: bcachefs: Reduce stack frame size of __bch2_str_hash_check_key()

We don't need all the helpers inlined here.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/str_hash.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 8c2c5539de2e..d78451c2a0c6 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -31,11 +31,11 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir
 	}
 }
 
-static int fsck_rename_dirent(struct btree_trans *trans,
-			      struct snapshots_seen *s,
-			      const struct bch_hash_desc desc,
-			      struct bch_hash_info *hash_info,
-			      struct bkey_s_c_dirent old)
+static noinline int fsck_rename_dirent(struct btree_trans *trans,
+				       struct snapshots_seen *s,
+				       const struct bch_hash_desc desc,
+				       struct bch_hash_info *hash_info,
+				       struct bkey_s_c_dirent old)
 {
 	struct qstr old_name = bch2_dirent_get_name(old);
 	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
@@ -71,11 +71,11 @@ static int fsck_rename_dirent(struct btree_trans *trans,
 	return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
 }
 
-static int hash_pick_winner(struct btree_trans *trans,
-			    const struct bch_hash_desc desc,
-			    struct bch_hash_info *hash_info,
-			    struct bkey_s_c k1,
-			    struct bkey_s_c k2)
+static noinline int hash_pick_winner(struct btree_trans *trans,
+				     const struct bch_hash_desc desc,
+				     struct bch_hash_info *hash_info,
+				     struct bkey_s_c k1,
+				     struct bkey_s_c k2)
 {
 	if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
 	    !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
@@ -142,8 +142,8 @@ fsck_err:
  * All versions of the same inode in different snapshots must have the same hash
  * seed/type: verify that the hash info we're using matches the root
  */
-static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
-					      struct bch_hash_info *hash_info)
+static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
+						       struct bch_hash_info *hash_info)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-- 
cgit v1.2.3


From 531503054e8fe9f4502fff0dceba20dfaa9920d5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:13:53 -0500
Subject: nfsd: fix handling of delegated change attr in CB_GETATTR

RFC8881, section 10.4.3 has some specific guidance as to how the
delegated change attribute should be handled. We currently don't follow
that guidance properly.

In particular, when the file is modified, the server always reports the
initial change attribute + 1. Section 10.4.3 however indicates that it
should be incremented on every GETATTR request from other clients.

Only request the change attribute until the file has been modified. If
there is an outstanding delegation, then increment the cached change
attribute on every GETATTR.

Fixes: 6487a13b5c6b ("NFSD: add support for CB_GETATTR callback")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c |  8 +++++---
 fs/nfsd/nfs4xdr.c      | 15 +++++++++------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a8e33f561bb3..1457f61ae051 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -361,12 +361,14 @@ static void
 encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
 			struct nfs4_cb_fattr *fattr)
 {
-	struct nfs4_delegation *dp =
-		container_of(fattr, struct nfs4_delegation, dl_cb_fattr);
+	struct nfs4_delegation *dp = container_of(fattr, struct nfs4_delegation, dl_cb_fattr);
 	struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle;
+	struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr;
 	u32 bmap[1];
 
-	bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE;
+	bmap[0] = FATTR4_WORD0_SIZE;
+	if (!ncf->ncf_file_modified)
+		bmap[0] |= FATTR4_WORD0_CHANGE;
 
 	encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR);
 	encode_nfs_fh4(xdr, fh);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 8d373d2edff1..31270e1a40ed 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2920,6 +2920,7 @@ struct nfsd4_fattr_args {
 	struct kstat		stat;
 	struct kstatfs		statfs;
 	struct nfs4_acl		*acl;
+	u64			change_attr;
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 	void			*context;
 	int			contextlen;
@@ -3019,7 +3020,6 @@ static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
 					 const struct nfsd4_fattr_args *args)
 {
 	const struct svc_export *exp = args->exp;
-	u64 c;
 
 	if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
 		u32 flush_time = convert_to_wallclock(exp->cd->flush_time);
@@ -3030,9 +3030,7 @@ static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
 			return nfserr_resource;
 		return nfs_ok;
 	}
-
-	c = nfsd4_change_attribute(&args->stat);
-	return nfsd4_encode_changeid4(xdr, c);
+	return nfsd4_encode_changeid4(xdr, args->change_attr);
 }
 
 static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr,
@@ -3557,11 +3555,16 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	if (dp) {
 		struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr;
 
-		if (ncf->ncf_file_modified)
+		if (ncf->ncf_file_modified) {
+			++ncf->ncf_initial_cinfo;
 			args.stat.size = ncf->ncf_cur_fsize;
-
+		}
+		args.change_attr = ncf->ncf_initial_cinfo;
 		nfs4_put_stid(&dp->dl_stid);
+	} else {
+		args.change_attr = nfsd4_change_attribute(&args.stat);
 	}
+
 	if (err)
 		goto out_nfserr;
 
-- 
cgit v1.2.3


From 8e1d32273ab7d06b6f78771e05824bfab01141f4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:13:54 -0500
Subject: nfs_common: make include/linux/nfs4.h include generated nfs4_1.h

In the long run, the NFS development community intends to autogenerate a
lot of the XDR handling code.  Both the NFS client and server include
"include/linux/nfs4.hi". That file was hand-rolled, and some of the symbols
in it conflict with the autogenerated symbols.

Add a small nfs4_1.x to Documentation that currently just has the
necessary definitions for the delstid draft, and generate the relevant
header and source files. Make include/linux/nfs4.h include the generated
include/linux/sunrpc/xdrgen/nfs4_1.h and remove the conflicting
definitions from it and nfs_xdr.h.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/sunrpc/xdr/nfs4_1.x    | 166 ++++++++++++++++++++++++
 fs/nfsd/Makefile                     |  16 ++-
 fs/nfsd/nfs4xdr_gen.c                | 239 +++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr_gen.h                |  25 ++++
 include/linux/nfs4.h                 |   7 +-
 include/linux/nfs_xdr.h              |   5 -
 include/linux/sunrpc/xdrgen/nfs4_1.h | 124 ++++++++++++++++++
 7 files changed, 570 insertions(+), 12 deletions(-)
 create mode 100644 Documentation/sunrpc/xdr/nfs4_1.x
 create mode 100644 fs/nfsd/nfs4xdr_gen.c
 create mode 100644 fs/nfsd/nfs4xdr_gen.h
 create mode 100644 include/linux/sunrpc/xdrgen/nfs4_1.h

(limited to 'fs')

diff --git a/Documentation/sunrpc/xdr/nfs4_1.x b/Documentation/sunrpc/xdr/nfs4_1.x
new file mode 100644
index 000000000000..fc37d1ecba0f
--- /dev/null
+++ b/Documentation/sunrpc/xdr/nfs4_1.x
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2010 IETF Trust and the persons identified
+ * as the document authors.  All rights reserved.
+ *
+ * The document authors are identified in RFC 3530 and
+ * RFC 5661.
+ *
+ * Redistribution and use in source and binary forms, with
+ * or without modification, are permitted provided that the
+ * following conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ *   copyright notice, this list of conditions and the
+ *   following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ *   copyright notice, this list of conditions and the
+ *   following disclaimer in the documentation and/or other
+ *   materials provided with the distribution.
+ *
+ * - Neither the name of Internet Society, IETF or IETF
+ *   Trust, nor the names of specific contributors, may be
+ *   used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS
+ *   AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ *   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *   FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ *   EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *   NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ *   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+pragma header nfs4;
+
+/*
+ * Basic typedefs for RFC 1832 data type definitions
+ */
+typedef hyper		int64_t;
+typedef unsigned int	uint32_t;
+
+/*
+ * Basic data types
+ */
+typedef uint32_t	bitmap4<>;
+
+/*
+ * Timeval
+ */
+struct nfstime4 {
+	int64_t		seconds;
+	uint32_t	nseconds;
+};
+
+
+/*
+ * The following content was extracted from draft-ietf-nfsv4-delstid
+ */
+
+typedef bool            fattr4_offline;
+
+
+const FATTR4_OFFLINE            = 83;
+
+
+struct open_arguments4 {
+  bitmap4  oa_share_access;
+  bitmap4  oa_share_deny;
+  bitmap4  oa_share_access_want;
+  bitmap4  oa_open_claim;
+  bitmap4  oa_create_mode;
+};
+
+
+enum open_args_share_access4 {
+   OPEN_ARGS_SHARE_ACCESS_READ  = 1,
+   OPEN_ARGS_SHARE_ACCESS_WRITE = 2,
+   OPEN_ARGS_SHARE_ACCESS_BOTH  = 3
+};
+
+
+enum open_args_share_deny4 {
+   OPEN_ARGS_SHARE_DENY_NONE  = 0,
+   OPEN_ARGS_SHARE_DENY_READ  = 1,
+   OPEN_ARGS_SHARE_DENY_WRITE = 2,
+   OPEN_ARGS_SHARE_DENY_BOTH  = 3
+};
+
+
+enum open_args_share_access_want4 {
+   OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG           = 3,
+   OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG            = 4,
+   OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL              = 5,
+   OPEN_ARGS_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL
+                                                   = 17,
+   OPEN_ARGS_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED
+                                                   = 18,
+   OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS    = 20,
+   OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 21
+};
+
+
+enum open_args_open_claim4 {
+   OPEN_ARGS_OPEN_CLAIM_NULL          = 0,
+   OPEN_ARGS_OPEN_CLAIM_PREVIOUS      = 1,
+   OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR  = 2,
+   OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV = 3,
+   OPEN_ARGS_OPEN_CLAIM_FH            = 4,
+   OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH  = 5,
+   OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH = 6
+};
+
+
+enum open_args_createmode4 {
+   OPEN_ARGS_CREATEMODE_UNCHECKED4     = 0,
+   OPEN_ARGS_CREATE_MODE_GUARDED       = 1,
+   OPEN_ARGS_CREATEMODE_EXCLUSIVE4     = 2,
+   OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1  = 3
+};
+
+
+typedef open_arguments4 fattr4_open_arguments;
+pragma public fattr4_open_arguments;
+
+
+%/*
+% * Determine what OPEN supports.
+% */
+const FATTR4_OPEN_ARGUMENTS     = 86;
+
+
+const OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000;
+
+
+const OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010;
+
+
+/*
+ * attributes for the delegation times being
+ * cached and served by the "client"
+ */
+typedef nfstime4        fattr4_time_deleg_access;
+typedef nfstime4        fattr4_time_deleg_modify;
+pragma public 		fattr4_time_deleg_access;
+pragma public		fattr4_time_deleg_modify;
+
+
+%/*
+% * New RECOMMENDED Attribute for
+% * delegation caching of times
+% */
+const FATTR4_TIME_DELEG_ACCESS  = 84;
+const FATTR4_TIME_DELEG_MODIFY  = 85;
+
+
+const OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000;
+
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 18cbd3fa7691..2f687619f65b 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -18,9 +18,23 @@ nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
-			   nfs4acl.o nfs4callback.o nfs4recover.o
+			   nfs4acl.o nfs4callback.o nfs4recover.o nfs4xdr_gen.o
 nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
 nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
 nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
 nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
 nfsd-$(CONFIG_NFS_LOCALIO) += localio.o
+
+
+.PHONY: xdrgen
+
+xdrgen: ../../include/linux/sunrpc/xdrgen/nfs4_1.h nfs4xdr_gen.h nfs4xdr_gen.c
+
+../../include/linux/sunrpc/xdrgen/nfs4_1.h: ../../Documentation/sunrpc/xdr/nfs4_1.x
+	../../tools/net/sunrpc/xdrgen/xdrgen definitions $< > $@
+
+nfs4xdr_gen.h: ../../Documentation/sunrpc/xdr/nfs4_1.x
+	../../tools/net/sunrpc/xdrgen/xdrgen declarations $< > $@
+
+nfs4xdr_gen.c: ../../Documentation/sunrpc/xdr/nfs4_1.x
+	../../tools/net/sunrpc/xdrgen/xdrgen source $< > $@
diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c
new file mode 100644
index 000000000000..e5d34f9a3147
--- /dev/null
+++ b/fs/nfsd/nfs4xdr_gen.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+// Generated by xdrgen. Manual edits will be lost.
+// XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x
+// XDR specification modification time: Thu Oct  3 11:30:59 2024
+
+#include <linux/sunrpc/svc.h>
+
+#include "nfs4xdr_gen.h"
+
+static bool __maybe_unused
+xdrgen_decode_int64_t(struct xdr_stream *xdr, int64_t *ptr)
+{
+	return xdrgen_decode_hyper(xdr, ptr);
+};
+
+static bool __maybe_unused
+xdrgen_decode_uint32_t(struct xdr_stream *xdr, uint32_t *ptr)
+{
+	return xdrgen_decode_unsigned_int(xdr, ptr);
+};
+
+static bool __maybe_unused
+xdrgen_decode_bitmap4(struct xdr_stream *xdr, bitmap4 *ptr)
+{
+	if (xdr_stream_decode_u32(xdr, &ptr->count) < 0)
+		return false;
+	for (u32 i = 0; i < ptr->count; i++)
+		if (!xdrgen_decode_uint32_t(xdr, &ptr->element[i]))
+			return false;
+	return true;
+};
+
+static bool __maybe_unused
+xdrgen_decode_nfstime4(struct xdr_stream *xdr, struct nfstime4 *ptr)
+{
+	if (!xdrgen_decode_int64_t(xdr, &ptr->seconds))
+		return false;
+	if (!xdrgen_decode_uint32_t(xdr, &ptr->nseconds))
+		return false;
+	return true;
+};
+
+static bool __maybe_unused
+xdrgen_decode_fattr4_offline(struct xdr_stream *xdr, fattr4_offline *ptr)
+{
+	return xdrgen_decode_bool(xdr, ptr);
+};
+
+static bool __maybe_unused
+xdrgen_decode_open_arguments4(struct xdr_stream *xdr, struct open_arguments4 *ptr)
+{
+	if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_share_access))
+		return false;
+	if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_share_deny))
+		return false;
+	if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_share_access_want))
+		return false;
+	if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_open_claim))
+		return false;
+	if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_create_mode))
+		return false;
+	return true;
+};
+
+static bool __maybe_unused
+xdrgen_decode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 *ptr)
+{
+	u32 val;
+
+	if (xdr_stream_decode_u32(xdr, &val) < 0)
+		return false;
+	*ptr = val;
+	return true;
+}
+
+static bool __maybe_unused
+xdrgen_decode_open_args_share_deny4(struct xdr_stream *xdr, open_args_share_deny4 *ptr)
+{
+	u32 val;
+
+	if (xdr_stream_decode_u32(xdr, &val) < 0)
+		return false;
+	*ptr = val;
+	return true;
+}
+
+static bool __maybe_unused
+xdrgen_decode_open_args_share_access_want4(struct xdr_stream *xdr, open_args_share_access_want4 *ptr)
+{
+	u32 val;
+
+	if (xdr_stream_decode_u32(xdr, &val) < 0)
+		return false;
+	*ptr = val;
+	return true;
+}
+
+static bool __maybe_unused
+xdrgen_decode_open_args_open_claim4(struct xdr_stream *xdr, open_args_open_claim4 *ptr)
+{
+	u32 val;
+
+	if (xdr_stream_decode_u32(xdr, &val) < 0)
+		return false;
+	*ptr = val;
+	return true;
+}
+
+static bool __maybe_unused
+xdrgen_decode_open_args_createmode4(struct xdr_stream *xdr, open_args_createmode4 *ptr)
+{
+	u32 val;
+
+	if (xdr_stream_decode_u32(xdr, &val) < 0)
+		return false;
+	*ptr = val;
+	return true;
+}
+
+bool
+xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_arguments *ptr)
+{
+	return xdrgen_decode_open_arguments4(xdr, ptr);
+};
+
+bool
+xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr)
+{
+	return xdrgen_decode_nfstime4(xdr, ptr);
+};
+
+bool
+xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg_modify *ptr)
+{
+	return xdrgen_decode_nfstime4(xdr, ptr);
+};
+
+static bool __maybe_unused
+xdrgen_encode_int64_t(struct xdr_stream *xdr, const int64_t value)
+{
+	return xdrgen_encode_hyper(xdr, value);
+};
+
+static bool __maybe_unused
+xdrgen_encode_uint32_t(struct xdr_stream *xdr, const uint32_t value)
+{
+	return xdrgen_encode_unsigned_int(xdr, value);
+};
+
+static bool __maybe_unused
+xdrgen_encode_bitmap4(struct xdr_stream *xdr, const bitmap4 value)
+{
+	if (xdr_stream_encode_u32(xdr, value.count) != XDR_UNIT)
+		return false;
+	for (u32 i = 0; i < value.count; i++)
+		if (!xdrgen_encode_uint32_t(xdr, value.element[i]))
+			return false;
+	return true;
+};
+
+static bool __maybe_unused
+xdrgen_encode_nfstime4(struct xdr_stream *xdr, const struct nfstime4 *value)
+{
+	if (!xdrgen_encode_int64_t(xdr, value->seconds))
+		return false;
+	if (!xdrgen_encode_uint32_t(xdr, value->nseconds))
+		return false;
+	return true;
+};
+
+static bool __maybe_unused
+xdrgen_encode_fattr4_offline(struct xdr_stream *xdr, const fattr4_offline value)
+{
+	return xdrgen_encode_bool(xdr, value);
+};
+
+static bool __maybe_unused
+xdrgen_encode_open_arguments4(struct xdr_stream *xdr, const struct open_arguments4 *value)
+{
+	if (!xdrgen_encode_bitmap4(xdr, value->oa_share_access))
+		return false;
+	if (!xdrgen_encode_bitmap4(xdr, value->oa_share_deny))
+		return false;
+	if (!xdrgen_encode_bitmap4(xdr, value->oa_share_access_want))
+		return false;
+	if (!xdrgen_encode_bitmap4(xdr, value->oa_open_claim))
+		return false;
+	if (!xdrgen_encode_bitmap4(xdr, value->oa_create_mode))
+		return false;
+	return true;
+};
+
+static bool __maybe_unused
+xdrgen_encode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 value)
+{
+	return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
+
+static bool __maybe_unused
+xdrgen_encode_open_args_share_deny4(struct xdr_stream *xdr, open_args_share_deny4 value)
+{
+	return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
+
+static bool __maybe_unused
+xdrgen_encode_open_args_share_access_want4(struct xdr_stream *xdr, open_args_share_access_want4 value)
+{
+	return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
+
+static bool __maybe_unused
+xdrgen_encode_open_args_open_claim4(struct xdr_stream *xdr, open_args_open_claim4 value)
+{
+	return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
+
+static bool __maybe_unused
+xdrgen_encode_open_args_createmode4(struct xdr_stream *xdr, open_args_createmode4 value)
+{
+	return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
+
+bool
+xdrgen_encode_fattr4_open_arguments(struct xdr_stream *xdr, const fattr4_open_arguments *value)
+{
+	return xdrgen_encode_open_arguments4(xdr, value);
+};
+
+bool
+xdrgen_encode_fattr4_time_deleg_access(struct xdr_stream *xdr, const fattr4_time_deleg_access *value)
+{
+	return xdrgen_encode_nfstime4(xdr, value);
+};
+
+bool
+xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time_deleg_modify *value)
+{
+	return xdrgen_encode_nfstime4(xdr, value);
+};
diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h
new file mode 100644
index 000000000000..c4c6a5075b17
--- /dev/null
+++ b/fs/nfsd/nfs4xdr_gen.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Generated by xdrgen. Manual edits will be lost. */
+/* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
+/* XDR specification modification time: Thu Oct  3 11:30:59 2024 */
+
+#ifndef _LINUX_XDRGEN_NFS4_1_DECL_H
+#define _LINUX_XDRGEN_NFS4_1_DECL_H
+
+#include <linux/types.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/xdrgen/_defs.h>
+#include <linux/sunrpc/xdrgen/_builtins.h>
+#include <linux/sunrpc/xdrgen/nfs4_1.h>
+
+bool xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_arguments *ptr);
+bool xdrgen_encode_fattr4_open_arguments(struct xdr_stream *xdr, const fattr4_open_arguments *value);
+
+bool xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr);
+bool xdrgen_encode_fattr4_time_deleg_access(struct xdr_stream *xdr, const fattr4_time_deleg_access *value);
+
+bool xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg_modify *ptr);
+bool xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time_deleg_modify *value);
+
+#endif /* _LINUX_XDRGEN_NFS4_1_DECL_H */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 8d7430d9f218..b90719244775 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -17,6 +17,7 @@
 #include <linux/uidgid.h>
 #include <uapi/linux/nfs4.h>
 #include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/xdrgen/nfs4_1.h>
 
 enum nfs4_acl_whotype {
 	NFS4_ACL_WHO_NAMED = 0,
@@ -512,12 +513,6 @@ enum {
 	FATTR4_XATTR_SUPPORT		= 82,
 };
 
-enum {
-	FATTR4_TIME_DELEG_ACCESS	= 84,
-	FATTR4_TIME_DELEG_MODIFY	= 85,
-	FATTR4_OPEN_ARGUMENTS		= 86,
-};
-
 /*
  * The following internal definitions enable processing the above
  * attribute bits within 32-bit word boundaries.
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 559273a0f16d..e74a87bb18a4 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1315,11 +1315,6 @@ struct nfs4_fsid_present_res {
 
 #endif /* CONFIG_NFS_V4 */
 
-struct nfstime4 {
-	u64	seconds;
-	u32	nseconds;
-};
-
 #ifdef CONFIG_NFS_V4_1
 
 struct pnfs_commit_bucket {
diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h
new file mode 100644
index 000000000000..6025ab6b7398
--- /dev/null
+++ b/include/linux/sunrpc/xdrgen/nfs4_1.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Generated by xdrgen. Manual edits will be lost. */
+/* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
+/* XDR specification modification time: Thu Oct  3 11:30:59 2024 */
+
+#ifndef _LINUX_XDRGEN_NFS4_1_DEF_H
+#define _LINUX_XDRGEN_NFS4_1_DEF_H
+
+#include <linux/types.h>
+#include <linux/sunrpc/xdrgen/_defs.h>
+
+typedef s64 int64_t;
+
+typedef u32 uint32_t;
+
+typedef struct {
+	u32 count;
+	uint32_t *element;
+} bitmap4;
+
+struct nfstime4 {
+	int64_t seconds;
+	uint32_t nseconds;
+};
+
+typedef bool fattr4_offline;
+
+enum { FATTR4_OFFLINE = 83 };
+
+struct open_arguments4 {
+	bitmap4 oa_share_access;
+	bitmap4 oa_share_deny;
+	bitmap4 oa_share_access_want;
+	bitmap4 oa_open_claim;
+	bitmap4 oa_create_mode;
+};
+
+enum open_args_share_access4 {
+	OPEN_ARGS_SHARE_ACCESS_READ = 1,
+	OPEN_ARGS_SHARE_ACCESS_WRITE = 2,
+	OPEN_ARGS_SHARE_ACCESS_BOTH = 3,
+};
+typedef enum open_args_share_access4 open_args_share_access4;
+
+enum open_args_share_deny4 {
+	OPEN_ARGS_SHARE_DENY_NONE = 0,
+	OPEN_ARGS_SHARE_DENY_READ = 1,
+	OPEN_ARGS_SHARE_DENY_WRITE = 2,
+	OPEN_ARGS_SHARE_DENY_BOTH = 3,
+};
+typedef enum open_args_share_deny4 open_args_share_deny4;
+
+enum open_args_share_access_want4 {
+	OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG = 3,
+	OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG = 4,
+	OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL = 5,
+	OPEN_ARGS_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL = 17,
+	OPEN_ARGS_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED = 18,
+	OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 20,
+	OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 21,
+};
+typedef enum open_args_share_access_want4 open_args_share_access_want4;
+
+enum open_args_open_claim4 {
+	OPEN_ARGS_OPEN_CLAIM_NULL = 0,
+	OPEN_ARGS_OPEN_CLAIM_PREVIOUS = 1,
+	OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR = 2,
+	OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV = 3,
+	OPEN_ARGS_OPEN_CLAIM_FH = 4,
+	OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH = 5,
+	OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH = 6,
+};
+typedef enum open_args_open_claim4 open_args_open_claim4;
+
+enum open_args_createmode4 {
+	OPEN_ARGS_CREATEMODE_UNCHECKED4 = 0,
+	OPEN_ARGS_CREATE_MODE_GUARDED = 1,
+	OPEN_ARGS_CREATEMODE_EXCLUSIVE4 = 2,
+	OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1 = 3,
+};
+typedef enum open_args_createmode4 open_args_createmode4;
+
+typedef struct open_arguments4 fattr4_open_arguments;
+
+enum { FATTR4_OPEN_ARGUMENTS = 86 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 };
+
+enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 };
+
+typedef struct nfstime4 fattr4_time_deleg_access;
+
+typedef struct nfstime4 fattr4_time_deleg_modify;
+
+enum { FATTR4_TIME_DELEG_ACCESS = 84 };
+
+enum { FATTR4_TIME_DELEG_MODIFY = 85 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 };
+
+#define NFS4_int64_t_sz                 \
+	(XDR_hyper)
+#define NFS4_uint32_t_sz                \
+	(XDR_unsigned_int)
+#define NFS4_bitmap4_sz                 (XDR_unsigned_int)
+#define NFS4_nfstime4_sz                \
+	(NFS4_int64_t_sz + NFS4_uint32_t_sz)
+#define NFS4_fattr4_offline_sz          \
+	(XDR_bool)
+#define NFS4_open_arguments4_sz         \
+	(NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz)
+#define NFS4_open_args_share_access4_sz (XDR_int)
+#define NFS4_open_args_share_deny4_sz   (XDR_int)
+#define NFS4_open_args_share_access_want4_sz (XDR_int)
+#define NFS4_open_args_open_claim4_sz   (XDR_int)
+#define NFS4_open_args_createmode4_sz   (XDR_int)
+#define NFS4_fattr4_open_arguments_sz   \
+	(NFS4_open_arguments4_sz)
+#define NFS4_fattr4_time_deleg_access_sz \
+	(NFS4_nfstime4_sz)
+#define NFS4_fattr4_time_deleg_modify_sz \
+	(NFS4_nfstime4_sz)
+
+#endif /* _LINUX_XDRGEN_NFS4_1_DEF_H */
-- 
cgit v1.2.3


From 8dfbea8bde6e976136948421325b24b5bdb76ad3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:13:55 -0500
Subject: nfsd: switch to autogenerated definitions for open_delegation_type4

Rename the enum with the same name in include/linux/nfs4.h, add the
proper enum to nfs4_1.x and regenerate the headers and source files.  Do
a mass rename of all NFS4_OPEN_DELEGATE_* to OPEN_DELEGATE_* in the nfsd
directory.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/sunrpc/xdr/nfs4_1.x    |  9 ++++++++-
 fs/nfsd/nfs4state.c                  | 34 +++++++++++++++++-----------------
 fs/nfsd/nfs4xdr.c                    |  8 ++++----
 fs/nfsd/nfs4xdr_gen.c                | 19 ++++++++++++++++++-
 fs/nfsd/nfs4xdr_gen.h                |  2 +-
 include/linux/nfs4.h                 |  2 +-
 include/linux/sunrpc/xdrgen/nfs4_1.h | 13 ++++++++++++-
 7 files changed, 61 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/Documentation/sunrpc/xdr/nfs4_1.x b/Documentation/sunrpc/xdr/nfs4_1.x
index fc37d1ecba0f..ee9f8f249f1e 100644
--- a/Documentation/sunrpc/xdr/nfs4_1.x
+++ b/Documentation/sunrpc/xdr/nfs4_1.x
@@ -161,6 +161,13 @@ pragma public		fattr4_time_deleg_modify;
 const FATTR4_TIME_DELEG_ACCESS  = 84;
 const FATTR4_TIME_DELEG_MODIFY  = 85;
 
-
 const OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000;
 
+enum open_delegation_type4 {
+       OPEN_DELEGATE_NONE                  = 0,
+       OPEN_DELEGATE_READ                  = 1,
+       OPEN_DELEGATE_WRITE                 = 2,
+       OPEN_DELEGATE_NONE_EXT              = 3, /* new to v4.1 */
+       OPEN_DELEGATE_READ_ATTRS_DELEG      = 4,
+       OPEN_DELEGATE_WRITE_ATTRS_DELEG     = 5
+};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b38b3a1c0307..a39c480da327 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2965,7 +2965,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 	seq_puts(s, ": { type: deleg, ");
 
 	seq_printf(s, "access: %s",
-		   ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
+		   ds->dl_type == OPEN_DELEGATE_READ ? "r" : "w");
 
 	/* XXX: lease time, whether it's being recalled. */
 
@@ -5581,7 +5581,7 @@ retry:
 static inline __be32
 nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 {
-	if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
+	if ((flags & WR_STATE) && (dp->dl_type == OPEN_DELEGATE_READ))
 		return nfserr_openmode;
 	else
 		return nfs_ok;
@@ -5823,7 +5823,7 @@ static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
 		return NULL;
 	fl->fl_lmops = &nfsd_lease_mng_ops;
 	fl->c.flc_flags = FL_DELEG;
-	fl->c.flc_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+	fl->c.flc_type = flag == OPEN_DELEGATE_READ ? F_RDLCK : F_WRLCK;
 	fl->c.flc_owner = (fl_owner_t)dp;
 	fl->c.flc_pid = current->tgid;
 	fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
@@ -5969,7 +5969,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	 */
 	if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) {
 		nf = find_rw_file(fp);
-		dl_type = NFS4_OPEN_DELEGATE_WRITE;
+		dl_type = OPEN_DELEGATE_WRITE;
 	}
 
 	/*
@@ -5978,7 +5978,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	 */
 	if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) {
 		nf = find_readable_file(fp);
-		dl_type = NFS4_OPEN_DELEGATE_READ;
+		dl_type = OPEN_DELEGATE_READ;
 	}
 
 	if (!nf)
@@ -6067,7 +6067,7 @@ out_delegees:
 
 static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
 {
-	open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+	open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 	if (status == -EAGAIN)
 		open->op_why_no_deleg = WND4_CONTENTION;
 	else {
@@ -6183,20 +6183,20 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 			destroy_delegation(dp);
 			goto out_no_deleg;
 		}
-		open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
+		open->op_delegate_type = OPEN_DELEGATE_WRITE;
 		dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
 		dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat);
 		trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
 	} else {
-		open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
+		open->op_delegate_type = OPEN_DELEGATE_READ;
 		trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
 	}
 	nfs4_put_stid(&dp->dl_stid);
 	return;
 out_no_deleg:
-	open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
+	open->op_delegate_type = OPEN_DELEGATE_NONE;
 	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
-	    open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
+	    open->op_delegate_type != OPEN_DELEGATE_NONE) {
 		dprintk("NFSD: WARNING: refusing delegation reclaim\n");
 		open->op_recall = true;
 	}
@@ -6211,17 +6211,17 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
 					struct nfs4_delegation *dp)
 {
 	if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG &&
-	    dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) {
-		open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+	    dp->dl_type == OPEN_DELEGATE_WRITE) {
+		open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 		open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE;
 	} else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG &&
-		   dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) {
-		open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+		   dp->dl_type == OPEN_DELEGATE_WRITE) {
+		open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 		open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE;
 	}
 	/* Otherwise the client must be confused wanting a delegation
 	 * it already has, therefore we don't return
-	 * NFS4_OPEN_DELEGATE_NONE_EXT and reason.
+	 * OPEN_DELEGATE_NONE_EXT and reason.
 	 */
 }
 
@@ -6311,7 +6311,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 
 	if (nfsd4_has_session(&resp->cstate)) {
 		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
-			open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+			open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 			open->op_why_no_deleg = WND4_NOT_WANTED;
 			goto nodeleg;
 		}
@@ -6327,7 +6327,7 @@ nodeleg:
 	trace_nfsd_open(&stp->st_stid.sc_stateid);
 out:
 	/* 4.1 client trying to upgrade/downgrade delegation? */
-	if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp &&
+	if (open->op_delegate_type == OPEN_DELEGATE_NONE && dp &&
 	    open->op_deleg_want)
 		nfsd4_deleg_xgrade_none_ext(open, dp);
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 31270e1a40ed..b6966acddcb6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4233,18 +4233,18 @@ nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
 	if (xdr_stream_encode_u32(xdr, open->op_delegate_type) != XDR_UNIT)
 		return nfserr_resource;
 	switch (open->op_delegate_type) {
-	case NFS4_OPEN_DELEGATE_NONE:
+	case OPEN_DELEGATE_NONE:
 		status = nfs_ok;
 		break;
-	case NFS4_OPEN_DELEGATE_READ:
+	case OPEN_DELEGATE_READ:
 		/* read */
 		status = nfsd4_encode_open_read_delegation4(xdr, open);
 		break;
-	case NFS4_OPEN_DELEGATE_WRITE:
+	case OPEN_DELEGATE_WRITE:
 		/* write */
 		status = nfsd4_encode_open_write_delegation4(xdr, open);
 		break;
-	case NFS4_OPEN_DELEGATE_NONE_EXT:
+	case OPEN_DELEGATE_NONE_EXT:
 		/* od_whynone */
 		status = nfsd4_encode_open_none_delegation4(xdr, open);
 		break;
diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c
index e5d34f9a3147..a0e01f50a28d 100644
--- a/fs/nfsd/nfs4xdr_gen.c
+++ b/fs/nfsd/nfs4xdr_gen.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 // Generated by xdrgen. Manual edits will be lost.
 // XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x
-// XDR specification modification time: Thu Oct  3 11:30:59 2024
+// XDR specification modification time: Sat Oct 12 08:10:54 2024
 
 #include <linux/sunrpc/svc.h>
 
@@ -135,6 +135,17 @@ xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg
 	return xdrgen_decode_nfstime4(xdr, ptr);
 };
 
+static bool __maybe_unused
+xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 *ptr)
+{
+	u32 val;
+
+	if (xdr_stream_decode_u32(xdr, &val) < 0)
+		return false;
+	*ptr = val;
+	return true;
+}
+
 static bool __maybe_unused
 xdrgen_encode_int64_t(struct xdr_stream *xdr, const int64_t value)
 {
@@ -237,3 +248,9 @@ xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time
 {
 	return xdrgen_encode_nfstime4(xdr, value);
 };
+
+static bool __maybe_unused
+xdrgen_encode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 value)
+{
+	return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h
index c4c6a5075b17..3fc8bde2b3b5 100644
--- a/fs/nfsd/nfs4xdr_gen.h
+++ b/fs/nfsd/nfs4xdr_gen.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by xdrgen. Manual edits will be lost. */
 /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Thu Oct  3 11:30:59 2024 */
+/* XDR specification modification time: Sat Oct 12 08:10:54 2024 */
 
 #ifndef _LINUX_XDRGEN_NFS4_1_DECL_H
 #define _LINUX_XDRGEN_NFS4_1_DECL_H
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b90719244775..71fbebfa43c7 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -366,7 +366,7 @@ enum limit_by4 {
 	NFS4_LIMIT_BLOCKS = 2
 };
 
-enum open_delegation_type4 {
+enum nfs4_open_delegation_type4 {
 	NFS4_OPEN_DELEGATE_NONE = 0,
 	NFS4_OPEN_DELEGATE_READ = 1,
 	NFS4_OPEN_DELEGATE_WRITE = 2,
diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h
index 6025ab6b7398..9ca83a4a04cf 100644
--- a/include/linux/sunrpc/xdrgen/nfs4_1.h
+++ b/include/linux/sunrpc/xdrgen/nfs4_1.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by xdrgen. Manual edits will be lost. */
 /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Thu Oct  3 11:30:59 2024 */
+/* XDR specification modification time: Sat Oct 12 08:10:54 2024 */
 
 #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H
 #define _LINUX_XDRGEN_NFS4_1_DEF_H
@@ -98,6 +98,16 @@ enum { FATTR4_TIME_DELEG_MODIFY = 85 };
 
 enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 };
 
+enum open_delegation_type4 {
+	OPEN_DELEGATE_NONE = 0,
+	OPEN_DELEGATE_READ = 1,
+	OPEN_DELEGATE_WRITE = 2,
+	OPEN_DELEGATE_NONE_EXT = 3,
+	OPEN_DELEGATE_READ_ATTRS_DELEG = 4,
+	OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5,
+};
+typedef enum open_delegation_type4 open_delegation_type4;
+
 #define NFS4_int64_t_sz                 \
 	(XDR_hyper)
 #define NFS4_uint32_t_sz                \
@@ -120,5 +130,6 @@ enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 };
 	(NFS4_nfstime4_sz)
 #define NFS4_fattr4_time_deleg_modify_sz \
 	(NFS4_nfstime4_sz)
+#define NFS4_open_delegation_type4_sz   (XDR_int)
 
 #endif /* _LINUX_XDRGEN_NFS4_1_DEF_H */
-- 
cgit v1.2.3


From c9c99a33e2b0083c83a2c29eebfad92c78e16791 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:13:56 -0500
Subject: nfsd: rename NFS4_SHARE_WANT_* constants to OPEN4_SHARE_ACCESS_WANT_*

Add the OPEN4_SHARE_ACCESS_WANT constants from the nfs4.1 and delstid
draft into the nfs4_1.x file, and regenerate the headers and source
files. Do a mass renaming of NFS4_SHARE_WANT_* to
OPEN4_SHARE_ACCESS_WANT_* in the nfsd directory.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/sunrpc/xdr/nfs4_1.x    | 15 ++++++++++++++-
 fs/nfsd/nfs4state.c                  | 16 ++++++++--------
 fs/nfsd/nfs4xdr.c                    | 12 ++++++------
 fs/nfsd/nfs4xdr_gen.c                |  2 +-
 fs/nfsd/nfs4xdr_gen.h                |  2 +-
 include/linux/sunrpc/xdrgen/nfs4_1.h | 24 +++++++++++++++++++++---
 6 files changed, 51 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/Documentation/sunrpc/xdr/nfs4_1.x b/Documentation/sunrpc/xdr/nfs4_1.x
index ee9f8f249f1e..ca95150a3a29 100644
--- a/Documentation/sunrpc/xdr/nfs4_1.x
+++ b/Documentation/sunrpc/xdr/nfs4_1.x
@@ -138,7 +138,6 @@ pragma public fattr4_open_arguments;
 const FATTR4_OPEN_ARGUMENTS     = 86;
 
 
-const OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000;
 
 
 const OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010;
@@ -161,7 +160,21 @@ pragma public		fattr4_time_deleg_modify;
 const FATTR4_TIME_DELEG_ACCESS  = 84;
 const FATTR4_TIME_DELEG_MODIFY  = 85;
 
+
+
+/* new flags for share_access field of OPEN4args */
+const OPEN4_SHARE_ACCESS_WANT_DELEG_MASK        = 0xFF00;
+const OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE     = 0x0000;
+const OPEN4_SHARE_ACCESS_WANT_READ_DELEG        = 0x0100;
+const OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG       = 0x0200;
+const OPEN4_SHARE_ACCESS_WANT_ANY_DELEG         = 0x0300;
+const OPEN4_SHARE_ACCESS_WANT_NO_DELEG          = 0x0400;
+const OPEN4_SHARE_ACCESS_WANT_CANCEL            = 0x0500;
+
+const OPEN4_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL = 0x10000;
+const OPEN4_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED = 0x20000;
 const OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000;
+const OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000;
 
 enum open_delegation_type4 {
        OPEN_DELEGATE_NONE                  = 0,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a39c480da327..0c73cb8fae19 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6073,14 +6073,14 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
 	else {
 		open->op_why_no_deleg = WND4_RESOURCE;
 		switch (open->op_deleg_want) {
-		case NFS4_SHARE_WANT_READ_DELEG:
-		case NFS4_SHARE_WANT_WRITE_DELEG:
-		case NFS4_SHARE_WANT_ANY_DELEG:
+		case OPEN4_SHARE_ACCESS_WANT_READ_DELEG:
+		case OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG:
+		case OPEN4_SHARE_ACCESS_WANT_ANY_DELEG:
 			break;
-		case NFS4_SHARE_WANT_CANCEL:
+		case OPEN4_SHARE_ACCESS_WANT_CANCEL:
 			open->op_why_no_deleg = WND4_CANCELLED;
 			break;
-		case NFS4_SHARE_WANT_NO_DELEG:
+		case OPEN4_SHARE_ACCESS_WANT_NO_DELEG:
 			WARN_ON_ONCE(1);
 		}
 	}
@@ -6210,11 +6210,11 @@ out_no_deleg:
 static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
 					struct nfs4_delegation *dp)
 {
-	if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG &&
+	if (open->op_deleg_want == OPEN4_SHARE_ACCESS_WANT_READ_DELEG &&
 	    dp->dl_type == OPEN_DELEGATE_WRITE) {
 		open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 		open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE;
-	} else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG &&
+	} else if (open->op_deleg_want == OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG &&
 		   dp->dl_type == OPEN_DELEGATE_WRITE) {
 		open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 		open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE;
@@ -6310,7 +6310,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	mutex_unlock(&stp->st_mutex);
 
 	if (nfsd4_has_session(&resp->cstate)) {
-		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
+		if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_NO_DELEG) {
 			open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 			open->op_why_no_deleg = WND4_NOT_WANTED;
 			goto nodeleg;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b6966acddcb6..e3f1cb59e118 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1067,12 +1067,12 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh
 	if (!argp->minorversion)
 		return nfserr_bad_xdr;
 	switch (w & NFS4_SHARE_WANT_MASK) {
-	case NFS4_SHARE_WANT_NO_PREFERENCE:
-	case NFS4_SHARE_WANT_READ_DELEG:
-	case NFS4_SHARE_WANT_WRITE_DELEG:
-	case NFS4_SHARE_WANT_ANY_DELEG:
-	case NFS4_SHARE_WANT_NO_DELEG:
-	case NFS4_SHARE_WANT_CANCEL:
+	case OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE:
+	case OPEN4_SHARE_ACCESS_WANT_READ_DELEG:
+	case OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG:
+	case OPEN4_SHARE_ACCESS_WANT_ANY_DELEG:
+	case OPEN4_SHARE_ACCESS_WANT_NO_DELEG:
+	case OPEN4_SHARE_ACCESS_WANT_CANCEL:
 		break;
 	default:
 		return nfserr_bad_xdr;
diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c
index a0e01f50a28d..a17b5d8e60b3 100644
--- a/fs/nfsd/nfs4xdr_gen.c
+++ b/fs/nfsd/nfs4xdr_gen.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 // Generated by xdrgen. Manual edits will be lost.
 // XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x
-// XDR specification modification time: Sat Oct 12 08:10:54 2024
+// XDR specification modification time: Mon Oct 14 09:10:13 2024
 
 #include <linux/sunrpc/svc.h>
 
diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h
index 3fc8bde2b3b5..41a0033b7256 100644
--- a/fs/nfsd/nfs4xdr_gen.h
+++ b/fs/nfsd/nfs4xdr_gen.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by xdrgen. Manual edits will be lost. */
 /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Sat Oct 12 08:10:54 2024 */
+/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */
 
 #ifndef _LINUX_XDRGEN_NFS4_1_DECL_H
 #define _LINUX_XDRGEN_NFS4_1_DECL_H
diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h
index 9ca83a4a04cf..cf21a14aa885 100644
--- a/include/linux/sunrpc/xdrgen/nfs4_1.h
+++ b/include/linux/sunrpc/xdrgen/nfs4_1.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by xdrgen. Manual edits will be lost. */
 /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Sat Oct 12 08:10:54 2024 */
+/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */
 
 #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H
 #define _LINUX_XDRGEN_NFS4_1_DEF_H
@@ -84,8 +84,6 @@ typedef struct open_arguments4 fattr4_open_arguments;
 
 enum { FATTR4_OPEN_ARGUMENTS = 86 };
 
-enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 };
-
 enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 };
 
 typedef struct nfstime4 fattr4_time_deleg_access;
@@ -96,8 +94,28 @@ enum { FATTR4_TIME_DELEG_ACCESS = 84 };
 
 enum { FATTR4_TIME_DELEG_MODIFY = 85 };
 
+enum { OPEN4_SHARE_ACCESS_WANT_DELEG_MASK = 0xFF00 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE = 0x0000 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_READ_DELEG = 0x0100 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG = 0x0200 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_ANY_DELEG = 0x0300 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_NO_DELEG = 0x0400 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_CANCEL = 0x0500 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL = 0x10000 };
+
+enum { OPEN4_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED = 0x20000 };
+
 enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 };
 
+enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 };
+
 enum open_delegation_type4 {
 	OPEN_DELEGATE_NONE = 0,
 	OPEN_DELEGATE_READ = 1,
-- 
cgit v1.2.3


From fbd5573d0deda145fe173431f1f3ca444940de18 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:13:57 -0500
Subject: nfsd: prepare delegation code for handing out *_ATTRS_DELEG
 delegations

Add some preparatory code to various functions that handle delegation
types to allow them to handle the OPEN_DELEGATE_*_ATTRS_DELEG constants.
Add helpers for detecting whether it's a read or write deleg, and
whether the attributes are delegated.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 43 ++++++++++++++++++++++++++++---------------
 fs/nfsd/nfs4xdr.c   |  2 ++
 fs/nfsd/state.h     | 16 ++++++++++++++++
 3 files changed, 46 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0c73cb8fae19..e7b26806797a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2951,6 +2951,21 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
 	return 0;
 }
 
+static char *nfs4_show_deleg_type(u32 dl_type)
+{
+	switch (dl_type) {
+	case OPEN_DELEGATE_READ:
+		return "r";
+	case OPEN_DELEGATE_WRITE:
+		return "w";
+	case OPEN_DELEGATE_READ_ATTRS_DELEG:
+		return "ra";
+	case OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+		return "wa";
+	}
+	return "?";
+}
+
 static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 {
 	struct nfs4_delegation *ds;
@@ -2964,8 +2979,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 	nfs4_show_stateid(s, &st->sc_stateid);
 	seq_puts(s, ": { type: deleg, ");
 
-	seq_printf(s, "access: %s",
-		   ds->dl_type == OPEN_DELEGATE_READ ? "r" : "w");
+	seq_printf(s, "access: %s", nfs4_show_deleg_type(ds->dl_type));
 
 	/* XXX: lease time, whether it's being recalled. */
 
@@ -5581,7 +5595,7 @@ retry:
 static inline __be32
 nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 {
-	if ((flags & WR_STATE) && (dp->dl_type == OPEN_DELEGATE_READ))
+	if ((flags & WR_STATE) && deleg_is_read(dp->dl_type))
 		return nfserr_openmode;
 	else
 		return nfs_ok;
@@ -5813,8 +5827,7 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
 	return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
 }
 
-static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
-						int flag)
+static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp)
 {
 	struct file_lease *fl;
 
@@ -5823,7 +5836,7 @@ static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
 		return NULL;
 	fl->fl_lmops = &nfsd_lease_mng_ops;
 	fl->c.flc_flags = FL_DELEG;
-	fl->c.flc_type = flag == OPEN_DELEGATE_READ ? F_RDLCK : F_WRLCK;
+	fl->c.flc_type = deleg_is_read(dp->dl_type) ? F_RDLCK : F_WRLCK;
 	fl->c.flc_owner = (fl_owner_t)dp;
 	fl->c.flc_pid = current->tgid;
 	fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
@@ -6010,7 +6023,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	if (!dp)
 		goto out_delegees;
 
-	fl = nfs4_alloc_init_lease(dp, dl_type);
+	fl = nfs4_alloc_init_lease(dp);
 	if (!fl)
 		goto out_clnt_odstate;
 
@@ -6210,14 +6223,14 @@ out_no_deleg:
 static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
 					struct nfs4_delegation *dp)
 {
-	if (open->op_deleg_want == OPEN4_SHARE_ACCESS_WANT_READ_DELEG &&
-	    dp->dl_type == OPEN_DELEGATE_WRITE) {
-		open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
-		open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE;
-	} else if (open->op_deleg_want == OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG &&
-		   dp->dl_type == OPEN_DELEGATE_WRITE) {
-		open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
-		open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE;
+	if (deleg_is_write(dp->dl_type)) {
+		if (open->op_deleg_want == OPEN4_SHARE_ACCESS_WANT_READ_DELEG) {
+			open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
+			open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE;
+		} else if (open->op_deleg_want == OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG) {
+			open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
+			open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE;
+		}
 	}
 	/* Otherwise the client must be confused wanting a delegation
 	 * it already has, therefore we don't return
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e3f1cb59e118..66b1e2c224ff 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4237,10 +4237,12 @@ nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
 		status = nfs_ok;
 		break;
 	case OPEN_DELEGATE_READ:
+	case OPEN_DELEGATE_READ_ATTRS_DELEG:
 		/* read */
 		status = nfsd4_encode_open_read_delegation4(xdr, open);
 		break;
 	case OPEN_DELEGATE_WRITE:
+	case OPEN_DELEGATE_WRITE_ATTRS_DELEG:
 		/* write */
 		status = nfsd4_encode_open_write_delegation4(xdr, open);
 		break;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index b31a8523c8e5..17e98a2521ff 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -207,6 +207,22 @@ struct nfs4_delegation {
 	struct nfs4_cb_fattr    dl_cb_fattr;
 };
 
+static inline bool deleg_is_read(u32 dl_type)
+{
+	return (dl_type == OPEN_DELEGATE_READ || dl_type == OPEN_DELEGATE_READ_ATTRS_DELEG);
+}
+
+static inline bool deleg_is_write(u32 dl_type)
+{
+	return (dl_type == OPEN_DELEGATE_WRITE || dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG);
+}
+
+static inline bool deleg_attrs_deleg(u32 dl_type)
+{
+	return dl_type == OPEN_DELEGATE_READ_ATTRS_DELEG ||
+	       dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG;
+}
+
 #define cb_to_delegation(cb) \
 	container_of(cb, struct nfs4_delegation, dl_recall)
 
-- 
cgit v1.2.3


From 51c0d4f7e317d3cb4a3001e502bd8ca2d57f2a4b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:13:58 -0500
Subject: nfsd: add support for FATTR4_OPEN_ARGUMENTS

Add support for FATTR4_OPEN_ARGUMENTS. This a new mechanism for the
client to discover what OPEN features the server supports.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfsd.h    |  3 ++-
 2 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 66b1e2c224ff..147564a74b46 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -55,6 +55,7 @@
 #include "netns.h"
 #include "pnfs.h"
 #include "filecache.h"
+#include "nfs4xdr_gen.h"
 
 #include "trace.h"
 
@@ -3388,6 +3389,54 @@ static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr,
 	return nfsd4_encode_bool(xdr, err == 0);
 }
 
+#define NFSD_OA_SHARE_ACCESS	(BIT(OPEN_ARGS_SHARE_ACCESS_READ)	| \
+				 BIT(OPEN_ARGS_SHARE_ACCESS_WRITE)	| \
+				 BIT(OPEN_ARGS_SHARE_ACCESS_BOTH))
+
+#define NFSD_OA_SHARE_DENY	(BIT(OPEN_ARGS_SHARE_DENY_NONE)		| \
+				 BIT(OPEN_ARGS_SHARE_DENY_READ)		| \
+				 BIT(OPEN_ARGS_SHARE_DENY_WRITE)	| \
+				 BIT(OPEN_ARGS_SHARE_DENY_BOTH))
+
+#define NFSD_OA_SHARE_ACCESS_WANT	(BIT(OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG)		| \
+					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG)		| \
+					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL))
+
+#define NFSD_OA_OPEN_CLAIM	(BIT(OPEN_ARGS_OPEN_CLAIM_NULL)		| \
+				 BIT(OPEN_ARGS_OPEN_CLAIM_PREVIOUS)	| \
+				 BIT(OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR)	| \
+				 BIT(OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV)| \
+				 BIT(OPEN_ARGS_OPEN_CLAIM_FH)		| \
+				 BIT(OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH)	| \
+				 BIT(OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH))
+
+#define NFSD_OA_CREATE_MODE	(BIT(OPEN_ARGS_CREATEMODE_UNCHECKED4)	| \
+				 BIT(OPEN_ARGS_CREATE_MODE_GUARDED)	| \
+				 BIT(OPEN_ARGS_CREATEMODE_EXCLUSIVE4)	| \
+				 BIT(OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1))
+
+static uint32_t oa_share_access = NFSD_OA_SHARE_ACCESS;
+static uint32_t oa_share_deny = NFSD_OA_SHARE_DENY;
+static uint32_t oa_share_access_want = NFSD_OA_SHARE_ACCESS_WANT;
+static uint32_t oa_open_claim = NFSD_OA_OPEN_CLAIM;
+static uint32_t oa_create_mode = NFSD_OA_CREATE_MODE;
+
+static const struct open_arguments4 nfsd_open_arguments = {
+	.oa_share_access = { .count = 1, .element = &oa_share_access },
+	.oa_share_deny = { .count = 1, .element = &oa_share_deny },
+	.oa_share_access_want = { .count = 1, .element = &oa_share_access_want },
+	.oa_open_claim = { .count = 1, .element = &oa_open_claim },
+	.oa_create_mode = { .count = 1, .element = &oa_create_mode },
+};
+
+static __be32 nfsd4_encode_fattr4_open_arguments(struct xdr_stream *xdr,
+						 const struct nfsd4_fattr_args *args)
+{
+	if (!xdrgen_encode_fattr4_open_arguments(xdr, &nfsd_open_arguments))
+		return nfserr_resource;
+	return nfs_ok;
+}
+
 static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
 	[FATTR4_SUPPORTED_ATTRS]	= nfsd4_encode_fattr4_supported_attrs,
 	[FATTR4_TYPE]			= nfsd4_encode_fattr4_type,
@@ -3488,6 +3537,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
 
 	[FATTR4_MODE_UMASK]		= nfsd4_encode_fattr4__noop,
 	[FATTR4_XATTR_SUPPORT]		= nfsd4_encode_fattr4_xattr_support,
+	[FATTR4_OPEN_ARGUMENTS]		= nfsd4_encode_fattr4_open_arguments,
 };
 
 /*
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 3eb21e63b921..988df43f6b07 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -455,7 +455,8 @@ enum {
 	(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
 	FATTR4_WORD2_MODE_UMASK | \
 	NFSD4_2_SECURITY_ATTRS | \
-	FATTR4_WORD2_XATTR_SUPPORT)
+	FATTR4_WORD2_XATTR_SUPPORT | \
+	FATTR4_WORD2_OPEN_ARGUMENTS)
 
 extern const u32 nfsd_suppattrs[3][3];
 
-- 
cgit v1.2.3


From cee9b4ef42512a6e57562460a15f18a022c84dda Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:13:59 -0500
Subject: nfsd: rework NFS4_SHARE_WANT_* flag handling

The delstid draft adds new NFS4_SHARE_WANT_TYPE_MASK values that don't
fit neatly into the existing WANT_MASK or WHEN_MASK. Add a new
NFS4_SHARE_WANT_MOD_MASK value and redefine NFS4_SHARE_WANT_MASK to
include it.

Also fix the checks in nfsd4_deleg_xgrade_none_ext() to check for the
flags instead of equality, since there may be modifier flags in the
value.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c       | 4 ++--
 fs/nfsd/nfs4xdr.c         | 2 +-
 include/uapi/linux/nfs4.h | 7 +++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e7b26806797a..780db0992c9f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6224,10 +6224,10 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
 					struct nfs4_delegation *dp)
 {
 	if (deleg_is_write(dp->dl_type)) {
-		if (open->op_deleg_want == OPEN4_SHARE_ACCESS_WANT_READ_DELEG) {
+		if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_READ_DELEG) {
 			open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 			open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE;
-		} else if (open->op_deleg_want == OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG) {
+		} else if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG) {
 			open->op_delegate_type = OPEN_DELEGATE_NONE_EXT;
 			open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE;
 		}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 147564a74b46..5407fd0d464c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1067,7 +1067,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh
 		return nfs_ok;
 	if (!argp->minorversion)
 		return nfserr_bad_xdr;
-	switch (w & NFS4_SHARE_WANT_MASK) {
+	switch (w & NFS4_SHARE_WANT_TYPE_MASK) {
 	case OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE:
 	case OPEN4_SHARE_ACCESS_WANT_READ_DELEG:
 	case OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG:
diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index caf4db2fcbb9..4273e0249fcb 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -58,7 +58,7 @@
 #define NFS4_SHARE_DENY_BOTH	0x0003
 
 /* nfs41 */
-#define NFS4_SHARE_WANT_MASK		0xFF00
+#define NFS4_SHARE_WANT_TYPE_MASK	0xFF00
 #define NFS4_SHARE_WANT_NO_PREFERENCE	0x0000
 #define NFS4_SHARE_WANT_READ_DELEG	0x0100
 #define NFS4_SHARE_WANT_WRITE_DELEG	0x0200
@@ -66,13 +66,16 @@
 #define NFS4_SHARE_WANT_NO_DELEG	0x0400
 #define NFS4_SHARE_WANT_CANCEL		0x0500
 
-#define NFS4_SHARE_WHEN_MASK		0xF0000
+#define NFS4_SHARE_WHEN_MASK				0xF0000
 #define NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL	0x10000
 #define NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED		0x20000
 
+#define NFS4_SHARE_WANT_MOD_MASK			0xF00000
 #define NFS4_SHARE_WANT_DELEG_TIMESTAMPS		0x100000
 #define NFS4_SHARE_WANT_OPEN_XOR_DELEGATION		0x200000
 
+#define NFS4_SHARE_WANT_MASK	(NFS4_SHARE_WANT_TYPE_MASK | NFS4_SHARE_WANT_MOD_MASK)
+
 #define NFS4_CDFC4_FORE	0x1
 #define NFS4_CDFC4_BACK 0x2
 #define NFS4_CDFC4_BOTH 0x3
-- 
cgit v1.2.3


From 6ae30d6eb26bce02c48c60074b4306270e2434c1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:14:00 -0500
Subject: nfsd: add support for delegated timestamps

Add support for the delegated timestamps on write delegations. This
allows the server to proxy timestamps from the delegation holder to
other clients that are doing GETATTRs vs. the same inode.

When OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS bit is set in the OPEN
call, set the dl_type to the *_ATTRS_DELEG flavor of delegation.

Add timespec64 fields to nfs4_cb_fattr and decode the timestamps into
those. Vet those timestamps according to the delstid spec and update
the inode attrs if necessary.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 42 +++++++++++++++++++--
 fs/nfsd/nfs4state.c    | 99 +++++++++++++++++++++++++++++++++++++++++++-------
 fs/nfsd/nfs4xdr.c      | 15 +++++++-
 fs/nfsd/nfsd.h         |  2 +
 fs/nfsd/state.h        |  2 +
 fs/nfsd/xdr4cb.h       | 10 +++--
 include/linux/time64.h |  5 +++
 7 files changed, 152 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 1457f61ae051..50e468bdb8d4 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -42,6 +42,7 @@
 #include "trace.h"
 #include "xdr4cb.h"
 #include "xdr4.h"
+#include "nfs4xdr_gen.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -93,12 +94,35 @@ static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap,
 {
 	fattr->ncf_cb_change = 0;
 	fattr->ncf_cb_fsize = 0;
+	fattr->ncf_cb_atime.tv_sec = 0;
+	fattr->ncf_cb_atime.tv_nsec = 0;
+	fattr->ncf_cb_mtime.tv_sec = 0;
+	fattr->ncf_cb_mtime.tv_nsec = 0;
+
 	if (bitmap[0] & FATTR4_WORD0_CHANGE)
 		if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0)
 			return -NFSERR_BAD_XDR;
 	if (bitmap[0] & FATTR4_WORD0_SIZE)
 		if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0)
 			return -NFSERR_BAD_XDR;
+	if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) {
+		fattr4_time_deleg_access access;
+
+		if (!xdrgen_decode_fattr4_time_deleg_access(xdr, &access))
+			return -NFSERR_BAD_XDR;
+		fattr->ncf_cb_atime.tv_sec = access.seconds;
+		fattr->ncf_cb_atime.tv_nsec = access.nseconds;
+
+	}
+	if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) {
+		fattr4_time_deleg_modify modify;
+
+		if (!xdrgen_decode_fattr4_time_deleg_modify(xdr, &modify))
+			return -NFSERR_BAD_XDR;
+		fattr->ncf_cb_mtime.tv_sec = modify.seconds;
+		fattr->ncf_cb_mtime.tv_nsec = modify.nseconds;
+
+	}
 	return 0;
 }
 
@@ -364,15 +388,21 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
 	struct nfs4_delegation *dp = container_of(fattr, struct nfs4_delegation, dl_cb_fattr);
 	struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle;
 	struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr;
-	u32 bmap[1];
+	u32 bmap_size = 1;
+	u32 bmap[3];
 
 	bmap[0] = FATTR4_WORD0_SIZE;
 	if (!ncf->ncf_file_modified)
 		bmap[0] |= FATTR4_WORD0_CHANGE;
 
+	if (deleg_attrs_deleg(dp->dl_type)) {
+		bmap[1] = 0;
+		bmap[2] = FATTR4_WORD2_TIME_DELEG_ACCESS | FATTR4_WORD2_TIME_DELEG_MODIFY;
+		bmap_size = 3;
+	}
 	encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR);
 	encode_nfs_fh4(xdr, fh);
-	encode_bitmap4(xdr, bmap, ARRAY_SIZE(bmap));
+	encode_bitmap4(xdr, bmap, bmap_size);
 	hdr->nops++;
 }
 
@@ -636,7 +666,7 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
 	struct nfs4_cb_compound_hdr hdr;
 	int status;
 	u32 bitmap[3] = {0};
-	u32 attrlen;
+	u32 attrlen, maxlen;
 	struct nfs4_cb_fattr *ncf =
 		container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
 
@@ -655,7 +685,11 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
 		return -NFSERR_BAD_XDR;
 	if (xdr_stream_decode_u32(xdr, &attrlen) < 0)
 		return -NFSERR_BAD_XDR;
-	if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize)))
+	maxlen = sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize);
+	if (bitmap[2] != 0)
+		maxlen += (sizeof(ncf->ncf_cb_mtime.tv_sec) +
+			   sizeof(ncf->ncf_cb_mtime.tv_nsec)) * 2;
+	if (attrlen > maxlen)
 		return -NFSERR_BAD_XDR;
 	status = decode_cb_fattr4(xdr, bitmap, ncf);
 	return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 780db0992c9f..5e1d30b91738 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5951,13 +5951,14 @@ static struct nfs4_delegation *
 nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 		    struct svc_fh *parent)
 {
-	int status = 0;
+	bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS;
 	struct nfs4_client *clp = stp->st_stid.sc_client;
 	struct nfs4_file *fp = stp->st_stid.sc_file;
 	struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
 	struct nfs4_delegation *dp;
 	struct nfsd_file *nf = NULL;
 	struct file_lease *fl;
+	int status = 0;
 	u32 dl_type;
 
 	/*
@@ -5982,7 +5983,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	 */
 	if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) {
 		nf = find_rw_file(fp);
-		dl_type = OPEN_DELEGATE_WRITE;
+		dl_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE;
 	}
 
 	/*
@@ -5991,7 +5992,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	 */
 	if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) {
 		nf = find_readable_file(fp);
-		dl_type = OPEN_DELEGATE_READ;
+		dl_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ;
 	}
 
 	if (!nf)
@@ -6149,13 +6150,14 @@ static void
 nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 		     struct svc_fh *currentfh)
 {
-	struct nfs4_delegation *dp;
+	bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS;
 	struct nfs4_openowner *oo = openowner(stp->st_stateowner);
 	struct nfs4_client *clp = stp->st_stid.sc_client;
 	struct svc_fh *parent = NULL;
-	int cb_up;
-	int status = 0;
+	struct nfs4_delegation *dp;
 	struct kstat stat;
+	int status = 0;
+	int cb_up;
 
 	cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
 	open->op_recall = false;
@@ -6196,12 +6198,14 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 			destroy_delegation(dp);
 			goto out_no_deleg;
 		}
-		open->op_delegate_type = OPEN_DELEGATE_WRITE;
+		open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG :
+						    OPEN_DELEGATE_WRITE;
 		dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
 		dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat);
 		trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
 	} else {
-		open->op_delegate_type = OPEN_DELEGATE_READ;
+		open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG :
+						    OPEN_DELEGATE_READ;
 		trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
 	}
 	nfs4_put_stid(&dp->dl_stid);
@@ -9017,6 +9021,78 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
 	get_stateid(cstate, &u->write.wr_stateid);
 }
 
+/**
+ * set_cb_time - vet and set the timespec for a cb_getattr update
+ * @cb: timestamp from the CB_GETATTR response
+ * @orig: original timestamp in the inode
+ * @now: current time
+ *
+ * Given a timestamp in a CB_GETATTR response, check it against the
+ * current timestamp in the inode and the current time. Returns true
+ * if the inode's timestamp needs to be updated, and false otherwise.
+ * @cb may also be changed if the timestamp needs to be clamped.
+ */
+static bool set_cb_time(struct timespec64 *cb, const struct timespec64 *orig,
+			const struct timespec64 *now)
+{
+
+	/*
+	 * "When the time presented is before the original time, then the
+	 *  update is ignored." Also no need to update if there is no change.
+	 */
+	if (timespec64_compare(cb, orig) <= 0)
+		return false;
+
+	/*
+	 * "When the time presented is in the future, the server can either
+	 *  clamp the new time to the current time, or it may
+	 *  return NFS4ERR_DELAY to the client, allowing it to retry."
+	 */
+	if (timespec64_compare(cb, now) > 0) {
+		/* clamp it */
+		*cb = *now;
+	}
+
+	return true;
+}
+
+static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation *dp)
+{
+	struct inode *inode = d_inode(dentry);
+	struct timespec64 now = current_time(inode);
+	struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr;
+	struct iattr attrs = { };
+	int ret;
+
+	if (deleg_attrs_deleg(dp->dl_type)) {
+		struct timespec64 atime = inode_get_atime(inode);
+		struct timespec64 mtime = inode_get_mtime(inode);
+
+		attrs.ia_atime = ncf->ncf_cb_atime;
+		attrs.ia_mtime = ncf->ncf_cb_mtime;
+
+		if (set_cb_time(&attrs.ia_atime, &atime, &now))
+			attrs.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+
+		if (set_cb_time(&attrs.ia_mtime, &mtime, &now)) {
+			attrs.ia_valid |= ATTR_CTIME | ATTR_MTIME | ATTR_MTIME_SET;
+			attrs.ia_ctime = attrs.ia_mtime;
+		}
+	} else {
+		attrs.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+		attrs.ia_mtime = attrs.ia_ctime = now;
+	}
+
+	if (!attrs.ia_valid)
+		return 0;
+
+	attrs.ia_valid |= ATTR_DELEG;
+	inode_lock(inode);
+	ret = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL);
+	inode_unlock(inode);
+	return ret;
+}
+
 /**
  * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
  * @rqstp: RPC transaction context
@@ -9043,7 +9119,6 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 	struct file_lock_context *ctx;
 	struct nfs4_delegation *dp = NULL;
 	struct file_lease *fl;
-	struct iattr attrs;
 	struct nfs4_cb_fattr *ncf;
 	struct inode *inode = d_inode(dentry);
 
@@ -9105,11 +9180,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 		 * not update the file's metadata with the client's
 		 * modified size
 		 */
-		attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
-		attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG;
-		inode_lock(inode);
-		err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL);
-		inode_unlock(inode);
+		err = cb_getattr_update_times(dentry, dp);
 		if (err) {
 			status = nfserrno(err);
 			goto out_status;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5407fd0d464c..2c5c288064b0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3400,7 +3400,8 @@ static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr,
 
 #define NFSD_OA_SHARE_ACCESS_WANT	(BIT(OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG)		| \
 					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG)		| \
-					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL))
+					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL)		| \
+					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS))
 
 #define NFSD_OA_OPEN_CLAIM	(BIT(OPEN_ARGS_OPEN_CLAIM_NULL)		| \
 				 BIT(OPEN_ARGS_OPEN_CLAIM_PREVIOUS)	| \
@@ -3593,7 +3594,11 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		if (status)
 			goto out;
 	}
-	if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+	if ((attrmask[0] & (FATTR4_WORD0_CHANGE |
+			    FATTR4_WORD0_SIZE)) ||
+	    (attrmask[1] & (FATTR4_WORD1_TIME_ACCESS |
+			    FATTR4_WORD1_TIME_MODIFY |
+			    FATTR4_WORD1_TIME_METADATA))) {
 		status = nfsd4_deleg_getattr_conflict(rqstp, dentry, &dp);
 		if (status)
 			goto out;
@@ -3608,8 +3613,14 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		if (ncf->ncf_file_modified) {
 			++ncf->ncf_initial_cinfo;
 			args.stat.size = ncf->ncf_cur_fsize;
+			if (!timespec64_is_epoch(&ncf->ncf_cb_mtime))
+				args.stat.mtime = ncf->ncf_cb_mtime;
 		}
 		args.change_attr = ncf->ncf_initial_cinfo;
+
+		if (!timespec64_is_epoch(&ncf->ncf_cb_atime))
+			args.stat.atime = ncf->ncf_cb_atime;
+
 		nfs4_put_stid(&dp->dl_stid);
 	} else {
 		args.change_attr = nfsd4_change_attribute(&args.stat);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 988df43f6b07..4c101c8be95f 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -456,6 +456,8 @@ enum {
 	FATTR4_WORD2_MODE_UMASK | \
 	NFSD4_2_SECURITY_ATTRS | \
 	FATTR4_WORD2_XATTR_SUPPORT | \
+	FATTR4_WORD2_TIME_DELEG_ACCESS | \
+	FATTR4_WORD2_TIME_DELEG_MODIFY | \
 	FATTR4_WORD2_OPEN_ARGUMENTS)
 
 extern const u32 nfsd_suppattrs[3][3];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 17e98a2521ff..74d2d7b42676 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -159,6 +159,8 @@ struct nfs4_cb_fattr {
 	/* from CB_GETATTR reply */
 	u64 ncf_cb_change;
 	u64 ncf_cb_fsize;
+	struct timespec64 ncf_cb_mtime;
+	struct timespec64 ncf_cb_atime;
 
 	unsigned long ncf_cb_flags;
 	bool ncf_file_modified;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index e8b00309c449..f1a315cd31b7 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -59,16 +59,20 @@
  * 1: CB_GETATTR opcode (32-bit)
  * N: file_handle
  * 1: number of entry in attribute array (32-bit)
- * 1: entry 0 in attribute array (32-bit)
+ * 3: entry 0-2 in attribute array (32-bit * 3)
  */
 #define NFS4_enc_cb_getattr_sz		(cb_compound_enc_hdr_sz +       \
 					cb_sequence_enc_sz +            \
-					1 + enc_nfs4_fh_sz + 1 + 1)
+					1 + enc_nfs4_fh_sz + 1 + 3)
 /*
  * 4: fattr_bitmap_maxsz
  * 1: attribute array len
  * 2: change attr (64-bit)
  * 2: size (64-bit)
+ * 2: atime.seconds (64-bit)
+ * 1: atime.nanoseconds (32-bit)
+ * 2: mtime.seconds (64-bit)
+ * 1: mtime.nanoseconds (32-bit)
  */
 #define NFS4_dec_cb_getattr_sz		(cb_compound_dec_hdr_sz  +      \
-			cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz)
+			cb_sequence_dec_sz + 4 + 1 + 2 + 2 + 2 + 1 + 2 + 1 + op_dec_sz)
diff --git a/include/linux/time64.h b/include/linux/time64.h
index f1bcea8c124a..9934331c7b86 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -49,6 +49,11 @@ static inline int timespec64_equal(const struct timespec64 *a,
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
 }
 
+static inline bool timespec64_is_epoch(const struct timespec64 *ts)
+{
+	return ts->tv_sec == 0 && ts->tv_nsec == 0;
+}
+
 /*
  * lhs < rhs:  return <0
  * lhs == rhs: return 0
-- 
cgit v1.2.3


From 7e13f4f8d27dc02fb88666f603c53ca749d56f92 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:14:01 -0500
Subject: nfsd: handle delegated timestamps in SETATTR

Allow SETATTR to handle delegated timestamps. This patch assumes that
only the delegation holder has the ability to set the timestamps in this
way, so we allow this only if the SETATTR stateid refers to a
*_ATTRS_DELEG delegation.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c  | 31 ++++++++++++++++++++++++++++---
 fs/nfsd/nfs4state.c |  2 +-
 fs/nfsd/nfs4xdr.c   | 20 ++++++++++++++++++++
 fs/nfsd/nfsd.h      |  5 ++++-
 4 files changed, 53 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ad44ad49274f..f6e06c779d09 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1135,18 +1135,43 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		.na_iattr	= &setattr->sa_iattr,
 		.na_seclabel	= &setattr->sa_label,
 	};
+	bool save_no_wcc, deleg_attrs;
+	struct nfs4_stid *st = NULL;
 	struct inode *inode;
 	__be32 status = nfs_ok;
-	bool save_no_wcc;
 	int err;
 
-	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
+	deleg_attrs = setattr->sa_bmval[2] & (FATTR4_WORD2_TIME_DELEG_ACCESS |
+					      FATTR4_WORD2_TIME_DELEG_MODIFY);
+
+	if (deleg_attrs || (setattr->sa_iattr.ia_valid & ATTR_SIZE)) {
+		int flags = WR_STATE;
+
+		if (setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_ACCESS)
+			flags |= RD_STATE;
+
 		status = nfs4_preprocess_stateid_op(rqstp, cstate,
 				&cstate->current_fh, &setattr->sa_stateid,
-				WR_STATE, NULL, NULL);
+				flags, NULL, &st);
 		if (status)
 			return status;
 	}
+
+	if (deleg_attrs) {
+		status = nfserr_bad_stateid;
+		if (st->sc_type & SC_TYPE_DELEG) {
+			struct nfs4_delegation *dp = delegstateid(st);
+
+			/* Only for *_ATTRS_DELEG flavors */
+			if (deleg_attrs_deleg(dp->dl_type))
+				status = nfs_ok;
+		}
+	}
+	if (st)
+		nfs4_put_stid(st);
+	if (status)
+		return status;
+
 	err = fh_want_write(&cstate->current_fh);
 	if (err)
 		return nfserrno(err);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5e1d30b91738..ab19ac7b2329 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5595,7 +5595,7 @@ retry:
 static inline __be32
 nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 {
-	if ((flags & WR_STATE) && deleg_is_read(dp->dl_type))
+	if (!(flags & RD_STATE) && deleg_is_read(dp->dl_type))
 		return nfserr_openmode;
 	else
 		return nfs_ok;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2c5c288064b0..178325fcb6cb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -521,6 +521,26 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen,
 		*umask = mask & S_IRWXUGO;
 		iattr->ia_valid |= ATTR_MODE;
 	}
+	if (bmval[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) {
+		fattr4_time_deleg_access access;
+
+		if (!xdrgen_decode_fattr4_time_deleg_access(argp->xdr, &access))
+			return nfserr_bad_xdr;
+		iattr->ia_atime.tv_sec = access.seconds;
+		iattr->ia_atime.tv_nsec = access.nseconds;
+		iattr->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET | ATTR_DELEG;
+	}
+	if (bmval[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) {
+		fattr4_time_deleg_modify modify;
+
+		if (!xdrgen_decode_fattr4_time_deleg_modify(argp->xdr, &modify))
+			return nfserr_bad_xdr;
+		iattr->ia_mtime.tv_sec = modify.seconds;
+		iattr->ia_mtime.tv_nsec = modify.nseconds;
+		iattr->ia_ctime.tv_sec = modify.seconds;
+		iattr->ia_ctime.tv_nsec = modify.seconds;
+		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME | ATTR_MTIME_SET | ATTR_DELEG;
+	}
 
 	/* request sanity: did attrlist4 contain the expected number of words? */
 	if (attrlist4_count != xdr_stream_pos(argp->xdr) - starting_pos)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 4c101c8be95f..e2997f0ffbc5 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -528,7 +528,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
 #endif
 #define NFSD_WRITEABLE_ATTRS_WORD2 \
 	(FATTR4_WORD2_MODE_UMASK \
-	| MAYBE_FATTR4_WORD2_SECURITY_LABEL)
+	| MAYBE_FATTR4_WORD2_SECURITY_LABEL \
+	| FATTR4_WORD2_TIME_DELEG_ACCESS \
+	| FATTR4_WORD2_TIME_DELEG_MODIFY \
+	)
 
 #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
 	NFSD_WRITEABLE_ATTRS_WORD0
-- 
cgit v1.2.3


From d3edfd9ed17cb3bc754b3064051fb5df7863fda3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Dec 2024 16:14:02 -0500
Subject: nfsd: implement OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION

Allow clients to request getting a delegation xor an open stateid if a
delegation isn't available. This allows the client to avoid sending a
final CLOSE for the (useless) open stateid, when it is granted a
delegation.

If this flag is requested by the client and there isn't already a new
open stateid, discard the new open stateid before replying.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 24 +++++++++++++++++++++++-
 fs/nfsd/nfs4xdr.c   |  3 ++-
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ab19ac7b2329..b7a0cfd05401 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6242,6 +6242,17 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
 	 */
 }
 
+/* Are we returning only a delegation stateid? */
+static bool open_xor_delegation(struct nfsd4_open *open)
+{
+	if (!(open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION))
+		return false;
+	/* Did we actually get a delegation? */
+	if (!deleg_is_read(open->op_delegate_type) && !deleg_is_write(open->op_delegate_type))
+		return false;
+	return true;
+}
+
 /**
  * nfsd4_process_open2 - finish open processing
  * @rqstp: the RPC transaction being executed
@@ -6339,6 +6350,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	* OPEN succeeds even if we fail.
 	*/
 	nfs4_open_delegation(open, stp, &resp->cstate.current_fh);
+
+	/*
+	 * If there is an existing open stateid, it must be updated and
+	 * returned. Only respect WANT_OPEN_XOR_DELEGATION when a new
+	 * open stateid would have to be created.
+	 */
+	if (new_stp && open_xor_delegation(open)) {
+		memcpy(&open->op_stateid, &zero_stateid, sizeof(open->op_stateid));
+		open->op_rflags |= OPEN4_RESULT_NO_OPEN_STATEID;
+		release_open_stateid(stp);
+	}
 nodeleg:
 	status = nfs_ok;
 	trace_nfsd_open(&stp->st_stid.sc_stateid);
@@ -6355,7 +6377,7 @@ out:
 	/*
 	* To finish the open response, we just need to set the rflags.
 	*/
-	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
+	open->op_rflags |= NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
 	if (nfsd4_has_session(&resp->cstate))
 		open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK;
 	else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 178325fcb6cb..5d413e751c74 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3421,7 +3421,8 @@ static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr,
 #define NFSD_OA_SHARE_ACCESS_WANT	(BIT(OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG)		| \
 					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG)		| \
 					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL)		| \
-					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS))
+					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS)	| \
+					 BIT(OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION))
 
 #define NFSD_OA_OPEN_CLAIM	(BIT(OPEN_ARGS_OPEN_CLAIM_NULL)		| \
 				 BIT(OPEN_ARGS_OPEN_CLAIM_PREVIOUS)	| \
-- 
cgit v1.2.3


From 2c5d8a8347a8ea8c243245e2ed8d87dfbf9f54fb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 21 Jan 2025 17:42:25 -0500
Subject: bcachefs: "Journal stuck" timeout now takes into account device
 latency

If a block device (e.g. your typical consumer SSD) is taking multiple
seconds for IOs (typically flushes), we don't want to emit the "journal
stuck" message prematurely.

Also, make sure to drop the btree_trans srcu lock if we're blocking for
more than a second.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c |  2 +-
 fs/bcachefs/journal.c            | 32 ++++++++++++++++++++++++++++----
 fs/bcachefs/journal.h            |  7 ++++---
 3 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 6b79b672e0b1..2760dd9569ed 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 						      unsigned flags)
 {
 	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-				    trans->journal_u64s, flags);
+				    trans->journal_u64s, flags, trans);
 }
 
 #define JSET_ENTRY_LOG_U64s		4
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2cd20114b74b..46d53d3ba018 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -601,6 +601,16 @@ out:
 		: -BCH_ERR_journal_res_get_blocked;
 }
 
+static unsigned max_dev_latency(struct bch_fs *c)
+{
+	u64 nsecs = 0;
+
+	for_each_rw_member(c, ca)
+		nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
+
+	return nsecs_to_jiffies(nsecs);
+}
+
 /*
  * Essentially the entry function to the journaling code. When bcachefs is doing
  * a btree insert, it calls this function to get the current journal write.
@@ -612,17 +622,31 @@ out:
  * btree node write locks.
  */
 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-				  unsigned flags)
+				  unsigned flags,
+				  struct btree_trans *trans)
 {
 	int ret;
 
 	if (closure_wait_event_timeout(&j->async_wait,
 		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
 		   (flags & JOURNAL_RES_GET_NONBLOCK),
-		   HZ * 10))
+		   HZ))
 		return ret;
 
+	if (trans)
+		bch2_trans_unlock_long(trans);
+
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
+
+	remaining_wait = max(0, remaining_wait - HZ);
+
+	if (closure_wait_event_timeout(&j->async_wait,
+		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK),
+		   remaining_wait))
+		return ret;
+
 	struct printbuf buf = PRINTBUF;
 	bch2_journal_debug_to_text(&buf, j);
 	bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
@@ -727,7 +751,7 @@ recheck_need_open:
 		 * livelock:
 		 */
 		sched_annotate_sleep();
-		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
 		if (ret)
 			return ret;
 
@@ -848,7 +872,7 @@ out:
 static int __bch2_journal_meta(struct journal *j)
 {
 	struct journal_res res = {};
-	int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
 	if (ret)
 		return ret;
 
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index cb0df0663946..a01dae1a57e3 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -312,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 }
 
 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-				  unsigned);
+				  unsigned, struct btree_trans *);
 
 /* First bits for BCH_WATERMARK: */
 enum journal_res_flags {
@@ -368,7 +368,8 @@ static inline int journal_res_get_fast(struct journal *j,
 }
 
 static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-				       unsigned u64s, unsigned flags)
+				       unsigned u64s, unsigned flags,
+				       struct btree_trans *trans)
 {
 	int ret;
 
@@ -380,7 +381,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
 	if (journal_res_get_fast(j, res, flags))
 		goto out;
 
-	ret = bch2_journal_res_get_slowpath(j, res, flags);
+	ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
 	if (ret)
 		return ret;
 out:
-- 
cgit v1.2.3


From c9c8a17f7aa65b713418626573c5695c2a5d8ecd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 21 Jan 2025 23:03:08 -0500
Subject: bcachefs: bset_blacklisted_journal_seq is now AUTOFIX

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-errors_format.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 0b4fe899209b..ea0a18364751 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -57,7 +57,7 @@ enum bch_fsck_flags {
 	x(bset_wrong_sector_offset,				 44,	0)		\
 	x(bset_empty,						 45,	0)		\
 	x(bset_bad_seq,						 46,	0)		\
-	x(bset_blacklisted_journal_seq,				 47,	0)		\
+	x(bset_blacklisted_journal_seq,				 47,	FSCK_AUTOFIX)	\
 	x(first_bset_blacklisted_journal_seq,			 48,	FSCK_AUTOFIX)	\
 	x(btree_node_bad_btree,					 49,	0)		\
 	x(btree_node_bad_level,					 50,	0)		\
-- 
cgit v1.2.3


From 0e2f98da2071bae0f07135adb9b3efdb737aaee6 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Tue, 7 Jan 2025 13:31:28 -0800
Subject: efivarfs: abstract initial variable creation routine

Reuse later for variable creation after hibernation

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/super.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 89010c5878ce..0ddfda4c484b 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -226,26 +226,18 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name,
 	return dentry != NULL;
 }
 
-static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
-			     unsigned long name_size, void *data)
+static int efivarfs_create_dentry(struct super_block *sb, efi_char16_t *name16,
+				  unsigned long name_size, efi_guid_t vendor,
+				  char *name)
 {
-	struct super_block *sb = (struct super_block *)data;
 	struct efivar_entry *entry;
-	struct inode *inode = NULL;
+	struct inode *inode;
 	struct dentry *dentry, *root = sb->s_root;
 	unsigned long size = 0;
-	char *name;
 	int len;
 	int err = -ENOMEM;
 	bool is_removable = false;
 
-	if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
-		return 0;
-
-	name = efivar_get_utf8name(name16, &vendor);
-	if (!name)
-		return err;
-
 	/* length of the variable name itself: remove GUID and separator */
 	len = strlen(name) - EFI_VARIABLE_GUID_LEN - 1;
 
@@ -289,6 +281,22 @@ fail_name:
 	return err;
 }
 
+static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
+			     unsigned long name_size, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	char *name;
+
+	if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
+		return 0;
+
+	name = efivar_get_utf8name(name16, &vendor);
+	if (!name)
+		return -ENOMEM;
+
+	return efivarfs_create_dentry(sb, name16, name_size, vendor, name);
+}
+
 enum {
 	Opt_uid, Opt_gid,
 };
-- 
cgit v1.2.3


From b5d1e6ee761a109400e97ac6a1b91c57d0f6a43a Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Tue, 7 Jan 2025 13:31:29 -0800
Subject: efivarfs: add variable resync after hibernation

Hibernation allows other OSs to boot and thus the variable state might
be altered by the time the hibernation image is resumed.  Resync the
variable state by looping over all the dentries and update the size
(in case of alteration) delete any which no-longer exist.  Finally,
loop over all efi variables creating any which don't have
corresponding dentries.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
[ardb: - apply error pointer fixup from Dan Carpenter
       - rebase onto latest version of James's efivarfs rework]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/internal.h |   3 +-
 fs/efivarfs/super.c    | 151 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/efivarfs/vars.c     |   6 +-
 3 files changed, 156 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 1f84844fe032..ac6a1dd0a6a5 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -17,6 +17,7 @@ struct efivarfs_fs_info {
 	struct efivarfs_mount_opts mount_opts;
 	struct super_block *sb;
 	struct notifier_block nb;
+	struct notifier_block pm_nb;
 };
 
 struct efi_variable {
@@ -37,7 +38,7 @@ static inline struct efivar_entry *efivar_entry(struct inode *inode)
 }
 
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
-		void *data);
+		void *data, bool duplicate_check);
 
 int efivar_entry_delete(struct efivar_entry *entry);
 
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 0ddfda4c484b..09fcf731e65d 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/ucs2_string.h>
 #include <linux/slab.h>
+#include <linux/suspend.h>
 #include <linux/magic.h>
 #include <linux/statfs.h>
 #include <linux/notifier.h>
@@ -366,7 +367,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (err)
 		return err;
 
-	return efivar_init(efivarfs_callback, sb);
+	return efivar_init(efivarfs_callback, sb, true);
 }
 
 static int efivarfs_get_tree(struct fs_context *fc)
@@ -390,6 +391,148 @@ static const struct fs_context_operations efivarfs_context_ops = {
 	.reconfigure	= efivarfs_reconfigure,
 };
 
+struct efivarfs_ctx {
+	struct dir_context ctx;
+	struct super_block *sb;
+	struct dentry *dentry;
+};
+
+static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len,
+			   loff_t offset, u64 ino, unsigned mode)
+{
+	unsigned long size;
+	struct efivarfs_ctx *ectx = container_of(ctx, struct efivarfs_ctx, ctx);
+	struct qstr qstr = { .name = name, .len = len };
+	struct dentry *dentry = d_hash_and_lookup(ectx->sb->s_root, &qstr);
+	struct inode *inode;
+	struct efivar_entry *entry;
+	int err;
+
+	if (IS_ERR_OR_NULL(dentry))
+		return true;
+
+	inode = d_inode(dentry);
+	entry = efivar_entry(inode);
+
+	err = efivar_entry_size(entry, &size);
+	size += sizeof(__u32);	/* attributes */
+	if (err)
+		size = 0;
+
+	inode_lock(inode);
+	i_size_write(inode, size);
+	inode_unlock(inode);
+
+	if (!size) {
+		ectx->dentry = dentry;
+		return false;
+	}
+
+	dput(dentry);
+
+	return true;
+}
+
+static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
+				  unsigned long name_size, void *data)
+{
+	char *name;
+	struct super_block *sb = data;
+	struct dentry *dentry;
+	struct qstr qstr;
+	int err;
+
+	if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
+		return 0;
+
+	name = efivar_get_utf8name(name16, &vendor);
+	if (!name)
+		return -ENOMEM;
+
+	qstr.name = name;
+	qstr.len = strlen(name);
+	dentry = d_hash_and_lookup(sb->s_root, &qstr);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out;
+	}
+
+	if (!dentry) {
+		/* found missing entry */
+		pr_info("efivarfs: creating variable %s\n", name);
+		return efivarfs_create_dentry(sb, name16, name_size, vendor, name);
+	}
+
+	dput(dentry);
+	err = 0;
+
+ out:
+	kfree(name);
+
+	return err;
+}
+
+static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
+			      void *ptr)
+{
+	struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info,
+						    pm_nb);
+	struct path path = { .mnt = NULL, .dentry = sfi->sb->s_root, };
+	struct efivarfs_ctx ectx = {
+		.ctx = {
+			.actor	= efivarfs_actor,
+		},
+		.sb = sfi->sb,
+	};
+	struct file *file;
+	static bool rescan_done = true;
+
+	if (action == PM_HIBERNATION_PREPARE) {
+		rescan_done = false;
+		return NOTIFY_OK;
+	} else if (action != PM_POST_HIBERNATION) {
+		return NOTIFY_DONE;
+	}
+
+	if (rescan_done)
+		return NOTIFY_DONE;
+
+	pr_info("efivarfs: resyncing variable state\n");
+
+	/* O_NOATIME is required to prevent oops on NULL mnt */
+	file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME,
+				current_cred());
+	if (IS_ERR(file))
+		return NOTIFY_DONE;
+
+	rescan_done = true;
+
+	/*
+	 * First loop over the directory and verify each entry exists,
+	 * removing it if it doesn't
+	 */
+	file->f_pos = 2;	/* skip . and .. */
+	do {
+		ectx.dentry = NULL;
+		iterate_dir(file, &ectx.ctx);
+		if (ectx.dentry) {
+			pr_info("efivarfs: removing variable %pd\n",
+				ectx.dentry);
+			simple_recursive_removal(ectx.dentry, NULL);
+			dput(ectx.dentry);
+		}
+	} while (ectx.dentry);
+	fput(file);
+
+	/*
+	 * then loop over variables, creating them if there's no matching
+	 * dentry
+	 */
+	efivar_init(efivarfs_check_missing, sfi->sb, false);
+
+	return NOTIFY_OK;
+}
+
 static int efivarfs_init_fs_context(struct fs_context *fc)
 {
 	struct efivarfs_fs_info *sfi;
@@ -406,6 +549,11 @@ static int efivarfs_init_fs_context(struct fs_context *fc)
 
 	fc->s_fs_info = sfi;
 	fc->ops = &efivarfs_context_ops;
+
+	sfi->pm_nb.notifier_call = efivarfs_pm_notify;
+	sfi->pm_nb.priority = 0;
+	register_pm_notifier(&sfi->pm_nb);
+
 	return 0;
 }
 
@@ -415,6 +563,7 @@ static void efivarfs_kill_sb(struct super_block *sb)
 
 	blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
 	kill_litter_super(sb);
+	unregister_pm_notifier(&sfi->pm_nb);
 
 	kfree(sfi);
 }
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index d0beecbf9441..6edc10958ecf 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -364,6 +364,7 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
  * efivar_init - build the initial list of EFI variables
  * @func: callback function to invoke for every variable
  * @data: function-specific data to pass to @func
+ * @duplicate_check: fail if a duplicate variable is found
  *
  * Get every EFI variable from the firmware and invoke @func. @func
  * should populate the initial dentry and inode tree.
@@ -371,7 +372,7 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
  * Returns 0 on success, or a kernel error code on failure.
  */
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
-		void *data)
+		void *data, bool duplicate_check)
 {
 	unsigned long variable_name_size = 512;
 	efi_char16_t *variable_name;
@@ -415,7 +416,8 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
 			 * we'll ever see a different variable name,
 			 * and may end up looping here forever.
 			 */
-			if (efivarfs_variable_is_present(variable_name,
+			if (duplicate_check &&
+			    efivarfs_variable_is_present(variable_name,
 							 &vendor_guid, data)) {
 				dup_variable_bug(variable_name, &vendor_guid,
 						 variable_name_size);
-- 
cgit v1.2.3


From eb3fabde15bccdf34f1c9b35a83aa4c0dacbb4ca Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 16 Jan 2025 20:05:39 -0500
Subject: pnfs/flexfiles: retry getting layout segment for reads

If ff_layout_pg_get_read()'s attempt to get a layout segment results
in -EAGAIN have ff_layout_pg_init_read() retry it after sleeping.

If "softerr" mount is used, use 'io_maxretrans' to limit the number of
attempts to get a layout segment.

This fixes a long-standing issue of O_DIRECT reads failing with
-EAGAIN (11) when using flexfiles Client Side Mirroring (CSM).

Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ce61bf1ada6c..98b45b636be3 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -848,6 +848,9 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 	struct nfs4_pnfs_ds *ds;
 	u32 ds_idx;
 
+	if (NFS_SERVER(pgio->pg_inode)->flags &
+			(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
+		pgio->pg_maxretrans = io_maxretrans;
 retry:
 	pnfs_generic_pg_check_layout(pgio, req);
 	/* Use full layout for now */
@@ -861,6 +864,8 @@ retry:
 		if (!pgio->pg_lseg)
 			goto out_nolseg;
 	}
+	/* Reset wb_nio, since getting layout segment was successful */
+	req->wb_nio = 0;
 
 	ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
 	if (!ds) {
@@ -877,14 +882,24 @@ retry:
 	pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
 
 	pgio->pg_mirror_idx = ds_idx;
-
-	if (NFS_SERVER(pgio->pg_inode)->flags &
-			(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
-		pgio->pg_maxretrans = io_maxretrans;
 	return;
 out_nolseg:
-	if (pgio->pg_error < 0)
-		return;
+	if (pgio->pg_error < 0) {
+		if (pgio->pg_error != -EAGAIN)
+			return;
+		/* Retry getting layout segment if lower layer returned -EAGAIN */
+		if (pgio->pg_maxretrans && req->wb_nio++ > pgio->pg_maxretrans) {
+			if (NFS_SERVER(pgio->pg_inode)->flags & NFS_MOUNT_SOFTERR)
+				pgio->pg_error = -ETIMEDOUT;
+			else
+				pgio->pg_error = -EIO;
+			return;
+		}
+		pgio->pg_error = 0;
+		/* Sleep for 1 second before retrying */
+		ssleep(1);
+		goto retry;
+	}
 out_mds:
 	trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
 			0, NFS4_MAX_UINT64, IOMODE_READ,
-- 
cgit v1.2.3


From a68905d48a4efe5ca45bb4ceab5cd7d6ac47c300 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Mon, 20 Jan 2025 05:59:44 -0700
Subject: f2fs: Fix format specifier in sanity_check_inode()

When building for 32-bit platforms, for which 'size_t' is 'unsigned int',
there is a warning due to an incorrect format specifier:

  fs/f2fs/inode.c:320:6: error: format specifies type 'unsigned long' but the argument has type 'unsigned int' [-Werror,-Wformat]
    318 |                 f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, min: %lu, max: %lu",
        |                                                                                                 ~~~
        |                                                                                                 %u
    319 |                           __func__, inode->i_ino, fi->i_inline_xattr_size,
    320 |                           MIN_INLINE_XATTR_SIZE, MAX_INLINE_XATTR_SIZE);
        |                           ^~~~~~~~~~~~~~~~~~~~~
  fs/f2fs/f2fs.h:1855:46: note: expanded from macro 'f2fs_warn'
   1855 |         f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__)
        |                                              ~~~    ^~~~~~~~~~~
  fs/f2fs/xattr.h:86:31: note: expanded from macro 'MIN_INLINE_XATTR_SIZE'
     86 | #define MIN_INLINE_XATTR_SIZE (sizeof(struct f2fs_xattr_header) / sizeof(__le32))
        |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Use the format specifier for 'size_t', '%zu', to resolve the warning.

Fixes: 5c1768b67250 ("f2fs: fix to do sanity check correctly on i_inline_xattr_size")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 7de33da8b3ea..3dd25f64d6f1 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -315,7 +315,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 		f2fs_has_inline_xattr(inode) &&
 		(fi->i_inline_xattr_size < MIN_INLINE_XATTR_SIZE ||
 		fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
-		f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, min: %lu, max: %lu",
+		f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, min: %zu, max: %lu",
 			  __func__, inode->i_ino, fi->i_inline_xattr_size,
 			  MIN_INLINE_XATTR_SIZE, MAX_INLINE_XATTR_SIZE);
 		return false;
-- 
cgit v1.2.3


From 207764e5d6f19de483d7f0e43243d1a1fce4fb32 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Mon, 20 Jan 2025 19:19:40 +0800
Subject: f2fs: fix to avoid return invalid mtime from f2fs_get_section_mtime()

syzbot reported a f2fs bug as below:

------------[ cut here ]------------
kernel BUG at fs/f2fs/gc.c:373!
CPU: 0 UID: 0 PID: 5316 Comm: syz.0.0 Not tainted 6.13.0-rc3-syzkaller-00044-gaef25be35d23 #0
RIP: 0010:get_cb_cost fs/f2fs/gc.c:373 [inline]
RIP: 0010:get_gc_cost fs/f2fs/gc.c:406 [inline]
RIP: 0010:f2fs_get_victim+0x68b1/0x6aa0 fs/f2fs/gc.c:912
Call Trace:
 <TASK>
 __get_victim fs/f2fs/gc.c:1707 [inline]
 f2fs_gc+0xc89/0x2f60 fs/f2fs/gc.c:1915
 f2fs_ioc_gc fs/f2fs/file.c:2624 [inline]
 __f2fs_ioctl+0x4cc9/0xb8b0 fs/f2fs/file.c:4482
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:906 [inline]
 __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

w/ below testcase, it can reproduce directly:
- dd if=/dev/zero of=/tmp/file bs=1M count=64
- mkfs.f2fs /tmp/file
- mount -t f2fs -o loop,mode=fragment:block /tmp/file /mnt/f2fs
- echo 0 >  /sys/fs/f2fs/loop0/min_ssr_sections
- dd if=/dev/zero of=/mnt/f2fs/file bs=1M count=5
- umount /mnt/f2fs
- for((i=4096;i<16384;i+=512)) do inject.f2fs --sit 0 --blk $i --mb mtime --val -1 /tmp/file; done
- mount -o loop /tmp/file /mnt/f2fs
- f2fs_io gc 0 /mnt/f2fs/file

static unsigned int get_cb_cost()
{
	...
	mtime = f2fs_get_section_mtime(sbi, segno);
	f2fs_bug_on(sbi, mtime == INVALID_MTIME);
	...
}

The root cause is: mtime in f2fs_sit_entry can be fuzzed to INVALID_MTIME,
then it will trigger BUG_ON in get_cb_cost() during GC.

Let's change behavior of f2fs_get_section_mtime() as below for fix:
- return INVALID_MTIME only if total valid blocks is zero.
- return INVALID_MTIME - 1 if average mtime calculated is
INVALID_MTIME.

Fixes: b19ee7272208 ("f2fs: introduce f2fs_get_section_mtime")
Reported-by: syzbot+b9972806adbe20a910eb@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-f2fs-devel/6768c82e.050a0220.226966.0035.GAE@google.com
Cc: liuderong <liuderong@oppo.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 813254dcc00e..b3a82a8cdc5f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -5549,8 +5549,10 @@ unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi,
 	secno = GET_SEC_FROM_SEG(sbi, segno);
 	start = GET_SEG_FROM_SEC(sbi, secno);
 
-	if (!__is_large_section(sbi))
-		return get_seg_entry(sbi, start + i)->mtime;
+	if (!__is_large_section(sbi)) {
+		mtime = get_seg_entry(sbi, start + i)->mtime;
+		goto out;
+	}
 
 	for (i = 0; i < usable_segs_per_sec; i++) {
 		/* for large section, only check the mtime of valid segments */
@@ -5563,7 +5565,11 @@ unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi,
 	if (total_valid_blocks == 0)
 		return INVALID_MTIME;
 
-	return div_u64(mtime, total_valid_blocks);
+	mtime = div_u64(mtime, total_valid_blocks);
+out:
+	if (unlikely(mtime == INVALID_MTIME))
+		mtime -= 1;
+	return mtime;
 }
 
 /*
-- 
cgit v1.2.3


From f6370a360d46e4cf10f5dde3c857747994fe1fd5 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Mon, 20 Jan 2025 19:19:41 +0800
Subject: f2fs: procfs: show mtime in segment_bits

Show mtime in segment_bits for debug.

cat /proc/fs//f2fs/loop0/segment_bits
format: segment_type|valid_blocks|bitmaps|mtime
segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)
0         3|1  | 04 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00| ffffffffffffffff
1         4|3  | 00 d0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00| ffffffffffffffff
2         5|0  | 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00| ffffffffffffffff
3         0|1  | 40 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00| ffffffffffffffff

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/sysfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 6b99dc49f776..d15c68b28952 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -1472,7 +1472,7 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
 			le32_to_cpu(sbi->raw_super->segment_count_main);
 	int i, j;
 
-	seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
+	seq_puts(seq, "format: segment_type|valid_blocks|bitmaps|mtime\n"
 		"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
 
 	for (i = 0; i < total_segs; i++) {
@@ -1482,6 +1482,7 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
 		seq_printf(seq, "%d|%-3u|", se->type, se->valid_blocks);
 		for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
 			seq_printf(seq, " %.2x", se->cur_valid_map[j]);
+		seq_printf(seq, "| %llx", se->mtime);
 		seq_putc(seq, '\n');
 	}
 	return 0;
-- 
cgit v1.2.3


From 6d4008dc4a8e0949afe1d9ff27c93fe68672ea06 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.sun@unisoc.com>
Date: Mon, 13 Jan 2025 09:47:02 +0800
Subject: f2fs: Clean up the loop outside of f2fs_invalidate_blocks()

Now f2fs_invalidate_blocks() supports a continuous range of addresses,
so the for loop can be omitted.

Signed-off-by: Yi Sun <yi.sun@unisoc.com>
Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3e06fea9795c..f92a9fba9991 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -768,10 +768,8 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
 	if (IS_DEVICE_ALIASING(inode)) {
 		struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
 		struct extent_info ei = et->largest;
-		unsigned int i;
 
-		for (i = 0; i < ei.len; i++)
-			f2fs_invalidate_blocks(sbi, ei.blk + i, 1);
+		f2fs_invalidate_blocks(sbi, ei.blk, ei.len);
 
 		dec_valid_block_count(sbi, inode, ei.len);
 		f2fs_update_time(sbi, REQ_TIME);
-- 
cgit v1.2.3


From edf3c0860060f8171d60dc5a6c28cb6702559f32 Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Tue, 21 Jan 2025 10:15:41 +0800
Subject: f2fs: fix to avoid changing 'check only' behaior of recovery

The following two 'check only recovery' processes are very dependent on
the return value of f2fs_recover_fsync_data, especially when the return
value is greater than 0.
1. when device has readonly mode, shown as commit
23738e74472f ("f2fs: fix to restrict mount condition on readonly block device")
2. mount optiont NORECOVERY or DISABLE_ROLL_FORWARD is set, shown as commit
6781eabba1bd ("f2fs: give -EINVAL for norecovery and rw mount")

However, commit c426d99127b1 ("f2fs: Check write pointer consistency of open zones")
will change the return value unexpectedly, thereby changing the caller's behavior

This patch let the f2fs_recover_fsync_data return correct value,and not do
f2fs_check_and_fix_write_pointer when the device is read-only.

Fixes: c426d99127b1 ("f2fs: Check write pointer consistency of open zones")
Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/recovery.c | 4 +---
 fs/f2fs/segment.c  | 3 ++-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index f35be2c48e3c..69a2027e3ebc 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -899,10 +899,8 @@ skip:
 	 * and the f2fs is not read only, check and fix zoned block devices'
 	 * write pointer consistency.
 	 */
-	if (!err) {
+	if (!err)
 		err = f2fs_check_and_fix_write_pointer(sbi);
-		ret = err;
-	}
 
 	if (!err)
 		clear_sbi_flag(sbi, SBI_POR_DOING);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index b3a82a8cdc5f..dc1b47f9269a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -5462,7 +5462,8 @@ int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi)
 {
 	int ret;
 
-	if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb))
+	if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb) ||
+	    f2fs_hw_is_readonly(sbi))
 		return 0;
 
 	f2fs_notice(sbi, "Checking entire write pointers");
-- 
cgit v1.2.3


From 0a9b00e5e5c5fc3c77cbfd01e6ffbe77fc7fe74a Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 17 Jan 2025 17:38:56 -0300
Subject: smb: client: get rid of TCP_Server_Info::refpath_lock

TCP_Server_Info::leaf_fullpath is allocated in cifs_get_tcp_session()
and never changed afterwards, so there is no need to serialize its
access.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h  | 13 ++-----------
 fs/smb/client/connect.c   | 20 ++++----------------
 fs/smb/client/dfs_cache.c |  4 +---
 3 files changed, 7 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index c747b6f9baca..49ffc040f736 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -811,18 +811,9 @@ struct TCP_Server_Info {
 	bool use_swn_dstaddr;
 	struct sockaddr_storage swn_dstaddr;
 #endif
-	struct mutex refpath_lock; /* protects leaf_fullpath */
 	/*
-	 * leaf_fullpath: Canonical DFS referral path related to this
-	 *                connection.
-	 *                It is used in DFS cache refresher, reconnect and may
-	 *                change due to nested DFS links.
-	 *
-	 * Protected by @refpath_lock and @srv_lock.  The @refpath_lock is
-	 * mostly used for not requiring a copy of @leaf_fullpath when getting
-	 * cached or new DFS referrals (which might also sleep during I/O).
-	 * While @srv_lock is held for making string and NULL comparisons against
-	 * both fields as in mount(2) and cache refresh.
+	 * Canonical DFS referral path used in cifs_reconnect() for failover as
+	 * well as in DFS cache refresher.
 	 *
 	 * format: \\HOST\SHARE[\OPTIONAL PATH]
 	 */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 73d07d95d435..1053ef8915fb 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -492,7 +492,7 @@ static int reconnect_target_locked(struct TCP_Server_Info *server,
 static int reconnect_dfs_server(struct TCP_Server_Info *server)
 {
 	struct dfs_cache_tgt_iterator *target_hint = NULL;
-
+	const char *ref_path = server->leaf_fullpath + 1;
 	DFS_CACHE_TGT_LIST(tl);
 	int num_targets = 0;
 	int rc = 0;
@@ -505,10 +505,8 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
 	 * through /proc/fs/cifs/dfscache or the target list is empty due to server settings after
 	 * refreshing the referral, so, in this case, default it to 1.
 	 */
-	mutex_lock(&server->refpath_lock);
-	if (!dfs_cache_noreq_find(server->leaf_fullpath + 1, NULL, &tl))
+	if (!dfs_cache_noreq_find(ref_path, NULL, &tl))
 		num_targets = dfs_cache_get_nr_tgts(&tl);
-	mutex_unlock(&server->refpath_lock);
 	if (!num_targets)
 		num_targets = 1;
 
@@ -552,9 +550,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
 		mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
 	} while (server->tcpStatus == CifsNeedReconnect);
 
-	mutex_lock(&server->refpath_lock);
-	dfs_cache_noreq_update_tgthint(server->leaf_fullpath + 1, target_hint);
-	mutex_unlock(&server->refpath_lock);
+	dfs_cache_noreq_update_tgthint(ref_path, target_hint);
 	dfs_cache_free_tgts(&tl);
 
 	/* Need to set up echo worker again once connection has been established */
@@ -569,13 +565,8 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
 
 int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
 {
-	mutex_lock(&server->refpath_lock);
-	if (!server->leaf_fullpath) {
-		mutex_unlock(&server->refpath_lock);
+	if (!server->leaf_fullpath)
 		return __cifs_reconnect(server, mark_smb_session);
-	}
-	mutex_unlock(&server->refpath_lock);
-
 	return reconnect_dfs_server(server);
 }
 #else
@@ -1754,9 +1745,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
 	INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
 	INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server);
 	mutex_init(&tcp_ses->reconnect_mutex);
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	mutex_init(&tcp_ses->refpath_lock);
-#endif
 	memcpy(&tcp_ses->srcaddr, &ctx->srcaddr,
 	       sizeof(tcp_ses->srcaddr));
 	memcpy(&tcp_ses->dstaddr, &ctx->dstaddr,
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 72527623f433..5022bb1f122a 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1141,13 +1141,11 @@ static char *get_ses_refpath(struct cifs_ses *ses)
 	struct TCP_Server_Info *server = ses->server;
 	char *path = ERR_PTR(-ENOENT);
 
-	mutex_lock(&server->refpath_lock);
 	if (server->leaf_fullpath) {
-		path = kstrdup(server->leaf_fullpath + 1, GFP_ATOMIC);
+		path = kstrdup(server->leaf_fullpath + 1, GFP_KERNEL);
 		if (!path)
 			path = ERR_PTR(-ENOMEM);
 	}
-	mutex_unlock(&server->refpath_lock);
 	return path;
 }
 
-- 
cgit v1.2.3


From 056e91cbc9804f15704b5bc2f02f91c23b1abea1 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 17 Jan 2025 17:52:15 -0300
Subject: smb: client: don't check for @leaf_fullpath in match_server()

The matching of DFS connections is already handled by @dfs_conn, so
remove @leaf_fullpath matching altogether.

Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 38 +++-----------------------------------
 1 file changed, 3 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 1053ef8915fb..880d7cf8b730 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -1526,42 +1526,10 @@ static int match_server(struct TCP_Server_Info *server,
 	if (!cifs_match_ipaddr((struct sockaddr *)&ctx->srcaddr,
 			       (struct sockaddr *)&server->srcaddr))
 		return 0;
-	/*
-	 * When matching cifs.ko superblocks (@match_super == true), we can't
-	 * really match either @server->leaf_fullpath or @server->dstaddr
-	 * directly since this @server might belong to a completely different
-	 * server -- in case of domain-based DFS referrals or DFS links -- as
-	 * provided earlier by mount(2) through 'source' and 'ip' options.
-	 *
-	 * Otherwise, match the DFS referral in @server->leaf_fullpath or the
-	 * destination address in @server->dstaddr.
-	 *
-	 * When using 'nodfs' mount option, we avoid sharing it with DFS
-	 * connections as they might failover.
-	 */
-	if (!match_super) {
-		if (!ctx->nodfs) {
-			if (server->leaf_fullpath) {
-				if (!ctx->leaf_fullpath ||
-				    strcasecmp(server->leaf_fullpath,
-					       ctx->leaf_fullpath))
-					return 0;
-			} else if (ctx->leaf_fullpath) {
-				return 0;
-			}
-		} else if (server->leaf_fullpath) {
-			return 0;
-		}
-	}
 
-	/*
-	 * Match for a regular connection (address/hostname/port) which has no
-	 * DFS referrals set.
-	 */
-	if (!server->leaf_fullpath &&
-	    (strcasecmp(server->hostname, ctx->server_hostname) ||
-	     !match_server_address(server, addr) ||
-	     !match_port(server, addr)))
+	if (strcasecmp(server->hostname, ctx->server_hostname) ||
+	    !match_server_address(server, addr) ||
+	    !match_port(server, addr))
 		return 0;
 
 	if (!match_security(server, ctx))
-- 
cgit v1.2.3


From 3681c74d342db75b0d641ba60de27bf73e16e66b Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Tue, 21 Jan 2025 15:25:36 -0300
Subject: smb: client: handle lack of EA support in smb2_query_path_info()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the server doesn't support both EAs and reparse point in a file,
the SMB2_QUERY_INFO request will fail with either
STATUS_NO_EAS_ON_FILE or STATUS_EAS_NOT_SUPPORT in the compound chain,
so ignore it as long as reparse point isn't
IO_REPARSE_TAG_LX_(CHR|BLK), which would require the EAs to know about
major/minor numbers.

Reported-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2inode.c | 92 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 69 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 7d3685dd655a..c97f14757c27 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -176,27 +176,27 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 			    struct kvec *out_iov, int *out_buftype, struct dentry *dentry)
 {
 
-	struct reparse_data_buffer *rbuf;
+	struct smb2_query_info_rsp *qi_rsp = NULL;
 	struct smb2_compound_vars *vars = NULL;
-	struct kvec *rsp_iov, *iov;
-	struct smb_rqst *rqst;
-	int rc;
-	__le16 *utf16_path = NULL;
 	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
-	struct cifs_fid fid;
+	struct cifs_open_info_data *idata;
 	struct cifs_ses *ses = tcon->ses;
+	struct reparse_data_buffer *rbuf;
 	struct TCP_Server_Info *server;
-	int num_rqst = 0, i;
 	int resp_buftype[MAX_COMPOUND];
-	struct smb2_query_info_rsp *qi_rsp = NULL;
-	struct cifs_open_info_data *idata;
+	int retries = 0, cur_sleep = 1;
+	__u8 delete_pending[8] = {1,};
+	struct kvec *rsp_iov, *iov;
 	struct inode *inode = NULL;
-	int flags = 0;
-	__u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+	__le16 *utf16_path = NULL;
+	struct smb_rqst *rqst;
 	unsigned int size[2];
-	void *data[2];
+	struct cifs_fid fid;
+	int num_rqst = 0, i;
 	unsigned int len;
-	int retries = 0, cur_sleep = 1;
+	int tmp_rc, rc;
+	int flags = 0;
+	void *data[2];
 
 replay_again:
 	/* reinitialize for possible replay */
@@ -639,7 +639,14 @@ finished:
 		tcon->need_reconnect = true;
 	}
 
+	tmp_rc = rc;
 	for (i = 0; i < num_cmds; i++) {
+		char *buf = rsp_iov[i + i].iov_base;
+
+		if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER)
+			rc = server->ops->map_error(buf, false);
+		else
+			rc = tmp_rc;
 		switch (cmds[i]) {
 		case SMB2_OP_QUERY_INFO:
 			idata = in_iov[i].iov_base;
@@ -805,6 +812,7 @@ finished:
 		}
 	}
 	SMB2_close_free(&rqst[num_rqst]);
+	rc = tmp_rc;
 
 	num_cmds += 2;
 	if (out_iov && out_buftype) {
@@ -860,22 +868,52 @@ static int parse_create_response(struct cifs_open_info_data *data,
 	return rc;
 }
 
+/* Check only if SMB2_OP_QUERY_WSL_EA command failed in the compound chain */
+static bool ea_unsupported(int *cmds, int num_cmds,
+			   struct kvec *out_iov, int *out_buftype)
+{
+	int i;
+
+	if (cmds[num_cmds - 1] != SMB2_OP_QUERY_WSL_EA)
+		return false;
+
+	for (i = 1; i < num_cmds - 1; i++) {
+		struct smb2_hdr *hdr = out_iov[i].iov_base;
+
+		if (out_buftype[i] == CIFS_NO_BUFFER || !hdr ||
+		    hdr->Status != STATUS_SUCCESS)
+			return false;
+	}
+	return true;
+}
+
+static inline void free_rsp_iov(struct kvec *iovs, int *buftype, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		free_rsp_buf(buftype[i], iovs[i].iov_base);
+		memset(&iovs[i], 0, sizeof(*iovs));
+		buftype[i] = CIFS_NO_BUFFER;
+	}
+}
+
 int smb2_query_path_info(const unsigned int xid,
 			 struct cifs_tcon *tcon,
 			 struct cifs_sb_info *cifs_sb,
 			 const char *full_path,
 			 struct cifs_open_info_data *data)
 {
+	struct kvec in_iov[3], out_iov[5] = {};
+	struct cached_fid *cfid = NULL;
 	struct cifs_open_parms oparms;
-	__u32 create_options = 0;
 	struct cifsFileInfo *cfile;
-	struct cached_fid *cfid = NULL;
+	__u32 create_options = 0;
+	int out_buftype[5] = {};
 	struct smb2_hdr *hdr;
-	struct kvec in_iov[3], out_iov[3] = {};
-	int out_buftype[3] = {};
+	int num_cmds = 0;
 	int cmds[3];
 	bool islink;
-	int i, num_cmds = 0;
 	int rc, rc2;
 
 	data->adjust_tz = false;
@@ -945,14 +983,14 @@ int smb2_query_path_info(const unsigned int xid,
 		if (rc || !data->reparse_point)
 			goto out;
 
-		if (!tcon->posix_extensions)
-			cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
 		/*
 		 * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create
 		 * response.
 		 */
 		if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK)
 			cmds[num_cmds++] = SMB2_OP_GET_REPARSE;
+		if (!tcon->posix_extensions)
+			cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
 
 		oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
 				     FILE_READ_ATTRIBUTES |
@@ -960,9 +998,18 @@ int smb2_query_path_info(const unsigned int xid,
 				     FILE_OPEN, create_options |
 				     OPEN_REPARSE_POINT, ACL_NO_MODE);
 		cifs_get_readable_path(tcon, full_path, &cfile);
+		free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov));
 		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
 				      &oparms, in_iov, cmds, num_cmds,
-				      cfile, NULL, NULL, NULL);
+				      cfile, out_iov, out_buftype, NULL);
+		if (rc && ea_unsupported(cmds, num_cmds,
+					 out_iov, out_buftype)) {
+			if (data->reparse.tag != IO_REPARSE_TAG_LX_BLK &&
+			    data->reparse.tag != IO_REPARSE_TAG_LX_CHR)
+				rc = 0;
+			else
+				rc = -EOPNOTSUPP;
+		}
 		break;
 	case -EREMOTE:
 		break;
@@ -980,8 +1027,7 @@ int smb2_query_path_info(const unsigned int xid,
 	}
 
 out:
-	for (i = 0; i < ARRAY_SIZE(out_buftype); i++)
-		free_rsp_buf(out_buftype[i], out_iov[i].iov_base);
+	free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov));
 	return rc;
 }
 
-- 
cgit v1.2.3


From 8f9530aeeb4f756bdfa70510b40e5d28ea3c742e Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Thu, 23 Jan 2025 17:01:09 +0800
Subject: erofs: refine z_erofs_get_extent_compressedlen()

 - Set `compressedblks = 1` directly for non-bigpcluster cases.  This
   simplifies the logic a bit since lcluster sizes larger than one block
   are unsupported and the details remain unclear.

 - For Z_EROFS_LCLUSTER_TYPE_PLAIN pclusters, avoid assuming
   `compressedblks = 1` by default.  Instead, check if
   Z_EROFS_ADVISE_BIG_PCLUSTER_2 is set.

It basically has no impact to existing valid images, but it's useful to
find the gap to prepare for large PLAIN pclusters.

Link: https://lore.kernel.org/r/20250123090109.973463-1-hsiangkao@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/zmap.c | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index b9e35089c9b8..689437e99a5a 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -294,27 +294,23 @@ err_bogus:
 static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
 					    unsigned int initial_lcn)
 {
-	struct super_block *sb = m->inode->i_sb;
-	struct erofs_inode *const vi = EROFS_I(m->inode);
-	struct erofs_map_blocks *const map = m->map;
-	const unsigned int lclusterbits = vi->z_logical_clusterbits;
-	unsigned long lcn;
+	struct inode *inode = m->inode;
+	struct super_block *sb = inode->i_sb;
+	struct erofs_inode *vi = EROFS_I(inode);
+	bool bigpcl1 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+	bool bigpcl2 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2;
+	unsigned long lcn = m->lcn + 1;
 	int err;
 
-	DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN &&
-		  m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 &&
-		  m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2);
+	DBG_BUGON(m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
 	DBG_BUGON(m->type != m->headtype);
 
-	if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
-	    ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) &&
-	     !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
-	    ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) &&
-	     !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
-		map->m_plen = 1ULL << lclusterbits;
-		return 0;
-	}
-	lcn = m->lcn + 1;
+	if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) ||
+	    ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
+	      m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) ||
+	    (lcn << vi->z_logical_clusterbits) >= inode->i_size)
+		m->compressedblks = 1;
+
 	if (m->compressedblks)
 		goto out;
 
@@ -339,9 +335,9 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
 	case Z_EROFS_LCLUSTER_TYPE_HEAD2:
 		/*
 		 * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
-		 * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+		 * rather than CBLKCNT, it's a 1 block-sized pcluster.
 		 */
-		m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits);
+		m->compressedblks = 1;
 		break;
 	case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
 		if (m->delta[0] != 1)
@@ -356,7 +352,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
 		return -EFSCORRUPTED;
 	}
 out:
-	map->m_plen = erofs_pos(sb, m->compressedblks);
+	m->map->m_plen = erofs_pos(sb, m->compressedblks);
 	return 0;
 err_bonus_cblkcnt:
 	erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
-- 
cgit v1.2.3


From 92270d076115ae81b83f0605703602d2d5a866e2 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:28:54 +0100
Subject: fuse: rename to fuse_dev_end_requests and make non-static

This function is needed by fuse_uring.c to clean ring queues,
so make it non static. Especially in non-static mode the function
name 'end_requests' should be prefixed with fuse_

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c        | 11 +++++------
 fs/fuse/fuse_dev_i.h | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 6 deletions(-)
 create mode 100644 fs/fuse/fuse_dev_i.h

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 27ccae63495d..757f2c797d68 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -7,6 +7,7 @@
 */
 
 #include "fuse_i.h"
+#include "fuse_dev_i.h"
 
 #include <linux/init.h>
 #include <linux/module.h>
@@ -34,8 +35,6 @@ MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
 
-static void end_requests(struct list_head *head);
-
 static struct fuse_dev *fuse_get_dev(struct file *file)
 {
 	/*
@@ -1885,7 +1884,7 @@ static void fuse_resend(struct fuse_conn *fc)
 		spin_unlock(&fiq->lock);
 		list_for_each_entry(req, &to_queue, list)
 			clear_bit(FR_PENDING, &req->flags);
-		end_requests(&to_queue);
+		fuse_dev_end_requests(&to_queue);
 		return;
 	}
 	/* iq and pq requests are both oldest to newest */
@@ -2204,7 +2203,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
 }
 
 /* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct list_head *head)
+void fuse_dev_end_requests(struct list_head *head)
 {
 	while (!list_empty(head)) {
 		struct fuse_req *req;
@@ -2307,7 +2306,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
 		wake_up_all(&fc->blocked_waitq);
 		spin_unlock(&fc->lock);
 
-		end_requests(&to_end);
+		fuse_dev_end_requests(&to_end);
 	} else {
 		spin_unlock(&fc->lock);
 	}
@@ -2337,7 +2336,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
 			list_splice_init(&fpq->processing[i], &to_end);
 		spin_unlock(&fpq->lock);
 
-		end_requests(&to_end);
+		fuse_dev_end_requests(&to_end);
 
 		/* Are we the last open device? */
 		if (atomic_dec_and_test(&fc->dev_count)) {
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
new file mode 100644
index 000000000000..4fcff2223fa6
--- /dev/null
+++ b/fs/fuse/fuse_dev_i.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+ */
+#ifndef _FS_FUSE_DEV_I_H
+#define _FS_FUSE_DEV_I_H
+
+#include <linux/types.h>
+
+void fuse_dev_end_requests(struct list_head *head);
+
+#endif
+
-- 
cgit v1.2.3


From 867d93dcdede5ea453543085d1ed0bf43cde60de Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:28:55 +0100
Subject: fuse: Move fuse_get_dev to header file

Another preparation patch, as this function will be needed by
fuse/dev.c and fuse/dev_uring.c.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c        | 9 ---------
 fs/fuse/fuse_dev_i.h | 9 +++++++++
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 757f2c797d68..3db3282bdac4 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -35,15 +35,6 @@ MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
 
-static struct fuse_dev *fuse_get_dev(struct file *file)
-{
-	/*
-	 * Lockless access is OK, because file->private data is set
-	 * once during mount and is valid until the file is released.
-	 */
-	return READ_ONCE(file->private_data);
-}
-
 static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
 {
 	INIT_LIST_HEAD(&req->list);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 4fcff2223fa6..e7ea1b21c182 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -8,6 +8,15 @@
 
 #include <linux/types.h>
 
+static inline struct fuse_dev *fuse_get_dev(struct file *file)
+{
+	/*
+	 * Lockless access is OK, because file->private data is set
+	 * once during mount and is valid until the file is released.
+	 */
+	return READ_ONCE(file->private_data);
+}
+
 void fuse_dev_end_requests(struct list_head *head);
 
 #endif
-- 
cgit v1.2.3


From 88be7aa98d91f5761f8764273ebd961915e4632d Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:28:56 +0100
Subject: fuse: Move request bits

These are needed by fuse-over-io-uring.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c        | 4 ----
 fs/fuse/fuse_dev_i.h | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 3db3282bdac4..4f8825de9e05 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -29,10 +29,6 @@
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
 
-/* Ordinary requests have even IDs, while interrupts IDs are odd */
-#define FUSE_INT_REQ_BIT (1ULL << 0)
-#define FUSE_REQ_ID_STEP (1ULL << 1)
-
 static struct kmem_cache *fuse_req_cachep;
 
 static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index e7ea1b21c182..08a7e88e0027 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -8,6 +8,10 @@
 
 #include <linux/types.h>
 
+/* Ordinary requests have even IDs, while interrupts IDs are odd */
+#define FUSE_INT_REQ_BIT (1ULL << 0)
+#define FUSE_REQ_ID_STEP (1ULL << 1)
+
 static inline struct fuse_dev *fuse_get_dev(struct file *file)
 {
 	/*
-- 
cgit v1.2.3


From 7ccd86ba3a485a8bc33478776eb7053d9adb7816 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:28:58 +0100
Subject: fuse: make args->in_args[0] to be always the header

This change sets up FUSE operations to always have headers in
args.in_args[0], even for opcodes without an actual header.
This step prepares for a clean separation of payload from headers,
initially it is used by fuse-over-io-uring.

For opcodes without a header, we use a zero-sized struct as a
placeholder. This approach:
- Keeps things consistent across all FUSE operations
- Will help with payload alignment later
- Avoids future issues when header sizes change

Op codes that already have an op code specific header do not
need modification.
Op codes that have neither payload nor op code headers
are not modified either (FUSE_READLINK and FUSE_DESTROY).
FUSE_BATCH_FORGET already has the header in the right place,
but is not using fuse_copy_args - as -over-uring is currently
not handling forgets it does not matter for now, but header
separation will later need special attention for that op code.

Correct the struct fuse_args->in_args array max size.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dax.c    | 11 ++++++-----
 fs/fuse/dev.c    |  9 +++++----
 fs/fuse/dir.c    | 32 ++++++++++++++++++--------------
 fs/fuse/fuse_i.h | 15 ++++++++++++++-
 fs/fuse/xattr.c  |  7 ++++---
 5 files changed, 47 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 9abbc2f2894f..0b6ee6dd1fd6 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -240,11 +240,12 @@ static int fuse_send_removemapping(struct inode *inode,
 
 	args.opcode = FUSE_REMOVEMAPPING;
 	args.nodeid = fi->nodeid;
-	args.in_numargs = 2;
-	args.in_args[0].size = sizeof(*inargp);
-	args.in_args[0].value = inargp;
-	args.in_args[1].size = inargp->count * sizeof(*remove_one);
-	args.in_args[1].value = remove_one;
+	args.in_numargs = 3;
+	fuse_set_zero_arg0(&args);
+	args.in_args[1].size = sizeof(*inargp);
+	args.in_args[1].value = inargp;
+	args.in_args[2].size = inargp->count * sizeof(*remove_one);
+	args.in_args[2].value = remove_one;
 	return fuse_simple_request(fm, &args);
 }
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4f8825de9e05..623c5a067c18 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1746,7 +1746,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
 	args = &ap->args;
 	args->nodeid = outarg->nodeid;
 	args->opcode = FUSE_NOTIFY_REPLY;
-	args->in_numargs = 2;
+	args->in_numargs = 3;
 	args->in_pages = true;
 	args->end = fuse_retrieve_end;
 
@@ -1774,9 +1774,10 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
 	}
 	ra->inarg.offset = outarg->offset;
 	ra->inarg.size = total_len;
-	args->in_args[0].size = sizeof(ra->inarg);
-	args->in_args[0].value = &ra->inarg;
-	args->in_args[1].size = total_len;
+	fuse_set_zero_arg0(args);
+	args->in_args[1].size = sizeof(ra->inarg);
+	args->in_args[1].value = &ra->inarg;
+	args->in_args[2].size = total_len;
 
 	err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
 	if (err)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e540d05549ff..2ecdb8f14d46 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -175,9 +175,10 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
 	memset(outarg, 0, sizeof(struct fuse_entry_out));
 	args->opcode = FUSE_LOOKUP;
 	args->nodeid = nodeid;
-	args->in_numargs = 1;
-	args->in_args[0].size = name->len + 1;
-	args->in_args[0].value = name->name;
+	args->in_numargs = 2;
+	fuse_set_zero_arg0(args);
+	args->in_args[1].size = name->len + 1;
+	args->in_args[1].value = name->name;
 	args->out_numargs = 1;
 	args->out_args[0].size = sizeof(struct fuse_entry_out);
 	args->out_args[0].value = outarg;
@@ -928,11 +929,12 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	FUSE_ARGS(args);
 
 	args.opcode = FUSE_SYMLINK;
-	args.in_numargs = 2;
-	args.in_args[0].size = entry->d_name.len + 1;
-	args.in_args[0].value = entry->d_name.name;
-	args.in_args[1].size = len;
-	args.in_args[1].value = link;
+	args.in_numargs = 3;
+	fuse_set_zero_arg0(&args);
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
+	args.in_args[2].size = len;
+	args.in_args[2].value = link;
 	return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK);
 }
 
@@ -992,9 +994,10 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 
 	args.opcode = FUSE_UNLINK;
 	args.nodeid = get_node_id(dir);
-	args.in_numargs = 1;
-	args.in_args[0].size = entry->d_name.len + 1;
-	args.in_args[0].value = entry->d_name.name;
+	args.in_numargs = 2;
+	fuse_set_zero_arg0(&args);
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
 	err = fuse_simple_request(fm, &args);
 	if (!err) {
 		fuse_dir_changed(dir);
@@ -1015,9 +1018,10 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 
 	args.opcode = FUSE_RMDIR;
 	args.nodeid = get_node_id(dir);
-	args.in_numargs = 1;
-	args.in_args[0].size = entry->d_name.len + 1;
-	args.in_args[0].value = entry->d_name.name;
+	args.in_numargs = 2;
+	fuse_set_zero_arg0(&args);
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
 	err = fuse_simple_request(fm, &args);
 	if (!err) {
 		fuse_dir_changed(dir);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 74744c6f2860..5666900bee5e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -310,7 +310,7 @@ struct fuse_args {
 	bool is_ext:1;
 	bool is_pinned:1;
 	bool invalidate_vmap:1;
-	struct fuse_in_arg in_args[3];
+	struct fuse_in_arg in_args[4];
 	struct fuse_arg out_args[2];
 	void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
 	/* Used for kvec iter backed by vmalloc address */
@@ -947,6 +947,19 @@ struct fuse_mount {
 	struct rcu_head rcu;
 };
 
+/*
+ * Empty header for FUSE opcodes without specific header needs.
+ * Used as a placeholder in args->in_args[0] for consistency
+ * across all FUSE operations, simplifying request handling.
+ */
+struct fuse_zero_header {};
+
+static inline void fuse_set_zero_arg0(struct fuse_args *args)
+{
+	args->in_args[0].size = sizeof(struct fuse_zero_header);
+	args->in_args[0].value = NULL;
+}
+
 static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 9f568d345c51..93dfb06b6cea 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -164,9 +164,10 @@ int fuse_removexattr(struct inode *inode, const char *name)
 
 	args.opcode = FUSE_REMOVEXATTR;
 	args.nodeid = get_node_id(inode);
-	args.in_numargs = 1;
-	args.in_args[0].size = strlen(name) + 1;
-	args.in_args[0].value = name;
+	args.in_numargs = 2;
+	fuse_set_zero_arg0(&args);
+	args.in_args[1].size = strlen(name) + 1;
+	args.in_args[1].value = name;
 	err = fuse_simple_request(fm, &args);
 	if (err == -ENOSYS) {
 		fm->fc->no_removexattr = 1;
-- 
cgit v1.2.3


From 24fe962c86f55347385933a1b06ca71b60854690 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:28:59 +0100
Subject: fuse: {io-uring} Handle SQEs - register commands

This adds basic support for ring SQEs (with opcode=IORING_OP_URING_CMD).
For now only FUSE_IO_URING_CMD_REGISTER is handled to register queue
entries.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> # io_uring
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/Kconfig           |  12 ++
 fs/fuse/Makefile          |   1 +
 fs/fuse/dev_uring.c       | 326 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dev_uring_i.h     | 113 ++++++++++++++++
 fs/fuse/fuse_i.h          |   5 +
 fs/fuse/inode.c           |  10 ++
 include/uapi/linux/fuse.h |  76 ++++++++++-
 7 files changed, 542 insertions(+), 1 deletion(-)
 create mode 100644 fs/fuse/dev_uring.c
 create mode 100644 fs/fuse/dev_uring_i.h

(limited to 'fs')

diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 8674dbfbe59d..ca215a3cba3e 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH
 	  to be performed directly on a backing file.
 
 	  If you want to allow passthrough operations, answer Y.
+
+config FUSE_IO_URING
+	bool "FUSE communication over io-uring"
+	default y
+	depends on FUSE_FS
+	depends on IO_URING
+	help
+	  This allows sending FUSE requests over the io-uring interface and
+          also adds request core affinity.
+
+	  If you want to allow fuse server/client communication through io-uring,
+	  answer Y
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 2c372180d631..3f0f312a31c1 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -15,5 +15,6 @@ fuse-y += iomode.o
 fuse-$(CONFIG_FUSE_DAX) += dax.o
 fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
 fuse-$(CONFIG_SYSCTL) += sysctl.o
+fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
 
 virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
new file mode 100644
index 000000000000..42092a234570
--- /dev/null
+++ b/fs/fuse/dev_uring.c
@@ -0,0 +1,326 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#include "fuse_i.h"
+#include "dev_uring_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/fs.h>
+#include <linux/io_uring/cmd.h>
+
+static bool __read_mostly enable_uring;
+module_param(enable_uring, bool, 0644);
+MODULE_PARM_DESC(enable_uring,
+		 "Enable userspace communication through io-uring");
+
+#define FUSE_URING_IOV_SEGS 2 /* header and payload */
+
+
+bool fuse_uring_enabled(void)
+{
+	return enable_uring;
+}
+
+void fuse_uring_destruct(struct fuse_conn *fc)
+{
+	struct fuse_ring *ring = fc->ring;
+	int qid;
+
+	if (!ring)
+		return;
+
+	for (qid = 0; qid < ring->nr_queues; qid++) {
+		struct fuse_ring_queue *queue = ring->queues[qid];
+
+		if (!queue)
+			continue;
+
+		WARN_ON(!list_empty(&queue->ent_avail_queue));
+		WARN_ON(!list_empty(&queue->ent_commit_queue));
+
+		kfree(queue);
+		ring->queues[qid] = NULL;
+	}
+
+	kfree(ring->queues);
+	kfree(ring);
+	fc->ring = NULL;
+}
+
+/*
+ * Basic ring setup for this connection based on the provided configuration
+ */
+static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
+{
+	struct fuse_ring *ring;
+	size_t nr_queues = num_possible_cpus();
+	struct fuse_ring *res = NULL;
+	size_t max_payload_size;
+
+	ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
+	if (!ring)
+		return NULL;
+
+	ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *),
+			       GFP_KERNEL_ACCOUNT);
+	if (!ring->queues)
+		goto out_err;
+
+	max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write);
+	max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE);
+
+	spin_lock(&fc->lock);
+	if (fc->ring) {
+		/* race, another thread created the ring in the meantime */
+		spin_unlock(&fc->lock);
+		res = fc->ring;
+		goto out_err;
+	}
+
+	fc->ring = ring;
+	ring->nr_queues = nr_queues;
+	ring->fc = fc;
+	ring->max_payload_sz = max_payload_size;
+
+	spin_unlock(&fc->lock);
+	return ring;
+
+out_err:
+	kfree(ring->queues);
+	kfree(ring);
+	return res;
+}
+
+static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
+						       int qid)
+{
+	struct fuse_conn *fc = ring->fc;
+	struct fuse_ring_queue *queue;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
+	if (!queue)
+		return NULL;
+	queue->qid = qid;
+	queue->ring = ring;
+	spin_lock_init(&queue->lock);
+
+	INIT_LIST_HEAD(&queue->ent_avail_queue);
+	INIT_LIST_HEAD(&queue->ent_commit_queue);
+
+	spin_lock(&fc->lock);
+	if (ring->queues[qid]) {
+		spin_unlock(&fc->lock);
+		kfree(queue);
+		return ring->queues[qid];
+	}
+
+	/*
+	 * write_once and lock as the caller mostly doesn't take the lock at all
+	 */
+	WRITE_ONCE(ring->queues[qid], queue);
+	spin_unlock(&fc->lock);
+
+	return queue;
+}
+
+/*
+ * Make a ring entry available for fuse_req assignment
+ */
+static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
+				 struct fuse_ring_queue *queue)
+{
+	WARN_ON_ONCE(!ent->cmd);
+	list_move(&ent->list, &queue->ent_avail_queue);
+	ent->state = FRRS_AVAILABLE;
+}
+
+/*
+ * fuse_uring_req_fetch command handling
+ */
+static void fuse_uring_do_register(struct fuse_ring_ent *ent,
+				   struct io_uring_cmd *cmd,
+				   unsigned int issue_flags)
+{
+	struct fuse_ring_queue *queue = ent->queue;
+
+	spin_lock(&queue->lock);
+	ent->cmd = cmd;
+	fuse_uring_ent_avail(ent, queue);
+	spin_unlock(&queue->lock);
+}
+
+/*
+ * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1]
+ * the payload
+ */
+static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
+					 struct iovec iov[FUSE_URING_IOV_SEGS])
+{
+	struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	struct iov_iter iter;
+	ssize_t ret;
+
+	if (sqe->len != FUSE_URING_IOV_SEGS)
+		return -EINVAL;
+
+	/*
+	 * Direction for buffer access will actually be READ and WRITE,
+	 * using write for the import should include READ access as well.
+	 */
+	ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS,
+			   FUSE_URING_IOV_SEGS, &iov, &iter);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static struct fuse_ring_ent *
+fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
+			   struct fuse_ring_queue *queue)
+{
+	struct fuse_ring *ring = queue->ring;
+	struct fuse_ring_ent *ent;
+	size_t payload_size;
+	struct iovec iov[FUSE_URING_IOV_SEGS];
+	int err;
+
+	err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov);
+	if (err) {
+		pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n",
+				    err);
+		return ERR_PTR(err);
+	}
+
+	err = -EINVAL;
+	if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) {
+		pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len);
+		return ERR_PTR(err);
+	}
+
+	payload_size = iov[1].iov_len;
+	if (payload_size < ring->max_payload_sz) {
+		pr_info_ratelimited("Invalid req payload len %zu\n",
+				    payload_size);
+		return ERR_PTR(err);
+	}
+
+	err = -ENOMEM;
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
+	if (!ent)
+		return ERR_PTR(err);
+
+	INIT_LIST_HEAD(&ent->list);
+
+	ent->queue = queue;
+	ent->headers = iov[0].iov_base;
+	ent->payload = iov[1].iov_base;
+
+	return ent;
+}
+
+/*
+ * Register header and payload buffer with the kernel and puts the
+ * entry as "ready to get fuse requests" on the queue
+ */
+static int fuse_uring_register(struct io_uring_cmd *cmd,
+			       unsigned int issue_flags, struct fuse_conn *fc)
+{
+	const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+	struct fuse_ring *ring = fc->ring;
+	struct fuse_ring_queue *queue;
+	struct fuse_ring_ent *ent;
+	int err;
+	unsigned int qid = READ_ONCE(cmd_req->qid);
+
+	err = -ENOMEM;
+	if (!ring) {
+		ring = fuse_uring_create(fc);
+		if (!ring)
+			return err;
+	}
+
+	if (qid >= ring->nr_queues) {
+		pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid);
+		return -EINVAL;
+	}
+
+	queue = ring->queues[qid];
+	if (!queue) {
+		queue = fuse_uring_create_queue(ring, qid);
+		if (!queue)
+			return err;
+	}
+
+	/*
+	 * The created queue above does not need to be destructed in
+	 * case of entry errors below, will be done at ring destruction time.
+	 */
+
+	ent = fuse_uring_create_ring_ent(cmd, queue);
+	if (IS_ERR(ent))
+		return PTR_ERR(ent);
+
+	fuse_uring_do_register(ent, cmd, issue_flags);
+
+	return 0;
+}
+
+/*
+ * Entry function from io_uring to handle the given passthrough command
+ * (op code IORING_OP_URING_CMD)
+ */
+int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd,
+				  unsigned int issue_flags)
+{
+	struct fuse_dev *fud;
+	struct fuse_conn *fc;
+	u32 cmd_op = cmd->cmd_op;
+	int err;
+
+	if (!enable_uring) {
+		pr_info_ratelimited("fuse-io-uring is disabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* This extra SQE size holds struct fuse_uring_cmd_req */
+	if (!(issue_flags & IO_URING_F_SQE128))
+		return -EINVAL;
+
+	fud = fuse_get_dev(cmd->file);
+	if (!fud) {
+		pr_info_ratelimited("No fuse device found\n");
+		return -ENOTCONN;
+	}
+	fc = fud->fc;
+
+	if (fc->aborted)
+		return -ECONNABORTED;
+	if (!fc->connected)
+		return -ENOTCONN;
+
+	/*
+	 * fuse_uring_register() needs the ring to be initialized,
+	 * we need to know the max payload size
+	 */
+	if (!fc->initialized)
+		return -EAGAIN;
+
+	switch (cmd_op) {
+	case FUSE_IO_URING_CMD_REGISTER:
+		err = fuse_uring_register(cmd, issue_flags, fc);
+		if (err) {
+			pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n",
+				     err);
+			return err;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return -EIOCBQUEUED;
+}
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
new file mode 100644
index 000000000000..ae1536355b36
--- /dev/null
+++ b/fs/fuse/dev_uring_i.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#ifndef _FS_FUSE_DEV_URING_I_H
+#define _FS_FUSE_DEV_URING_I_H
+
+#include "fuse_i.h"
+
+#ifdef CONFIG_FUSE_IO_URING
+
+enum fuse_ring_req_state {
+	FRRS_INVALID = 0,
+
+	/* The ring entry received from userspace and it is being processed */
+	FRRS_COMMIT,
+
+	/* The ring entry is waiting for new fuse requests */
+	FRRS_AVAILABLE,
+
+	/* The ring entry is in or on the way to user space */
+	FRRS_USERSPACE,
+};
+
+/** A fuse ring entry, part of the ring queue */
+struct fuse_ring_ent {
+	/* userspace buffer */
+	struct fuse_uring_req_header __user *headers;
+	void __user *payload;
+
+	/* the ring queue that owns the request */
+	struct fuse_ring_queue *queue;
+
+	/* fields below are protected by queue->lock */
+
+	struct io_uring_cmd *cmd;
+
+	struct list_head list;
+
+	enum fuse_ring_req_state state;
+
+	struct fuse_req *fuse_req;
+};
+
+struct fuse_ring_queue {
+	/*
+	 * back pointer to the main fuse uring structure that holds this
+	 * queue
+	 */
+	struct fuse_ring *ring;
+
+	/* queue id, corresponds to the cpu core */
+	unsigned int qid;
+
+	/*
+	 * queue lock, taken when any value in the queue changes _and_ also
+	 * a ring entry state changes.
+	 */
+	spinlock_t lock;
+
+	/* available ring entries (struct fuse_ring_ent) */
+	struct list_head ent_avail_queue;
+
+	/*
+	 * entries in the process of being committed or in the process
+	 * to be sent to userspace
+	 */
+	struct list_head ent_commit_queue;
+};
+
+/**
+ * Describes if uring is for communication and holds alls the data needed
+ * for uring communication
+ */
+struct fuse_ring {
+	/* back pointer */
+	struct fuse_conn *fc;
+
+	/* number of ring queues */
+	size_t nr_queues;
+
+	/* maximum payload/arg size */
+	size_t max_payload_sz;
+
+	struct fuse_ring_queue **queues;
+};
+
+bool fuse_uring_enabled(void);
+void fuse_uring_destruct(struct fuse_conn *fc);
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+
+#else /* CONFIG_FUSE_IO_URING */
+
+struct fuse_ring;
+
+static inline void fuse_uring_create(struct fuse_conn *fc)
+{
+}
+
+static inline void fuse_uring_destruct(struct fuse_conn *fc)
+{
+}
+
+static inline bool fuse_uring_enabled(void)
+{
+	return false;
+}
+
+#endif /* CONFIG_FUSE_IO_URING */
+
+#endif /* _FS_FUSE_DEV_URING_I_H */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5666900bee5e..bce8cc482d64 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -923,6 +923,11 @@ struct fuse_conn {
 	/** IDR for backing files ids */
 	struct idr backing_files_map;
 #endif
+
+#ifdef CONFIG_FUSE_IO_URING
+	/**  uring connection information*/
+	struct fuse_ring *ring;
+#endif
 };
 
 /*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3ce4f4e81d09..e4f9bbacfc1b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -7,6 +7,7 @@
 */
 
 #include "fuse_i.h"
+#include "dev_uring_i.h"
 
 #include <linux/pagemap.h>
 #include <linux/slab.h>
@@ -992,6 +993,8 @@ static void delayed_release(struct rcu_head *p)
 {
 	struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
 
+	fuse_uring_destruct(fc);
+
 	put_user_ns(fc->user_ns);
 	fc->release(fc);
 }
@@ -1446,6 +1449,13 @@ void fuse_send_init(struct fuse_mount *fm)
 	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
 		flags |= FUSE_PASSTHROUGH;
 
+	/*
+	 * This is just an information flag for fuse server. No need to check
+	 * the reply - server is either sending IORING_OP_URING_CMD or not.
+	 */
+	if (fuse_uring_enabled())
+		flags |= FUSE_OVER_IO_URING;
+
 	ia->in.flags = flags;
 	ia->in.flags2 = flags >> 32;
 
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index f1e99458e29e..5e0eb41d967e 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -220,6 +220,15 @@
  *
  *  7.41
  *  - add FUSE_ALLOW_IDMAP
+ *  7.42
+ *  - Add FUSE_OVER_IO_URING and all other io-uring related flags and data
+ *    structures:
+ *    - struct fuse_uring_ent_in_out
+ *    - struct fuse_uring_req_header
+ *    - struct fuse_uring_cmd_req
+ *    - FUSE_URING_IN_OUT_HEADER_SZ
+ *    - FUSE_URING_OP_IN_OUT_SZ
+ *    - enum fuse_uring_cmd
  */
 
 #ifndef _LINUX_FUSE_H
@@ -255,7 +264,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 41
+#define FUSE_KERNEL_MINOR_VERSION 42
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -425,6 +434,7 @@ struct fuse_file_lock {
  * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit
  *		    of the request ID indicates resend requests
  * FUSE_ALLOW_IDMAP: allow creation of idmapped mounts
+ * FUSE_OVER_IO_URING: Indicate that client supports io-uring
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -471,6 +481,7 @@ struct fuse_file_lock {
 /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */
 #define FUSE_DIRECT_IO_RELAX	FUSE_DIRECT_IO_ALLOW_MMAP
 #define FUSE_ALLOW_IDMAP	(1ULL << 40)
+#define FUSE_OVER_IO_URING	(1ULL << 41)
 
 /**
  * CUSE INIT request/reply flags
@@ -1206,4 +1217,67 @@ struct fuse_supp_groups {
 	uint32_t	groups[];
 };
 
+/**
+ * Size of the ring buffer header
+ */
+#define FUSE_URING_IN_OUT_HEADER_SZ 128
+#define FUSE_URING_OP_IN_OUT_SZ 128
+
+/* Used as part of the fuse_uring_req_header */
+struct fuse_uring_ent_in_out {
+	uint64_t flags;
+
+	/*
+	 * commit ID to be used in a reply to a ring request (see also
+	 * struct fuse_uring_cmd_req)
+	 */
+	uint64_t commit_id;
+
+	/* size of user payload buffer */
+	uint32_t payload_sz;
+	uint32_t padding;
+
+	uint64_t reserved;
+};
+
+/**
+ * Header for all fuse-io-uring requests
+ */
+struct fuse_uring_req_header {
+	/* struct fuse_in_header / struct fuse_out_header */
+	char in_out[FUSE_URING_IN_OUT_HEADER_SZ];
+
+	/* per op code header */
+	char op_in[FUSE_URING_OP_IN_OUT_SZ];
+
+	struct fuse_uring_ent_in_out ring_ent_in_out;
+};
+
+/**
+ * sqe commands to the kernel
+ */
+enum fuse_uring_cmd {
+	FUSE_IO_URING_CMD_INVALID = 0,
+
+	/* register the request buffer and fetch a fuse request */
+	FUSE_IO_URING_CMD_REGISTER = 1,
+
+	/* commit fuse request result and fetch next request */
+	FUSE_IO_URING_CMD_COMMIT_AND_FETCH = 2,
+};
+
+/**
+ * In the 80B command area of the SQE.
+ */
+struct fuse_uring_cmd_req {
+	uint64_t flags;
+
+	/* entry identifier for commits */
+	uint64_t commit_id;
+
+	/* queue the command is for (queue index) */
+	uint16_t qid;
+	uint8_t padding[6];
+};
+
 #endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3


From d0f9c62aaf7a98412b46a91fe7daad76b316b3b7 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:00 +0100
Subject: fuse: Make fuse_copy non static

Move 'struct fuse_copy_state' and fuse_copy_* functions
to fuse_dev_i.h to make it available for fuse-io-uring.
'copy_out_args()' is renamed to 'fuse_copy_out_args'.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c        | 30 ++++++++----------------------
 fs/fuse/fuse_dev_i.h | 25 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 623c5a067c18..6ee7e28a84c8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -678,22 +678,8 @@ static int unlock_request(struct fuse_req *req)
 	return err;
 }
 
-struct fuse_copy_state {
-	int write;
-	struct fuse_req *req;
-	struct iov_iter *iter;
-	struct pipe_buffer *pipebufs;
-	struct pipe_buffer *currbuf;
-	struct pipe_inode_info *pipe;
-	unsigned long nr_segs;
-	struct page *pg;
-	unsigned len;
-	unsigned offset;
-	unsigned move_pages:1;
-};
-
-static void fuse_copy_init(struct fuse_copy_state *cs, int write,
-			   struct iov_iter *iter)
+void fuse_copy_init(struct fuse_copy_state *cs, int write,
+		    struct iov_iter *iter)
 {
 	memset(cs, 0, sizeof(*cs));
 	cs->write = write;
@@ -1054,9 +1040,9 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 }
 
 /* Copy request arguments to/from userspace buffer */
-static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
-			  unsigned argpages, struct fuse_arg *args,
-			  int zeroing)
+int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+		   unsigned argpages, struct fuse_arg *args,
+		   int zeroing)
 {
 	int err = 0;
 	unsigned i;
@@ -1933,8 +1919,8 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
 	return NULL;
 }
 
-static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
-			 unsigned nbytes)
+int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+		       unsigned nbytes)
 {
 	unsigned reqsize = sizeof(struct fuse_out_header);
 
@@ -2036,7 +2022,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 	if (oh.error)
 		err = nbytes != sizeof(oh) ? -EINVAL : 0;
 	else
-		err = copy_out_args(cs, req->args, nbytes);
+		err = fuse_copy_out_args(cs, req->args, nbytes);
 	fuse_copy_finish(cs);
 
 	spin_lock(&fpq->lock);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 08a7e88e0027..21eb1bdb492d 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -12,6 +12,23 @@
 #define FUSE_INT_REQ_BIT (1ULL << 0)
 #define FUSE_REQ_ID_STEP (1ULL << 1)
 
+struct fuse_arg;
+struct fuse_args;
+
+struct fuse_copy_state {
+	int write;
+	struct fuse_req *req;
+	struct iov_iter *iter;
+	struct pipe_buffer *pipebufs;
+	struct pipe_buffer *currbuf;
+	struct pipe_inode_info *pipe;
+	unsigned long nr_segs;
+	struct page *pg;
+	unsigned int len;
+	unsigned int offset;
+	unsigned int move_pages:1;
+};
+
 static inline struct fuse_dev *fuse_get_dev(struct file *file)
 {
 	/*
@@ -23,5 +40,13 @@ static inline struct fuse_dev *fuse_get_dev(struct file *file)
 
 void fuse_dev_end_requests(struct list_head *head);
 
+void fuse_copy_init(struct fuse_copy_state *cs, int write,
+			   struct iov_iter *iter);
+int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
+		   unsigned int argpages, struct fuse_arg *args,
+		   int zeroing);
+int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+		       unsigned int nbytes);
+
 #endif
 
-- 
cgit v1.2.3


From f773a7c2c3d934d00542c6471170066d150d152f Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:01 +0100
Subject: fuse: Add fuse-io-uring handling into fuse_copy

Add special fuse-io-uring into the fuse argument
copy handler.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c        | 12 +++++++++++-
 fs/fuse/fuse_dev_i.h |  4 ++++
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6ee7e28a84c8..8b03a540e151 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -786,6 +786,9 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
 	*size -= ncpy;
 	cs->len -= ncpy;
 	cs->offset += ncpy;
+	if (cs->is_uring)
+		cs->ring.copied_sz += ncpy;
+
 	return ncpy;
 }
 
@@ -1922,7 +1925,14 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
 int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
 		       unsigned nbytes)
 {
-	unsigned reqsize = sizeof(struct fuse_out_header);
+
+	unsigned int reqsize = 0;
+
+	/*
+	 * Uring has all headers separated from args - args is payload only
+	 */
+	if (!cs->is_uring)
+		reqsize = sizeof(struct fuse_out_header);
 
 	reqsize += fuse_len_args(args->out_numargs, args->out_args);
 
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 21eb1bdb492d..4a8a4feb2df5 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -27,6 +27,10 @@ struct fuse_copy_state {
 	unsigned int len;
 	unsigned int offset;
 	unsigned int move_pages:1;
+	unsigned int is_uring:1;
+	struct {
+		unsigned int copied_sz; /* copied size into the user buffer */
+	} ring;
 };
 
 static inline struct fuse_dev *fuse_get_dev(struct file *file)
-- 
cgit v1.2.3


From 3821336530616776414aa7a879640052b89def28 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:02 +0100
Subject: fuse: {io-uring} Make hash-list req unique finding functions
 non-static

fuse-over-io-uring uses existing functions to find requests based
on their unique id - make these functions non-static.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c        | 6 +++---
 fs/fuse/fuse_dev_i.h | 5 +++++
 fs/fuse/fuse_i.h     | 5 +++++
 fs/fuse/inode.c      | 2 +-
 4 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8b03a540e151..aa33eba51c51 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -220,7 +220,7 @@ u64 fuse_get_unique(struct fuse_iqueue *fiq)
 }
 EXPORT_SYMBOL_GPL(fuse_get_unique);
 
-static unsigned int fuse_req_hash(u64 unique)
+unsigned int fuse_req_hash(u64 unique)
 {
 	return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
 }
@@ -1910,7 +1910,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 }
 
 /* Look up request on processing list by unique ID */
-static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
+struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique)
 {
 	unsigned int hash = fuse_req_hash(unique);
 	struct fuse_req *req;
@@ -1994,7 +1994,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 	spin_lock(&fpq->lock);
 	req = NULL;
 	if (fpq->connected)
-		req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
+		req = fuse_request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
 
 	err = -ENOENT;
 	if (!req) {
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 4a8a4feb2df5..b64ab84cbc0d 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -14,6 +14,8 @@
 
 struct fuse_arg;
 struct fuse_args;
+struct fuse_pqueue;
+struct fuse_req;
 
 struct fuse_copy_state {
 	int write;
@@ -42,6 +44,9 @@ static inline struct fuse_dev *fuse_get_dev(struct file *file)
 	return READ_ONCE(file->private_data);
 }
 
+unsigned int fuse_req_hash(u64 unique);
+struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
+
 void fuse_dev_end_requests(struct list_head *head);
 
 void fuse_copy_init(struct fuse_copy_state *cs, int write,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bce8cc482d64..fb981002287d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1237,6 +1237,11 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
  */
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
 
+/**
+ * Initialize the fuse processing queue
+ */
+void fuse_pqueue_init(struct fuse_pqueue *fpq);
+
 /**
  * Initialize fuse_conn
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e4f9bbacfc1b..328797b9aac9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -938,7 +938,7 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq,
 	fiq->priv = priv;
 }
 
-static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+void fuse_pqueue_init(struct fuse_pqueue *fpq)
 {
 	unsigned int i;
 
-- 
cgit v1.2.3


From 03511e936916873bf880e6678c98d5fb59c19742 Mon Sep 17 00:00:00 2001
From: Jianan Huang <huangjianan@xiaomi.com>
Date: Fri, 24 Jan 2025 13:57:51 +0800
Subject: f2fs: fix inconsistent dirty state of atomic file

When testing the atomic write fix patches, the f2fs_bug_on was
triggered as below:

------------[ cut here ]------------
kernel BUG at fs/f2fs/inode.c:935!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 3 UID: 0 PID: 257 Comm: bash Not tainted 6.13.0-rc1-00033-gc283a70d3497 #5
RIP: 0010:f2fs_evict_inode+0x50f/0x520
Call Trace:
 <TASK>
 ? __die_body+0x65/0xb0
 ? die+0x9f/0xc0
 ? do_trap+0xa1/0x170
 ? f2fs_evict_inode+0x50f/0x520
 ? f2fs_evict_inode+0x50f/0x520
 ? handle_invalid_op+0x65/0x80
 ? f2fs_evict_inode+0x50f/0x520
 ? exc_invalid_op+0x39/0x50
 ? asm_exc_invalid_op+0x1a/0x20
 ? __pfx_f2fs_get_dquots+0x10/0x10
 ? f2fs_evict_inode+0x50f/0x520
 ? f2fs_evict_inode+0x2e5/0x520
 evict+0x186/0x2f0
 prune_icache_sb+0x75/0xb0
 super_cache_scan+0x1a8/0x200
 do_shrink_slab+0x163/0x320
 shrink_slab+0x2fc/0x470
 drop_slab+0x82/0xf0
 drop_caches_sysctl_handler+0x4e/0xb0
 proc_sys_call_handler+0x183/0x280
 vfs_write+0x36d/0x450
 ksys_write+0x68/0xd0
 do_syscall_64+0xc8/0x1a0
 ? arch_exit_to_user_mode_prepare+0x11/0x60
 ? irqentry_exit_to_user_mode+0x7e/0xa0

The root cause is: f2fs uses FI_ATOMIC_DIRTIED to indicate dirty
atomic files during commit. If the inode is dirtied during commit,
such as by f2fs_i_pino_write, the vfs inode keeps clean and the
f2fs inode is set to FI_DIRTY_INODE. The FI_DIRTY_INODE flag cann't
be cleared by write_inode later due to the clean vfs inode. Finally,
f2fs_bug_on is triggered due to this inconsistent state when evict.

To reproduce this situation:
- fd = open("/mnt/test.db", O_WRONLY)
- ioctl(fd, F2FS_IOC_START_ATOMIC_WRITE)
- mv /mnt/test.db /mnt/test1.db
- ioctl(fd, F2FS_IOC_COMMIT_ATOMIC_WRITE)
- echo 3 > /proc/sys/vm/drop_caches

To fix this problem, clear FI_DIRTY_INODE after commit, then
f2fs_mark_inode_dirty_sync will ensure a consistent dirty state.

Fixes: fccaa81de87e ("f2fs: prevent atomic file from being dirtied before commit")
Signed-off-by: Yunlei He <heyunlei@xiaomi.com>
Signed-off-by: Jianan Huang <huangjianan@xiaomi.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index dc1b47f9269a..c282e8a0a2ec 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -201,6 +201,12 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) {
 		clear_inode_flag(inode, FI_ATOMIC_DIRTIED);
+		/*
+		 * The vfs inode keeps clean during commit, but the f2fs inode
+		 * doesn't. So clear the dirty state after commit and let
+		 * f2fs_mark_inode_dirty_sync ensure a consistent dirty state.
+		 */
+		f2fs_inode_synced(inode);
 		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 	stat_dec_atomic_inode(inode);
-- 
cgit v1.2.3


From 49ff29240ebb13bc0efa4623d4379650e68fcb35 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sun, 29 Dec 2024 23:37:49 +0000
Subject: squashfs: make squashfs_cache_init() return ERR_PTR(-ENOMEM)

Patch series "squashfs: reduce memory usage and update docs".

This patchset reduces the amount of memory that Squashfs uses when
CONFIG_FILE_DIRECT is configured, and updates various out of date
information in the documentation and Kconfig.

This patch (of 4):

Make squashfs_cache_init() return an ERR_PTR(-ENOMEM) on failure rather
than NULL.

This tidies up some calling code, but, it also allows NULL to be returned
as a valid result when a cache hasn't be allocated.

Link: https://lkml.kernel.org/r/20241229233752.54481-1-phillip@squashfs.org.uk
Link: https://lkml.kernel.org/r/20241229233752.54481-2-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/cache.c | 10 +++++++---
 fs/squashfs/super.c | 17 ++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 5062326d0efb..4db0d2b0aab8 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -224,11 +224,15 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 	int block_size)
 {
 	int i, j;
-	struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+	struct squashfs_cache *cache;
 
+	if (entries == 0)
+		return NULL;
+
+	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
 	if (cache == NULL) {
 		ERROR("Failed to allocate %s cache\n", name);
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
@@ -281,7 +285,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 
 cleanup:
 	squashfs_cache_delete(cache);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 269c6d61bc29..fedae8dbc5de 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -314,26 +314,29 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &squashfs_super_ops;
 
-	err = -ENOMEM;
-
 	msblk->block_cache = squashfs_cache_init("metadata",
 			SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
-	if (msblk->block_cache == NULL)
+	if (IS_ERR(msblk->block_cache)) {
+		err = PTR_ERR(msblk->block_cache);
 		goto failed_mount;
+	}
 
 	/* Allocate read_page block */
 	msblk->read_page = squashfs_cache_init("data",
 		msblk->max_thread_num, msblk->block_size);
-	if (msblk->read_page == NULL) {
+	if (IS_ERR(msblk->read_page)) {
 		errorf(fc, "Failed to allocate read_page block");
+		err = PTR_ERR(msblk->read_page);
 		goto failed_mount;
 	}
 
 	if (msblk->devblksize == PAGE_SIZE) {
 		struct inode *cache = new_inode(sb);
 
-		if (cache == NULL)
+		if (cache == NULL) {
+			err = -ENOMEM;
 			goto failed_mount;
+		}
 
 		set_nlink(cache, 1);
 		cache->i_size = OFFSET_MAX;
@@ -406,8 +409,8 @@ handle_fragments:
 
 	msblk->fragment_cache = squashfs_cache_init("fragment",
 		min(SQUASHFS_CACHED_FRAGMENTS, fragments), msblk->block_size);
-	if (msblk->fragment_cache == NULL) {
-		err = -ENOMEM;
+	if (IS_ERR(msblk->fragment_cache)) {
+		err = PTR_ERR(msblk->fragment_cache);
 		goto failed_mount;
 	}
 
-- 
cgit v1.2.3


From f5cc08737507f227e5e7e2df52d26ed1f0a7e315 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sun, 29 Dec 2024 23:37:50 +0000
Subject: squashfs: don't allocate read_page cache if SQUASHFS_FILE_DIRECT
 configured

If Squashfs has been configured to directly read datablocks into the page
cache (SQUASHFS_FILE_DIRECT), then the read_page cache is unnecessary.

This improvement is due to the following two commits, which added the
ability to read datablocks into the page cache when pages were missing,
enabling the fallback which used an intermediate buffer to be removed.

commit f268eedddf359 ("squashfs: extend "page actor" to handle missing pages")
commit 1bb1a07afad97 ("squashfs: don't use intermediate buffer if pages missing")

This reduces the amount of memory used when mounting a filesystem by
block_size * maximum number of threads.

Link: https://lkml.kernel.org/r/20241229233752.54481-3-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/squashfs.h | 6 ++++++
 fs/squashfs/super.c    | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5a756e6790b5..2c45b9b938e9 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -14,6 +14,12 @@
 
 #define WARNING(s, args...)	pr_warn("SQUASHFS: "s, ## args)
 
+#ifdef CONFIG_SQUASHFS_FILE_CACHE
+#define SQUASHFS_READ_PAGES msblk->max_thread_num
+#else
+#define SQUASHFS_READ_PAGES 0
+#endif
+
 /* block.c */
 extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
 				struct squashfs_page_actor *);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index fedae8dbc5de..67c55fe32ce8 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -323,7 +323,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	/* Allocate read_page block */
 	msblk->read_page = squashfs_cache_init("data",
-		msblk->max_thread_num, msblk->block_size);
+		SQUASHFS_READ_PAGES, msblk->block_size);
 	if (IS_ERR(msblk->read_page)) {
 		errorf(fc, "Failed to allocate read_page block");
 		err = PTR_ERR(msblk->read_page);
-- 
cgit v1.2.3


From d7f4b23c6702e2afbdfe33fe77eedc1584761242 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sun, 29 Dec 2024 23:37:52 +0000
Subject: squashfs: update Kconfig information

Update the compression algorithms supported, and the Squashfs website
location.

Link: https://lkml.kernel.org/r/20241229233752.54481-5-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 60fc98bdf421..b1091e70434a 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,8 +5,8 @@ config SQUASHFS
 	help
 	  Saying Y here includes support for SquashFS 4.0 (a Compressed
 	  Read-Only File System).  Squashfs is a highly compressed read-only
-	  filesystem for Linux.  It uses zlib, lzo or xz compression to
-	  compress both files, inodes and directories.  Inodes in the system
+	  filesystem for Linux.  It uses zlib, lz4, lzo, xz or zstd compression
+	  to compress both files, inodes and directories.  Inodes in the system
 	  are very small and all blocks are packed to minimise data overhead.
 	  Block sizes greater than 4K are supported up to a maximum of 1 Mbytes
 	  (default block size 128K).  SquashFS 4.0 supports 64 bit filesystems
@@ -16,7 +16,7 @@ config SQUASHFS
 	  Squashfs is intended for general read-only filesystem use, for
 	  archival use (i.e. in cases where a .tar.gz file may be used), and in
 	  embedded systems where low overhead is needed.  Further information
-	  and tools are available from http://squashfs.sourceforge.net.
+	  and tools are available from github.com/plougher/squashfs-tools.
 
 	  If you want to compile this as a module ( = code which can be
 	  inserted in and removed from the running kernel whenever you want),
-- 
cgit v1.2.3


From 608c2501675cd37260312d35daa2fd6dc6d56e96 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Dec 2024 22:46:24 +0000
Subject: squashfs: use a folio throughout squashfs_read_folio()

Use modern folio APIs where they exist and convert back to struct
page for the internal functions.

Link: https://lkml.kernel.org/r/20241220224634.723899-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 21aaa96856c1..bc6598c3a48f 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -445,21 +445,19 @@ static int squashfs_readpage_sparse(struct page *page, int expected)
 
 static int squashfs_read_folio(struct file *file, struct folio *folio)
 {
-	struct page *page = &folio->page;
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = folio->mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-	int index = page->index >> (msblk->block_log - PAGE_SHIFT);
+	int index = folio->index >> (msblk->block_log - PAGE_SHIFT);
 	int file_end = i_size_read(inode) >> msblk->block_log;
 	int expected = index == file_end ?
 			(i_size_read(inode) & (msblk->block_size - 1)) :
 			 msblk->block_size;
 	int res = 0;
-	void *pageaddr;
 
 	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
-				page->index, squashfs_i(inode)->start);
+				folio->index, squashfs_i(inode)->start);
 
-	if (page->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >>
+	if (folio->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >>
 					PAGE_SHIFT))
 		goto out;
 
@@ -472,23 +470,18 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
 			goto out;
 
 		if (res == 0)
-			res = squashfs_readpage_sparse(page, expected);
+			res = squashfs_readpage_sparse(&folio->page, expected);
 		else
-			res = squashfs_readpage_block(page, block, res, expected);
+			res = squashfs_readpage_block(&folio->page, block, res, expected);
 	} else
-		res = squashfs_readpage_fragment(page, expected);
+		res = squashfs_readpage_fragment(&folio->page, expected);
 
 	if (!res)
 		return 0;
 
 out:
-	pageaddr = kmap_atomic(page);
-	memset(pageaddr, 0, PAGE_SIZE);
-	kunmap_atomic(pageaddr);
-	flush_dcache_page(page);
-	if (res == 0)
-		SetPageUptodate(page);
-	unlock_page(page);
+	folio_zero_segment(folio, 0, folio_size(folio));
+	folio_end_read(folio, res == 0);
 
 	return res;
 }
-- 
cgit v1.2.3


From 8c1565fcf6392d2f962b7cbc9fde43bfcd175c2f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Dec 2024 22:46:25 +0000
Subject: squashfs: pass a folio to squashfs_readpage_fragment()

Remove an access to page->mapping.

Link: https://lkml.kernel.org/r/20241220224634.723899-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index bc6598c3a48f..6bd16e12493b 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -417,9 +417,9 @@ skip_page:
 }
 
 /* Read datablock stored packed inside a fragment (tail-end packed block) */
-static int squashfs_readpage_fragment(struct page *page, int expected)
+static int squashfs_readpage_fragment(struct folio *folio, int expected)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = folio->mapping->host;
 	struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
 		squashfs_i(inode)->fragment_block,
 		squashfs_i(inode)->fragment_size);
@@ -430,7 +430,7 @@ static int squashfs_readpage_fragment(struct page *page, int expected)
 			squashfs_i(inode)->fragment_block,
 			squashfs_i(inode)->fragment_size);
 	else
-		squashfs_copy_cache(page, buffer, expected,
+		squashfs_copy_cache(&folio->page, buffer, expected,
 			squashfs_i(inode)->fragment_offset);
 
 	squashfs_cache_put(buffer);
@@ -474,7 +474,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
 		else
 			res = squashfs_readpage_block(&folio->page, block, res, expected);
 	} else
-		res = squashfs_readpage_fragment(&folio->page, expected);
+		res = squashfs_readpage_fragment(folio, expected);
 
 	if (!res)
 		return 0;
-- 
cgit v1.2.3


From 2a7aea59cf4dd4a070c4550fddaffc5a73312cad Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Dec 2024 22:46:26 +0000
Subject: squashfs: convert squashfs_readpage_block() to take a folio

Remove a few accesses to page->mapping.

Link: https://lkml.kernel.org/r/20241220224634.723899-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c        |  2 +-
 fs/squashfs/file_cache.c  |  6 +++---
 fs/squashfs/file_direct.c | 11 +++++------
 fs/squashfs/squashfs.h    |  2 +-
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 6bd16e12493b..5b81e26b1226 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -472,7 +472,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
 		if (res == 0)
 			res = squashfs_readpage_sparse(&folio->page, expected);
 		else
-			res = squashfs_readpage_block(&folio->page, block, res, expected);
+			res = squashfs_readpage_block(folio, block, res, expected);
 	} else
 		res = squashfs_readpage_fragment(folio, expected);
 
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
index 54c17b7c85fd..0360d22a77d4 100644
--- a/fs/squashfs/file_cache.c
+++ b/fs/squashfs/file_cache.c
@@ -18,9 +18,9 @@
 #include "squashfs.h"
 
 /* Read separately compressed datablock and memcopy into page cache */
-int squashfs_readpage_block(struct page *page, u64 block, int bsize, int expected)
+int squashfs_readpage_block(struct folio *folio, u64 block, int bsize, int expected)
 {
-	struct inode *i = page->mapping->host;
+	struct inode *i = folio->mapping->host;
 	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
 		block, bsize);
 	int res = buffer->error;
@@ -29,7 +29,7 @@ int squashfs_readpage_block(struct page *page, u64 block, int bsize, int expecte
 		ERROR("Unable to read page, block %llx, size %x\n", block,
 			bsize);
 	else
-		squashfs_copy_cache(page, buffer, expected, 0);
+		squashfs_copy_cache(&folio->page, buffer, expected, 0);
 
 	squashfs_cache_put(buffer);
 	return res;
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index d19d4db74af8..2c3e809d6891 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -19,12 +19,11 @@
 #include "page_actor.h"
 
 /* Read separately compressed datablock directly into page cache */
-int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
-	int expected)
-
+int squashfs_readpage_block(struct folio *folio, u64 block, int bsize,
+		int expected)
 {
-	struct folio *folio = page_folio(target_page);
-	struct inode *inode = target_page->mapping->host;
+	struct page *target_page = &folio->page;
+	struct inode *inode = folio->mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
 	loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 	int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
@@ -48,7 +47,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	/* Try to grab all the pages covered by the Squashfs block */
 	for (i = 0, index = start_index; index <= end_index; index++) {
 		page[i] = (index == folio->index) ? target_page :
-			grab_cache_page_nowait(target_page->mapping, index);
+			grab_cache_page_nowait(folio->mapping, index);
 
 		if (page[i] == NULL)
 			continue;
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 2c45b9b938e9..9922a9460ce6 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -78,7 +78,7 @@ void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
 				int);
 
 /* file_xxx.c */
-extern int squashfs_readpage_block(struct page *, u64, int, int);
+int squashfs_readpage_block(struct folio *, u64 block, int bsize, int expected);
 
 /* id.c */
 extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
-- 
cgit v1.2.3


From 5641371fd0b3703d11809a0ae249aed270cb8add Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Dec 2024 22:46:27 +0000
Subject: squashfs; convert squashfs_copy_cache() to take a folio

Remove accesses to page->index and page->mapping.  Also use folio
APIs where available.  This code still assumes order 0 folios.

[dan.carpenter@linaro.org: fix a NULL vs IS_ERR() bug]
  Link: https://lkml.kernel.org/r/7b7f44d6-9153-4d7c-b65b-2d78febe6c7a@stanley.mountain
Link: https://lkml.kernel.org/r/20241220224634.723899-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c       | 46 +++++++++++++++++++++++++---------------------
 fs/squashfs/file_cache.c |  2 +-
 fs/squashfs/squashfs.h   |  4 ++--
 3 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 5b81e26b1226..74076c4823c3 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -378,13 +378,15 @@ void squashfs_fill_page(struct page *page, struct squashfs_cache_entry *buffer,
 }
 
 /* Copy data into page cache  */
-void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
-	int bytes, int offset)
+void squashfs_copy_cache(struct folio *folio,
+		struct squashfs_cache_entry *buffer, size_t bytes,
+		size_t offset)
 {
-	struct inode *inode = page->mapping->host;
+	struct address_space *mapping = folio->mapping;
+	struct inode *inode = mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
 	int i, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
-	int start_index = page->index & ~mask, end_index = start_index | mask;
+	int start_index = folio->index & ~mask, end_index = start_index | mask;
 
 	/*
 	 * Loop copying datablock into pages.  As the datablock likely covers
@@ -394,25 +396,27 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
 	 */
 	for (i = start_index; i <= end_index && bytes > 0; i++,
 			bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
-		struct page *push_page;
-		int avail = buffer ? min_t(int, bytes, PAGE_SIZE) : 0;
+		struct folio *push_folio;
+		size_t avail = buffer ? min(bytes, PAGE_SIZE) : 0;
 
-		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
+		TRACE("bytes %zu, i %d, available_bytes %zu\n", bytes, i, avail);
 
-		push_page = (i == page->index) ? page :
-			grab_cache_page_nowait(page->mapping, i);
+		push_folio = (i == folio->index) ? folio :
+			__filemap_get_folio(mapping, i,
+					FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+					mapping_gfp_mask(mapping));
 
-		if (!push_page)
+		if (IS_ERR(push_folio))
 			continue;
 
-		if (PageUptodate(push_page))
-			goto skip_page;
+		if (folio_test_uptodate(push_folio))
+			goto skip_folio;
 
-		squashfs_fill_page(push_page, buffer, offset, avail);
-skip_page:
-		unlock_page(push_page);
-		if (i != page->index)
-			put_page(push_page);
+		squashfs_fill_page(&push_folio->page, buffer, offset, avail);
+skip_folio:
+		folio_unlock(push_folio);
+		if (i != folio->index)
+			folio_put(push_folio);
 	}
 }
 
@@ -430,16 +434,16 @@ static int squashfs_readpage_fragment(struct folio *folio, int expected)
 			squashfs_i(inode)->fragment_block,
 			squashfs_i(inode)->fragment_size);
 	else
-		squashfs_copy_cache(&folio->page, buffer, expected,
+		squashfs_copy_cache(folio, buffer, expected,
 			squashfs_i(inode)->fragment_offset);
 
 	squashfs_cache_put(buffer);
 	return res;
 }
 
-static int squashfs_readpage_sparse(struct page *page, int expected)
+static int squashfs_readpage_sparse(struct folio *folio, int expected)
 {
-	squashfs_copy_cache(page, NULL, expected, 0);
+	squashfs_copy_cache(folio, NULL, expected, 0);
 	return 0;
 }
 
@@ -470,7 +474,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
 			goto out;
 
 		if (res == 0)
-			res = squashfs_readpage_sparse(&folio->page, expected);
+			res = squashfs_readpage_sparse(folio, expected);
 		else
 			res = squashfs_readpage_block(folio, block, res, expected);
 	} else
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
index 0360d22a77d4..40e59a43d098 100644
--- a/fs/squashfs/file_cache.c
+++ b/fs/squashfs/file_cache.c
@@ -29,7 +29,7 @@ int squashfs_readpage_block(struct folio *folio, u64 block, int bsize, int expec
 		ERROR("Unable to read page, block %llx, size %x\n", block,
 			bsize);
 	else
-		squashfs_copy_cache(&folio->page, buffer, expected, 0);
+		squashfs_copy_cache(folio, buffer, expected, 0);
 
 	squashfs_cache_put(buffer);
 	return res;
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 9922a9460ce6..8c6fbef022f4 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -74,8 +74,8 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
 
 /* file.c */
 void squashfs_fill_page(struct page *, struct squashfs_cache_entry *, int, int);
-void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
-				int);
+void squashfs_copy_cache(struct folio *, struct squashfs_cache_entry *,
+		size_t bytes, size_t offset);
 
 /* file_xxx.c */
 int squashfs_readpage_block(struct folio *, u64 block, int bsize, int expected);
-- 
cgit v1.2.3


From 5748be3e9ef4659f9adf7e1ad454fafb0868cd88 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Dec 2024 22:46:28 +0000
Subject: squashfs: convert squashfs_fill_page() to take a folio

squashfs_fill_page is only used in this file, so make it static.
Use kmap_local instead of kmap_atomic, and return a bool so that
the caller can use folio_end_read() which saves an atomic operation
over calling folio_mark_uptodate() followed by folio_unlock().

[willy@infradead.org: fix polarity of "uptodate" Thanks to Ryan for testing]
  Link: https://lkml.kernel.org/r/20250110163300.3346321-2-willy@infradead.org
Link: https://lkml.kernel.org/r/20241220224634.723899-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c     | 21 ++++++++++++---------
 fs/squashfs/squashfs.h |  1 -
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 74076c4823c3..5ca2baa16dc2 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -362,19 +362,21 @@ static int read_blocklist(struct inode *inode, int index, u64 *block)
 	return squashfs_block_size(size);
 }
 
-void squashfs_fill_page(struct page *page, struct squashfs_cache_entry *buffer, int offset, int avail)
+static bool squashfs_fill_page(struct folio *folio,
+		struct squashfs_cache_entry *buffer, size_t offset,
+		size_t avail)
 {
-	int copied;
+	size_t copied;
 	void *pageaddr;
 
-	pageaddr = kmap_atomic(page);
+	pageaddr = kmap_local_folio(folio, 0);
 	copied = squashfs_copy_data(pageaddr, buffer, offset, avail);
 	memset(pageaddr + copied, 0, PAGE_SIZE - copied);
-	kunmap_atomic(pageaddr);
+	kunmap_local(pageaddr);
 
-	flush_dcache_page(page);
-	if (copied == avail)
-		SetPageUptodate(page);
+	flush_dcache_folio(folio);
+
+	return copied == avail;
 }
 
 /* Copy data into page cache  */
@@ -398,6 +400,7 @@ void squashfs_copy_cache(struct folio *folio,
 			bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
 		struct folio *push_folio;
 		size_t avail = buffer ? min(bytes, PAGE_SIZE) : 0;
+		bool updated = false;
 
 		TRACE("bytes %zu, i %d, available_bytes %zu\n", bytes, i, avail);
 
@@ -412,9 +415,9 @@ void squashfs_copy_cache(struct folio *folio,
 		if (folio_test_uptodate(push_folio))
 			goto skip_folio;
 
-		squashfs_fill_page(&push_folio->page, buffer, offset, avail);
+		updated = squashfs_fill_page(push_folio, buffer, offset, avail);
 skip_folio:
-		folio_unlock(push_folio);
+		folio_end_read(push_folio, updated);
 		if (i != folio->index)
 			folio_put(push_folio);
 	}
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 8c6fbef022f4..218868b20f16 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,7 +73,6 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
 				u64, u64, unsigned int);
 
 /* file.c */
-void squashfs_fill_page(struct page *, struct squashfs_cache_entry *, int, int);
 void squashfs_copy_cache(struct folio *, struct squashfs_cache_entry *,
 		size_t bytes, size_t offset);
 
-- 
cgit v1.2.3


From 34bb50c42335b9316cbdb9506c2c4126a66a52ee Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Mon, 6 Jan 2025 10:34:31 +0800
Subject: ocfs2: check el->l_next_free_rec in ocfs2_get_clusters_nocache

Recently syzbot reported a use-after-free issue[1].

The root cause of the problem is that the journal inode recorded in this
file system image is corrupted.  The value of
"di->id2.i_list.l_next_free_rec" is 8193, which is greater than the value
of "di->id2.i_list.l_count" (19).

To solve this problem, an additional check should be added within
ocfs2_get_clusters_nocache().  If the check fails, an error will be
returned and the file system will be set to read-only.

[1]: https://lore.kernel.org/all/67577778.050a0220.a30f1.01bc.GAE@google.com/T/

Link: https://lkml.kernel.org/r/20250106023432.1320904-1-sunjunchao2870@gmail.com
Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Reported-by: syzbot+2313dda4dc4885c93578@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=2313dda4dc4885c93578
Tested-by: syzbot+2313dda4dc4885c93578@syzkaller.appspotmail.com
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/extent_map.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f7672472fa82..930150ed5db1 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -435,6 +435,16 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
 		}
 	}
 
+	if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has an invalid extent (next_free_rec %u, count %u)\n",
+			    inode->i_ino,
+			    le16_to_cpu(el->l_next_free_rec),
+			    le16_to_cpu(el->l_count));
+		ret = -EROFS;
+		goto out;
+	}
+
 	i = ocfs2_search_extent_list(el, v_cluster);
 	if (i == -1) {
 		/*
-- 
cgit v1.2.3


From fdbb6cd96ed51da002bd9a5a142586a74a436784 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Mon, 6 Jan 2025 10:34:32 +0800
Subject: ocfs2: correct l_next_free_rec in online check

Correct the value of l_next_free_rec to l_count during the online check,
as done in the check_el() function in ocfs2_tools.

Link: https://lkml.kernel.org/r/20250106023432.1320904-2-sunjunchao2870@gmail.com
Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/inode.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index cd3173062ae3..12e5d1f73325 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -200,6 +200,20 @@ bail:
 	return inode;
 }
 
+static int ocfs2_dinode_has_extents(struct ocfs2_dinode *di)
+{
+	/* inodes flagged with other stuff in id2 */
+	if (di->i_flags & (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL |
+			   OCFS2_CHAIN_FL | OCFS2_DEALLOC_FL))
+		return 0;
+	/* i_flags doesn't indicate when id2 is a fast symlink */
+	if (S_ISLNK(di->i_mode) && di->i_size && di->i_clusters == 0)
+		return 0;
+	if (di->i_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	return 1;
+}
 
 /*
  * here's how inodes get read from disk:
@@ -1547,6 +1561,16 @@ static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
 		     le32_to_cpu(di->i_fs_generation));
 	}
 
+	if (ocfs2_dinode_has_extents(di) &&
+	    le16_to_cpu(di->id2.i_list.l_next_free_rec) > le16_to_cpu(di->id2.i_list.l_count)) {
+		di->id2.i_list.l_next_free_rec = di->id2.i_list.l_count;
+		changed = 1;
+		mlog(ML_ERROR,
+		     "Filecheck: reset dinode #%llu: l_next_free_rec to %u\n",
+		     (unsigned long long)bh->b_blocknr,
+		     le16_to_cpu(di->id2.i_list.l_next_free_rec));
+	}
+
 	if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
 		ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
 		mark_buffer_dirty(bh);
-- 
cgit v1.2.3


From 01676ecd0b5c897ebb420c5a09a658e3514cc6d8 Mon Sep 17 00:00:00 2001
From: Su Yue <glass.su@suse.com>
Date: Wed, 8 Jan 2025 10:41:19 +0800
Subject: ocfs2: check tl->count of truncate log inode in
 ocfs2_get_truncate_log_info

syz reported:

(syz-executor404,5313,0):ocfs2_truncate_log_append:5874 ERROR: bug
expression: tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
tl_count == 0
(syz-executor404,5313,0):ocfs2_truncate_log_append:5874 ERROR: Truncate
record count on #77 invalid wanted 39, actual 2087
------------[ cut here  ]------------
kernel BUG at fs/ocfs2/alloc.c:5874!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
CPU: 0 UID: 0 PID: 5313 Comm: syz-executor404 Not tainted
6.12.0-rc5-syzkaller-00299-g11066801dd4b #0
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014
RIP: 0010:ocfs2_truncate_log_append+0x9a8/0x9c0 fs/ocfs2/alloc.c:5868
RSP: 0018:ffffc9000cf16f40 EFLAGS: 00010292
RAX: b4b54f1d10640800 RBX: 0000000000000027 RCX: b4b54f1d10640800
RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000
RBP: ffffc9000cf17070 R08: ffffffff8174a14c R09: 1ffff11003f8519a
R10: dffffc0000000000 R11: ffffed1003f8519b R12: 1ffff110085f5f58
R13: ffffff3800000000 R14: 000000000000004d R15: ffff8880438f0008
FS:  00005555722df380(0000) GS:ffff88801fc00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000002000f000 CR3: 000000004010e000 CR4: 0000000000352ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
ocfs2_remove_btree_range+0x1303/0x1860 fs/ocfs2/alloc.c:5789
ocfs2_remove_inode_range+0xff3/0x29f0 fs/ocfs2/file.c:1907
ocfs2_reflink_remap_extent fs/ocfs2/refcounttree.c:4537 [inline]
ocfs2_reflink_remap_blocks+0xcd4/0x1f30 fs/ocfs2/refcounttree.c:4684
ocfs2_remap_file_range+0x5fa/0x8d0 fs/ocfs2/file.c:2736
vfs_copy_file_range+0xc07/0x1510 fs/read_write.c:1615
__do_sys_copy_file_range fs/read_write.c:1705 [inline]
__se_sys_copy_file_range+0x3f2/0x5d0 fs/read_write.c:1668
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fd327167af9
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 61 17 00 00 90 48 89 f8 48 89
f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01
f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffe6b8e22e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000146
RAX: ffffffffffffffda RBX: 00007fd3271b005e RCX: 00007fd327167af9
RDX: 0000000000000006 RSI: 0000000000000000 RDI: 0000000000000004
RBP: 00007fd3271de610 R08: 000000000000d8c2 R09: 0000000000000000
R10: 0000000020000640 R11: 0000000000000246 R12: 0000000000000001
R13: 00007ffe6b8e24b8 R14: 0000000000000001 R15: 0000000000000001
</TASK>

The fuzz image has a truncate log inode whose tl_count is bigger than
ocfs2_truncate_recs_per_inode() so it triggers the BUG in
ocfs2_truncate_log_append().

As what the check in ocfs2_truncate_log_append() does, just do same check
into ocfs2_get_truncate_log_info when truncate log inode is reading in so
we can bail out earlier.

Link: https://lkml.kernel.org/r/20250108024119.60313-1-glass.su@suse.com
Signed-off-by: Su Yue <glass.su@suse.com>
Reported-by: Liebes Wang <wanghaichi0403@gmail.com>
Link: https://lore.kernel.org/ocfs2-devel/CADCV8souQhdP0RdQF1U7KTWtuHDfpn+3LnTt-EEuMmB-pMRrgQ@mail.gmail.com/T/#u
Reported-by: syzbot+a66542ca5ebb4233b563@syzkaller.appspotmail.com
Tested-by: syzbot+a66542ca5ebb4233b563@syzkaller.appspotmail.com
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5cf698785fae..4414743b638e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6154,6 +6154,9 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 	int status;
 	struct inode *inode = NULL;
 	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+	unsigned int tl_count;
 
 	inode = ocfs2_get_system_file_inode(osb,
 					   TRUNCATE_LOG_SYSTEM_INODE,
@@ -6171,6 +6174,18 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	di = (struct ocfs2_dinode *)bh->b_data;
+	tl = &di->id2.i_dealloc;
+	tl_count = le16_to_cpu(tl->tl_count);
+	if (unlikely(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+		     tl_count == 0)) {
+		status = -EFSCORRUPTED;
+		iput(inode);
+		brelse(bh);
+		mlog_errno(status);
+		goto bail;
+	}
+
 	*tl_inode = inode;
 	*tl_bh    = bh;
 bail:
-- 
cgit v1.2.3


From 276c61385f6bc3223a5ecd307cf4aba2dfbb9a31 Mon Sep 17 00:00:00 2001
From: Su Yue <glass.su@suse.com>
Date: Mon, 6 Jan 2025 22:06:53 +0800
Subject: ocfs2: mark dquot as inactive if failed to start trans while
 releasing dquot

While running fstests generic/329, the kernel workqueue
quota_release_workfn is dead looping in calling ocfs2_release_dquot().
The ocfs2 state is already readonly but ocfs2_release_dquot wants to
start a transaction but fails and returns.

=====================================================================
[ 2918.123602 ][  T275 ] On-disk corruption discovered. Please run
fsck.ocfs2 once the filesystem is unmounted.
[ 2918.124034 ][  T275 ] (kworker/u135:1,275,11):ocfs2_release_dquot:765
ERROR: status = -30
[ 2918.124452 ][  T275 ] (kworker/u135:1,275,11):ocfs2_release_dquot:795
ERROR: status = -30
[ 2918.124883 ][  T275 ] (kworker/u135:1,275,11):ocfs2_start_trans:357
ERROR: status = -30
[ 2918.125276 ][  T275 ] OCFS2: abort (device dm-0): ocfs2_start_trans:
Detected aborted journal
[ 2918.125710 ][  T275 ] On-disk corruption discovered. Please run
fsck.ocfs2 once the filesystem is unmounted.
=====================================================================

ocfs2_release_dquot() is much like dquot_release(), which is called by
ext4 to handle similar situation.  So here fix it by marking the dquot as
inactive like what dquot_release() does.

Link: https://lkml.kernel.org/r/20250106140653.92292-1-glass.su@suse.com
Fixes: 9e33d69f553a ("ocfs2: Implementation of local and global quota file handling")
Signed-off-by: Su Yue <glass.su@suse.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/quota_global.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 3404e7a30c33..15d9acd456ec 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -761,6 +761,11 @@ static int ocfs2_release_dquot(struct dquot *dquot)
 	handle = ocfs2_start_trans(osb,
 		ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type));
 	if (IS_ERR(handle)) {
+		/*
+		 * Mark dquot as inactive to avoid endless cycle in
+		 * quota_release_workfn().
+		 */
+		clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 		status = PTR_ERR(handle);
 		mlog_errno(status);
 		goto out_ilock;
-- 
cgit v1.2.3


From 840265f7b2795ec12a04ef948f52d12cd96f42c3 Mon Sep 17 00:00:00 2001
From: Su Yue <glass.su@suse.com>
Date: Mon, 6 Jan 2025 22:06:34 +0800
Subject: ocfs2: remove parameter parent_fe_bh from __ocfs2_mknod_locked

The parameter is not used in __ocfs2_mknod_locked().  So remove it.

No functional change.

Link: https://lkml.kernel.org/r/20250106140634.92241-1-glass.su@suse.com
Signed-off-by: Su Yue <glass.su@suse.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/namei.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5550f8afa438..0ec63a1a94b8 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -508,7 +508,6 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 				struct inode *inode,
 				dev_t dev,
 				struct buffer_head **new_fe_bh,
-				struct buffer_head *parent_fe_bh,
 				handle_t *handle,
 				struct ocfs2_alloc_context *inode_ac,
 				u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
@@ -641,8 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	}
 
 	return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
-				    parent_fe_bh, handle, inode_ac,
-				    fe_blkno, suballoc_loc, suballoc_bit);
+				    handle, inode_ac, fe_blkno,
+				    suballoc_loc, suballoc_bit);
 }
 
 static int ocfs2_mkdir(struct mnt_idmap *idmap,
@@ -2576,7 +2575,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 	clear_nlink(inode);
 	/* do the real work now. */
 	status = __ocfs2_mknod_locked(dir, inode,
-				      0, &new_di_bh, parent_di_bh, handle,
+				      0, &new_di_bh, handle,
 				      inode_ac, di_blkno, suballoc_loc,
 				      suballoc_bit);
 	if (status < 0) {
-- 
cgit v1.2.3


From ca76bb226bf47ff04c782cacbd299f12ddee1ec1 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Wed, 8 Jan 2025 05:00:46 +0900
Subject: nilfs2: do not force clear folio if buffer is referenced

Patch series "nilfs2: protect busy buffer heads from being force-cleared".

This series fixes the buffer head state inconsistency issues reported by
syzbot that occurs when the filesystem is corrupted and falls back to
read-only, and the associated buffer head use-after-free issue.


This patch (of 2):

Syzbot has reported that after nilfs2 detects filesystem corruption and
falls back to read-only, inconsistencies in the buffer state may occur.

One of the inconsistencies is that when nilfs2 calls mark_buffer_dirty()
to set a data or metadata buffer as dirty, but it detects that the buffer
is not in the uptodate state:

 WARNING: CPU: 0 PID: 6049 at fs/buffer.c:1177 mark_buffer_dirty+0x2e5/0x520
  fs/buffer.c:1177
 ...
 Call Trace:
  <TASK>
  nilfs_palloc_commit_alloc_entry+0x4b/0x160 fs/nilfs2/alloc.c:598
  nilfs_ifile_create_inode+0x1dd/0x3a0 fs/nilfs2/ifile.c:73
  nilfs_new_inode+0x254/0x830 fs/nilfs2/inode.c:344
  nilfs_mkdir+0x10d/0x340 fs/nilfs2/namei.c:218
  vfs_mkdir+0x2f9/0x4f0 fs/namei.c:4257
  do_mkdirat+0x264/0x3a0 fs/namei.c:4280
  __do_sys_mkdirat fs/namei.c:4295 [inline]
  __se_sys_mkdirat fs/namei.c:4293 [inline]
  __x64_sys_mkdirat+0x87/0xa0 fs/namei.c:4293
  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
  do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
  entry_SYSCALL_64_after_hwframe+0x77/0x7f

The other is when nilfs_btree_propagate(), which propagates the dirty
state to the ancestor nodes of a b-tree that point to a dirty buffer,
detects that the origin buffer is not dirty, even though it should be:

 WARNING: CPU: 0 PID: 5245 at fs/nilfs2/btree.c:2089
  nilfs_btree_propagate+0xc79/0xdf0 fs/nilfs2/btree.c:2089
 ...
 Call Trace:
  <TASK>
  nilfs_bmap_propagate+0x75/0x120 fs/nilfs2/bmap.c:345
  nilfs_collect_file_data+0x4d/0xd0 fs/nilfs2/segment.c:587
  nilfs_segctor_apply_buffers+0x184/0x340 fs/nilfs2/segment.c:1006
  nilfs_segctor_scan_file+0x28c/0xa50 fs/nilfs2/segment.c:1045
  nilfs_segctor_collect_blocks fs/nilfs2/segment.c:1216 [inline]
  nilfs_segctor_collect fs/nilfs2/segment.c:1540 [inline]
  nilfs_segctor_do_construct+0x1c28/0x6b90 fs/nilfs2/segment.c:2115
  nilfs_segctor_construct+0x181/0x6b0 fs/nilfs2/segment.c:2479
  nilfs_segctor_thread_construct fs/nilfs2/segment.c:2587 [inline]
  nilfs_segctor_thread+0x69e/0xe80 fs/nilfs2/segment.c:2701
  kthread+0x2f0/0x390 kernel/kthread.c:389
  ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
  </TASK>

Both of these issues are caused by the callbacks that handle the
page/folio write requests, forcibly clear various states, including the
working state of the buffers they hold, at unexpected times when they
detect read-only fallback.

Fix these issues by checking if the buffer is referenced before clearing
the page/folio state, and skipping the clear if it is.

Link: https://lkml.kernel.org/r/20250107200202.6432-1-konishi.ryusuke@gmail.com
Link: https://lkml.kernel.org/r/20250107200202.6432-2-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+b2b14916b77acf8626d7@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=b2b14916b77acf8626d7
Reported-by: syzbot+d98fd19acd08b36ff422@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=d98fd19acd08b36ff422
Fixes: 8c26c4e2694a ("nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption")
Tested-by: syzbot+b2b14916b77acf8626d7@syzkaller.appspotmail.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/page.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 9de2a494a069..899686d2e5f7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -392,6 +392,11 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
 /**
  * nilfs_clear_folio_dirty - discard dirty folio
  * @folio: dirty folio that will be discarded
+ *
+ * nilfs_clear_folio_dirty() clears working states including dirty state for
+ * the folio and its buffers.  If the folio has buffers, clear only if it is
+ * confirmed that none of the buffer heads are busy (none have valid
+ * references and none are locked).
  */
 void nilfs_clear_folio_dirty(struct folio *folio)
 {
@@ -399,10 +404,6 @@ void nilfs_clear_folio_dirty(struct folio *folio)
 
 	BUG_ON(!folio_test_locked(folio));
 
-	folio_clear_uptodate(folio);
-	folio_clear_mappedtodisk(folio);
-	folio_clear_checked(folio);
-
 	head = folio_buffers(folio);
 	if (head) {
 		const unsigned long clear_bits =
@@ -410,6 +411,25 @@ void nilfs_clear_folio_dirty(struct folio *folio)
 			 BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
 			 BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) |
 			 BIT(BH_Delay));
+		bool busy, invalidated = false;
+
+recheck_buffers:
+		busy = false;
+		bh = head;
+		do {
+			if (atomic_read(&bh->b_count) | buffer_locked(bh)) {
+				busy = true;
+				break;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (busy) {
+			if (invalidated)
+				return;
+			invalidate_bh_lrus();
+			invalidated = true;
+			goto recheck_buffers;
+		}
 
 		bh = head;
 		do {
@@ -419,6 +439,9 @@ void nilfs_clear_folio_dirty(struct folio *folio)
 		} while (bh = bh->b_this_page, bh != head);
 	}
 
+	folio_clear_uptodate(folio);
+	folio_clear_mappedtodisk(folio);
+	folio_clear_checked(folio);
 	__nilfs_clear_folio_dirty(folio);
 }
 
-- 
cgit v1.2.3


From 367a9bffabe08c04f6d725032cce3d891b2b9e1a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Wed, 8 Jan 2025 05:00:47 +0900
Subject: nilfs2: protect access to buffers with no active references

nilfs_lookup_dirty_data_buffers(), which iterates through the buffers
attached to dirty data folios/pages, accesses the attached buffers without
locking the folios/pages.

For data cache, nilfs_clear_folio_dirty() may be called asynchronously
when the file system degenerates to read only, so
nilfs_lookup_dirty_data_buffers() still has the potential to cause use
after free issues when buffers lose the protection of their dirty state
midway due to this asynchronous clearing and are unintentionally freed by
try_to_free_buffers().

Eliminate this race issue by adjusting the lock section in this function.

Link: https://lkml.kernel.org/r/20250107200202.6432-3-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Fixes: 8c26c4e2694a ("nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption")
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/segment.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 587251830897..58a598b548fa 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -734,7 +734,6 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 		if (!head)
 			head = create_empty_buffers(folio,
 					i_blocksize(inode), 0);
-		folio_unlock(folio);
 
 		bh = head;
 		do {
@@ -744,11 +743,14 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 			list_add_tail(&bh->b_assoc_buffers, listp);
 			ndirties++;
 			if (unlikely(ndirties >= nlimit)) {
+				folio_unlock(folio);
 				folio_batch_release(&fbatch);
 				cond_resched();
 				return ndirties;
 			}
 		} while (bh = bh->b_this_page, bh != head);
+
+		folio_unlock(folio);
 	}
 	folio_batch_release(&fbatch);
 	cond_resched();
-- 
cgit v1.2.3


From 17c46a45cdb94c500f4e93b176cdd61931b03020 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Fri, 10 Jan 2025 10:01:44 +0900
Subject: nilfs2: correct return value kernel-doc descriptions for ioctl
 functions

Patch series "nilfs2: fix kernel-doc comments for function return values",
v2.

This series fixes the inadequacies in the return value descriptions in
nilfs2's kernel-doc comments (mainly incorrect formatting), as well as the
lack of return value descriptions themselves, and fixes most of the
remaining warnings that are output when the kernel-doc script is run with
the "-Wall" option.


This patch (of 7):

In the kernel-doc comments for functions, there are many cases where the
format of the return value description is inaccurate, such as "Return
Value: ...", which causes many warnings to be output when the kernel-doc
script is executed with the "-Wall" option.

This fixes such incorrectly formatted return value descriptions for ioctl
functions.

Link: https://lkml.kernel.org/r/20250110010530.21872-1-konishi.ryusuke@gmail.com
Link: https://lkml.kernel.org/r/20250110010530.21872-2-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: "Brian G ." <gissf1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/ioctl.c | 233 ++++++++++++++++++++++--------------------------------
 1 file changed, 96 insertions(+), 137 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index fa77f78df681..e877c97974a4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -33,17 +33,14 @@
  * @dofunc: concrete function of get/set metadata info
  *
  * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of
- * calling dofunc() function on the basis of @argv argument.
- *
- * Return Value: On success, 0 is returned and requested metadata info
- * is copied into userspace. On error, one of the following
- * negative error codes is returned.
- *
- * %-EINVAL - Invalid arguments from userspace.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EFAULT - Failure during execution of requested operation.
+ * calling dofunc() function on the basis of @argv argument.  If successful,
+ * the requested metadata information is copied to userspace memory.
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EFAULT	- Failure during execution of requested operation.
+ * * %-EINVAL	- Invalid arguments from userspace.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
 				 struct nilfs_argv *argv, int dir,
@@ -190,13 +187,10 @@ static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
  * given checkpoint between checkpoint and snapshot state. This ioctl
  * is used in chcp and mkcp utilities.
  *
- * Return Value: On success, 0 is returned and mode of a checkpoint is
- * changed. On error, one of the following negative error codes
- * is returned.
- *
- * %-EPERM - Operation not permitted.
- *
- * %-EFAULT - Failure during checkpoint mode changing.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * %-EFAULT	- Failure during checkpoint mode changing.
+ * %-EPERM	- Operation not permitted.
  */
 static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 				     unsigned int cmd, void __user *argp)
@@ -244,13 +238,10 @@ out:
  * checkpoint from NILFS2 file system. This ioctl is used in rmcp
  * utility.
  *
- * Return Value: On success, 0 is returned and a checkpoint is
- * removed. On error, one of the following negative error codes
- * is returned.
- *
- * %-EPERM - Operation not permitted.
- *
- * %-EFAULT - Failure during checkpoint removing.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * %-EFAULT	- Failure during checkpoint removing.
+ * %-EPERM	- Operation not permitted.
  */
 static int
 nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
@@ -296,7 +287,7 @@ out:
  * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in
  * lscp utility and by nilfs_cleanerd daemon.
  *
- * Return value: count of nilfs_cpinfo structures in output buffer.
+ * Return: Count of nilfs_cpinfo structures in output buffer.
  */
 static ssize_t
 nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
@@ -320,17 +311,14 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
  *
  * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints.
  * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities
- * and by nilfs_cleanerd daemon.
+ * and by nilfs_cleanerd daemon.  The checkpoint statistics are copied to
+ * the userspace memory pointed to by @argp.
  *
- * Return Value: On success, 0 is returned, and checkpoints information is
- * copied into userspace pointer @argp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EFAULT - Failure during getting checkpoints statistics.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EFAULT	- Failure during getting checkpoints statistics.
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
@@ -363,7 +351,8 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
  * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used
  * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon.
  *
- * Return value: count of nilfs_suinfo structures in output buffer.
+ * Return: Count of nilfs_suinfo structures in output buffer on success,
+ * or a negative error code on failure.
  */
 static ssize_t
 nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
@@ -387,17 +376,14 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
  *
  * Description: nilfs_ioctl_get_sustat() returns segment usage statistics.
  * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities
- * and by nilfs_cleanerd daemon.
- *
- * Return Value: On success, 0 is returned, and segment usage information is
- * copied into userspace pointer @argp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * and by nilfs_cleanerd daemon.  The requested segment usage information is
+ * copied to the userspace memory pointed to by @argp.
  *
- * %-EFAULT - Failure during getting segment usage statistics.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EFAULT	- Failure during getting segment usage statistics.
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
@@ -430,7 +416,8 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
  * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used
  * by nilfs_cleanerd daemon.
  *
- * Return value: count of nilfs_vinfo structures in output buffer.
+ * Return: Count of nilfs_vinfo structures in output buffer on success, or
+ * a negative error code on failure.
  */
 static ssize_t
 nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
@@ -457,7 +444,8 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
  * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
  * is used by nilfs_cleanerd daemon.
  *
- * Return value: count of nilfs_bdescs structures in output buffer.
+ * Return: Count of nilfs_bdescs structures in output buffer on success, or
+ * a negative error code on failure.
  */
 static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
@@ -494,19 +482,15 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
  *
  * Description: nilfs_ioctl_do_get_bdescs() function returns information
  * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
- * is used by nilfs_cleanerd daemon.
- *
- * Return Value: On success, 0 is returned, and disk block descriptors are
- * copied into userspace pointer @argp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EINVAL - Invalid arguments from userspace.
+ * is used by nilfs_cleanerd daemon.  If successful, disk block descriptors
+ * are copied to userspace pointer @argp.
  *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EFAULT - Failure during getting disk block descriptors.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EFAULT	- Failure during getting disk block descriptors.
+ * * %-EINVAL	- Invalid arguments from userspace.
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
@@ -540,16 +524,12 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
  * Description: nilfs_ioctl_move_inode_block() function registers data/node
  * buffer in the GC pagecache and submit read request.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - Requested block doesn't exist.
- *
- * %-EEXIST - Blocks conflict is detected.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EEXIST	- Block conflict detected.
+ * * %-EIO	- I/O error.
+ * * %-ENOENT	- Requested block doesn't exist.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_move_inode_block(struct inode *inode,
 					struct nilfs_vdesc *vdesc,
@@ -604,8 +584,8 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
  * blocks that garbage collector specified with the array of nilfs_vdesc
  * structures and stores them into page caches of GC inodes.
  *
- * Return Value: Number of processed nilfs_vdesc structures or
- * error code, otherwise.
+ * Return: Number of processed nilfs_vdesc structures on success, or
+ * a negative error code on failure.
  */
 static int nilfs_ioctl_move_blocks(struct super_block *sb,
 				   struct nilfs_argv *argv, void *buf)
@@ -682,14 +662,11 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
  * in the period from p_start to p_end, excluding p_end itself. The checkpoints
  * which have been already deleted are ignored.
  *
- * Return Value: Number of processed nilfs_period structures or
- * error code, otherwise.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - invalid checkpoints.
+ * Return: Number of processed nilfs_period structures on success, or one of
+ * the following negative error codes on failure:
+ * * %-EINVAL	- invalid checkpoints.
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
 					  struct nilfs_argv *argv, void *buf)
@@ -717,14 +694,11 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
  * Description: nilfs_ioctl_free_vblocknrs() function frees
  * the virtual block numbers specified by @buf and @argv->v_nmembs.
  *
- * Return Value: Number of processed virtual block numbers or
- * error code, otherwise.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - The virtual block number have not been allocated.
+ * Return: Number of processed virtual block numbers on success, or one of the
+ * following negative error codes on failure:
+ * * %-EIO	- I/O error.
+ * * %-ENOENT	- Unallocated virtual block number.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
 				      struct nilfs_argv *argv, void *buf)
@@ -746,14 +720,11 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
  * Description: nilfs_ioctl_mark_blocks_dirty() function marks
  * metadata file or data blocks as dirty.
  *
- * Return Value: Number of processed block descriptors or
- * error code, otherwise.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EIO - I/O error
- *
- * %-ENOENT - the specified block does not exist (hole block)
+ * Return: Number of processed block descriptors on success, or one of the
+ * following negative error codes on failure:
+ * * %-EIO	- I/O error.
+ * * %-ENOENT	- Non-existent block (hole block).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
 					 struct nilfs_argv *argv, void *buf)
@@ -852,7 +823,7 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
  * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by
  * nilfs_cleanerd daemon.
  *
- * Return Value: On success, 0 is returned or error code, otherwise.
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 				      unsigned int cmd, void __user *argp)
@@ -976,20 +947,14 @@ out:
  * and metadata are written out to the device when it successfully
  * returned.
  *
- * Return Value: On success, 0 is retured. On errors, one of the following
- * negative error code is returned.
- *
- * %-EROFS - Read only filesystem.
- *
- * %-EIO - I/O error
- *
- * %-ENOSPC - No space left on device (only in a panic state).
- *
- * %-ERESTARTSYS - Interrupted.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EFAULT - Failure during execution of requested operation.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EFAULT		- Failure during execution of requested operation.
+ * * %-EIO		- I/O error.
+ * * %-ENOMEM		- Insufficient memory available.
+ * * %-ENOSPC		- No space left on device (only in a panic state).
+ * * %-ERESTARTSYS	- Interrupted.
+ * * %-EROFS		- Read only filesystem.
  */
 static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
 			    unsigned int cmd, void __user *argp)
@@ -1023,7 +988,7 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
  * @filp: file object
  * @argp: pointer on argument from userspace
  *
- * Return Value: On success, 0 is returned or error code, otherwise.
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
 			      void __user *argp)
@@ -1059,7 +1024,7 @@ out:
  * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
  * performs the actual trim operation.
  *
- * Return Value: On success, 0 is returned or negative error code, otherwise.
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
 {
@@ -1101,7 +1066,7 @@ static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
  * of segments in bytes and upper limit of segments in bytes.
  * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
  *
- * Return Value: On success, 0 is returned or error code, otherwise.
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
 {
@@ -1152,17 +1117,15 @@ out:
  * @dofunc: concrete function of getting metadata info
  *
  * Description: nilfs_ioctl_get_info() gets metadata info by means of
- * calling dofunc() function.
+ * calling dofunc() function.  The requested metadata information is copied
+ * to userspace memory @argp.
  *
- * Return Value: On success, 0 is returned and requested metadata info
- * is copied into userspace. On error, one of the following
- * negative error codes is returned.
- *
- * %-EINVAL - Invalid arguments from userspace.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EFAULT - Failure during execution of requested operation.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EFAULT	- Failure during execution of requested operation.
+ * * %-EINVAL	- Invalid arguments from userspace.
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 				unsigned int cmd, void __user *argp,
@@ -1202,18 +1165,14 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
  * encapsulated in nilfs_argv and updates the segment usage info
  * according to the flags in nilfs_suinfo_update.
  *
- * Return Value: On success, 0 is returned. On error, one of the
- * following negative error codes is returned.
- *
- * %-EPERM - Not enough permissions
- *
- * %-EFAULT - Error copying input data
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EEXIST	- Block conflict detected.
+ * * %-EFAULT	- Error copying input data.
+ * * %-EINVAL	- Invalid values in input (segment number, flags or nblocks).
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EPERM	- Not enough permissions.
  */
 static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp,
 				unsigned int cmd, void __user *argp)
-- 
cgit v1.2.3


From e4a99d48bb3adcd4cf4377dcdf2263158914b448 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Fri, 10 Jan 2025 10:01:45 +0900
Subject: nilfs2: correct return value kernel-doc descriptions for bmap
 functions

Similar to the previous patch to fix the ioctl return value descriptions,
this fixes the format of the return value descriptions for bmap (and
btree)-related functions, which was causing the kernel-doc script to emit
a number of warnings.

Link: https://lkml.kernel.org/r/20250110010530.21872-3-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: "Brian G ." <gissf1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/bmap.c  | 124 +++++++++++++++++++++++-------------------------------
 fs/nilfs2/btree.c |   7 ++-
 2 files changed, 56 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index c9e8d9a7d820..ccc1a7aa52d2 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -47,17 +47,14 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
  * @ptrp: place to store the value associated to @key
  *
  * Description: nilfs_bmap_lookup_at_level() finds a record whose key
- * matches @key in the block at @level of the bmap.
- *
- * Return Value: On success, 0 is returned and the record associated with @key
- * is stored in the place pointed by @ptrp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A record associated with @key does not exist.
+ * matches @key in the block at @level of the bmap.  The record associated
+ * with @key is stored in the place pointed to by @ptrp.
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- A record associated with @key does not exist.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
 			       __u64 *ptrp)
@@ -138,14 +135,11 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
  * Description: nilfs_bmap_insert() inserts the new key-record pair specified
  * by @key and @rec into @bmap.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EEXIST - A record associated with @key already exist.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EEXIST	- A record associated with @key already exists.
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
 {
@@ -193,14 +187,11 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
  * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap
  * starting from @start, and stores it to @keyp if found.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - No valid entry was found
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- No valid entry was found.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp)
 {
@@ -236,14 +227,11 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
  * Description: nilfs_bmap_delete() deletes the key-record pair specified by
  * @key from @bmap.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A record associated with @key does not exist.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- A record associated with @key does not exist.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
 {
@@ -290,12 +278,10 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
  * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
  * greater than or equal to @key from @bmap.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
@@ -330,12 +316,10 @@ void nilfs_bmap_clear(struct nilfs_bmap *bmap)
  * Description: nilfs_bmap_propagate() marks the buffers that directly or
  * indirectly refer to the block specified by @bh dirty.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
 {
@@ -362,22 +346,22 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
 
 /**
  * nilfs_bmap_assign - assign a new block number to a block
- * @bmap: bmap
- * @bh: pointer to buffer head
+ * @bmap:    bmap
+ * @bh:      place to store a pointer to the buffer head to which a block
+ *           address is assigned (in/out)
  * @blocknr: block number
- * @binfo: block information
+ * @binfo:   block information
  *
  * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
- * buffer specified by @bh.
- *
- * Return Value: On success, 0 is returned and the buffer head of a newly
- * create buffer and the block information associated with the buffer are
- * stored in the place pointed by @bh and @binfo, respectively. On error, one
- * of the following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * buffer specified by @bh.  The block information is stored in the memory
+ * pointed to by @binfo, and the buffer head may be replaced as a block
+ * address is assigned, in which case a pointer to the new buffer head is
+ * stored in the memory pointed to by @bh.
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_assign(struct nilfs_bmap *bmap,
 		      struct buffer_head **bh,
@@ -402,12 +386,10 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
  * Description: nilfs_bmap_mark() marks the block specified by @key and @level
  * as dirty.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 {
@@ -430,7 +412,7 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
  * Description: nilfs_test_and_clear() is the atomic operation to test and
  * clear the dirty state of @bmap.
  *
- * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
+ * Return: 1 if @bmap is dirty, or 0 if clear.
  */
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
 {
@@ -490,10 +472,10 @@ static struct lock_class_key nilfs_bmap_mdt_lock_key;
  *
  * Description: nilfs_bmap_read() initializes the bmap @bmap.
  *
- * Return Value: On success, 0 is returned. On error, the following negative
- * error code is returned.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (corrupted bmap).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
 {
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index ef5061bb56da..0d8f7fb15c2e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -334,7 +334,7 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
  * @inode: host inode of btree
  * @blocknr: block number
  *
- * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ * Return: 0 if normal, 1 if the node is broken.
  */
 static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
 				   size_t size, struct inode *inode,
@@ -366,7 +366,7 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
  * @node: btree root node to be examined
  * @inode: host inode of btree
  *
- * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ * Return: 0 if normal, 1 if the root node is broken.
  */
 static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
 				   struct inode *inode)
@@ -652,8 +652,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
  * @minlevel: start level
  * @nextkey: place to store the next valid key
  *
- * Return Value: If a next key was found, 0 is returned. Otherwise,
- * -ENOENT is returned.
+ * Return: 0 if the next key was found, %-ENOENT if not found.
  */
 static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree,
 				    const struct nilfs_btree_path *path,
-- 
cgit v1.2.3


From d33544adcb89cd15f99de37fd5a1ea8e4838ec31 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Fri, 10 Jan 2025 10:01:46 +0900
Subject: nilfs2: correct return value kernel-doc descriptions for sufile

Similar to the previous changes to fix return value descriptions, this
fixes the format of the return value descriptions of functions for
sufile-related functions, eliminating a dozen warnings emitted by the
kernel-doc script.

Link: https://lkml.kernel.org/r/20250110010530.21872-4-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: "Brian G ." <gissf1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/sufile.c | 102 ++++++++++++++++++++---------------------------------
 fs/nilfs2/sufile.h |  16 ++++-----
 2 files changed, 45 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index d3ecc813d633..623d417a54eb 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -155,17 +155,13 @@ unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
  * of successfully modified segments from the head is stored in the
  * place @ndone points to.
  *
- * Return Value: On success, zero is returned.  On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - Given segment usage is in hole block (may be returned if
- *            @create is zero)
- *
- * %-EINVAL - Invalid segment usage number
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL	- Invalid segment usage number
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- Given segment usage is in hole block (may be returned if
+ *		  @create is zero)
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
 			 int create, size_t *ndone,
@@ -272,10 +268,7 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
  * @start: minimum segment number of allocatable region (inclusive)
  * @end: maximum segment number of allocatable region (inclusive)
  *
- * Return Value: On success, 0 is returned.  On error, one of the
- * following negative error codes is returned.
- *
- * %-ERANGE - invalid segment region
+ * Return: 0 on success, or %-ERANGE if segment range is invalid.
  */
 int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end)
 {
@@ -300,17 +293,14 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end)
  * @sufile: inode of segment usage file
  * @segnump: pointer to segment number
  *
- * Description: nilfs_sufile_alloc() allocates a clean segment.
- *
- * Return Value: On success, 0 is returned and the segment number of the
- * allocated segment is stored in the place pointed by @segnump. On error, one
- * of the following negative error codes is returned.
- *
- * %-EIO - I/O error.
+ * Description: nilfs_sufile_alloc() allocates a clean segment, and stores
+ * its segment number in the place pointed to by @segnump.
  *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOSPC - No clean segment left.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-ENOSPC	- No clean segment left.
  */
 int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 {
@@ -610,16 +600,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
  * @sufile: inode of segment usage file
  * @sustat: pointer to a structure of segment usage statistics
  *
- * Description: nilfs_sufile_get_stat() returns information about segment
- * usage.
- *
- * Return Value: On success, 0 is returned, and segment usage information is
- * stored in the place pointed by @sustat. On error, one of the following
- * negative error codes is returned.
+ * Description: nilfs_sufile_get_stat() retrieves segment usage statistics
+ * and stores them in the location pointed to by @sustat.
  *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 {
@@ -683,16 +670,12 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
  * @start: start segment number (inclusive)
  * @end: end segment number (inclusive)
  *
- * Return Value: On success, 0 is returned.  On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid number of segments specified
- *
- * %-EBUSY - Dirty or active segments are present in the range
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EBUSY	- Dirty or active segments are present in the range.
+ * * %-EINVAL	- Invalid number of segments specified.
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_sufile_truncate_range(struct inode *sufile,
 				       __u64 start, __u64 end)
@@ -787,16 +770,12 @@ out:
  * @sufile: inode of segment usage file
  * @newnsegs: new number of segments
  *
- * Return Value: On success, 0 is returned.  On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOSPC - Enough free space is not left for shrinking
- *
- * %-EBUSY - Dirty or active segments exist in the region to be truncated
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EBUSY	- Dirty or active segments exist in the region to be truncated.
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-ENOSPC	- Enough free space is not left for shrinking.
  */
 int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
 {
@@ -939,14 +918,11 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
  * segment usage accordingly. Only the fields indicated by the sup_flags
  * are updated.
  *
- * Return Value: On success, 0 is returned. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL	- Invalid values in input (segment number, flags or nblocks).
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
 				unsigned int supsz, size_t nsup)
@@ -1073,7 +1049,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
  * and start+len is rounded down. For each clean segment blkdev_issue_discard
  * function is invoked.
  *
- * Return Value: On success, 0 is returned or negative error code, otherwise.
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 {
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 8e8a1a5a0402..127ab01a47ea 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -95,8 +95,7 @@ static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
  * @nsegs: size of @segnumv array
  * @ndone: place to store the number of cancelled segments
  *
- * Return Value: On success, 0 is returned. On error, a negative error codes
- * is returned.
+ * Return: 0 on success, or a negative error code on failure.
  */
 static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
 					    __u64 *segnumv, size_t nsegs,
@@ -114,14 +113,11 @@ static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
  * Description: nilfs_sufile_set_error() marks the segment specified by
  * @segnum as erroneous. The error segment will never be used again.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid segment usage number.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL	- Invalid segment usage number.
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
 {
-- 
cgit v1.2.3


From b8268501f1e3c49e3f90526d44b92eae4baa421b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Fri, 10 Jan 2025 10:01:47 +0900
Subject: nilfs2: correct return value kernel-doc descriptions for metadata
 files

Similar to the previous changes to fix return value descriptions, this
fixes the format of the return value descriptions for metadata file
functions other than sufile.

Link: https://lkml.kernel.org/r/20250110010530.21872-5-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: "Brian G ." <gissf1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c | 51 ++++++++++++++++++----------------------------
 fs/nilfs2/dat.c    | 40 +++++++++++++++---------------------
 fs/nilfs2/ifile.c  | 32 +++++++++++++----------------
 fs/nilfs2/mdt.c    | 59 +++++++++++++++++++++++++-----------------------------
 4 files changed, 77 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index c20207d7a989..a1c0c09f61e2 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -191,14 +191,11 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
  * @cnop: place to store the next checkpoint number
  * @bhp: place to store a pointer to buffer_head struct
  *
- * Return Value: On success, it returns 0. On error, the following negative
- * error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EIO - I/O error
- *
- * %-ENOENT - no block exists in the range.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- no block exists in the range.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile,
 					      __u64 start_cno, __u64 end_cno,
@@ -447,14 +444,11 @@ error:
  * the period from @start to @end, excluding @end itself. The checkpoints
  * which have been already deleted are ignored.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - invalid checkpoints.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL	- Invalid checkpoints.
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 				    __u64 start,
@@ -1058,14 +1052,11 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
  * Description: nilfs_change_cpmode() changes the mode of the checkpoint
  * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - No such checkpoint.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- No such checkpoint.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 {
@@ -1097,14 +1088,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
  * @cpstat: pointer to a structure of checkpoint statistics
  *
  * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
+ * The checkpoint statistics are stored in the location pointed to by @cpstat.
  *
- * Return Value: On success, 0 is returned, and checkpoints information is
- * stored in the place pointed by @cpstat. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 {
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index e220dcb08aa6..c57445e62298 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -302,14 +302,11 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
  * Description: nilfs_dat_freev() frees the virtual block numbers specified by
  * @vblocknrs and @nitems.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - The virtual block number have not been allocated.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- The virtual block number have not been allocated.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
 {
@@ -325,12 +322,10 @@ int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
  * Description: nilfs_dat_move() changes the block number associated with
  * @vblocknr to @blocknr.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 {
@@ -390,17 +385,14 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
  * @blocknrp: pointer to a block number
  *
  * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
- * to the corresponding block number.
- *
- * Return Value: On success, 0 is returned and the block number associated
- * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
- * of the following negative error codes is returned.
- *
- * %-EIO - I/O error.
+ * to the corresponding block number.  The block number associated with
+ * @vblocknr is stored in the place pointed to by @blocknrp.
  *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A block number associated with @vblocknr does not exist.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- A block number associated with @vblocknr does not exist.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 {
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index e7339eb3c08a..bae9d7d9a424 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -38,17 +38,16 @@ static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
  * @out_ino: pointer to a variable to store inode number
  * @out_bh: buffer_head contains newly allocated disk inode
  *
- * Return Value: On success, 0 is returned and the newly allocated inode
- * number is stored in the place pointed by @ino, and buffer_head pointer
- * that contains newly allocated disk inode structure is stored in the
- * place pointed by @out_bh
- * On error, one of the following negative error codes is returned.
+ * nilfs_ifile_create_inode() allocates a new inode in the ifile metadata
+ * file and stores the inode number in the variable pointed to by @out_ino,
+ * as well as storing the ifile's buffer with the disk inode in the location
+ * pointed to by @out_bh.
  *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOSPC - No inode left.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-ENOSPC	- No inode left.
  */
 int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
 			     struct buffer_head **out_bh)
@@ -83,14 +82,11 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
  * @ifile: ifile inode
  * @ino: inode number
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - The inode number @ino have not been allocated.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- Inode number unallocated.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
 {
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 965b5ad1c0df..3f7510beebbb 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -226,20 +226,21 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
  * @out_bh: output of a pointer to the buffer_head
  *
  * nilfs_mdt_get_block() looks up the specified buffer and tries to create
- * a new buffer if @create is not zero.  On success, the returned buffer is
- * assured to be either existing or formatted using a buffer lock on success.
- * @out_bh is substituted only when zero is returned.
+ * a new buffer if @create is not zero.  If (and only if) this function
+ * succeeds, it stores a pointer to the retrieved buffer head in the location
+ * pointed to by @out_bh.
  *
- * Return Value: On success, it returns 0. On error, the following negative
- * error code is returned.
+ * The retrieved buffer may be either an existing one or a newly allocated one.
+ * For a newly created buffer, if the callback function argument @init_block
+ * is non-NULL, the callback will be called with the buffer locked to format
+ * the block.
  *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EIO - I/O error
- *
- * %-ENOENT - the specified block does not exist (hole block)
- *
- * %-EROFS - Read only filesystem (for create mode)
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- The specified block does not exist (hole block).
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EROFS	- Read only filesystem (for create mode).
  */
 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
 			void (*init_block)(struct inode *,
@@ -275,14 +276,11 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
  * @out_bh, and block offset to @blkoff, respectively.  @out_bh and
  * @blkoff are substituted only when zero is returned.
  *
- * Return Value: On success, it returns 0. On error, the following negative
- * error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EIO - I/O error
- *
- * %-ENOENT - no block was found in the range
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- No block was found in the range.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
 			 unsigned long end, unsigned long *blkoff,
@@ -321,12 +319,11 @@ out:
  * @inode: inode of the meta data file
  * @block: block offset
  *
- * Return Value: On success, zero is returned.
- * On error, one of the following negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EIO - I/O error
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- Non-existent block.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
 {
@@ -349,12 +346,10 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
  * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
  * tries to release the page including the buffer from a page cache.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error code is returned.
- *
- * %-EBUSY - page has an active buffer.
- *
- * %-ENOENT - page cache has no page addressed by the offset.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EBUSY	- Page has an active buffer.
+ * * %-ENOENT	- Page cache has no page addressed by the offset.
  */
 int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 {
-- 
cgit v1.2.3


From 343d4a33f26d048fde4b02cfc3681f766b3f1b12 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Fri, 10 Jan 2025 10:01:48 +0900
Subject: nilfs2: correct return value kernel-doc descriptions for the rest

Similar to the previous changes to fix return value descriptions, this
fixes the format of the return value descriptions of functions for the
rest.

Link: https://lkml.kernel.org/r/20250110010530.21872-6-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: "Brian G ." <gissf1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/gcinode.c   | 24 ++++++++++--------------
 fs/nilfs2/inode.c     | 12 +++---------
 fs/nilfs2/recovery.c  | 32 ++++++++++++--------------------
 fs/nilfs2/segbuf.c    | 12 ++----------
 fs/nilfs2/segment.c   | 48 ++++++++++++++++++------------------------------
 fs/nilfs2/the_nilfs.c | 12 +++++-------
 6 files changed, 50 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 2dbb15767df1..561c220799c7 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -46,14 +46,11 @@
  * specified by @pbn to the GC pagecache with the key @blkoff.
  * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
  *
- * Return Value: On success, 0 is returned. On Error, one of the following
- * negative error code is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - The block specified with @pbn does not exist.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- The block specified with @pbn does not exist.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
 				   sector_t pbn, __u64 vbn,
@@ -114,12 +111,11 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
  * specified by @vbn to the GC pagecache.  @pbn can be supplied by the
  * caller to avoid translation of the disk block address.
  *
- * Return Value: On success, 0 is returned. On Error, one of the following
- * negative error code is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- Invalid virtual block address.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
 				   __u64 vbn, struct buffer_head **out_bh)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 23f3a75edd50..228bfffa5d22 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -598,10 +598,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
  * or does nothing if the inode already has it.  This function allocates
  * an additional inode to maintain page cache of B-tree nodes one-on-one.
  *
- * Return Value: On success, 0 is returned. On errors, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
+ * Return: 0 on success, or %-ENOMEM if memory is insufficient.
  */
 int nilfs_attach_btree_node_cache(struct inode *inode)
 {
@@ -660,11 +657,8 @@ void nilfs_detach_btree_node_cache(struct inode *inode)
  * in one inode and the one for b-tree node pages is set up in the
  * other inode, which is attached to the former inode.
  *
- * Return Value: On success, a pointer to the inode for data pages is
- * returned. On errors, one of the following negative error code is returned
- * in a pointer type.
- *
- * %-ENOMEM - Insufficient memory available.
+ * Return: a pointer to the inode for data pages on success, or %-ENOMEM
+ * if memory is insufficient.
  */
 struct inode *nilfs_iget_for_shadow(struct inode *inode)
 {
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index e43405bf521e..4063f084e75d 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -754,18 +754,13 @@ static void nilfs_abort_roll_forward(struct the_nilfs *nilfs)
  * @sb: super block instance
  * @ri: pointer to a nilfs_recovery_info struct to store search results.
  *
- * Return Value: On success, 0 is returned.  On error, one of the following
- * negative error code is returned.
- *
- * %-EINVAL - Inconsistent filesystem state.
- *
- * %-EIO - I/O error
- *
- * %-ENOSPC - No space left on device (only in a panic state).
- *
- * %-ERESTARTSYS - Interrupted.
- *
- * %-ENOMEM - Insufficient memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL		- Inconsistent filesystem state.
+ * * %-EIO		- I/O error.
+ * * %-ENOMEM		- Insufficient memory available.
+ * * %-ENOSPC		- No space left on device (only in a panic state).
+ * * %-ERESTARTSYS	- Interrupted.
  */
 int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
 			      struct super_block *sb,
@@ -830,14 +825,11 @@ failed:
  * segment pointed by the superblock.  It sets up struct the_nilfs through
  * this search. It fills nilfs_recovery_info (ri) required for recovery.
  *
- * Return Value: On success, 0 is returned.  On error, one of the following
- * negative error code is returned.
- *
- * %-EINVAL - No valid segment found
- *
- * %-EIO - I/O error
- *
- * %-ENOMEM - Insufficient memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL	- No valid segment found.
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 int nilfs_search_super_root(struct the_nilfs *nilfs,
 			    struct nilfs_recovery_info *ri)
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index e08cab03366b..a8bdf3d318ea 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -406,12 +406,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
  * @segbuf: buffer storing a log to be written
  * @nilfs: nilfs object
  *
- * Return Value: On Success, 0 is returned. On Error, one of the following
- * negative error code is returned.
- *
- * %-EIO - I/O error
- *
- * %-ENOMEM - Insufficient memory available.
+ * Return: Always 0.
  */
 static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
 			      struct the_nilfs *nilfs)
@@ -452,10 +447,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
  * nilfs_segbuf_wait - wait for completion of requested BIOs
  * @segbuf: segment buffer
  *
- * Return Value: On Success, 0 is returned. On Error, one of the following
- * negative error code is returned.
- *
- * %-EIO - I/O error
+ * Return: 0 on success, or %-EIO if I/O error is detected.
  */
 static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
 {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 58a598b548fa..05c24b971464 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,12 +191,10 @@ static int nilfs_prepare_segment_lock(struct super_block *sb,
  * When @vacancy_check flag is set, this function will check the amount of
  * free space, and will wait for the GC to reclaim disk space if low capacity.
  *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-ENOSPC - No space left on device
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-ENOSPC	- No space left on device (if checking free space).
  */
 int nilfs_transaction_begin(struct super_block *sb,
 			    struct nilfs_transaction_info *ti,
@@ -2314,18 +2312,13 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err, bool force)
  * nilfs_construct_segment - construct a logical segment
  * @sb: super block
  *
- * Return Value: On success, 0 is returned. On errors, one of the following
- * negative error code is returned.
- *
- * %-EROFS - Read only filesystem.
- *
- * %-EIO - I/O error
- *
- * %-ENOSPC - No space left on device (only in a panic state).
- *
- * %-ERESTARTSYS - Interrupted.
- *
- * %-ENOMEM - Insufficient memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO		- I/O error (including metadata corruption).
+ * * %-ENOMEM		- Insufficient memory available.
+ * * %-ENOSPC		- No space left on device (only in a panic state).
+ * * %-ERESTARTSYS	- Interrupted.
+ * * %-EROFS		- Read only filesystem.
  */
 int nilfs_construct_segment(struct super_block *sb)
 {
@@ -2349,18 +2342,13 @@ int nilfs_construct_segment(struct super_block *sb)
  * @start: start byte offset
  * @end: end byte offset (inclusive)
  *
- * Return Value: On success, 0 is returned. On errors, one of the following
- * negative error code is returned.
- *
- * %-EROFS - Read only filesystem.
- *
- * %-EIO - I/O error
- *
- * %-ENOSPC - No space left on device (only in a panic state).
- *
- * %-ERESTARTSYS - Interrupted.
- *
- * %-ENOMEM - Insufficient memory available.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO		- I/O error (including metadata corruption).
+ * * %-ENOMEM		- Insufficient memory available.
+ * * %-ENOSPC		- No space left on device (only in a panic state).
+ * * %-ERESTARTSYS	- Interrupted.
+ * * %-EROFS		- Read only filesystem.
  */
 int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 				  loff_t start, loff_t end)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ac03fd3c330c..1bcaf85506d3 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -49,8 +49,8 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
  * alloc_nilfs - allocate a nilfs object
  * @sb: super block instance
  *
- * Return Value: On success, pointer to the_nilfs is returned.
- * On error, NULL is returned.
+ * Return: a pointer to the allocated nilfs object on success, or NULL on
+ * failure.
  */
 struct the_nilfs *alloc_nilfs(struct super_block *sb)
 {
@@ -200,8 +200,7 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
  * exponent information written in @sbp and stores it in @blocksize,
  * or aborts with an error message if it's too large.
  *
- * Return Value: On success, 0 is returned. If the block size is too
- * large, -EINVAL is returned.
+ * Return: 0 on success, or %-EINVAL if the block size is too large.
  */
 static int nilfs_get_blocksize(struct super_block *sb,
 			       struct nilfs_super_block *sbp, int *blocksize)
@@ -538,7 +537,7 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp)
  * area, or if the parameters themselves are not normal, it is
  * determined to be invalid.
  *
- * Return Value: true if invalid, false if valid.
+ * Return: true if invalid, false if valid.
  */
 static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
 {
@@ -684,8 +683,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
  * reading the super block, getting disk layout information, initializing
  * shared fields in the_nilfs).
  *
- * Return Value: On success, 0 is returned. On error, a negative error
- * code is returned.
+ * Return: 0 on success, or a negative error code on failure.
  */
 int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 {
-- 
cgit v1.2.3


From fd4e7fadfd91023c2c5868f16af3fe8bed06636b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Fri, 10 Jan 2025 10:01:49 +0900
Subject: nilfs2: add missing return value kernel-doc descriptions

There are a number of kernel-doc comments for functions that are missing
return values, which also causes a number of warnings when the kernel-doc
script is run with the "-Wall" option.

Fix this issue by adding proper return value descriptions, and improve
code maintainability.

Link: https://lkml.kernel.org/r/20250110010530.21872-7-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: "Brian G ." <gissf1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/alloc.c     | 67 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/nilfs2/alloc.h     |  2 ++
 fs/nilfs2/cpfile.c    |  2 ++
 fs/nilfs2/dat.c       |  2 ++
 fs/nilfs2/ifile.c     |  2 ++
 fs/nilfs2/inode.c     |  4 +++
 fs/nilfs2/mdt.c       |  4 +++
 fs/nilfs2/page.c      |  8 +++---
 fs/nilfs2/recovery.c  | 30 +++++++++++++++++++++++
 fs/nilfs2/segment.c   |  8 ++++++
 fs/nilfs2/sufile.c    |  8 ++++++
 fs/nilfs2/sufile.h    |  6 +++++
 fs/nilfs2/super.c     | 10 +++++++-
 fs/nilfs2/the_nilfs.c | 14 +++++++++++
 14 files changed, 160 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index ba3e1f591f36..6b506995818d 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -21,6 +21,8 @@
  * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
  *					descriptor block can maintain
  * @inode: inode of metadata file using this allocator
+ *
+ * Return: Number of groups that a group descriptor block can maintain.
  */
 static inline unsigned long
 nilfs_palloc_groups_per_desc_block(const struct inode *inode)
@@ -32,6 +34,8 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
 /**
  * nilfs_palloc_groups_count - get maximum number of groups
  * @inode: inode of metadata file using this allocator
+ *
+ * Return: Maximum number of groups.
  */
 static inline unsigned long
 nilfs_palloc_groups_count(const struct inode *inode)
@@ -43,6 +47,8 @@ nilfs_palloc_groups_count(const struct inode *inode)
  * nilfs_palloc_init_blockgroup - initialize private variables for allocator
  * @inode: inode of metadata file using this allocator
  * @entry_size: size of the persistent object
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
 {
@@ -78,6 +84,9 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
  * @inode: inode of metadata file using this allocator
  * @nr: serial number of the entry (e.g. inode number)
  * @offset: pointer to store offset number in the group
+ *
+ * Return: Number of the group that contains the entry with the index
+ * specified by @nr.
  */
 static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
 					unsigned long *offset)
@@ -93,8 +102,8 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
  * @inode: inode of metadata file using this allocator
  * @group: group number
  *
- * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
- * block which contains a descriptor of the specified group.
+ * Return: Index number in the metadata file of the descriptor block of
+ * the group specified by @group.
  */
 static unsigned long
 nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
@@ -111,6 +120,9 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
  *
  * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
  * block used to allocate/deallocate entries in the specified group.
+ *
+ * Return: Index number in the metadata file of the bitmap block of
+ * the group specified by @group.
  */
 static unsigned long
 nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
@@ -125,6 +137,8 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
  * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
  * @desc: pointer to descriptor structure for the group
  * @lock: spin lock protecting @desc
+ *
+ * Return: Number of free entries written in the group descriptor @desc.
  */
 static unsigned long
 nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
@@ -143,6 +157,9 @@ nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
  * @desc: pointer to descriptor structure for the group
  * @lock: spin lock protecting @desc
  * @n: delta to be added
+ *
+ * Return: Number of free entries after adjusting the group descriptor
+ * @desc.
  */
 static u32
 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
@@ -161,6 +178,9 @@ nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
  * nilfs_palloc_entry_blkoff - get block offset of an entry block
  * @inode: inode of metadata file using this allocator
  * @nr: serial number of the entry (e.g. inode number)
+ *
+ * Return: Index number in the metadata file of the block containing
+ * the entry specified by @nr.
  */
 static unsigned long
 nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
@@ -238,6 +258,12 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
  * @blkoff: block offset
  * @prev: nilfs_bh_assoc struct of the last used buffer
  * @lock: spin lock protecting @prev
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOENT	- Non-existent block.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
 				     struct nilfs_bh_assoc *prev,
@@ -258,6 +284,8 @@ static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
  * @group: group number
  * @create: create flag
  * @bhp: pointer to store the resultant buffer head
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_palloc_get_desc_block(struct inode *inode,
 				       unsigned long group,
@@ -277,6 +305,8 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
  * @group: group number
  * @create: create flag
  * @bhp: pointer to store the resultant buffer head
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
 					 unsigned long group,
@@ -294,6 +324,8 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
  * nilfs_palloc_delete_bitmap_block - delete a bitmap block
  * @inode: inode of metadata file using this allocator
  * @group: group number
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
 					    unsigned long group)
@@ -312,6 +344,8 @@ static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
  * @nr: serial number of the entry (e.g. inode number)
  * @create: create flag
  * @bhp: pointer to store the resultant buffer head
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
 				 int create, struct buffer_head **bhp)
@@ -328,6 +362,8 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
  * nilfs_palloc_delete_entry_block - delete an entry block
  * @inode: inode of metadata file using this allocator
  * @nr: serial number of the entry
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
 {
@@ -397,6 +433,9 @@ size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr,
  * @bsize: size in bits
  * @lock: spin lock protecting @bitmap
  * @wrap: whether to wrap around
+ *
+ * Return: Offset number within the group of the found free entry, or
+ * %-ENOSPC if not found.
  */
 static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
 					    unsigned long target,
@@ -438,6 +477,9 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
  * @inode: inode of metadata file using this allocator
  * @curr: current group number
  * @max: maximum number of groups
+ *
+ * Return: Number of remaining descriptors (= groups) managed by the descriptor
+ * block.
  */
 static unsigned long
 nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
@@ -453,6 +495,8 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
  * nilfs_palloc_count_desc_blocks - count descriptor blocks number
  * @inode: inode of metadata file using this allocator
  * @desc_blocks: descriptor blocks number [out]
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_palloc_count_desc_blocks(struct inode *inode,
 					    unsigned long *desc_blocks)
@@ -473,6 +517,8 @@ static int nilfs_palloc_count_desc_blocks(struct inode *inode,
  *					MDT file growing
  * @inode: inode of metadata file using this allocator
  * @desc_blocks: known current descriptor blocks count
+ *
+ * Return: true if a group can be added in the metadata file, false if not.
  */
 static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
 						    unsigned long desc_blocks)
@@ -487,6 +533,12 @@ static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
  * @inode: inode of metadata file using this allocator
  * @nused: current number of used entries
  * @nmaxp: max number of entries [out]
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-ERANGE	- Number of entries in use is out of range.
  */
 int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
 {
@@ -518,6 +570,13 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
  * @inode: inode of metadata file using this allocator
  * @req: nilfs_palloc_req structure exchanged for the allocation
  * @wrap: whether to wrap around
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-ENOSPC	- Entries exhausted (No entries available for allocation).
+ * * %-EROFS	- Read only filesystem
  */
 int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 				     struct nilfs_palloc_req *req, bool wrap)
@@ -710,6 +769,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
  * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
  * @inode: inode of metadata file using this allocator
  * @req: nilfs_palloc_req structure exchanged for the removal
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_palloc_prepare_free_entry(struct inode *inode,
 				    struct nilfs_palloc_req *req)
@@ -754,6 +815,8 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
  * @inode: inode of metadata file using this allocator
  * @entry_nrs: array of entry numbers to be deallocated
  * @nitems: number of entries stored in @entry_nrs
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 {
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 3f115ab7e9a7..046d876ea3e0 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -21,6 +21,8 @@
  *
  * The number of entries per group is defined by the number of bits
  * that a bitmap block can maintain.
+ *
+ * Return: Number of entries per group.
  */
 static inline unsigned long
 nilfs_palloc_entries_per_group(const struct inode *inode)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index a1c0c09f61e2..aaab0ae90261 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -1124,6 +1124,8 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
  * @cpsize: size of a checkpoint entry
  * @raw_inode: on-disk cpfile inode
  * @inodep: buffer to store the inode
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
 		      struct nilfs_inode *raw_inode, struct inode **inodep)
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index c57445e62298..c5664035b3e3 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -481,6 +481,8 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
  * @entry_size: size of a dat entry
  * @raw_inode: on-disk dat inode
  * @inodep: buffer to store the inode
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_dat_read(struct super_block *sb, size_t entry_size,
 		   struct nilfs_inode *raw_inode, struct inode **inodep)
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index bae9d7d9a424..e1e050dfbbc2 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -146,6 +146,8 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
  * @ifile: ifile inode
  * @nmaxinodes: current maximum of available inodes count [out]
  * @nfreeinodes: free inodes count [out]
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_ifile_count_free_inodes(struct inode *ifile,
 				    u64 *nmaxinodes, u64 *nfreeinodes)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 228bfffa5d22..e8015d24a82c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -68,6 +68,8 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
  *
  * This function does not issue actual read request of the specified data
  * block. It is done by VFS.
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_get_block(struct inode *inode, sector_t blkoff,
 		    struct buffer_head *bh_result, int create)
@@ -141,6 +143,8 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
  * address_space_operations.
  * @file: file struct of the file to be read
  * @folio: the folio to be read
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_read_folio(struct file *file, struct folio *folio)
 {
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 3f7510beebbb..2f850a18d6e7 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -519,6 +519,8 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
  * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
  * @inode: inode of the metadata file
  * @shadow: shadow mapping
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_mdt_setup_shadow_map(struct inode *inode,
 			       struct nilfs_shadow_map *shadow)
@@ -540,6 +542,8 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 /**
  * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
  * @inode: inode of the metadata file
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_mdt_save_to_shadow_map(struct inode *inode)
 {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 899686d2e5f7..806b056d2260 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -135,8 +135,7 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
  * nilfs_folio_buffers_clean - Check if a folio has dirty buffers or not.
  * @folio: Folio to be checked.
  *
- * nilfs_folio_buffers_clean() returns false if the folio has dirty buffers.
- * Otherwise, it returns true.
+ * Return: false if the folio has dirty buffers, true otherwise.
  */
 bool nilfs_folio_buffers_clean(struct folio *folio)
 {
@@ -500,8 +499,9 @@ void __nilfs_clear_folio_dirty(struct folio *folio)
  * This function searches an extent of buffers marked "delayed" which
  * starts from a block offset equal to or larger than @start_blk.  If
  * such an extent was found, this will store the start offset in
- * @blkoff and return its length in blocks.  Otherwise, zero is
- * returned.
+ * @blkoff and return its length in blocks.
+ *
+ * Return: Length in blocks of found extent, 0 otherwise.
  */
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 					    sector_t start_blk,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 4063f084e75d..22aecf6e2344 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -88,6 +88,8 @@ static int nilfs_warn_segment_error(struct super_block *sb, int err)
  * @check_bytes: number of bytes to be checked
  * @start: DBN of start block
  * @nblock: number of blocks to be checked
+ *
+ * Return: 0 on success, or %-EIO if an I/O error occurs.
  */
 static int nilfs_compute_checksum(struct the_nilfs *nilfs,
 				  struct buffer_head *bhs, u32 *sum,
@@ -126,6 +128,11 @@ static int nilfs_compute_checksum(struct the_nilfs *nilfs,
  * @sr_block: disk block number of the super root block
  * @pbh: address of a buffer_head pointer to return super root buffer
  * @check: CRC check flag
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL	- Super root block corrupted.
+ * * %-EIO	- I/O error.
  */
 int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
 				struct buffer_head **pbh, int check)
@@ -176,6 +183,8 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
  * @nilfs: nilfs object
  * @start_blocknr: start block number of the log
  * @sum: pointer to return segment summary structure
+ *
+ * Return: Buffer head pointer, or NULL if an I/O error occurs.
  */
 static struct buffer_head *
 nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
@@ -195,6 +204,13 @@ nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
  * @seg_seq: sequence number of segment
  * @bh_sum: buffer head of summary block
  * @sum: segment summary struct
+ *
+ * Return: 0 on success, or one of the following internal codes on failure:
+ * * %NILFS_SEG_FAIL_MAGIC	    - Magic number mismatch.
+ * * %NILFS_SEG_FAIL_SEQ	    - Sequence number mismatch.
+ * * %NIFLS_SEG_FAIL_CONSISTENCY    - Block count out of range.
+ * * %NILFS_SEG_FAIL_IO		    - I/O error.
+ * * %NILFS_SEG_FAIL_CHECKSUM_FULL  - Full log checksum verification failed.
  */
 static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
 			      struct buffer_head *bh_sum,
@@ -238,6 +254,9 @@ out:
  * @pbh: the current buffer head on summary blocks [in, out]
  * @offset: the current byte offset on summary blocks [in, out]
  * @bytes: byte size of the item to be read
+ *
+ * Return: Kernel space address of current segment summary entry, or
+ * NULL if an I/O error occurs.
  */
 static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
 				     struct buffer_head **pbh,
@@ -300,6 +319,11 @@ static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
  * @start_blocknr: start block number of the log
  * @sum: log summary information
  * @head: list head to add nilfs_recovery_block struct
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
 				struct nilfs_segment_summary *sum,
@@ -571,6 +595,12 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
  * @sb: super block instance
  * @root: NILFS root instance
  * @ri: pointer to a nilfs_recovery_info
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL	- Log format error.
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
  */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 				 struct super_block *sb,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 05c24b971464..94f45281253c 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -250,6 +250,8 @@ int nilfs_transaction_begin(struct super_block *sb,
  * nilfs_transaction_commit() sets a timer to start the segment
  * constructor.  If a sync flag is set, it starts construction
  * directly.
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_transaction_commit(struct super_block *sb)
 {
@@ -405,6 +407,8 @@ static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
 /**
  * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
  * @sci: nilfs_sc_info
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
 {
@@ -1315,6 +1319,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
  * nilfs_segctor_begin_construction - setup segment buffer to make a new log
  * @sci: nilfs_sc_info
  * @nilfs: nilfs object
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
 					    struct the_nilfs *nilfs)
@@ -2454,6 +2460,8 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
  * nilfs_segctor_construct - form logs and write them to disk
  * @sci: segment constructor object
  * @mode: mode of log forming
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 623d417a54eb..47d01255a8fb 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -133,6 +133,8 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 /**
  * nilfs_sufile_get_ncleansegs - return the number of clean segments
  * @sufile: inode of segment usage file
+ *
+ * Return: Number of clean segments.
  */
 unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
 {
@@ -500,6 +502,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
  * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
  * @sufile: inode of segment usage file
  * @segnum: segment number
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 {
@@ -559,6 +563,8 @@ out_sem:
  * @segnum: segment number
  * @nblocks: number of live blocks in the segment
  * @modtime: modification time (option)
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 				   unsigned long nblocks, time64_t modtime)
@@ -1195,6 +1201,8 @@ out_sem:
  * @susize: size of a segment usage entry
  * @raw_inode: on-disk sufile inode
  * @inodep: buffer to store the inode
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_sufile_read(struct super_block *sb, size_t susize,
 		      struct nilfs_inode *raw_inode, struct inode **inodep)
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 127ab01a47ea..cd6f28ab3521 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -58,6 +58,8 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
  * nilfs_sufile_scrap - make a segment garbage
  * @sufile: inode of segment usage file
  * @segnum: segment number to be freed
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
 {
@@ -68,6 +70,8 @@ static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
  * nilfs_sufile_free - free segment
  * @sufile: inode of segment usage file
  * @segnum: segment number to be freed
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
 {
@@ -80,6 +84,8 @@ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
  * @segnumv: array of segment numbers
  * @nsegs: size of @segnumv array
  * @ndone: place to store the number of freed segments
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
 				     size_t nsegs, size_t *ndone)
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index eca79cca3803..badc2cbc895e 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -309,6 +309,8 @@ int nilfs_commit_super(struct super_block *sb, int flag)
  * This function restores state flags in the on-disk super block.
  * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
  * filesystem was not clean previously.
+ *
+ * Return: 0 on success, %-EIO if I/O error or superblock is corrupted.
  */
 int nilfs_cleanup_super(struct super_block *sb)
 {
@@ -339,6 +341,8 @@ int nilfs_cleanup_super(struct super_block *sb)
  * nilfs_move_2nd_super - relocate secondary super block
  * @sb: super block instance
  * @sb2off: new offset of the secondary super block (in bytes)
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
 {
@@ -420,6 +424,8 @@ out:
  * nilfs_resize_fs - resize the filesystem
  * @sb: super block instance
  * @newsize: new size of the filesystem (in bytes)
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
 {
@@ -987,7 +993,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
  * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint
  * @root_dentry: root dentry of the tree to be shrunk
  *
- * This function returns true if the tree was in-use.
+ * Return: true if the tree was in-use, false otherwise.
  */
 static bool nilfs_tree_is_busy(struct dentry *root_dentry)
 {
@@ -1033,6 +1039,8 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
  *
  * This function is called exclusively by nilfs->ns_mount_mutex.
  * So, the recovery process is protected from other simultaneous mounts.
+ *
+ * Return: 0 on success, or a negative error code on failure.
  */
 static int
 nilfs_fill_super(struct super_block *sb, struct fs_context *fc)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 1bcaf85506d3..cb01ea81724d 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -165,6 +165,9 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
  * containing a super root from a given super block, and initializes
  * relevant information on the nilfs object preparatory for log
  * scanning and recovery.
+ *
+ * Return: 0 on success, or %-EINVAL if current segment number is out
+ * of range.
  */
 static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
 				  struct nilfs_super_block *sbp)
@@ -225,6 +228,13 @@ static int nilfs_get_blocksize(struct super_block *sb,
  * load_nilfs() searches and load the latest super root,
  * attaches the last segment, and does recovery if needed.
  * The caller must call this exclusively for simultaneous mounts.
+ *
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
+ * * %-EINVAL	- No valid segment found.
+ * * %-EIO	- I/O error.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EROFS	- Read only device or RO compat mode (if recovery is required)
  */
 int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 {
@@ -394,6 +404,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
  * nilfs_nrsvsegs - calculate the number of reserved segments
  * @nilfs: nilfs object
  * @nsegs: total number of segments
+ *
+ * Return: Number of reserved segments.
  */
 unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
 {
@@ -405,6 +417,8 @@ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
 /**
  * nilfs_max_segment_count - calculate the maximum number of segments
  * @nilfs: nilfs object
+ *
+ * Return: Maximum number of segments
  */
 static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
 {
-- 
cgit v1.2.3


From 2e62857f1312606bd173b81102047e8c7bce3c3f Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Fri, 10 Jan 2025 10:01:50 +0900
Subject: nilfs2: revise the return value description style for consistency.

Also for comments that do not cause kernel-doc warnings (those that list
multiple error codes), revise the return value description style to match
Brian G.'s suggestion of "..., or one of the following negative error
codes on failure:".

Link: https://lkml.kernel.org/r/CAAq45aNh1qV8P6XgDhKeNstT=PvcPUaCXsAF-f9rvmzznsZL5A@mail.gmail.com
Link: https://lkml.kernel.org/r/20250110010530.21872-8-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: "Brian G ." <gissf1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/btnode.c  |  3 ++-
 fs/nilfs2/cpfile.c  | 16 ++++++++++------
 fs/nilfs2/dat.c     |  3 ++-
 fs/nilfs2/ifile.c   |  3 ++-
 fs/nilfs2/ioctl.c   |  3 ++-
 fs/nilfs2/segment.c |  6 ++++--
 fs/nilfs2/sufile.c  |  2 +-
 7 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 54a3fa0cf67e..568367129092 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -201,7 +201,8 @@ void nilfs_btnode_delete(struct buffer_head *bh)
  * Note that the current implementation does not support folio sizes larger
  * than the page size.
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-EIO	- I/O error (metadata corruption).
  * * %-ENOMEM	- Insufficient memory available.
  */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aaab0ae90261..bcc7d76269ac 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -236,7 +236,8 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
  * stores it to the inode file given by @ifile and the nilfs root object
  * given by @root.
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-EINVAL	- Invalid checkpoint.
  * * %-ENOMEM	- Insufficient memory available.
  * * %-EIO	- I/O error (including metadata corruption).
@@ -304,7 +305,8 @@ out_sem:
  * In either case, the buffer of the block containing the checkpoint entry
  * and the cpfile inode are made dirty for inclusion in the write log.
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-ENOMEM	- Insufficient memory available.
  * * %-EIO	- I/O error (including metadata corruption).
  * * %-EROFS	- Read only filesystem
@@ -373,7 +375,8 @@ out_sem:
  * cpfile with the data given by the arguments @root, @blkinc, @ctime, and
  * @minor.
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-ENOMEM	- Insufficient memory available.
  * * %-EIO	- I/O error (including metadata corruption).
  */
@@ -712,7 +715,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
  * number to continue searching.
  *
  * Return: Count of checkpoint info items stored in the output buffer on
- * success, or the following negative error code on failure.
+ * success, or one of the following negative error codes on failure:
  * * %-EINVAL	- Invalid checkpoint mode.
  * * %-ENOMEM	- Insufficient memory available.
  * * %-EIO	- I/O error (including metadata corruption).
@@ -737,7 +740,8 @@ ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
  * @cpfile: checkpoint file inode
  * @cno:    checkpoint number to delete
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-EBUSY	- Checkpoint in use (snapshot specified).
  * * %-EIO	- I/O error (including metadata corruption).
  * * %-ENOENT	- No valid checkpoint found.
@@ -1005,7 +1009,7 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
  * @cno:    checkpoint number
  *
  * Return: 1 if the checkpoint specified by @cno is a snapshot, 0 if not, or
- * the following negative error code on failure.
+ * one of the following negative error codes on failure:
  * * %-EIO	- I/O error (including metadata corruption).
  * * %-ENOENT	- No such checkpoint.
  * * %-ENOMEM	- Insufficient memory available.
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index c5664035b3e3..c664daba56ae 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -276,7 +276,8 @@ void nilfs_dat_abort_update(struct inode *dat,
  * @dat:      DAT file inode
  * @vblocknr: virtual block number
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-EINVAL	- Invalid DAT entry (internal code).
  * * %-EIO	- I/O error (including metadata corruption).
  * * %-ENOMEM	- Insufficient memory available.
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index e1e050dfbbc2..c4cd4a4dedd0 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -172,7 +172,8 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile,
  * @cno: number of checkpoint entry to read
  * @inode_size: size of an inode
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-EINVAL	- Invalid checkpoint.
  * * %-ENOMEM	- Insufficient memory available.
  * * %-EIO	- I/O error (including metadata corruption).
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index e877c97974a4..a66d62a51f77 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1268,7 +1268,8 @@ static int nilfs_ioctl_get_fslabel(struct super_block *sb, void __user *argp)
  * @filp: file object
  * @argp: pointer to userspace memory that contains the volume name
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-EFAULT	- Error copying input data.
  * * %-EINVAL	- Label length exceeds record size in superblock.
  * * %-EIO	- I/O error.
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 94f45281253c..3a202e51b360 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1122,7 +1122,8 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
  * a super root block containing this sufile change is complete, and it can
  * be canceled with nilfs_sufile_cancel_freev() until then.
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-EINVAL	- Invalid segment number.
  * * %-EIO	- I/O error (including metadata corruption).
  * * %-ENOMEM	- Insufficient memory available.
@@ -2834,7 +2835,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
  * This allocates a log writer object, initializes it, and starts the
  * log writer.
  *
- * Return: 0 on success, or the following negative error code on failure.
+ * Return: 0 on success, or one of the following negative error codes on
+ * failure:
  * * %-EINTR	- Log writer thread creation failed due to interruption.
  * * %-ENOMEM	- Insufficient memory available.
  */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 47d01255a8fb..330f269abedf 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -850,7 +850,7 @@ out:
  * @nsi:    size of suinfo array
  *
  * Return: Count of segment usage info items stored in the output buffer on
- * success, or the following negative error code on failure.
+ * success, or one of the following negative error codes on failure:
  * * %-EIO	- I/O error (including metadata corruption).
  * * %-ENOMEM	- Insufficient memory available.
  */
-- 
cgit v1.2.3


From ee70999a988b8abc3490609142f50ebaa8344432 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Sat, 11 Jan 2025 23:26:35 +0900
Subject: nilfs2: handle errors that nilfs_prepare_chunk() may return

Patch series "nilfs2: fix issues with rename operations".

This series fixes BUG_ON check failures reported by syzbot around rename
operations, and a minor behavioral issue where the mtime of a child
directory changes when it is renamed instead of moved.


This patch (of 2):

The directory manipulation routines nilfs_set_link() and
nilfs_delete_entry() rewrite the directory entry in the folio/page
previously read by nilfs_find_entry(), so error handling is omitted on the
assumption that nilfs_prepare_chunk(), which prepares the buffer for
rewriting, will always succeed for these.  And if an error is returned, it
triggers the legacy BUG_ON() checks in each routine.

This assumption is wrong, as proven by syzbot: the buffer layer called by
nilfs_prepare_chunk() may call nilfs_get_block() if necessary, which may
fail due to metadata corruption or other reasons.  This has been there all
along, but improved sanity checks and error handling may have made it more
reproducible in fuzzing tests.

Fix this issue by adding missing error paths in nilfs_set_link(),
nilfs_delete_entry(), and their caller nilfs_rename().

Link: https://lkml.kernel.org/r/20250111143518.7901-1-konishi.ryusuke@gmail.com
Link: https://lkml.kernel.org/r/20250111143518.7901-2-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+32c3706ebf5d95046ea1@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=32c3706ebf5d95046ea1
Reported-by: syzbot+1097e95f134f37d9395c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=1097e95f134f37d9395c
Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations")
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/dir.c   | 13 ++++++++++---
 fs/nilfs2/namei.c | 29 +++++++++++++++--------------
 fs/nilfs2/nilfs.h |  4 ++--
 3 files changed, 27 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0a3aea6c416b..9b7f8e9655a2 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -400,7 +400,7 @@ int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino)
 	return 0;
 }
 
-void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 		    struct folio *folio, struct inode *inode)
 {
 	size_t from = offset_in_folio(folio, de);
@@ -410,11 +410,15 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 
 	folio_lock(folio);
 	err = nilfs_prepare_chunk(folio, from, to);
-	BUG_ON(err);
+	if (unlikely(err)) {
+		folio_unlock(folio);
+		return err;
+	}
 	de->inode = cpu_to_le64(inode->i_ino);
 	de->file_type = fs_umode_to_ftype(inode->i_mode);
 	nilfs_commit_chunk(folio, mapping, from, to);
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+	return 0;
 }
 
 /*
@@ -543,7 +547,10 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct folio *folio)
 		from = (char *)pde - kaddr;
 	folio_lock(folio);
 	err = nilfs_prepare_chunk(folio, from, to);
-	BUG_ON(err);
+	if (unlikely(err)) {
+		folio_unlock(folio);
+		goto out;
+	}
 	if (pde)
 		pde->rec_len = nilfs_rec_len_to_disk(to - from);
 	dir->inode = 0;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1d836a5540f3..e02fae6757f1 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -406,8 +406,10 @@ static int nilfs_rename(struct mnt_idmap *idmap,
 			err = PTR_ERR(new_de);
 			goto out_dir;
 		}
-		nilfs_set_link(new_dir, new_de, new_folio, old_inode);
+		err = nilfs_set_link(new_dir, new_de, new_folio, old_inode);
 		folio_release_kmap(new_folio, new_de);
+		if (unlikely(err))
+			goto out_dir;
 		nilfs_mark_inode_dirty(new_dir);
 		inode_set_ctime_current(new_inode);
 		if (dir_de)
@@ -430,28 +432,27 @@ static int nilfs_rename(struct mnt_idmap *idmap,
 	 */
 	inode_set_ctime_current(old_inode);
 
-	nilfs_delete_entry(old_de, old_folio);
-
-	if (dir_de) {
-		nilfs_set_link(old_inode, dir_de, dir_folio, new_dir);
-		folio_release_kmap(dir_folio, dir_de);
-		drop_nlink(old_dir);
+	err = nilfs_delete_entry(old_de, old_folio);
+	if (likely(!err)) {
+		if (dir_de) {
+			err = nilfs_set_link(old_inode, dir_de, dir_folio,
+					     new_dir);
+			drop_nlink(old_dir);
+		}
+		nilfs_mark_inode_dirty(old_dir);
 	}
-	folio_release_kmap(old_folio, old_de);
-
-	nilfs_mark_inode_dirty(old_dir);
 	nilfs_mark_inode_dirty(old_inode);
 
-	err = nilfs_transaction_commit(old_dir->i_sb);
-	return err;
-
 out_dir:
 	if (dir_de)
 		folio_release_kmap(dir_folio, dir_de);
 out_old:
 	folio_release_kmap(old_folio, old_de);
 out:
-	nilfs_transaction_abort(old_dir->i_sb);
+	if (likely(!err))
+		err = nilfs_transaction_commit(old_dir->i_sb);
+	else
+		nilfs_transaction_abort(old_dir->i_sb);
 	return err;
 }
 
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index dff241c53fc5..cb6ed54accd7 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -261,8 +261,8 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *,
 int nilfs_delete_entry(struct nilfs_dir_entry *, struct folio *);
 int nilfs_empty_dir(struct inode *);
 struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct folio **);
-void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
-			   struct folio *, struct inode *);
+int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+		   struct folio *folio, struct inode *inode);
 
 /* file.c */
 extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
-- 
cgit v1.2.3


From e30ccbb9c022660c52648f02ab6231d5dcaf4253 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Sat, 11 Jan 2025 23:26:36 +0900
Subject: nilfs2: do not update mtime of renamed directory that is not moved

A minor issue with nilfs_rename, originating from an old ext2
implementation, is that the mtime is updated even if the rename target is
a directory and it is renamed within the same directory, rather than moved
to a different directory.

In this case, the child directory being renamed does not change in any
way, so changing its mtime is unnecessary according to the specification,
and can unnecessarily confuse backup tools.

In ext2, this issue was fixed by commit 39fe7557b4d6 ("ext2: Do not update
mtime of a moved directory") and a few subsequent fixes, but it remained
in nilfs2.

Fix this issue by not calling nilfs_set_link(), which rewrites the inode
number of the directory entry that refers to the parent directory, when
the move target is a directory and the source and destination are the same
directory.

Here, the directory to be moved only needs to be read if the inode number
of the parent directory is rewritten with nilfs_set_link, so also adjust
the execution conditions of the preparation work to avoid unnecessary
directory reads.

Link: https://lkml.kernel.org/r/20250111143518.7901-3-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/namei.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index e02fae6757f1..953fbd5f0851 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -370,6 +370,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
 	struct folio *old_folio;
 	struct nilfs_dir_entry *old_de;
 	struct nilfs_transaction_info ti;
+	bool old_is_dir = S_ISDIR(old_inode->i_mode);
 	int err;
 
 	if (flags & ~RENAME_NOREPLACE)
@@ -385,7 +386,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
 		goto out;
 	}
 
-	if (S_ISDIR(old_inode->i_mode)) {
+	if (old_is_dir && old_dir != new_dir) {
 		err = -EIO;
 		dir_de = nilfs_dotdot(old_inode, &dir_folio);
 		if (!dir_de)
@@ -397,7 +398,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
 		struct nilfs_dir_entry *new_de;
 
 		err = -ENOTEMPTY;
-		if (dir_de && !nilfs_empty_dir(new_inode))
+		if (old_is_dir && !nilfs_empty_dir(new_inode))
 			goto out_dir;
 
 		new_de = nilfs_find_entry(new_dir, &new_dentry->d_name,
@@ -412,7 +413,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
 			goto out_dir;
 		nilfs_mark_inode_dirty(new_dir);
 		inode_set_ctime_current(new_inode);
-		if (dir_de)
+		if (old_is_dir)
 			drop_nlink(new_inode);
 		drop_nlink(new_inode);
 		nilfs_mark_inode_dirty(new_inode);
@@ -420,7 +421,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
 		err = nilfs_add_link(new_dentry, old_inode);
 		if (err)
 			goto out_dir;
-		if (dir_de) {
+		if (old_is_dir) {
 			inc_nlink(new_dir);
 			nilfs_mark_inode_dirty(new_dir);
 		}
@@ -434,9 +435,10 @@ static int nilfs_rename(struct mnt_idmap *idmap,
 
 	err = nilfs_delete_entry(old_de, old_folio);
 	if (likely(!err)) {
-		if (dir_de) {
-			err = nilfs_set_link(old_inode, dir_de, dir_folio,
-					     new_dir);
+		if (old_is_dir) {
+			if (old_dir != new_dir)
+				err = nilfs_set_link(old_inode, dir_de,
+						     dir_folio, new_dir);
 			drop_nlink(old_dir);
 		}
 		nilfs_mark_inode_dirty(old_dir);
-- 
cgit v1.2.3


From f0ef073e213a3a1a494b6f55a76ce1242600a453 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 14 Jan 2025 21:04:54 +0800
Subject: include/linux/lz4.h: add some missing macros

Currently, LZ4_DISTANCE_MAX and LZ4_DECOMPRESS_INPLACE_MARGIN are
defined in the erofs subsystem for LZ4 in-place decompression, which is
somewhat unsuitable since they should belong to the LZ4 itself and
may change with future LZ4 codebase updates.

Move them to include/linux/lz4.h to match the upstream LZ4 library [1].
No logic changes.

[1] https://github.com/lz4/lz4/blob/v1.10.0/lib/lz4.h#L670

Link: https://lkml.kernel.org/r/20250114130454.1191150-1-hsiangkao@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Yue Hu <zbestahu@gmail.com>
Cc; Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Sandeep Dhavale <dhavale@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/erofs/decompressor.c  | 7 -------
 include/linux/lz4.h      | 6 ++++++
 lib/lz4/lz4_compress.c   | 1 -
 lib/lz4/lz4_decompress.c | 1 -
 lib/lz4/lz4defs.h        | 4 ++--
 lib/lz4/lz4hc_compress.c | 1 -
 6 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index eb318c7ddd80..2b123b070a42 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -7,14 +7,7 @@
 #include "compress.h"
 #include <linux/lz4.h>
 
-#ifndef LZ4_DISTANCE_MAX	/* history window size */
-#define LZ4_DISTANCE_MAX 65535	/* set to maximum value by default */
-#endif
-
 #define LZ4_MAX_DISTANCE_PAGES	(DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
-#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
-#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize)  (((srcsize) >> 8) + 32)
-#endif
 
 struct z_erofs_lz4_decompress_ctx {
 	struct z_erofs_decompress_req *rq;
diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index b16e15b9587a..ad6042a718b5 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -645,4 +645,10 @@ int LZ4_decompress_safe_usingDict(const char *source, char *dest,
 int LZ4_decompress_fast_usingDict(const char *source, char *dest,
 	int originalSize, const char *dictStart, int dictSize);
 
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)          (((compressedSize) >> 8) + 32)
+
+#ifndef LZ4_DISTANCE_MAX	/* history window size; can be user-defined at compile time */
+#define LZ4_DISTANCE_MAX 65535	/* set to maximum value by default */
+#endif
+
 #endif
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index b0bbeeb74b9e..2a397bb2c661 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -33,7 +33,6 @@
 /*-************************************
  *	Dependencies
  **************************************/
-#include <linux/lz4.h>
 #include "lz4defs.h"
 #include <linux/module.h>
 #include <linux/kernel.h>
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 0e31e6da5ce7..3a2cd9acada4 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -33,7 +33,6 @@
 /*-************************************
  *	Dependencies
  **************************************/
-#include <linux/lz4.h>
 #include "lz4defs.h"
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index cb358d6bde5a..17277ec16919 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -39,6 +39,7 @@
 
 #include <linux/bitops.h>
 #include <linux/string.h>	 /* memset, memcpy */
+#include <linux/lz4.h>
 
 #define FORCE_INLINE __always_inline
 
@@ -92,8 +93,7 @@ typedef uintptr_t uptrval;
 #define MB (1 << 20)
 #define GB (1U << 30)
 
-#define MAXD_LOG 16
-#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+#define MAX_DISTANCE LZ4_DISTANCE_MAX
 #define STEPSIZE sizeof(size_t)
 
 #define ML_BITS	4
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index bc45594ad2a8..91936dc3d14b 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -34,7 +34,6 @@
 /*-************************************
  *	Dependencies
  **************************************/
-#include <linux/lz4.h>
 #include "lz4defs.h"
 #include <linux/module.h>
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From beeba1eeec9c75d0a62335861ac175b9bdccfbd5 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Fri, 17 Jan 2025 10:13:29 +0100
Subject: ocfs2: use str_yes_no() and str_no_yes() helper functions

Remove hard-coded strings by using the str_yes_no() and str_no_yes()
helper functions.

Link: https://lkml.kernel.org/r/20250117091335.1189-2-thorsten.blum@linux.dev
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmdebug.c    |  9 +++++----
 fs/ocfs2/dlm/dlmmaster.c   | 12 ++++++------
 fs/ocfs2/dlm/dlmrecovery.c |  7 +++----
 fs/ocfs2/dlmglue.c         |  3 ++-
 4 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e9ef4e2b0e75..fe4fdd09bae3 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/export.h>
+#include <linux/string_choices.h>
 
 #include "../cluster/heartbeat.h"
 #include "../cluster/nodemanager.h"
@@ -90,12 +91,12 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 	       buf, res->owner, res->state);
 	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
 	       res->last_used, kref_read(&res->refs),
-	       list_empty(&res->purge) ? "no" : "yes");
+	       str_no_yes(list_empty(&res->purge)));
 	printk("  on dirty list: %s, on reco list: %s, "
 	       "migrating pending: %s\n",
-	       list_empty(&res->dirty) ? "no" : "yes",
-	       list_empty(&res->recovering) ? "no" : "yes",
-	       res->migration_pending ? "yes" : "no");
+	       str_no_yes(list_empty(&res->dirty)),
+	       str_no_yes(list_empty(&res->recovering)),
+	       str_yes_no(res->migration_pending));
 	printk("  inflight locks: %d, asts reserved: %d\n",
 	       res->inflight_locks, atomic_read(&res->asts_reserved));
 	dlm_print_lockres_refmap(res);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index d610da8e2f24..86bb1a03bcc1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -21,7 +21,7 @@
 #include <linux/inet.h>
 #include <linux/spinlock.h>
 #include <linux/delay.h>
-
+#include <linux/string_choices.h>
 
 #include "../cluster/heartbeat.h"
 #include "../cluster/nodemanager.h"
@@ -2859,7 +2859,7 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
 	dlm_lockres_release_ast(dlm, res);
 
 	mlog(0, "about to wait on migration_wq, dirty=%s\n",
-	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+	       str_yes_no(res->state & DLM_LOCK_RES_DIRTY));
 	/* if the extra ref we just put was the final one, this
 	 * will pass thru immediately.  otherwise, we need to wait
 	 * for the last ast to finish. */
@@ -2869,12 +2869,12 @@ again:
 		   msecs_to_jiffies(1000));
 	if (ret < 0) {
 		mlog(0, "woken again: migrating? %s, dead? %s\n",
-		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
-		       test_bit(target, dlm->domain_map) ? "no":"yes");
+		       str_yes_no(res->state & DLM_LOCK_RES_MIGRATING),
+		       str_no_yes(test_bit(target, dlm->domain_map)));
 	} else {
 		mlog(0, "all is well: migrating? %s, dead? %s\n",
-		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
-		       test_bit(target, dlm->domain_map) ? "no":"yes");
+		       str_yes_no(res->state & DLM_LOCK_RES_MIGRATING),
+		       str_no_yes(test_bit(target, dlm->domain_map)));
 	}
 	if (!dlm_migration_can_proceed(dlm, res, target)) {
 		mlog(0, "trying again...\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 54c548ef037a..67fc62a49a76 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -22,7 +22,7 @@
 #include <linux/timer.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
-
+#include <linux/string_choices.h>
 
 #include "../cluster/heartbeat.h"
 #include "../cluster/nodemanager.h"
@@ -581,8 +581,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 							   msecs_to_jiffies(1000));
 					mlog(0, "waited 1 sec for %u, "
 					     "dead? %s\n", ndata->node_num,
-					     dlm_is_node_dead(dlm, ndata->node_num) ?
-					     "yes" : "no");
+					     str_yes_no(dlm_is_node_dead(dlm, ndata->node_num)));
 				} else {
 					/* -ENOMEM on the other node */
 					mlog(0, "%s: node %u returned "
@@ -677,7 +676,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 		spin_unlock(&dlm_reco_state_lock);
 
 		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
-		     all_nodes_done?"yes":"no");
+		     str_yes_no(all_nodes_done));
 		if (all_nodes_done) {
 			int ret;
 
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c9b62a6d8673..92a6149da9c1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -19,6 +19,7 @@
 #include <linux/delay.h>
 #include <linux/quotaops.h>
 #include <linux/sched/signal.h>
+#include <linux/string_choices.h>
 
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -4337,7 +4338,7 @@ unqueue:
 		ocfs2_schedule_blocked_lock(osb, lockres);
 
 	mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
-	     ctl.requeue ? "yes" : "no");
+	     str_yes_no(ctl.requeue));
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	if (ctl.unblock_action != UNBLOCK_CONTINUE
-- 
cgit v1.2.3


From 2efbc3518fb3dad164643099fa8ce62361894243 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 24 Jan 2025 09:23:02 -0500
Subject: bcachefs: Improve decompression error messages

Ratelimit them, and use the new bch2_write_op_error() helper that prints
path and file offset.

Reported-by: https://github.com/koverstreet/bcachefs/issues/819
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/compress.c | 31 ++++++++++++++++++++++---------
 fs/bcachefs/compress.h |  4 ++--
 fs/bcachefs/io_write.c |  4 ++--
 fs/bcachefs/io_write.h |  2 ++
 4 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index f99ff1819597..114bf2f3879f 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -4,6 +4,7 @@
 #include "compress.h"
 #include "error.h"
 #include "extents.h"
+#include "io_write.h"
 #include "opts.h"
 #include "super-io.h"
 
@@ -254,11 +255,14 @@ err:
 	goto out;
 }
 
-int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
-				struct bch_extent_crc_unpacked *crc)
+int bch2_bio_uncompress_inplace(struct bch_write_op *op,
+				struct bio *bio)
 {
+	struct bch_fs *c = op->c;
+	struct bch_extent_crc_unpacked *crc = &op->crc;
 	struct bbuf data = { NULL };
 	size_t dst_len = crc->uncompressed_size << 9;
+	int ret = 0;
 
 	/* bio must own its pages: */
 	BUG_ON(!bio->bi_vcnt);
@@ -266,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 
 	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max ||
 	    crc->compressed_size << 9	> c->opts.encoded_extent_max) {
-		bch_err(c, "error rewriting existing data: extent too big");
+		struct printbuf buf = PRINTBUF;
+		bch2_write_op_error(&buf, op);
+		prt_printf(&buf, "error rewriting existing data: extent too big");
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
 		return -EIO;
 	}
 
 	data = __bounce_alloc(c, dst_len, WRITE);
 
 	if (__bio_uncompress(c, bio, data.b, *crc)) {
-		if (!c->opts.no_data_io)
-			bch_err(c, "error rewriting existing data: decompression error");
-		bio_unmap_or_unbounce(c, data);
-		return -EIO;
+		if (!c->opts.no_data_io) {
+			struct printbuf buf = PRINTBUF;
+			bch2_write_op_error(&buf, op);
+			prt_printf(&buf, "error rewriting existing data: decompression error");
+			bch_err_ratelimited(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+		ret = -EIO;
+		goto err;
 	}
 
 	/*
@@ -293,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 	crc->uncompressed_size	= crc->live_size;
 	crc->offset		= 0;
 	crc->csum		= (struct bch_csum) { 0, 0 };
-
+err:
 	bio_unmap_or_unbounce(c, data);
-	return 0;
+	return ret;
 }
 
 int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 607fd5e232c9..bec2f05bfd52 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
 	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
 }
 
-int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
-				struct bch_extent_crc_unpacked *);
+struct bch_write_op;
+int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *);
 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
 		       struct bvec_iter, struct bch_extent_crc_unpacked);
 unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 3e71860f66b9..dd508d93e9fc 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -406,7 +406,7 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
 		   op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
 }
 
-static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
 {
 	__bch2_write_op_error(out, op, op->pos.offset);
 }
@@ -873,7 +873,7 @@ static enum prep_encoded_ret {
 		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
 			return PREP_ENCODED_CHECKSUM_ERR;
 
-		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+		if (bch2_bio_uncompress_inplace(op, bio))
 			return PREP_ENCODED_ERR;
 	}
 
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index 5400ce94ee57..b4626013abc8 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -20,6 +20,8 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *, bool);
 
+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
+
 #define BCH_WRITE_FLAGS()		\
 	x(ALLOC_NOWAIT)			\
 	x(CACHED)			\
-- 
cgit v1.2.3


From 37fd6b8176520c7ebe4edf78b01f0ee0a0aab62d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 25 Jan 2025 17:19:38 -0500
Subject: bcachefs: rebalance, copygc enabled are runtime opts

Fix a regression from when these were switched to normal opts.h options.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index e763d52e0f38..a182b5d454ba 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -476,13 +476,13 @@ enum fsck_err_opts {
 	  NULL,		"Enable nocow mode: enables runtime locking in\n"\
 			"data move path needed if nocow will ever be in use\n")\
 	x(copygc_enabled,		u8,				\
-	  OPT_FS|OPT_MOUNT,						\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,			true,			\
 	  NULL,		"Enable copygc: disable for debugging, or to\n"\
 			"quiet the system when doing performance testing\n")\
 	x(rebalance_enabled,		u8,				\
-	  OPT_FS|OPT_MOUNT,						\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,			true,			\
 	  NULL,		"Enable rebalance: disable for debugging, or to\n"\
-- 
cgit v1.2.3


From 0c74c85bbe84f765220492938400b9d50d75e985 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 25 Jan 2025 18:09:32 -0500
Subject: bcachefs: fix bch2_btree_node_flags

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 672ca2c1d37d..ca755e8d1a37 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -24,7 +24,10 @@ do {						 \
 } while (0)
 
 const char * const bch2_btree_node_flags[] = {
-#define x(f)	#f,
+	"typebit",
+	"typebit",
+	"typebit",
+#define x(f)	[BTREE_NODE_##f] = #f,
 	BTREE_FLAGS()
 #undef x
 	NULL
-- 
cgit v1.2.3


From 35f5197009cae284cec9beb8b2692d2321a5bd3c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 25 Jan 2025 19:22:50 -0500
Subject: bcachefs: Improve journal pin flushing

Running the preempt tiering tests with a lower than normal journal
reclaim delay turned up a shutdown hang - a lost wakeup, caused because
flushing a journal pin (e.g. key cache/write buffer) can generate a new
journal pin.

The "simple" fix of adding the correct wakeup didn't work because of
ordering issues; if we flush btree node pins too aggressively before
other pins have completed, we end up spinning where each flush iteration
generates new work.

So to fix this correctly:
- The list of flushed journal pins is now broken out by type, so that
  we can wait for key cache/write buffer pin flushing to complete
  before flushing dirty btree nodes

- A new closure_waitlist is added for bch2_journal_flush_pins; this one
  is only used under or when we're taking the journal lock, so it's
  pretty cheap to add rigorously correct wakeups to journal_pin_set()
  and journal_pin_drop().

Additionally, bch2_journal_seq_pins_to_text() is moved to
journal_reclaim.c, where it belongs, along with a bit of other small
renaming and refactoring.

Besides fixing the hang, the better ordering between key cache/write
buffer flushing and btree node flushing should help or fix the "unmount
taking excessively long" a few users have been noticing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/debug.c           |   1 +
 fs/bcachefs/journal.c         |  60 ++----------------
 fs/bcachefs/journal.h         |   2 -
 fs/bcachefs/journal_reclaim.c | 142 +++++++++++++++++++++++++++++++++++-------
 fs/bcachefs/journal_reclaim.h |   3 +
 fs/bcachefs/journal_types.h   |  13 ++--
 6 files changed, 136 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index b5de52a50d10..55333e82d1fe 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -20,6 +20,7 @@
 #include "extents.h"
 #include "fsck.h"
 #include "inode.h"
+#include "journal_reclaim.h"
 #include "super.h"
 
 #include <linux/console.h>
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 46d53d3ba018..cb2c3722f674 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 
 static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 {
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(p->list); i++)
-		INIT_LIST_HEAD(&p->list[i]);
-	INIT_LIST_HEAD(&p->flushed);
+	for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
+		INIT_LIST_HEAD(&p->unflushed[i]);
+	for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
+		INIT_LIST_HEAD(&p->flushed[i]);
 	atomic_set(&p->count, count);
 	p->devs.nr = 0;
 }
@@ -1626,54 +1625,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	__bch2_journal_debug_to_text(out, j);
 	spin_unlock(&j->lock);
 }
-
-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
-{
-	struct journal_entry_pin_list *pin_list;
-	struct journal_entry_pin *pin;
-
-	spin_lock(&j->lock);
-	if (!test_bit(JOURNAL_running, &j->flags)) {
-		spin_unlock(&j->lock);
-		return true;
-	}
-
-	*seq = max(*seq, j->pin.front);
-
-	if (*seq >= j->pin.back) {
-		spin_unlock(&j->lock);
-		return true;
-	}
-
-	out->atomic++;
-
-	pin_list = journal_seq_pin(j, *seq);
-
-	prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
-	printbuf_indent_add(out, 2);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
-		list_for_each_entry(pin, &pin_list->list[i], list)
-			prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
-	if (!list_empty(&pin_list->flushed))
-		prt_printf(out, "flushed:\n");
-
-	list_for_each_entry(pin, &pin_list->flushed, list)
-		prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
-	printbuf_indent_sub(out, 2);
-
-	--out->atomic;
-	spin_unlock(&j->lock);
-
-	return false;
-}
-
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
-{
-	u64 seq = 0;
-
-	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
-		seq++;
-}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index a01dae1a57e3..dccddd5420ad 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -430,8 +430,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
 
 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
 				unsigned nr);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3c8242606da7..6a9cefb635d6 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j)
 		popped = true;
 	}
 
-	if (popped)
+	if (popped) {
 		bch2_journal_space_available(j);
+		__closure_wake_up(&j->reclaim_flush_wait);
+	}
 }
 
 bool __bch2_journal_pin_put(struct journal *j, u64 seq)
@@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j,
 	pin->seq = 0;
 	list_del_init(&pin->list);
 
+	if (j->reclaim_flush_wait.list.first)
+		__closure_wake_up(&j->reclaim_flush_wait);
+
 	/*
 	 * Unpinning a journal entry may make journal_next_bucket() succeed, if
 	 * writing a new last_seq will now make another bucket available:
@@ -383,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
 {
 	if (fn == bch2_btree_node_flush0 ||
 	    fn == bch2_btree_node_flush1)
-		return JOURNAL_PIN_btree;
+		return JOURNAL_PIN_TYPE_btree;
 	else if (fn == bch2_btree_key_cache_journal_flush)
-		return JOURNAL_PIN_key_cache;
+		return JOURNAL_PIN_TYPE_key_cache;
 	else
-		return JOURNAL_PIN_other;
+		return JOURNAL_PIN_TYPE_other;
 }
 
 static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
@@ -406,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
 	atomic_inc(&pin_list->count);
 	pin->seq	= seq;
 	pin->flush	= flush_fn;
-	list_add(&pin->list, &pin_list->list[type]);
+
+	if (list_empty(&pin_list->unflushed[type]) &&
+	    j->reclaim_flush_wait.list.first)
+		__closure_wake_up(&j->reclaim_flush_wait);
+
+	list_add(&pin->list, &pin_list->unflushed[type]);
 }
 
 void bch2_journal_pin_copy(struct journal *j,
@@ -499,16 +509,15 @@ journal_get_next_pin(struct journal *j,
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *ret = NULL;
-	unsigned i;
 
 	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
 		if (*seq > seq_to_flush && !allowed_above_seq)
 			break;
 
-		for (i = 0; i < JOURNAL_PIN_NR; i++)
-			if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
-			    ((1U << i) & allowed_above_seq)) {
-				ret = list_first_entry_or_null(&pin_list->list[i],
+		for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
+			if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+			    (BIT(i) & allowed_above_seq)) {
+				ret = list_first_entry_or_null(&pin_list->unflushed[i],
 					struct journal_entry_pin, list);
 				if (ret)
 					return ret;
@@ -544,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j,
 		}
 
 		if (min_key_cache) {
-			allowed_above |= 1U << JOURNAL_PIN_key_cache;
-			allowed_below |= 1U << JOURNAL_PIN_key_cache;
+			allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
+			allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
 		}
 
 		cond_resched();
@@ -553,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j,
 		j->last_flushed = jiffies;
 
 		spin_lock(&j->lock);
-		pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
+		pin = journal_get_next_pin(j, seq_to_flush,
+					   allowed_below,
+					   allowed_above, &seq);
 		if (pin) {
 			BUG_ON(j->flush_in_progress);
 			j->flush_in_progress = pin;
@@ -576,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j,
 		spin_lock(&j->lock);
 		/* Pin might have been dropped or rearmed: */
 		if (likely(!err && !j->flush_in_progress_dropped))
-			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
 		j->flush_in_progress = NULL;
 		j->flush_in_progress_dropped = false;
 		spin_unlock(&j->lock);
@@ -816,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j)
 	return 0;
 }
 
+static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
+					unsigned types)
+{
+	struct journal_entry_pin_list *pin_list;
+	u64 seq;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
+		if (seq > seq_to_flush)
+			break;
+
+		for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
+			if ((BIT(i) & types) &&
+			    (!list_empty(&pin_list->unflushed[i]) ||
+			     !list_empty(&pin_list->flushed[i]))) {
+				spin_unlock(&j->lock);
+				return true;
+			}
+	}
+	spin_unlock(&j->lock);
+
+	return false;
+}
+
+static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
+						 unsigned types)
+{
+	return  journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
+		journal_pins_still_flushing(j, seq_to_flush, types);
+}
+
 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 			      bool *did_work)
 {
-	int ret;
+	int ret = 0;
 
 	ret = bch2_journal_error(j);
 	if (ret)
@@ -827,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
 	mutex_lock(&j->reclaim_lock);
 
-	if (journal_flush_pins(j, seq_to_flush,
-			       (1U << JOURNAL_PIN_key_cache)|
-			       (1U << JOURNAL_PIN_other), 0, 0, 0) ||
-	    journal_flush_pins(j, seq_to_flush,
-			       (1U << JOURNAL_PIN_btree), 0, 0, 0))
+	if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
+			       BIT(JOURNAL_PIN_TYPE_key_cache)|
+			       BIT(JOURNAL_PIN_TYPE_other))) {
+		*did_work = true;
+		goto unlock;
+	}
+
+	if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
+			       BIT(JOURNAL_PIN_TYPE_btree))) {
 		*did_work = true;
+		goto unlock;
+	}
 
 	if (seq_to_flush > journal_cur_seq(j))
 		bch2_journal_entry_close(j);
@@ -847,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 		!fifo_used(&j->pin);
 
 	spin_unlock(&j->lock);
+unlock:
 	mutex_unlock(&j->reclaim_lock);
 
 	return ret;
@@ -860,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 	if (!test_bit(JOURNAL_running, &j->flags))
 		return false;
 
-	closure_wait_event(&j->async_wait,
+	closure_wait_event(&j->reclaim_flush_wait,
 		journal_flush_done(j, seq_to_flush, &did_work));
 
 	return did_work;
@@ -926,3 +975,54 @@ err:
 
 	return ret;
 }
+
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *pin;
+
+	spin_lock(&j->lock);
+	if (!test_bit(JOURNAL_running, &j->flags)) {
+		spin_unlock(&j->lock);
+		return true;
+	}
+
+	*seq = max(*seq, j->pin.front);
+
+	if (*seq >= j->pin.back) {
+		spin_unlock(&j->lock);
+		return true;
+	}
+
+	out->atomic++;
+
+	pin_list = journal_seq_pin(j, *seq);
+
+	prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "unflushed:\n");
+	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
+		list_for_each_entry(pin, &pin_list->unflushed[i], list)
+			prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+	prt_printf(out, "flushed:\n");
+	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
+		list_for_each_entry(pin, &pin_list->flushed[i], list)
+			prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+	printbuf_indent_sub(out, 2);
+
+	--out->atomic;
+	spin_unlock(&j->lock);
+
+	return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+	u64 seq = 0;
+
+	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+		seq++;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index ec84c3345281..0a73d7134e1c 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j)
 
 int bch2_journal_flush_device_pins(struct journal *, int);
 
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
+
 #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index e9bd716fbb71..3ba433a48eb8 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -53,15 +53,15 @@ struct journal_buf {
  */
 
 enum journal_pin_type {
-	JOURNAL_PIN_btree,
-	JOURNAL_PIN_key_cache,
-	JOURNAL_PIN_other,
-	JOURNAL_PIN_NR,
+	JOURNAL_PIN_TYPE_btree,
+	JOURNAL_PIN_TYPE_key_cache,
+	JOURNAL_PIN_TYPE_other,
+	JOURNAL_PIN_TYPE_NR,
 };
 
 struct journal_entry_pin_list {
-	struct list_head		list[JOURNAL_PIN_NR];
-	struct list_head		flushed;
+	struct list_head		unflushed[JOURNAL_PIN_TYPE_NR];
+	struct list_head		flushed[JOURNAL_PIN_TYPE_NR];
 	atomic_t			count;
 	struct bch_devs_list		devs;
 };
@@ -226,6 +226,7 @@ struct journal {
 	/* Used when waiting because the journal was full */
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
+	struct closure_waitlist	reclaim_flush_wait;
 
 	struct delayed_work	write_work;
 	struct workqueue_struct *wq;
-- 
cgit v1.2.3


From 30cee1e4861b59200aa09c94a4d789c461e5f408 Mon Sep 17 00:00:00 2001
From: Maninder Singh <maninder1.s@samsung.com>
Date: Mon, 30 Dec 2024 15:40:43 +0530
Subject: lib/list_debug.c: add object information in case of invalid object

As of now during link list corruption it prints about cluprit address and
its wrong value, but sometime it is not enough to catch the actual issue
point.

If it prints allocation and free path of that corrupted node, it will be a
lot easier to find and fix the issues.

Adding the same information when data mismatch is found in link list
debug data:

[   14.243055]  slab kmalloc-32 start ffff0000cda19320 data offset 32 pointer offset 8 size 32 allocated at add_to_list+0x28/0xb0
[   14.245259]     __kmalloc_cache_noprof+0x1c4/0x358
[   14.245572]     add_to_list+0x28/0xb0
...
[   14.248632]     do_el0_svc_compat+0x1c/0x34
[   14.249018]     el0_svc_compat+0x2c/0x80
[   14.249244]  Free path:
[   14.249410]     kfree+0x24c/0x2f0
[   14.249724]     do_force_corruption+0xbc/0x100
...
[   14.252266]     el0_svc_common.constprop.0+0x40/0xe0
[   14.252540]     do_el0_svc_compat+0x1c/0x34
[   14.252763]     el0_svc_compat+0x2c/0x80
[   14.253071] ------------[ cut here ]------------
[   14.253303] list_del corruption. next->prev should be ffff0000cda192a8, but was 6b6b6b6b6b6b6b6b. (next=ffff0000cda19348)
[   14.254255] WARNING: CPU: 3 PID: 84 at lib/list_debug.c:65 __list_del_entry_valid_or_report+0x158/0x164

Moved prototype of mem_dump_obj() to bug.h, as mm.h can not be included in
bug.h.

Link: https://lkml.kernel.org/r/20241230101043.53773-1-maninder1.s@samsung.com
Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: Rohit Thapliyal <r.thapliyal@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/open.c           |  2 +-
 fs/super.c          |  2 +-
 include/linux/bug.h | 10 +++++++++-
 include/linux/mm.h  |  6 ------
 lib/list_debug.c    | 22 +++++++++++-----------
 5 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index e6911101fe71..0f75e220b700 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1504,7 +1504,7 @@ static int filp_flush(struct file *filp, fl_owner_t id)
 {
 	int retval = 0;
 
-	if (CHECK_DATA_CORRUPTION(file_count(filp) == 0,
+	if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp,
 			"VFS: Close: file count is 0 (f_op=%ps)",
 			filp->f_op)) {
 		return 0;
diff --git a/fs/super.c b/fs/super.c
index c9c7223bc2a2..5a7db4a556e3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -647,7 +647,7 @@ void generic_shutdown_super(struct super_block *sb)
 		 */
 		fscrypt_destroy_keyring(sb);
 
-		if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes),
+		if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL,
 				"VFS: Busy inodes after unmount of %s (%s)",
 				sb->s_id, sb->s_type->name)) {
 			/*
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 348acf2558f3..a9948a9f1093 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -73,15 +73,23 @@ static inline void generic_bug_clear_once(void) {}
 
 #endif	/* CONFIG_GENERIC_BUG */
 
+#ifdef CONFIG_PRINTK
+void mem_dump_obj(void *object);
+#else
+static inline void mem_dump_obj(void *object) {}
+#endif
+
 /*
  * Since detected data corruption should stop operation on the affected
  * structures. Return value must be checked and sanely acted on by caller.
  */
 static inline __must_check bool check_data_corruption(bool v) { return v; }
-#define CHECK_DATA_CORRUPTION(condition, fmt, ...)			 \
+#define CHECK_DATA_CORRUPTION(condition, addr, fmt, ...)		 \
 	check_data_corruption(({					 \
 		bool corruption = unlikely(condition);			 \
 		if (corruption) {					 \
+			if (addr)					 \
+				mem_dump_obj(addr);			 \
 			if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
 				pr_err(fmt, ##__VA_ARGS__);		 \
 				BUG();					 \
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 15a903d59d09..c550912a5d6d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4084,12 +4084,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
 
 extern int sysctl_nr_trim_pages;
 
-#ifdef CONFIG_PRINTK
-void mem_dump_obj(void *object);
-#else
-static inline void mem_dump_obj(void *object) {}
-#endif
-
 #ifdef CONFIG_ANON_VMA_NAME
 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 			  unsigned long len_in,
diff --git a/lib/list_debug.c b/lib/list_debug.c
index db602417febf..ee7eeeb8f92c 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -22,17 +22,17 @@ __list_valid_slowpath
 bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev,
 				struct list_head *next)
 {
-	if (CHECK_DATA_CORRUPTION(prev == NULL,
+	if (CHECK_DATA_CORRUPTION(prev == NULL, NULL,
 			"list_add corruption. prev is NULL.\n") ||
-	    CHECK_DATA_CORRUPTION(next == NULL,
+	    CHECK_DATA_CORRUPTION(next == NULL, NULL,
 			"list_add corruption. next is NULL.\n") ||
-	    CHECK_DATA_CORRUPTION(next->prev != prev,
+	    CHECK_DATA_CORRUPTION(next->prev != prev, next,
 			"list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n",
 			prev, next->prev, next) ||
-	    CHECK_DATA_CORRUPTION(prev->next != next,
+	    CHECK_DATA_CORRUPTION(prev->next != next, prev,
 			"list_add corruption. prev->next should be next (%px), but was %px. (prev=%px).\n",
 			next, prev->next, prev) ||
-	    CHECK_DATA_CORRUPTION(new == prev || new == next,
+	    CHECK_DATA_CORRUPTION(new == prev || new == next, NULL,
 			"list_add double add: new=%px, prev=%px, next=%px.\n",
 			new, prev, next))
 		return false;
@@ -49,20 +49,20 @@ bool __list_del_entry_valid_or_report(struct list_head *entry)
 	prev = entry->prev;
 	next = entry->next;
 
-	if (CHECK_DATA_CORRUPTION(next == NULL,
+	if (CHECK_DATA_CORRUPTION(next == NULL, NULL,
 			"list_del corruption, %px->next is NULL\n", entry) ||
-	    CHECK_DATA_CORRUPTION(prev == NULL,
+	    CHECK_DATA_CORRUPTION(prev == NULL, NULL,
 			"list_del corruption, %px->prev is NULL\n", entry) ||
-	    CHECK_DATA_CORRUPTION(next == LIST_POISON1,
+	    CHECK_DATA_CORRUPTION(next == LIST_POISON1, next,
 			"list_del corruption, %px->next is LIST_POISON1 (%px)\n",
 			entry, LIST_POISON1) ||
-	    CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
+	    CHECK_DATA_CORRUPTION(prev == LIST_POISON2, prev,
 			"list_del corruption, %px->prev is LIST_POISON2 (%px)\n",
 			entry, LIST_POISON2) ||
-	    CHECK_DATA_CORRUPTION(prev->next != entry,
+	    CHECK_DATA_CORRUPTION(prev->next != entry, prev,
 			"list_del corruption. prev->next should be %px, but was %px. (prev=%px)\n",
 			entry, prev->next, prev) ||
-	    CHECK_DATA_CORRUPTION(next->prev != entry,
+	    CHECK_DATA_CORRUPTION(next->prev != entry, next,
 			"list_del corruption. next->prev should be %px, but was %px. (next=%px)\n",
 			entry, next->prev, next))
 		return false;
-- 
cgit v1.2.3


From 30cef82bc6e8975a360ec05b707f7fb194c875ed Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 7 Jan 2025 15:39:58 -0500
Subject: mm/hugetlb: rename avoid_reserve to cow_from_owner

The old name "avoid_reserve" can be too generic and can be used wrongly in
the new call sites that want to allocate a hugetlb folio.

It's confusing on two things: (1) whether one can opt-in to avoid global
reservation, and (2) whether it should take more than one count.

In reality, this flag is only used in an extremely hacky path, in an
extremely hacky way in hugetlb CoW path only, and always use with 1 saying
"skip global reservation".  Rename the flag to avoid future abuse of this
flag, making it a boolean so as to reflect its true representation that
it's not a counter.  To make it even harder to abuse, add a comment above
the function to explain it.

Link: https://lkml.kernel.org/r/20250107204002.2683356-4-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Ackerley Tng <ackerleytng@google.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |  2 +-
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 33 ++++++++++++++++++++-------------
 3 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 62fb0cbc93ab..0fc179a59830 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -814,7 +814,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		 * folios in these areas, we need to consume the reserves
 		 * to keep reservation accounting consistent.
 		 */
-		folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
+		folio = alloc_hugetlb_folio(&pseudo_vma, addr, false);
 		if (IS_ERR(folio)) {
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			error = PTR_ERR(folio);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 10faf42ca96a..49ec2362ce92 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -683,7 +683,7 @@ struct huge_bootmem_page {
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
-				unsigned long addr, int avoid_reserve);
+				unsigned long addr, bool cow_from_owner);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 				nodemask_t *nmask, gfp_t gfp_mask,
 				bool allow_alloc_fallback);
@@ -1068,7 +1068,7 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn,
 
 static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 					   unsigned long addr,
-					   int avoid_reserve)
+					   bool cow_from_owner)
 {
 	return NULL;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7df19b5f956c..e59b734b2c95 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3012,8 +3012,15 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
 	return ret;
 }
 
+/*
+ * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
+ * faults of hugetlb private mappings on top of a non-page-cache folio (in
+ * which case even if there's a private vma resv map it won't cover such
+ * allocation).  New call sites should (probably) never set it to true!!
+ * When it's set, the allocation will bypass all vma level reservations.
+ */
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
-				    unsigned long addr, int avoid_reserve)
+				    unsigned long addr, bool cow_from_owner)
 {
 	struct hugepage_subpool *spool = subpool_vma(vma);
 	struct hstate *h = hstate_vma(vma);
@@ -3042,7 +3049,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	 * Allocations for MAP_NORESERVE mappings also need to be
 	 * checked against any subpool limit.
 	 */
-	if (map_chg || avoid_reserve) {
+	if (map_chg || cow_from_owner) {
 		gbl_chg = hugepage_subpool_get_pages(spool, 1);
 		if (gbl_chg < 0)
 			goto out_end_reservation;
@@ -3050,7 +3057,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 
 	/* If this allocation is not consuming a reservation, charge it now.
 	 */
-	deferred_reserve = map_chg || avoid_reserve;
+	deferred_reserve = map_chg || cow_from_owner;
 	if (deferred_reserve) {
 		ret = hugetlb_cgroup_charge_cgroup_rsvd(
 			idx, pages_per_huge_page(h), &h_cg);
@@ -3075,7 +3082,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 		if (!folio)
 			goto out_uncharge_cgroup;
 		spin_lock_irq(&hugetlb_lock);
-		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
+		if (!cow_from_owner && vma_has_reserves(vma, gbl_chg)) {
 			folio_set_hugetlb_restore_reserve(folio);
 			h->resv_huge_pages--;
 		}
@@ -3142,7 +3149,7 @@ out_uncharge_cgroup_reservation:
 		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
 						    h_cg);
 out_subpool_put:
-	if (map_chg || avoid_reserve)
+	if (map_chg || cow_from_owner)
 		hugepage_subpool_put_pages(spool, 1);
 out_end_reservation:
 	vma_end_reservation(h, vma, addr);
@@ -5373,7 +5380,7 @@ again:
 				spin_unlock(src_ptl);
 				spin_unlock(dst_ptl);
 				/* Do not use reserve as it's private owned */
-				new_folio = alloc_hugetlb_folio(dst_vma, addr, 0);
+				new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
 				if (IS_ERR(new_folio)) {
 					folio_put(pte_folio);
 					ret = PTR_ERR(new_folio);
@@ -5839,7 +5846,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
 	struct hstate *h = hstate_vma(vma);
 	struct folio *old_folio;
 	struct folio *new_folio;
-	int outside_reserve = 0;
+	bool cow_from_owner = 0;
 	vm_fault_t ret = 0;
 	struct mmu_notifier_range range;
 
@@ -5902,7 +5909,7 @@ retry_avoidcopy:
 	 */
 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
 			old_folio != pagecache_folio)
-		outside_reserve = 1;
+		cow_from_owner = true;
 
 	folio_get(old_folio);
 
@@ -5911,7 +5918,7 @@ retry_avoidcopy:
 	 * be acquired again before returning to the caller, as expected.
 	 */
 	spin_unlock(vmf->ptl);
-	new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
+	new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);
 
 	if (IS_ERR(new_folio)) {
 		/*
@@ -5921,7 +5928,7 @@ retry_avoidcopy:
 		 * reliability, unmap the page from child processes. The child
 		 * may get SIGKILLed if it later faults.
 		 */
-		if (outside_reserve) {
+		if (cow_from_owner) {
 			struct address_space *mapping = vma->vm_file->f_mapping;
 			pgoff_t idx;
 			u32 hash;
@@ -6172,7 +6179,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
 				goto out;
 		}
 
-		folio = alloc_hugetlb_folio(vma, vmf->address, 0);
+		folio = alloc_hugetlb_folio(vma, vmf->address, false);
 		if (IS_ERR(folio)) {
 			/*
 			 * Returning error will result in faulting task being
@@ -6638,7 +6645,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			goto out;
 		}
 
-		folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+		folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
 		if (IS_ERR(folio)) {
 			ret = -ENOMEM;
 			goto out;
@@ -6680,7 +6687,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			goto out;
 		}
 
-		folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+		folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
 		if (IS_ERR(folio)) {
 			folio_put(*foliop);
 			ret = -ENOMEM;
-- 
cgit v1.2.3


From 6bf9b5b40af373690313f64a3935b2bf2e5d46d9 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@redhat.com>
Date: Mon, 23 Dec 2024 17:00:38 -0500
Subject: mm: alloc_pages_bulk: rename API

The previous commit removed the page_list argument from
alloc_pages_bulk_noprof() along with the alloc_pages_bulk_list() function.

Now that only the *_array() flavour of the API remains, we can do the
following renaming (along with the _noprof() ones):

  alloc_pages_bulk_array -> alloc_pages_bulk
  alloc_pages_bulk_array_mempolicy -> alloc_pages_bulk_mempolicy
  alloc_pages_bulk_array_node -> alloc_pages_bulk_node

Link: https://lkml.kernel.org/r/275a3bbc0be20fbe9002297d60045e67ab3d4ada.1734991165.git.luizcap@redhat.com
Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/staging/media/atomisp/pci/hmm/hmm_bo.c |  4 ++--
 drivers/vfio/pci/mlx5/cmd.c                    | 14 +++++++-------
 drivers/vfio/pci/virtio/migrate.c              |  6 +++---
 fs/btrfs/extent_io.c                           |  2 +-
 fs/erofs/zutil.c                               |  4 ++--
 fs/splice.c                                    |  2 +-
 fs/xfs/xfs_buf.c                               |  4 ++--
 include/linux/gfp.h                            | 14 +++++++-------
 kernel/bpf/arena.c                             |  2 +-
 lib/alloc_tag.c                                |  4 ++--
 lib/kunit_iov_iter.c                           |  2 +-
 lib/test_vmalloc.c                             |  2 +-
 mm/mempolicy.c                                 | 14 +++++++-------
 mm/vmalloc.c                                   |  4 ++--
 net/core/page_pool.c                           |  7 +++----
 net/sunrpc/svc.c                               |  4 ++--
 net/sunrpc/svc_xprt.c                          |  3 +--
 17 files changed, 45 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c
index 07ed33464d71..224ca8d42721 100644
--- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c
+++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c
@@ -624,10 +624,10 @@ static int alloc_private_pages(struct hmm_buffer_object *bo)
 	const gfp_t gfp = __GFP_NOWARN | __GFP_RECLAIM | __GFP_FS;
 	int ret;
 
-	ret = alloc_pages_bulk_array(gfp, bo->pgnr, bo->pages);
+	ret = alloc_pages_bulk(gfp, bo->pgnr, bo->pages);
 	if (ret != bo->pgnr) {
 		free_pages_bulk_array(ret, bo->pages);
-		dev_err(atomisp_dev, "alloc_pages_bulk_array() failed\n");
+		dev_err(atomisp_dev, "alloc_pages_bulk() failed\n");
 		return -ENOMEM;
 	}
 
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index eb7387ee6ebd..11eda6b207f1 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -408,7 +408,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
 				  buf->dma_dir, 0);
 	}
 
-	/* Undo alloc_pages_bulk_array() */
+	/* Undo alloc_pages_bulk() */
 	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
 		__free_page(sg_page_iter_page(&sg_iter));
 	sg_free_append_table(&buf->table);
@@ -431,8 +431,8 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
 		return -ENOMEM;
 
 	do {
-		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
-						page_list);
+		filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill,
+					  page_list);
 		if (!filled) {
 			ret = -ENOMEM;
 			goto err;
@@ -1342,7 +1342,7 @@ static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
 {
 	int i;
 
-	/* Undo alloc_pages_bulk_array() */
+	/* Undo alloc_pages_bulk() */
 	for (i = 0; i < recv_buf->npages; i++)
 		__free_page(recv_buf->page_list[i]);
 
@@ -1361,9 +1361,9 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
 		return -ENOMEM;
 
 	for (;;) {
-		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
-						npages - done,
-						recv_buf->page_list + done);
+		filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT,
+					  npages - done,
+					  recv_buf->page_list + done);
 		if (!filled)
 			goto err;
 
diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c
index ee54f4c17857..ba92bb4e9af9 100644
--- a/drivers/vfio/pci/virtio/migrate.c
+++ b/drivers/vfio/pci/virtio/migrate.c
@@ -77,8 +77,8 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf,
 		return -ENOMEM;
 
 	do {
-		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
-						page_list);
+		filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill,
+					  page_list);
 		if (!filled) {
 			ret = -ENOMEM;
 			goto err;
@@ -112,7 +112,7 @@ static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf)
 {
 	struct sg_page_iter sg_iter;
 
-	/* Undo alloc_pages_bulk_array() */
+	/* Undo alloc_pages_bulk() */
 	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
 		__free_page(sg_page_iter_page(&sg_iter));
 	sg_free_append_table(&buf->table);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b923d0cec61c..d70e9461fea8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -632,7 +632,7 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
 	for (allocated = 0; allocated < nr_pages;) {
 		unsigned int last = allocated;
 
-		allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array);
+		allocated = alloc_pages_bulk(gfp, nr_pages, page_array);
 		if (unlikely(allocated == last)) {
 			/* No progress, fail and do cleanup. */
 			for (int i = 0; i < allocated; i++) {
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 0dd65cefce33..9c5aa9d53682 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -87,8 +87,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages)
 			tmp_pages[j] = gbuf->pages[j];
 		do {
 			last = j;
-			j = alloc_pages_bulk_array(GFP_KERNEL, nrpages,
-						   tmp_pages);
+			j = alloc_pages_bulk(GFP_KERNEL, nrpages,
+					     tmp_pages);
 			if (last == j)
 				goto out;
 		} while (j != nrpages);
diff --git a/fs/splice.c b/fs/splice.c
index 2898fa1e9e63..28cfa63aa236 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -342,7 +342,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 		return -ENOMEM;
 
 	pages = (struct page **)(bv + npages);
-	npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
+	npages = alloc_pages_bulk(GFP_USER, npages, pages);
 	if (!npages) {
 		kfree(bv);
 		return -ENOMEM;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aa63b8efd782..82db3ab0e8b4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -395,8 +395,8 @@ xfs_buf_alloc_pages(
 	for (;;) {
 		long	last = filled;
 
-		filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
-						bp->b_pages);
+		filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
+					  bp->b_pages);
 		if (filled == bp->b_page_count) {
 			XFS_STATS_INC(bp->b_mount, xb_page_found);
 			break;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f8b33c5e7a14..6bb1a5a7a4ae 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -215,18 +215,18 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 				struct page **page_array);
 #define __alloc_pages_bulk(...)			alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))
 
-unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
+unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
 				unsigned long nr_pages,
 				struct page **page_array);
-#define  alloc_pages_bulk_array_mempolicy(...)				\
-	alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__))
+#define  alloc_pages_bulk_mempolicy(...)				\
+	alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__))
 
 /* Bulk allocate order-0 pages */
-#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array)		\
+#define alloc_pages_bulk(_gfp, _nr_pages, _page_array)		\
 	__alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array)
 
 static inline unsigned long
-alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
+alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
 				   struct page **page_array)
 {
 	if (nid == NUMA_NO_NODE)
@@ -235,8 +235,8 @@ alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
 	return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array);
 }
 
-#define alloc_pages_bulk_array_node(...)				\
-	alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__))
+#define alloc_pages_bulk_node(...)				\
+	alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__))
 
 static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
 {
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 945a5680f6a5..9927cd4c9e0e 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -443,7 +443,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 			return 0;
 	}
 
-	/* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
+	/* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
 	pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
 		return 0;
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 7dcebf118a3e..4bb778be4476 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -420,8 +420,8 @@ static int vm_module_tags_populate(void)
 		unsigned long nr;
 
 		more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT;
-		nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN,
-						 NUMA_NO_NODE, more_pages, next_page);
+		nr = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
+					   NUMA_NO_NODE, more_pages, next_page);
 		if (nr < more_pages ||
 		    vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL,
 				     next_page, PAGE_SHIFT) < 0) {
diff --git a/lib/kunit_iov_iter.c b/lib/kunit_iov_iter.c
index 13e15687675a..830bf3eca4c2 100644
--- a/lib/kunit_iov_iter.c
+++ b/lib/kunit_iov_iter.c
@@ -57,7 +57,7 @@ static void *__init iov_kunit_create_buffer(struct kunit *test,
         KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pages);
 	*ppages = pages;
 
-	got = alloc_pages_bulk_array(GFP_KERNEL, npages, pages);
+	got = alloc_pages_bulk(GFP_KERNEL, npages, pages);
 	if (got != npages) {
 		release_pages(pages, got);
 		KUNIT_ASSERT_EQ(test, got, npages);
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 4ddf769861ff..f585949ff696 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -373,7 +373,7 @@ vm_map_ram_test(void)
 	if (!pages)
 		return -1;
 
-	nr_allocated = alloc_pages_bulk_array(GFP_KERNEL, map_nr_pages, pages);
+	nr_allocated = alloc_pages_bulk(GFP_KERNEL, map_nr_pages, pages);
 	if (nr_allocated != map_nr_pages)
 		goto cleanup;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0da6cf950f7b..f83b73236ffe 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2372,7 +2372,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
 }
 EXPORT_SYMBOL(folio_alloc_noprof);
 
-static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
 {
@@ -2407,7 +2407,7 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 	return total_allocated;
 }
 
-static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
 {
@@ -2522,7 +2522,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 	return total_allocated;
 }
 
-static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
+static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
 {
@@ -2548,7 +2548,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
  * It can accelerate memory allocation especially interleaving
  * allocate memory.
  */
-unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
+unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
 		unsigned long nr_pages, struct page **page_array)
 {
 	struct mempolicy *pol = &default_policy;
@@ -2559,15 +2559,15 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
 		pol = get_task_policy(current);
 
 	if (pol->mode == MPOL_INTERLEAVE)
-		return alloc_pages_bulk_array_interleave(gfp, pol,
+		return alloc_pages_bulk_interleave(gfp, pol,
 							 nr_pages, page_array);
 
 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
-		return alloc_pages_bulk_array_weighted_interleave(
+		return alloc_pages_bulk_weighted_interleave(
 				  gfp, pol, nr_pages, page_array);
 
 	if (pol->mode == MPOL_PREFERRED_MANY)
-		return alloc_pages_bulk_array_preferred_many(gfp,
+		return alloc_pages_bulk_preferred_many(gfp,
 				numa_node_id(), pol, nr_pages, page_array);
 
 	nid = numa_node_id();
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5c88d0e90c20..a6e7acebe9ad 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3562,11 +3562,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
 			 * but mempolicy wants to alloc memory by interleaving.
 			 */
 			if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
-				nr = alloc_pages_bulk_array_mempolicy_noprof(gfp,
+				nr = alloc_pages_bulk_mempolicy_noprof(gfp,
 							nr_pages_request,
 							pages + nr_allocated);
 			else
-				nr = alloc_pages_bulk_array_node_noprof(gfp, nid,
+				nr = alloc_pages_bulk_node_noprof(gfp, nid,
 							nr_pages_request,
 							pages + nr_allocated);
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f89cf93f6eb4..8a91c1972dc5 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -532,12 +532,11 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
 	if (unlikely(pool->alloc.count > 0))
 		return pool->alloc.cache[--pool->alloc.count];
 
-	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
+	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
 	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 
-	nr_pages = alloc_pages_bulk_array_node(gfp,
-					       pool->p.nid, bulk,
-					       (struct page **)pool->alloc.cache);
+	nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
+					 (struct page **)pool->alloc.cache);
 	if (unlikely(!nr_pages))
 		return 0;
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 79879b7d39cb..e7f9c295d13c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -651,8 +651,8 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
 	if (pages > RPCSVC_MAXPAGES)
 		pages = RPCSVC_MAXPAGES;
 
-	ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages,
-					  rqstp->rq_pages);
+	ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages,
+				    rqstp->rq_pages);
 	return ret == pages;
 }
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 43c57124de52..aebc0d8ddff5 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -671,8 +671,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
 	}
 
 	for (filled = 0; filled < pages; filled = ret) {
-		ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
-					     rqstp->rq_pages);
+		ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages);
 		if (ret > filled)
 			/* Made progress, don't sleep yet */
 			continue;
-- 
cgit v1.2.3


From 27701521beb5897d6b97e2f8c20de41e74cbcb7b Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 14 Jan 2025 01:57:24 +0800
Subject: mm, swap: clean up device availability check

Remove highest_bit and lowest_bit.  After the HDD allocation path has been
removed, the only purpose of these two fields is to determine whether the
device is full or not, which can instead be determined by checking the
inuse_pages.

Link: https://lkml.kernel.org/r/20250113175732.48099-6-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chis Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/btrfs/inode.c     |  1 -
 fs/f2fs/data.c       |  1 -
 fs/iomap/swapfile.c  |  1 -
 include/linux/swap.h |  2 --
 mm/page_io.c         |  1 -
 mm/swapfile.c        | 38 ++++++++------------------------------
 6 files changed, 8 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 27b2fe7f735d..3b99b1e19371 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10110,7 +10110,6 @@ out_unlock_mmap:
 	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
 	sis->max = bsi.nr_pages;
 	sis->pages = bsi.nr_pages - 1;
-	sis->highest_bit = bsi.nr_pages - 1;
 	return bsi.nr_extents;
 }
 #else
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a2478c2afb3a..a9eddd782dbc 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -4043,7 +4043,6 @@ retry:
 		cur_lblock = 1;	/* force Empty message */
 	sis->max = cur_lblock;
 	sis->pages = cur_lblock - 1;
-	sis->highest_bit = cur_lblock - 1;
 out:
 	if (not_aligned)
 		f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)",
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index 5fc0ac36dee3..b90d0eda9e51 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -189,7 +189,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
 	*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
 	sis->max = isi.nr_pages;
 	sis->pages = isi.nr_pages - 1;
-	sis->highest_bit = isi.nr_pages - 1;
 	return isi.nr_extents;
 }
 EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3a71198a6957..c0d49dad7a4b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -305,8 +305,6 @@ struct swap_info_struct {
 	struct list_head frag_clusters[SWAP_NR_ORDERS];
 					/* list of cluster that are fragmented or contented */
 	unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
-	unsigned int lowest_bit;	/* index of first free in swap_map */
-	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
 	unsigned int inuse_pages;	/* number of those currently in use */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
diff --git a/mm/page_io.c b/mm/page_io.c
index 4b4ea8e49cf6..9b983de351f9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -163,7 +163,6 @@ reprobe:
 		page_no = 1;	/* force Empty message */
 	sis->max = page_no;
 	sis->pages = page_no - 1;
-	sis->highest_bit = page_no - 1;
 out:
 	return ret;
 bad_bmap:
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 83ebc24cc94b..2686032d3510 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -55,7 +55,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 static void free_swap_count_continuations(struct swap_info_struct *);
 static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
 				  unsigned int nr_pages);
-static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+static void swap_range_alloc(struct swap_info_struct *si,
 			     unsigned int nr_entries);
 static bool folio_swapcache_freeable(struct folio *folio);
 static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
@@ -650,7 +650,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
 	}
 
 	memset(si->swap_map + start, usage, nr_pages);
-	swap_range_alloc(si, start, nr_pages);
+	swap_range_alloc(si, nr_pages);
 	ci->count += nr_pages;
 
 	if (ci->count == SWAPFILE_CLUSTER) {
@@ -888,19 +888,11 @@ static void del_from_avail_list(struct swap_info_struct *si)
 	spin_unlock(&swap_avail_lock);
 }
 
-static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+static void swap_range_alloc(struct swap_info_struct *si,
 			     unsigned int nr_entries)
 {
-	unsigned int end = offset + nr_entries - 1;
-
-	if (offset == si->lowest_bit)
-		si->lowest_bit += nr_entries;
-	if (end == si->highest_bit)
-		WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
 	WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
 	if (si->inuse_pages == si->pages) {
-		si->lowest_bit = si->max;
-		si->highest_bit = 0;
 		del_from_avail_list(si);
 
 		if (vm_swap_full())
@@ -933,15 +925,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 	for (i = 0; i < nr_entries; i++)
 		clear_bit(offset + i, si->zeromap);
 
-	if (offset < si->lowest_bit)
-		si->lowest_bit = offset;
-	if (end > si->highest_bit) {
-		bool was_full = !si->highest_bit;
-
-		WRITE_ONCE(si->highest_bit, end);
-		if (was_full && (si->flags & SWP_WRITEOK))
-			add_to_avail_list(si);
-	}
+	if (si->inuse_pages == si->pages)
+		add_to_avail_list(si);
 	if (si->flags & SWP_BLKDEV)
 		swap_slot_free_notify =
 			si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -1051,15 +1036,12 @@ start_over:
 		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
 		spin_unlock(&swap_avail_lock);
 		spin_lock(&si->lock);
-		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+		if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) {
 			spin_lock(&swap_avail_lock);
 			if (plist_node_empty(&si->avail_lists[node])) {
 				spin_unlock(&si->lock);
 				goto nextsi;
 			}
-			WARN(!si->highest_bit,
-			     "swap_info %d in list but !highest_bit\n",
-			     si->type);
 			WARN(!(si->flags & SWP_WRITEOK),
 			     "swap_info %d in list but !SWP_WRITEOK\n",
 			     si->type);
@@ -2441,8 +2423,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
 	 */
 	plist_add(&si->list, &swap_active_head);
 
-	/* add to available list iff swap device is not full */
-	if (si->highest_bit)
+	/* add to available list if swap device is not full */
+	if (si->inuse_pages < si->pages)
 		add_to_avail_list(si);
 }
 
@@ -2606,7 +2588,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	drain_mmlist();
 
 	/* wait for anyone still in scan_swap_map_slots */
-	p->highest_bit = 0;		/* cuts scans short */
 	while (p->flags >= SWP_SCANNING) {
 		spin_unlock(&p->lock);
 		spin_unlock(&swap_lock);
@@ -2941,8 +2922,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 		return 0;
 	}
 
-	si->lowest_bit  = 1;
-
 	maxpages = swapfile_maximum_size;
 	last_page = swap_header->info.last_page;
 	if (!last_page) {
@@ -2959,7 +2938,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 		if ((unsigned int)maxpages == 0)
 			maxpages = UINT_MAX;
 	}
-	si->highest_bit = maxpages - 1;
 
 	if (!maxpages)
 		return 0;
-- 
cgit v1.2.3


From 3ab76c767bc783c122a8dfe105fbc10a0b029b42 Mon Sep 17 00:00:00 2001
From: xu xin <xu.xin16@zte.com.cn>
Date: Fri, 10 Jan 2025 17:40:34 +0800
Subject: ksm: add ksm involvement information for each process

In /proc/<pid>/ksm_stat, add two extra ksm involvement items including
KSM_mergeable and KSM_merge_any.  It helps administrators to better know
the system's KSM behavior at process level.

ksm_merge_any: yes/no
	whether the process'mm is added by prctl() into the candidate list
	of KSM or not, and fully enabled at process level.

ksm_mergeable: yes/no
    whether any VMAs of the process'mm are currently applicable to KSM.

Purpose
=======
These two items are just to improve the observability of KSM at process
level, so that users can know if a certain process has enabled KSM.

For example, if without these two items, when we look at
/proc/<pid>/ksm_stat and there's no merging pages found, We are not sure
whether it is because KSM was not enabled or because KSM did not
successfully merge any pages.

Although "mg" in /proc/<pid>/smaps indicate VM_MERGEABLE, it's opaque
and not very obvious for non professionals.

[akpm@linux-foundation.org: wording tweaks, per David and akpm]
Link: https://lkml.kernel.org/r/20250110174034304QOb8eDoqtFkp3_t8mqnqc@zte.com.cn
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Cc: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/filesystems/proc.rst | 68 ++++++++++++++++++++++++++++++++++++++
 fs/proc/base.c                     | 11 ++++++
 include/linux/ksm.h                |  1 +
 mm/ksm.c                           | 19 +++++++++++
 4 files changed, 99 insertions(+)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 6a882c57a7e7..4e06f246407c 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -48,6 +48,7 @@ fixes/update part 1.1  Stefani Seibold <stefani@seibold.net>    June 9 2009
   3.11	/proc/<pid>/patch_state - Livepatch patch operation state
   3.12	/proc/<pid>/arch_status - Task architecture specific information
   3.13  /proc/<pid>/fd - List of symlinks to open files
+  3.14  /proc/<pid/ksm_stat - Information about the process' ksm status.
 
   4	Configuring procfs
   4.1	Mount options
@@ -2232,6 +2233,73 @@ The number of open files for the process is stored in 'size' member
 of stat() output for /proc/<pid>/fd for fast access.
 -------------------------------------------------------
 
+3.14 /proc/<pid/ksm_stat - Information about the process' ksm status
+--------------------------------------------------------------------
+When CONFIG_KSM is enabled, each process has this file which displays
+the information of ksm merging status.
+
+Example
+~~~~~~~
+
+::
+
+    / # cat /proc/self/ksm_stat
+    ksm_rmap_items 0
+    ksm_zero_pages 0
+    ksm_merging_pages 0
+    ksm_process_profit 0
+    ksm_merge_any: no
+    ksm_mergeable: no
+
+Description
+~~~~~~~~~~~
+
+ksm_rmap_items
+^^^^^^^^^^^^^^
+
+The number of ksm_rmap_item structures in use.  The structure
+ksm_rmap_item stores the reverse mapping information for virtual
+addresses.  KSM will generate a ksm_rmap_item for each ksm-scanned page of
+the process.
+
+ksm_zero_pages
+^^^^^^^^^^^^^^
+
+When /sys/kernel/mm/ksm/use_zero_pages is enabled, it represent how many
+empty pages are merged with kernel zero pages by KSM.
+
+ksm_merging_pages
+^^^^^^^^^^^^^^^^^
+
+It represents how many pages of this process are involved in KSM merging
+(not including ksm_zero_pages). It is the same with what
+/proc/<pid>/ksm_merging_pages shows.
+
+ksm_process_profit
+^^^^^^^^^^^^^^^^^^
+
+The profit that KSM brings (Saved bytes). KSM can save memory by merging
+identical pages, but also can consume additional memory, because it needs
+to generate a number of rmap_items to save each scanned page's brief rmap
+information. Some of these pages may be merged, but some may not be abled
+to be merged after being checked several times, which are unprofitable
+memory consumed.
+
+ksm_merge_any
+^^^^^^^^^^^^^
+
+It specifies whether the process'mm is added by prctl() into the candidate list
+of KSM or not, and if KSM scanning is fully enabled at process level.
+
+ksm_mergeable
+^^^^^^^^^^^^^
+
+It specifies whether any VMAs of the process'mm are currently applicable
+to KSM.
+
+More information about KSM can be found in
+Documentation/admin-guide/mm/ksm.rst.
+
 
 Chapter 4: Configuring procfs
 =============================
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0edf14a9840e..a50b222a5917 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3269,6 +3269,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task)
 {
 	struct mm_struct *mm;
+	int ret = 0;
 
 	mm = get_task_mm(task);
 	if (mm) {
@@ -3276,6 +3277,16 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
 		seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
 		seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
 		seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
+		seq_printf(m, "ksm_merge_any: %s\n",
+				test_bit(MMF_VM_MERGE_ANY, &mm->flags) ? "yes" : "no");
+		ret = mmap_read_lock_killable(mm);
+		if (ret) {
+			mmput(mm);
+			return ret;
+		}
+		seq_printf(m, "ksm_mergeable: %s\n",
+				ksm_process_mergeable(mm) ? "yes" : "no");
+		mmap_read_unlock(mm);
 		mmput(mm);
 	}
 
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 6a53ac4885bb..d73095b5cd96 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -93,6 +93,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
 void collect_procs_ksm(const struct folio *folio, const struct page *page,
 		struct list_head *to_kill, int force_early);
 long ksm_process_profit(struct mm_struct *);
+bool ksm_process_mergeable(struct mm_struct *mm);
 
 #else  /* !CONFIG_KSM */
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 31a9bc365437..8be2b144fefd 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3262,6 +3262,25 @@ static void wait_while_offlining(void)
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 #ifdef CONFIG_PROC_FS
+/*
+ * The process is mergeable only if any VMA is currently
+ * applicable to KSM.
+ *
+ * The mmap lock must be held in read mode.
+ */
+bool ksm_process_mergeable(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	mmap_assert_locked(mm);
+	VMA_ITERATOR(vmi, mm, 0);
+	for_each_vma(vmi, vma)
+		if (vma->vm_flags & VM_MERGEABLE)
+			return true;
+
+	return false;
+}
+
 long ksm_process_profit(struct mm_struct *mm)
 {
 	return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
-- 
cgit v1.2.3


From 75474a54ed354ab7b00b8557ca7a166871473848 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 24 Jan 2025 21:29:24 -0500
Subject: bcachefs: Journal writes are now IOPRIO_CLASS_RT

System performance is particularly sensitive to journal write latency,
the number of outstanding journal writes is bounded and we can't issue
journal flushes until other journal writes have completed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7f2efe85a805..11c39e0c34f4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,7 @@
 #include "sb-clean.h"
 #include "trace.h"
 
+#include <linux/ioprio.h>
 #include <linux/string_choices.h>
 
 void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
@@ -1763,6 +1764,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
 		bio->bi_private		= ca;
+		bio->bi_ioprio		= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
 
 		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
 		ca->prev_journal_sector = bio->bi_iter.bi_sector;
-- 
cgit v1.2.3


From 0e458a616f680837d0ba9c3c8ba9f66993db4225 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Jan 2025 12:51:51 -0500
Subject: bcachefs: Fix trace_copygc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 11 +++++++----
 fs/bcachefs/trace.h    | 26 ++++++++++++--------------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 85c361e78ba5..21805509ab9e 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -215,7 +215,8 @@ static int bch2_copygc(struct moving_context *ctxt,
 	};
 	move_buckets buckets = { 0 };
 	struct move_bucket_in_flight *f;
-	u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
+	u64 sectors_seen	= atomic64_read(&ctxt->stats->sectors_seen);
+	u64 sectors_moved	= atomic64_read(&ctxt->stats->sectors_moved);
 	int ret = 0;
 
 	ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
@@ -245,7 +246,6 @@ static int bch2_copygc(struct moving_context *ctxt,
 		*did_work = true;
 	}
 err:
-	darray_exit(&buckets);
 
 	/* no entries in LRU btree found, or got to end: */
 	if (bch2_err_matches(ret, ENOENT))
@@ -254,8 +254,11 @@ err:
 	if (ret < 0 && !bch2_err_matches(ret, EROFS))
 		bch_err_msg(c, ret, "from bch2_move_data()");
 
-	moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
-	trace_and_count(c, copygc, c, moved, 0, 0, 0);
+	sectors_seen	= atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
+	sectors_moved	= atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
+	trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved);
+
+	darray_exit(&buckets);
 	return ret;
 }
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 9d40b7d4ea29..56a5a7fbc0fd 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -902,32 +902,30 @@ TRACE_EVENT(evacuate_bucket,
 
 TRACE_EVENT(copygc,
 	TP_PROTO(struct bch_fs *c,
-		 u64 sectors_moved, u64 sectors_not_moved,
-		 u64 buckets_moved, u64 buckets_not_moved),
-	TP_ARGS(c,
-		sectors_moved, sectors_not_moved,
-		buckets_moved, buckets_not_moved),
+		 u64 buckets,
+		 u64 sectors_seen,
+		 u64 sectors_moved),
+	TP_ARGS(c, buckets, sectors_seen, sectors_moved),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
+		__field(u64,		buckets			)
+		__field(u64,		sectors_seen		)
 		__field(u64,		sectors_moved		)
-		__field(u64,		sectors_not_moved	)
-		__field(u64,		buckets_moved		)
-		__field(u64,		buckets_not_moved	)
 	),
 
 	TP_fast_assign(
 		__entry->dev			= c->dev;
+		__entry->buckets		= buckets;
+		__entry->sectors_seen		= sectors_seen;
 		__entry->sectors_moved		= sectors_moved;
-		__entry->sectors_not_moved	= sectors_not_moved;
-		__entry->buckets_moved		= buckets_moved;
-		__entry->buckets_not_moved = buckets_moved;
 	),
 
-	TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+	TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->sectors_moved, __entry->sectors_not_moved,
-		  __entry->buckets_moved, __entry->buckets_not_moved)
+		  __entry->buckets,
+		  __entry->sectors_seen,
+		  __entry->sectors_moved)
 );
 
 TRACE_EVENT(copygc_wait,
-- 
cgit v1.2.3


From 5d9ccda9ba7e80893cd67905882315a4a7ab6ec1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 25 Jan 2025 20:08:26 -0500
Subject: bcachefs: Improve trace_move_extent_finish

We're currently debugging issues with rebalance, where it's not making
progress as quickly as it should be (or sometimes not at all).

Add the full data_update to the move_extent_finish tracepoint, so we can
check that the replicas we wrote match what we were supposed to do.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 50 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 585214931e05..337494facac6 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -91,15 +91,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
 	return true;
 }
 
-static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
+static noinline void trace_move_extent_finish2(struct data_update *u,
+					       struct bkey_i *new,
+					       struct bkey_i *insert)
 {
-	if (trace_move_extent_finish_enabled()) {
-		struct printbuf buf = PRINTBUF;
+	struct bch_fs *c = u->op.c;
+	struct printbuf buf = PRINTBUF;
 
-		bch2_bkey_val_to_text(&buf, c, k);
-		trace_move_extent_finish(c, buf.buf);
-		printbuf_exit(&buf);
-	}
+	prt_newline(&buf);
+
+	bch2_data_update_to_text(&buf, u);
+	prt_newline(&buf);
+
+	prt_str_indented(&buf, "new replicas:\t");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+	prt_newline(&buf);
+
+	prt_str_indented(&buf, "insert:\t");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+	prt_newline(&buf);
+
+	trace_move_extent_finish(c, buf.buf);
+	printbuf_exit(&buf);
 }
 
 static void trace_move_extent_fail2(struct data_update *m,
@@ -372,7 +385,8 @@ restart_drop_extra_replicas:
 			bch2_btree_iter_set_pos(&iter, next_pos);
 
 			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
-			trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
+			if (trace_move_extent_finish_enabled())
+				trace_move_extent_finish2(m, &new->k_i, insert);
 		}
 err:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -525,34 +539,38 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
 				   struct data_update_opts *data_opts)
 {
 	printbuf_tabstop_push(out, 20);
-	prt_str(out, "rewrite ptrs:\t");
+
+	prt_str_indented(out, "rewrite ptrs:\t");
 	bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
 	prt_newline(out);
 
-	prt_str(out, "kill ptrs:\t");
+	prt_str_indented(out, "kill ptrs:\t");
 	bch2_prt_u64_base2(out, data_opts->kill_ptrs);
 	prt_newline(out);
 
-	prt_str(out, "target:\t");
+	prt_str_indented(out, "target:\t");
 	bch2_target_to_text(out, c, data_opts->target);
 	prt_newline(out);
 
-	prt_str(out, "compression:\t");
+	prt_str_indented(out, "compression:\t");
 	bch2_compression_opt_to_text(out, io_opts->background_compression);
 	prt_newline(out);
 
-	prt_str(out, "opts.replicas:\t");
+	prt_str_indented(out, "opts.replicas:\t");
 	prt_u64(out, io_opts->data_replicas);
+	prt_newline(out);
 
-	prt_str(out, "extra replicas:\t");
+	prt_str_indented(out, "extra replicas:\t");
 	prt_u64(out, data_opts->extra_replicas);
 }
 
 void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
 {
-	bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
-	prt_newline(out);
 	bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
+	prt_newline(out);
+
+	prt_str_indented(out, "old key:\t");
+	bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
 }
 
 int bch2_extent_drop_ptrs(struct btree_trans *trans,
-- 
cgit v1.2.3


From f764fab72d98833b47d389ac2ed35bd000132d87 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Mon, 14 Oct 2024 15:18:04 +0200
Subject: cifs: Change translation of STATUS_NOT_A_REPARSE_POINT to -ENODATA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

STATUS_NOT_A_REPARSE_POINT indicates that object does not have reparse point
buffer attached, for example returned by FSCTL_GET_REPARSE_POINT.

Currently STATUS_NOT_A_REPARSE_POINT is translated to -EIO. Change it to
-ENODATA which better describe the situation when no reparse point is set.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/netmisc.c      | 7 +++++++
 fs/smb/client/nterr.c        | 1 +
 fs/smb/client/nterr.h        | 1 +
 fs/smb/client/smb2maperror.c | 2 +-
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/netmisc.c b/fs/smb/client/netmisc.c
index 17b3e21ea868..9dc16211e7a1 100644
--- a/fs/smb/client/netmisc.c
+++ b/fs/smb/client/netmisc.c
@@ -871,6 +871,13 @@ map_smb_to_linux_error(char *buf, bool logErr)
 	}
 	/* else ERRHRD class errors or junk  - return EIO */
 
+	/* special cases for NT status codes which cannot be translated to DOS codes */
+	if (smb->Flags2 & SMBFLG2_ERR_STATUS) {
+		__u32 err = le32_to_cpu(smb->Status.CifsError);
+		if (err == (NT_STATUS_NOT_A_REPARSE_POINT))
+			rc = -ENODATA;
+	}
+
 	cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n",
 		 le32_to_cpu(smb->Status.CifsError), rc);
 
diff --git a/fs/smb/client/nterr.c b/fs/smb/client/nterr.c
index d396a8e98a81..8f0bc441295e 100644
--- a/fs/smb/client/nterr.c
+++ b/fs/smb/client/nterr.c
@@ -674,6 +674,7 @@ const struct nt_err_code_struct nt_errs[] = {
 	{"NT_STATUS_QUOTA_LIST_INCONSISTENT",
 	 NT_STATUS_QUOTA_LIST_INCONSISTENT},
 	{"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE},
+	{"NT_STATUS_NOT_A_REPARSE_POINT", NT_STATUS_NOT_A_REPARSE_POINT},
 	{"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES},
 	{"NT_STATUS_MORE_ENTRIES", NT_STATUS_MORE_ENTRIES},
 	{"NT_STATUS_SOME_UNMAPPED", NT_STATUS_SOME_UNMAPPED},
diff --git a/fs/smb/client/nterr.h b/fs/smb/client/nterr.h
index edd4741cab0a..180602c22355 100644
--- a/fs/smb/client/nterr.h
+++ b/fs/smb/client/nterr.h
@@ -546,6 +546,7 @@ extern const struct nt_err_code_struct nt_errs[];
 #define NT_STATUS_TOO_MANY_LINKS 0xC0000000 | 0x0265
 #define NT_STATUS_QUOTA_LIST_INCONSISTENT 0xC0000000 | 0x0266
 #define NT_STATUS_FILE_IS_OFFLINE 0xC0000000 | 0x0267
+#define NT_STATUS_NOT_A_REPARSE_POINT 0xC0000000 | 0x0275
 #define NT_STATUS_NO_SUCH_JOB 0xC0000000 | 0xEDE	/* scheduler */
 
 #endif				/* _NTERR_H */
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index b05313acf9b2..612e7b5181b6 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -871,7 +871,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
 	{STATUS_VALIDATE_CONTINUE, -EIO, "STATUS_VALIDATE_CONTINUE"},
 	{STATUS_NO_MATCH, -EIO, "STATUS_NO_MATCH"},
 	{STATUS_NO_MORE_MATCHES, -EIO, "STATUS_NO_MORE_MATCHES"},
-	{STATUS_NOT_A_REPARSE_POINT, -EIO, "STATUS_NOT_A_REPARSE_POINT"},
+	{STATUS_NOT_A_REPARSE_POINT, -ENODATA, "STATUS_NOT_A_REPARSE_POINT"},
 	{STATUS_IO_REPARSE_TAG_INVALID, -EIO, "STATUS_IO_REPARSE_TAG_INVALID"},
 	{STATUS_IO_REPARSE_TAG_MISMATCH, -EIO,
 	"STATUS_IO_REPARSE_TAG_MISMATCH"},
-- 
cgit v1.2.3


From cdbc69716ff3a126cef43154742858e1caffa858 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:32 +0100
Subject: fs/proc/vmcore: convert vmcore_cb_lock into vmcore_mutex

We want to protect vmcore modifications from concurrent opening of
the vmcore, and also serialize vmcore modification.

(a) We can currently modify the vmcore after it was opened. This can happen
    if a vmcoredd is added after the vmcore module was initialized and
    already opened by user space. We want to fix that and prepare for
    new code wanting to serialize against concurrent opening.

(b) To handle it cleanly we need to protect the modifications against
    concurrent opening. As the modifications end up allocating memory and
    can sleep, we cannot rely on the spinlock.

Let's convert the spinlock into a mutex to prepare for further changes.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-2-david@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/proc/vmcore.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3d8a82cee63e..6f2800021e95 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
 /* Device Dump Size */
 static size_t vmcoredd_orig_sz;
 
-static DEFINE_SPINLOCK(vmcore_cb_lock);
+static DEFINE_MUTEX(vmcore_mutex);
+
 DEFINE_STATIC_SRCU(vmcore_cb_srcu);
 /* List of registered vmcore callbacks. */
 static LIST_HEAD(vmcore_cb_list);
@@ -72,7 +73,7 @@ static bool vmcore_opened;
 void register_vmcore_cb(struct vmcore_cb *cb)
 {
 	INIT_LIST_HEAD(&cb->next);
-	spin_lock(&vmcore_cb_lock);
+	mutex_lock(&vmcore_mutex);
 	list_add_tail(&cb->next, &vmcore_cb_list);
 	/*
 	 * Registering a vmcore callback after the vmcore was opened is
@@ -80,13 +81,13 @@ void register_vmcore_cb(struct vmcore_cb *cb)
 	 */
 	if (vmcore_opened)
 		pr_warn_once("Unexpected vmcore callback registration\n");
-	spin_unlock(&vmcore_cb_lock);
+	mutex_unlock(&vmcore_mutex);
 }
 EXPORT_SYMBOL_GPL(register_vmcore_cb);
 
 void unregister_vmcore_cb(struct vmcore_cb *cb)
 {
-	spin_lock(&vmcore_cb_lock);
+	mutex_lock(&vmcore_mutex);
 	list_del_rcu(&cb->next);
 	/*
 	 * Unregistering a vmcore callback after the vmcore was opened is
@@ -95,7 +96,7 @@ void unregister_vmcore_cb(struct vmcore_cb *cb)
 	 */
 	if (vmcore_opened)
 		pr_warn_once("Unexpected vmcore callback unregistration\n");
-	spin_unlock(&vmcore_cb_lock);
+	mutex_unlock(&vmcore_mutex);
 
 	synchronize_srcu(&vmcore_cb_srcu);
 }
@@ -120,9 +121,9 @@ static bool pfn_is_ram(unsigned long pfn)
 
 static int open_vmcore(struct inode *inode, struct file *file)
 {
-	spin_lock(&vmcore_cb_lock);
+	mutex_lock(&vmcore_mutex);
 	vmcore_opened = true;
-	spin_unlock(&vmcore_cb_lock);
+	mutex_unlock(&vmcore_mutex);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2083dfe45e83336fafb92f689d6d4b873ad9872d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:33 +0100
Subject: fs/proc/vmcore: replace vmcoredd_mutex by vmcore_mutex

Now that we have a mutex that synchronizes against opening of the vmcore,
let's use that one to replace vmcoredd_mutex: there is no need to have
two separate ones.

This is a preparation for properly preventing vmcore modifications
after the vmcore was opened.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-3-david@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/proc/vmcore.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 6f2800021e95..3a7497a32691 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -53,7 +53,6 @@ static struct proc_dir_entry *proc_vmcore;
 #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
 /* Device Dump list and mutex to synchronize access to list */
 static LIST_HEAD(vmcoredd_list);
-static DEFINE_MUTEX(vmcoredd_mutex);
 
 static bool vmcoredd_disabled;
 core_param(novmcoredd, vmcoredd_disabled, bool, 0);
@@ -248,7 +247,7 @@ static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
 	size_t tsz;
 	char *buf;
 
-	mutex_lock(&vmcoredd_mutex);
+	mutex_lock(&vmcore_mutex);
 	list_for_each_entry(dump, &vmcoredd_list, list) {
 		if (start < offset + dump->size) {
 			tsz = min(offset + (u64)dump->size - start, (u64)size);
@@ -269,7 +268,7 @@ static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
 	}
 
 out_unlock:
-	mutex_unlock(&vmcoredd_mutex);
+	mutex_unlock(&vmcore_mutex);
 	return ret;
 }
 
@@ -283,7 +282,7 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
 	size_t tsz;
 	char *buf;
 
-	mutex_lock(&vmcoredd_mutex);
+	mutex_lock(&vmcore_mutex);
 	list_for_each_entry(dump, &vmcoredd_list, list) {
 		if (start < offset + dump->size) {
 			tsz = min(offset + (u64)dump->size - start, (u64)size);
@@ -306,7 +305,7 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
 	}
 
 out_unlock:
-	mutex_unlock(&vmcoredd_mutex);
+	mutex_unlock(&vmcore_mutex);
 	return ret;
 }
 #endif /* CONFIG_MMU */
@@ -1518,9 +1517,9 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
 	dump->size = data_size;
 
 	/* Add the dump to driver sysfs list */
-	mutex_lock(&vmcoredd_mutex);
+	mutex_lock(&vmcore_mutex);
 	list_add_tail(&dump->list, &vmcoredd_list);
-	mutex_unlock(&vmcoredd_mutex);
+	mutex_unlock(&vmcore_mutex);
 
 	vmcoredd_update_size(data_size);
 	return 0;
@@ -1538,7 +1537,7 @@ EXPORT_SYMBOL(vmcore_add_device_dump);
 static void vmcore_free_device_dumps(void)
 {
 #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
-	mutex_lock(&vmcoredd_mutex);
+	mutex_lock(&vmcore_mutex);
 	while (!list_empty(&vmcoredd_list)) {
 		struct vmcoredd_node *dump;
 
@@ -1548,7 +1547,7 @@ static void vmcore_free_device_dumps(void)
 		vfree(dump->buf);
 		vfree(dump);
 	}
-	mutex_unlock(&vmcoredd_mutex);
+	mutex_unlock(&vmcore_mutex);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 }
 
-- 
cgit v1.2.3


From 0f3b1c40c65262a216fb459fd375f39442a5138b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:34 +0100
Subject: fs/proc/vmcore: disallow vmcore modifications while the vmcore is
 open

The vmcoredd_update_size() call and its effects (size/offset changes) are
currently completely unsynchronized, and will cause trouble when
performed concurrently, or when done while someone is already reading the
vmcore.

Let's protect all vmcore modifications by the vmcore_mutex, disallow vmcore
modifications while the vmcore is open, and warn on vmcore
modifications after the vmcore was already opened once: modifications
while the vmcore is open are unsafe, and modifications after the vmcore
was opened indicates trouble. Properly synchronize against concurrent
opening of the vmcore.

No need to grab the mutex during mmap()/read(): after we opened the
vmcore, modifications are impossible.

It's worth noting that modifications after the vmcore was opened are
completely unexpected, so failing if open, and warning if already opened
(+closed again) is good enough.

This change not only handles concurrent adding of device dumps +
concurrent reading of the vmcore properly, it also prepares for other
mechanisms that will modify the vmcore.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-4-david@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/proc/vmcore.c | 57 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3a7497a32691..4a7bfdf03738 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -68,6 +68,8 @@ DEFINE_STATIC_SRCU(vmcore_cb_srcu);
 static LIST_HEAD(vmcore_cb_list);
 /* Whether the vmcore has been opened once. */
 static bool vmcore_opened;
+/* Whether the vmcore is currently open. */
+static unsigned int vmcore_open;
 
 void register_vmcore_cb(struct vmcore_cb *cb)
 {
@@ -122,6 +124,20 @@ static int open_vmcore(struct inode *inode, struct file *file)
 {
 	mutex_lock(&vmcore_mutex);
 	vmcore_opened = true;
+	if (vmcore_open + 1 == 0) {
+		mutex_unlock(&vmcore_mutex);
+		return -EBUSY;
+	}
+	vmcore_open++;
+	mutex_unlock(&vmcore_mutex);
+
+	return 0;
+}
+
+static int release_vmcore(struct inode *inode, struct file *file)
+{
+	mutex_lock(&vmcore_mutex);
+	vmcore_open--;
 	mutex_unlock(&vmcore_mutex);
 
 	return 0;
@@ -243,33 +259,27 @@ static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
 {
 	struct vmcoredd_node *dump;
 	u64 offset = 0;
-	int ret = 0;
 	size_t tsz;
 	char *buf;
 
-	mutex_lock(&vmcore_mutex);
 	list_for_each_entry(dump, &vmcoredd_list, list) {
 		if (start < offset + dump->size) {
 			tsz = min(offset + (u64)dump->size - start, (u64)size);
 			buf = dump->buf + start - offset;
-			if (copy_to_iter(buf, tsz, iter) < tsz) {
-				ret = -EFAULT;
-				goto out_unlock;
-			}
+			if (copy_to_iter(buf, tsz, iter) < tsz)
+				return -EFAULT;
 
 			size -= tsz;
 			start += tsz;
 
 			/* Leave now if buffer filled already */
 			if (!size)
-				goto out_unlock;
+				return 0;
 		}
 		offset += dump->size;
 	}
 
-out_unlock:
-	mutex_unlock(&vmcore_mutex);
-	return ret;
+	return 0;
 }
 
 #ifdef CONFIG_MMU
@@ -278,20 +288,16 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
 {
 	struct vmcoredd_node *dump;
 	u64 offset = 0;
-	int ret = 0;
 	size_t tsz;
 	char *buf;
 
-	mutex_lock(&vmcore_mutex);
 	list_for_each_entry(dump, &vmcoredd_list, list) {
 		if (start < offset + dump->size) {
 			tsz = min(offset + (u64)dump->size - start, (u64)size);
 			buf = dump->buf + start - offset;
 			if (remap_vmalloc_range_partial(vma, dst, buf, 0,
-							tsz)) {
-				ret = -EFAULT;
-				goto out_unlock;
-			}
+							tsz))
+				return -EFAULT;
 
 			size -= tsz;
 			start += tsz;
@@ -299,14 +305,12 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
 
 			/* Leave now if buffer filled already */
 			if (!size)
-				goto out_unlock;
+				return 0;
 		}
 		offset += dump->size;
 	}
 
-out_unlock:
-	mutex_unlock(&vmcore_mutex);
-	return ret;
+	return 0;
 }
 #endif /* CONFIG_MMU */
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
@@ -691,6 +695,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 
 static const struct proc_ops vmcore_proc_ops = {
 	.proc_open	= open_vmcore,
+	.proc_release	= release_vmcore,
 	.proc_read_iter	= read_vmcore,
 	.proc_lseek	= default_llseek,
 	.proc_mmap	= mmap_vmcore,
@@ -1516,12 +1521,18 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
 	dump->buf = buf;
 	dump->size = data_size;
 
-	/* Add the dump to driver sysfs list */
+	/* Add the dump to driver sysfs list and update the elfcore hdr */
 	mutex_lock(&vmcore_mutex);
-	list_add_tail(&dump->list, &vmcoredd_list);
-	mutex_unlock(&vmcore_mutex);
+	if (vmcore_opened)
+		pr_warn_once("Unexpected adding of device dump\n");
+	if (vmcore_open) {
+		ret = -EBUSY;
+		goto out_err;
+	}
 
+	list_add_tail(&dump->list, &vmcoredd_list);
 	vmcoredd_update_size(data_size);
+	mutex_unlock(&vmcore_mutex);
 	return 0;
 
 out_err:
-- 
cgit v1.2.3


From 8e386957cc2d1fb67b65937c771140c1c47445b1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:35 +0100
Subject: fs/proc/vmcore: prefix all pr_* with "vmcore:"

Let's use "vmcore: " as a prefix, converting the single "Kdump:
vmcore not initialized" one to effectively be "vmcore: not initialized".

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-5-david@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/proc/vmcore.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4a7bfdf03738..7511837d81a2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -8,6 +8,8 @@
  *
  */
 
+#define pr_fmt(fmt) "vmcore: " fmt
+
 #include <linux/mm.h>
 #include <linux/kcore.h>
 #include <linux/user.h>
@@ -1580,7 +1582,7 @@ static int __init vmcore_init(void)
 	rc = parse_crash_elf_headers();
 	if (rc) {
 		elfcorehdr_free(elfcorehdr_addr);
-		pr_warn("Kdump: vmcore not initialized\n");
+		pr_warn("not initialized\n");
 		return rc;
 	}
 	elfcorehdr_free(elfcorehdr_addr);
-- 
cgit v1.2.3


From 819403c893551c5e93bf9087d334e01bcab5c6b9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:36 +0100
Subject: fs/proc/vmcore: move vmcore definitions out of kcore.h

These vmcore defines are not related to /proc/kcore, move them out.

We'll move "struct vmcoredd_node" to vmcore.c, because it is only used
internally. While "struct vmcore" is only used internally for now,
we're planning on using it from inline functions in crash_dump.h next,
so move it to crash_dump.h.

While at it, rename "struct vmcore" to "struct vmcore_range", which is a
more suitable name and will make the usage of it outside of vmcore.c
clearer.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-6-david@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/proc/vmcore.c           | 26 ++++++++++++++++----------
 include/linux/crash_dump.h |  7 +++++++
 include/linux/kcore.h      | 13 -------------
 3 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7511837d81a2..c081b130aba3 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -53,6 +53,12 @@ static u64 vmcore_size;
 static struct proc_dir_entry *proc_vmcore;
 
 #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+struct vmcoredd_node {
+	struct list_head list;	/* List of dumps */
+	void *buf;		/* Buffer containing device's dump */
+	unsigned int size;	/* Size of the buffer */
+};
+
 /* Device Dump list and mutex to synchronize access to list */
 static LIST_HEAD(vmcoredd_list);
 
@@ -322,10 +328,10 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
  */
 static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
 {
+	struct vmcore_range *m = NULL;
 	ssize_t acc = 0, tmp;
 	size_t tsz;
 	u64 start;
-	struct vmcore *m = NULL;
 
 	if (!iov_iter_count(iter) || *fpos >= vmcore_size)
 		return 0;
@@ -580,7 +586,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 {
 	size_t size = vma->vm_end - vma->vm_start;
 	u64 start, end, len, tsz;
-	struct vmcore *m;
+	struct vmcore_range *m;
 
 	start = (u64)vma->vm_pgoff << PAGE_SHIFT;
 	end = start + size;
@@ -703,16 +709,16 @@ static const struct proc_ops vmcore_proc_ops = {
 	.proc_mmap	= mmap_vmcore,
 };
 
-static struct vmcore* __init get_new_element(void)
+static struct vmcore_range * __init get_new_element(void)
 {
-	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
+	return kzalloc(sizeof(struct vmcore_range), GFP_KERNEL);
 }
 
 static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
 			   struct list_head *vc_list)
 {
+	struct vmcore_range *m;
 	u64 size;
-	struct vmcore *m;
 
 	size = elfsz + elfnotesegsz;
 	list_for_each_entry(m, vc_list, list) {
@@ -1110,11 +1116,11 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 						size_t elfnotes_sz,
 						struct list_head *vc_list)
 {
+	struct vmcore_range *new;
 	int i;
 	Elf64_Ehdr *ehdr_ptr;
 	Elf64_Phdr *phdr_ptr;
 	loff_t vmcore_off;
-	struct vmcore *new;
 
 	ehdr_ptr = (Elf64_Ehdr *)elfptr;
 	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
@@ -1153,11 +1159,11 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 						size_t elfnotes_sz,
 						struct list_head *vc_list)
 {
+	struct vmcore_range *new;
 	int i;
 	Elf32_Ehdr *ehdr_ptr;
 	Elf32_Phdr *phdr_ptr;
 	loff_t vmcore_off;
-	struct vmcore *new;
 
 	ehdr_ptr = (Elf32_Ehdr *)elfptr;
 	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
@@ -1195,8 +1201,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
 				    struct list_head *vc_list)
 {
+	struct vmcore_range *m;
 	loff_t vmcore_off;
-	struct vmcore *m;
 
 	/* Skip ELF header, program headers and ELF note segment. */
 	vmcore_off = elfsz + elfnotes_sz;
@@ -1605,9 +1611,9 @@ void vmcore_cleanup(void)
 
 	/* clear the vmcore list. */
 	while (!list_empty(&vmcore_list)) {
-		struct vmcore *m;
+		struct vmcore_range *m;
 
-		m = list_first_entry(&vmcore_list, struct vmcore, list);
+		m = list_first_entry(&vmcore_list, struct vmcore_range, list);
 		list_del(&m->list);
 		kfree(m);
 	}
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index acc55626afdc..788a45061f35 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -114,6 +114,13 @@ struct vmcore_cb {
 extern void register_vmcore_cb(struct vmcore_cb *cb);
 extern void unregister_vmcore_cb(struct vmcore_cb *cb);
 
+struct vmcore_range {
+	struct list_head list;
+	unsigned long long paddr;
+	unsigned long long size;
+	loff_t offset;
+};
+
 #else /* !CONFIG_CRASH_DUMP */
 static inline bool is_kdump_kernel(void) { return false; }
 #endif /* CONFIG_CRASH_DUMP */
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 86c0f1d18998..9a2fa013c91d 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -20,19 +20,6 @@ struct kcore_list {
 	int type;
 };
 
-struct vmcore {
-	struct list_head list;
-	unsigned long long paddr;
-	unsigned long long size;
-	loff_t offset;
-};
-
-struct vmcoredd_node {
-	struct list_head list;	/* List of dumps */
-	void *buf;		/* Buffer containing device's dump */
-	unsigned int size;	/* Size of the buffer */
-};
-
 #ifdef CONFIG_PROC_KCORE
 void __init kclist_add(struct kcore_list *, void *, size_t, int type);
 
-- 
cgit v1.2.3


From e017b1f4aa4eb887ee85fe13862206c0d31344b4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:37 +0100
Subject: fs/proc/vmcore: factor out allocating a vmcore range and adding it to
 a list

Let's factor it out into include/linux/crash_dump.h, from where we can
use it also outside of vmcore.c later.

Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-7-david@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/proc/vmcore.c           | 21 ++-------------------
 include/linux/crash_dump.h | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index c081b130aba3..7ad94fa7a2af 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -709,11 +709,6 @@ static const struct proc_ops vmcore_proc_ops = {
 	.proc_mmap	= mmap_vmcore,
 };
 
-static struct vmcore_range * __init get_new_element(void)
-{
-	return kzalloc(sizeof(struct vmcore_range), GFP_KERNEL);
-}
-
 static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
 			   struct list_head *vc_list)
 {
@@ -1116,7 +1111,6 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 						size_t elfnotes_sz,
 						struct list_head *vc_list)
 {
-	struct vmcore_range *new;
 	int i;
 	Elf64_Ehdr *ehdr_ptr;
 	Elf64_Phdr *phdr_ptr;
@@ -1139,13 +1133,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 		end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
 		size = end - start;
 
-		/* Add this contiguous chunk of memory to vmcore list.*/
-		new = get_new_element();
-		if (!new)
+		if (vmcore_alloc_add_range(vc_list, start, size))
 			return -ENOMEM;
-		new->paddr = start;
-		new->size = size;
-		list_add_tail(&new->list, vc_list);
 
 		/* Update the program header offset. */
 		phdr_ptr->p_offset = vmcore_off + (paddr - start);
@@ -1159,7 +1148,6 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 						size_t elfnotes_sz,
 						struct list_head *vc_list)
 {
-	struct vmcore_range *new;
 	int i;
 	Elf32_Ehdr *ehdr_ptr;
 	Elf32_Phdr *phdr_ptr;
@@ -1182,13 +1170,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 		end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
 		size = end - start;
 
-		/* Add this contiguous chunk of memory to vmcore list.*/
-		new = get_new_element();
-		if (!new)
+		if (vmcore_alloc_add_range(vc_list, start, size))
 			return -ENOMEM;
-		new->paddr = start;
-		new->size = size;
-		list_add_tail(&new->list, vc_list);
 
 		/* Update the program header offset */
 		phdr_ptr->p_offset = vmcore_off + (paddr - start);
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 788a45061f35..9717912ce4d1 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -121,6 +121,20 @@ struct vmcore_range {
 	loff_t offset;
 };
 
+/* Allocate a vmcore range and add it to the list. */
+static inline int vmcore_alloc_add_range(struct list_head *list,
+		unsigned long long paddr, unsigned long long size)
+{
+	struct vmcore_range *m = kzalloc(sizeof(*m), GFP_KERNEL);
+
+	if (!m)
+		return -ENOMEM;
+	m->paddr = paddr;
+	m->size = size;
+	list_add_tail(&m->list, list);
+	return 0;
+}
+
 #else /* !CONFIG_CRASH_DUMP */
 static inline bool is_kdump_kernel(void) { return false; }
 #endif /* CONFIG_CRASH_DUMP */
-- 
cgit v1.2.3


From e29e9acae06dc28ca8dbf3db976e09787e610dc8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:38 +0100
Subject: fs/proc/vmcore: factor out freeing a list of vmcore ranges

Let's factor it out into include/linux/crash_dump.h, from where we can
use it also outside of vmcore.c later.

Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-8-david@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/proc/vmcore.c           |  9 +--------
 include/linux/crash_dump.h | 11 +++++++++++
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7ad94fa7a2af..6632d4bc4b05 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1592,14 +1592,7 @@ void vmcore_cleanup(void)
 		proc_vmcore = NULL;
 	}
 
-	/* clear the vmcore list. */
-	while (!list_empty(&vmcore_list)) {
-		struct vmcore_range *m;
-
-		m = list_first_entry(&vmcore_list, struct vmcore_range, list);
-		list_del(&m->list);
-		kfree(m);
-	}
+	vmcore_free_ranges(&vmcore_list);
 	free_elfcorebuf();
 
 	/* clear vmcore device dump list */
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 9717912ce4d1..5d61c7454fd6 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -135,6 +135,17 @@ static inline int vmcore_alloc_add_range(struct list_head *list,
 	return 0;
 }
 
+/* Free a list of vmcore ranges. */
+static inline void vmcore_free_ranges(struct list_head *list)
+{
+	struct vmcore_range *m, *tmp;
+
+	list_for_each_entry_safe(m, tmp, list, list) {
+		list_del(&m->list);
+		kfree(m);
+	}
+}
+
 #else /* !CONFIG_CRASH_DUMP */
 static inline bool is_kdump_kernel(void) { return false; }
 #endif /* CONFIG_CRASH_DUMP */
-- 
cgit v1.2.3


From 7ad4d1f6e6ef967cd24c6275d8d4056045c019c1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:39 +0100
Subject: fs/proc/vmcore: introduce PROC_VMCORE_DEVICE_RAM to detect device RAM
 ranges in 2nd kernel

s390 allocates+prepares the elfcore hdr in the dump (2nd) kernel, not in
the crashed kernel.

RAM provided by memory devices such as virtio-mem can only be detected
using the device driver; when vmcore_init() is called, these device
drivers are usually not loaded yet, or the devices did not get probed
yet. Consequently, on s390 these RAM ranges will not be included in
the crash dump, which makes the dump partially corrupt and is
unfortunate.

Instead of deferring the vmcore_init() call, to an (unclear?) later point,
let's reuse the vmcore_cb infrastructure to obtain device RAM ranges as
the device drivers probe the device and get access to this information.

Then, we'll add these ranges to the vmcore, adding more PT_LOAD
entries and updating the offsets+vmcore size.

Use a separate Kconfig option to be set by an architecture to include this
code only if the arch really needs it. Further, we'll make the config
depend on the relevant drivers (i.e., virtio_mem) once they implement
support (next). The alternative of having a PROVIDE_PROC_VMCORE_DEVICE_RAM
config option was dropped for now for simplicity.

The current target use case is s390, which only creates an elf64
elfcore, so focusing on elf64 is sufficient.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-9-david@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/proc/Kconfig            |  18 ++++++
 fs/proc/vmcore.c           | 156 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/crash_dump.h |   9 +++
 3 files changed, 183 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index d80a1431ef7b..5668620ab34d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -61,6 +61,24 @@ config PROC_VMCORE_DEVICE_DUMP
 	  as ELF notes to /proc/vmcore. You can still disable device
 	  dump using the kernel command line option 'novmcoredd'.
 
+config NEED_PROC_VMCORE_DEVICE_RAM
+	bool
+
+config PROC_VMCORE_DEVICE_RAM
+	def_bool y
+	depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM
+	help
+	  If the elfcore hdr is allocated and prepared by the dump kernel
+	  ("2nd kernel") instead of the crashed kernel, RAM provided by memory
+	  devices such as virtio-mem will not be included in the dump
+	  image, because only the device driver can properly detect them.
+
+	  With this config enabled, these RAM ranges will be queried from the
+	  device drivers once the device gets probed, so they can be included
+	  in the crash dump.
+
+	  Relevant architectures should select NEED_PROC_VMCORE_DEVICE_RAM.
+
 config PROC_SYSCTL
 	bool "Sysctl support (/proc/sys)" if EXPERT
 	depends on PROC_FS
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 6632d4bc4b05..8d39e238329e 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -79,6 +79,8 @@ static bool vmcore_opened;
 /* Whether the vmcore is currently open. */
 static unsigned int vmcore_open;
 
+static void vmcore_process_device_ram(struct vmcore_cb *cb);
+
 void register_vmcore_cb(struct vmcore_cb *cb)
 {
 	INIT_LIST_HEAD(&cb->next);
@@ -90,6 +92,8 @@ void register_vmcore_cb(struct vmcore_cb *cb)
 	 */
 	if (vmcore_opened)
 		pr_warn_once("Unexpected vmcore callback registration\n");
+	if (!vmcore_open && cb->get_device_ram)
+		vmcore_process_device_ram(cb);
 	mutex_unlock(&vmcore_mutex);
 }
 EXPORT_SYMBOL_GPL(register_vmcore_cb);
@@ -1535,6 +1539,158 @@ out_err:
 EXPORT_SYMBOL(vmcore_add_device_dump);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
+static int vmcore_realloc_elfcore_buffer_elf64(size_t new_size)
+{
+	char *elfcorebuf_new;
+
+	if (WARN_ON_ONCE(new_size < elfcorebuf_sz))
+		return -EINVAL;
+	if (get_order(elfcorebuf_sz_orig) == get_order(new_size)) {
+		elfcorebuf_sz_orig = new_size;
+		return 0;
+	}
+
+	elfcorebuf_new = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						  get_order(new_size));
+	if (!elfcorebuf_new)
+		return -ENOMEM;
+	memcpy(elfcorebuf_new, elfcorebuf, elfcorebuf_sz);
+	free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+	elfcorebuf = elfcorebuf_new;
+	elfcorebuf_sz_orig = new_size;
+	return 0;
+}
+
+static void vmcore_reset_offsets_elf64(void)
+{
+	Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
+	loff_t vmcore_off = elfcorebuf_sz + elfnotes_sz;
+	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+	Elf64_Phdr *phdr;
+	int i;
+
+	for (i = 0, phdr = phdr_start; i < ehdr->e_phnum; i++, phdr++) {
+		u64 start, end;
+
+		/*
+		 * After merge_note_headers_elf64() we should only have a single
+		 * PT_NOTE entry that starts immediately after elfcorebuf_sz.
+		 */
+		if (phdr->p_type == PT_NOTE) {
+			phdr->p_offset = elfcorebuf_sz;
+			continue;
+		}
+
+		start = rounddown(phdr->p_offset, PAGE_SIZE);
+		end = roundup(phdr->p_offset + phdr->p_memsz, PAGE_SIZE);
+		phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+		vmcore_off = vmcore_off + end - start;
+	}
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+}
+
+static int vmcore_add_device_ram_elf64(struct list_head *list, size_t count)
+{
+	Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
+	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+	struct vmcore_range *cur;
+	Elf64_Phdr *phdr;
+	size_t new_size;
+	int rc;
+
+	if ((Elf32_Half)(ehdr->e_phnum + count) != ehdr->e_phnum + count) {
+		pr_err("too many device ram ranges\n");
+		return -ENOSPC;
+	}
+
+	/* elfcorebuf_sz must always cover full pages. */
+	new_size = sizeof(Elf64_Ehdr) +
+		   (ehdr->e_phnum + count) * sizeof(Elf64_Phdr);
+	new_size = roundup(new_size, PAGE_SIZE);
+
+	/*
+	 * Make sure we have sufficient space to include the new PT_LOAD
+	 * entries.
+	 */
+	rc = vmcore_realloc_elfcore_buffer_elf64(new_size);
+	if (rc) {
+		pr_err("resizing elfcore failed\n");
+		return rc;
+	}
+
+	/* Modify our used elfcore buffer size to cover the new entries. */
+	elfcorebuf_sz = new_size;
+
+	/* Fill the added PT_LOAD entries. */
+	phdr = phdr_start + ehdr->e_phnum;
+	list_for_each_entry(cur, list, list) {
+		WARN_ON_ONCE(!IS_ALIGNED(cur->paddr | cur->size, PAGE_SIZE));
+		elfcorehdr_fill_device_ram_ptload_elf64(phdr, cur->paddr, cur->size);
+
+		/* p_offset will be adjusted later. */
+		phdr++;
+		ehdr->e_phnum++;
+	}
+	list_splice_tail(list, &vmcore_list);
+
+	/* We changed elfcorebuf_sz and added new entries; reset all offsets. */
+	vmcore_reset_offsets_elf64();
+
+	/* Finally, recalculate the total vmcore size. */
+	vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+				      &vmcore_list);
+	proc_vmcore->size = vmcore_size;
+	return 0;
+}
+
+static void vmcore_process_device_ram(struct vmcore_cb *cb)
+{
+	unsigned char *e_ident = (unsigned char *)elfcorebuf;
+	struct vmcore_range *first, *m;
+	LIST_HEAD(list);
+	int count;
+
+	/* We only support Elf64 dumps for now. */
+	if (WARN_ON_ONCE(e_ident[EI_CLASS] != ELFCLASS64)) {
+		pr_err("device ram ranges only support Elf64\n");
+		return;
+	}
+
+	if (cb->get_device_ram(cb, &list)) {
+		pr_err("obtaining device ram ranges failed\n");
+		return;
+	}
+	count = list_count_nodes(&list);
+	if (!count)
+		return;
+
+	/*
+	 * For some reason these ranges are already know? Might happen
+	 * with unusual register->unregister->register sequences; we'll simply
+	 * sanity check using the first range.
+	 */
+	first = list_first_entry(&list, struct vmcore_range, list);
+	list_for_each_entry(m, &vmcore_list, list) {
+		unsigned long long m_end = m->paddr + m->size;
+		unsigned long long first_end = first->paddr + first->size;
+
+		if (first->paddr < m_end && m->paddr < first_end)
+			goto out_free;
+	}
+
+	/* If adding the mem nodes succeeds, they must not be freed. */
+	if (!vmcore_add_device_ram_elf64(&list, count))
+		return;
+out_free:
+	vmcore_free_ranges(&list);
+}
+#else /* !CONFIG_PROC_VMCORE_DEVICE_RAM */
+static void vmcore_process_device_ram(struct vmcore_cb *cb)
+{
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
+
 /* Free all dumps in vmcore device dump list */
 static void vmcore_free_device_dumps(void)
 {
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 5d61c7454fd6..2f2555e6407c 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -20,6 +20,8 @@ extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size);
 extern void elfcorehdr_free(unsigned long long addr);
 extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos);
 extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos);
+void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr,
+		unsigned long long paddr, unsigned long long size);
 extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
 				  unsigned long from, unsigned long pfn,
 				  unsigned long size, pgprot_t prot);
@@ -99,6 +101,12 @@ static inline void vmcore_unusable(void)
  *              indicated in the vmcore instead. For example, a ballooned page
  *              contains no data and reading from such a page will cause high
  *              load in the hypervisor.
+ * @get_device_ram: query RAM ranges that can only be detected by device
+ *   drivers, such as the virtio-mem driver, so they can be included in
+ *   the crash dump on architectures that allocate the elfcore hdr in the dump
+ *   ("2nd") kernel. Indicated RAM ranges may contain holes to reduce the
+ *   total number of ranges; such holes can be detected using the pfn_is_ram
+ *   callback just like for other RAM.
  * @next: List head to manage registered callbacks internally; initialized by
  *        register_vmcore_cb().
  *
@@ -109,6 +117,7 @@ static inline void vmcore_unusable(void)
  */
 struct vmcore_cb {
 	bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn);
+	int (*get_device_ram)(struct vmcore_cb *cb, struct list_head *list);
 	struct list_head next;
 };
 extern void register_vmcore_cb(struct vmcore_cb *cb);
-- 
cgit v1.2.3


From c6a8239a9eddf38137797292ed75432957a37a14 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Dec 2024 13:54:42 +0100
Subject: virtio-mem: support CONFIG_PROC_VMCORE_DEVICE_RAM

Let's implement the get_device_ram() vmcore callback, so
architectures that select NEED_PROC_VMCORE_NEED_DEVICE_RAM, like s390
soon, can include that memory in a crash dump.

Merge ranges, and process ranges that might contain a mixture of plugged
and unplugged, to reduce the total number of ranges.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20241204125444.1734652-12-david@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_mem.c | 88 +++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/Kconfig             |  1 +
 2 files changed, 89 insertions(+)

(limited to 'fs')

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 73477d5b79cf..8a294b9cbcf6 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -2728,6 +2728,91 @@ static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
 	mutex_unlock(&vm->hotplug_mutex);
 	return is_ram;
 }
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
+static int virtio_mem_vmcore_add_device_ram(struct virtio_mem *vm,
+		struct list_head *list, uint64_t start, uint64_t end)
+{
+	int rc;
+
+	rc = vmcore_alloc_add_range(list, start, end - start);
+	if (rc)
+		dev_err(&vm->vdev->dev,
+			 "Error adding device RAM range: %d\n", rc);
+	return rc;
+}
+
+static int virtio_mem_vmcore_get_device_ram(struct vmcore_cb *cb,
+		struct list_head *list)
+{
+	struct virtio_mem *vm = container_of(cb, struct virtio_mem,
+					     vmcore_cb);
+	const uint64_t device_start = vm->addr;
+	const uint64_t device_end = vm->addr + vm->usable_region_size;
+	uint64_t chunk_size, cur_start, cur_end, plugged_range_start = 0;
+	LIST_HEAD(tmp_list);
+	int rc;
+
+	if (!vm->plugged_size)
+		return 0;
+
+	/* Process memory sections, unless the device block size is bigger. */
+	chunk_size = max_t(uint64_t, PFN_PHYS(PAGES_PER_SECTION),
+			   vm->device_block_size);
+
+	mutex_lock(&vm->hotplug_mutex);
+
+	/*
+	 * We process larger chunks and indicate the complete chunk if any
+	 * block in there is plugged. This reduces the number of pfn_is_ram()
+	 * callbacks and mimic what is effectively being done when the old
+	 * kernel would add complete memory sections/blocks to the elfcore hdr.
+	 */
+	cur_start = device_start;
+	for (cur_start = device_start; cur_start < device_end; cur_start = cur_end) {
+		cur_end = ALIGN_DOWN(cur_start + chunk_size, chunk_size);
+		cur_end = min_t(uint64_t, cur_end, device_end);
+
+		rc = virtio_mem_send_state_request(vm, cur_start,
+						   cur_end - cur_start);
+
+		if (rc < 0) {
+			dev_err(&vm->vdev->dev,
+				"Error querying block states: %d\n", rc);
+			goto out;
+		} else if (rc != VIRTIO_MEM_STATE_UNPLUGGED) {
+			/* Merge ranges with plugged memory. */
+			if (!plugged_range_start)
+				plugged_range_start = cur_start;
+			continue;
+		}
+
+		/* Flush any plugged range. */
+		if (plugged_range_start) {
+			rc = virtio_mem_vmcore_add_device_ram(vm, &tmp_list,
+							      plugged_range_start,
+							      cur_start);
+			if (rc)
+				goto out;
+			plugged_range_start = 0;
+		}
+	}
+
+	/* Flush any plugged range. */
+	if (plugged_range_start)
+		rc = virtio_mem_vmcore_add_device_ram(vm, &tmp_list,
+						      plugged_range_start,
+						      cur_start);
+out:
+	mutex_unlock(&vm->hotplug_mutex);
+	if (rc < 0) {
+		vmcore_free_ranges(&tmp_list);
+		return rc;
+	}
+	list_splice_tail(&tmp_list, list);
+	return 0;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
 #endif /* CONFIG_PROC_VMCORE */
 
 static int virtio_mem_init_kdump(struct virtio_mem *vm)
@@ -2737,6 +2822,9 @@ static int virtio_mem_init_kdump(struct virtio_mem *vm)
 #ifdef CONFIG_PROC_VMCORE
 	dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n");
 	vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram;
+#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
+	vm->vmcore_cb.get_device_ram = virtio_mem_vmcore_get_device_ram;
+#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
 	register_vmcore_cb(&vm->vmcore_cb);
 	return 0;
 #else /* CONFIG_PROC_VMCORE */
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 5668620ab34d..6ae966c561e7 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -67,6 +67,7 @@ config NEED_PROC_VMCORE_DEVICE_RAM
 config PROC_VMCORE_DEVICE_RAM
 	def_bool y
 	depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM
+	depends on VIRTIO_MEM
 	help
 	  If the elfcore hdr is allocated and prepared by the dump kernel
 	  ("2nd kernel") instead of the crashed kernel, RAM provided by memory
-- 
cgit v1.2.3


From 2f0805d7c08bea71c95561bfb3e45d93b05196b9 Mon Sep 17 00:00:00 2001
From: Liang Jie <liangjie@lixiang.com>
Date: Fri, 10 Jan 2025 18:05:24 +0800
Subject: ceph: streamline request head structures in MDS client

The existence of the ceph_mds_request_head_old structure in the MDS
client code is no longer required due to improvements in handling
different MDS request header versions. This patch removes the now
redundant ceph_mds_request_head_old structure and replaces its usage
with the flexible and extensible ceph_mds_request_head structure.

Changes include:
- Modification of find_legacy_request_head to directly cast the
  pointer to ceph_mds_request_head_legacy without going through the
  old structure.
- Update sizeof calculations in create_request_message to use
  offsetofend for consistency and future-proofing, rather than
  referencing the old structure.
- Use of the structured ceph_mds_request_head directly instead of the
  old one.

Additionally, this consolidation normalizes the handling of
request_head_version v1 to align with versions v2 and v3, leading to
a more consistent and maintainable codebase.

These changes simplify the codebase and reduce potential confusion
stemming from the existence of an obsolete structure.

Signed-off-by: Liang Jie <liangjie@lixiang.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 16 ++++++++--------
 include/linux/ceph/ceph_fs.h | 14 --------------
 2 files changed, 8 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ae37f0e24c99..921f08a27dd7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2945,12 +2945,12 @@ static struct ceph_mds_request_head_legacy *
 find_legacy_request_head(void *p, u64 features)
 {
 	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
-	struct ceph_mds_request_head_old *ohead;
+	struct ceph_mds_request_head *head;
 
 	if (legacy)
 		return (struct ceph_mds_request_head_legacy *)p;
-	ohead = (struct ceph_mds_request_head_old *)p;
-	return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
+	head = (struct ceph_mds_request_head *)p;
+	return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;
 }
 
 /*
@@ -3020,7 +3020,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	if (legacy)
 		len = sizeof(struct ceph_mds_request_head_legacy);
 	else if (request_head_version == 1)
-		len = sizeof(struct ceph_mds_request_head_old);
+		len = offsetofend(struct ceph_mds_request_head, args);
 	else if (request_head_version == 2)
 		len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
 	else
@@ -3104,11 +3104,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 		msg->hdr.version = cpu_to_le16(3);
 		p = msg->front.iov_base + sizeof(*lhead);
 	} else if (request_head_version == 1) {
-		struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
+		struct ceph_mds_request_head *nhead = msg->front.iov_base;
 
 		msg->hdr.version = cpu_to_le16(4);
-		ohead->version = cpu_to_le16(1);
-		p = msg->front.iov_base + sizeof(*ohead);
+		nhead->version = cpu_to_le16(1);
+		p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args);
 	} else if (request_head_version == 2) {
 		struct ceph_mds_request_head *nhead = msg->front.iov_base;
 
@@ -3265,7 +3265,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 	 * so we limit to retry at most 256 times.
 	 */
 	if (req->r_attempts) {
-	       old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
+	       old_max_retry = sizeof_field(struct ceph_mds_request_head,
 					    num_retry);
 	       old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
 	       if ((old_version && req->r_attempts >= old_max_retry) ||
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 2d7d86f0290d..c7f2c63b3bc3 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -504,20 +504,6 @@ struct ceph_mds_request_head_legacy {
 
 #define CEPH_MDS_REQUEST_HEAD_VERSION  3
 
-struct ceph_mds_request_head_old {
-	__le16 version;                /* struct version */
-	__le64 oldest_client_tid;
-	__le32 mdsmap_epoch;           /* on client */
-	__le32 flags;                  /* CEPH_MDS_FLAG_* */
-	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
-	__le16 num_releases;           /* # include cap/lease release records */
-	__le32 op;                     /* mds op code */
-	__le32 caller_uid, caller_gid;
-	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
-					  etc. (if replaying) */
-	union ceph_mds_request_args_ext args;
-} __attribute__ ((packed));
-
 struct ceph_mds_request_head {
 	__le16 version;                /* struct version */
 	__le64 oldest_client_tid;
-- 
cgit v1.2.3


From 3981be13ec1baf811bfb93ed6a98bafc85cdeab1 Mon Sep 17 00:00:00 2001
From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Date: Fri, 24 Jan 2025 11:46:23 -0800
Subject: ceph: exchange hardcoded value on NAME_MAX

Initially, ceph_fs_debugfs_init() had temporary
name buffer with hardcoded length of 80 symbols.
Then, it was hardcoded again for 100 symbols.
Finally, it makes sense to exchange hardcoded
value on properly defined constant and 255 symbols
should be enough for any name case.

Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Reviewed-by: Patrick Donnelly <pdonnell@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdf9dc15eafa..fdd404fc8112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -412,7 +412,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 
 void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
-	char name[100];
+	char name[NAME_MAX];
 
 	doutc(fsc->client, "begin\n");
 	fsc->debugfs_congestion_kb =
-- 
cgit v1.2.3


From c090c8abae4b6b77a1bee116aa6c385456ebef96 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:03 +0100
Subject: fuse: Add io-uring sqe commit and fetch support

This adds support for fuse request completion through ring SQEs
(FUSE_URING_CMD_COMMIT_AND_FETCH handling). After committing
the ring entry it becomes available for new fuse requests.
Handling of requests through the ring (SQE/CQE handling)
is complete now.

Fuse request data are copied through the mmaped ring buffer,
there is no support for any zero copy yet.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> # io_uring
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev_uring.c   | 441 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dev_uring_i.h |  12 ++
 fs/fuse/fuse_i.h      |   4 +
 3 files changed, 457 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 42092a234570..1030c1720990 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -24,6 +24,17 @@ bool fuse_uring_enabled(void)
 	return enable_uring;
 }
 
+static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
+			       int error)
+{
+	ent->fuse_req = NULL;
+	if (error)
+		req->out.h.error = error;
+
+	clear_bit(FR_SENT, &req->flags);
+	fuse_request_end(req);
+}
+
 void fuse_uring_destruct(struct fuse_conn *fc)
 {
 	struct fuse_ring *ring = fc->ring;
@@ -39,8 +50,11 @@ void fuse_uring_destruct(struct fuse_conn *fc)
 			continue;
 
 		WARN_ON(!list_empty(&queue->ent_avail_queue));
+		WARN_ON(!list_empty(&queue->ent_w_req_queue));
 		WARN_ON(!list_empty(&queue->ent_commit_queue));
+		WARN_ON(!list_empty(&queue->ent_in_userspace));
 
+		kfree(queue->fpq.processing);
 		kfree(queue);
 		ring->queues[qid] = NULL;
 	}
@@ -99,20 +113,34 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
 {
 	struct fuse_conn *fc = ring->fc;
 	struct fuse_ring_queue *queue;
+	struct list_head *pq;
 
 	queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
 	if (!queue)
 		return NULL;
+	pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
+	if (!pq) {
+		kfree(queue);
+		return NULL;
+	}
+
 	queue->qid = qid;
 	queue->ring = ring;
 	spin_lock_init(&queue->lock);
 
 	INIT_LIST_HEAD(&queue->ent_avail_queue);
 	INIT_LIST_HEAD(&queue->ent_commit_queue);
+	INIT_LIST_HEAD(&queue->ent_w_req_queue);
+	INIT_LIST_HEAD(&queue->ent_in_userspace);
+	INIT_LIST_HEAD(&queue->fuse_req_queue);
+
+	queue->fpq.processing = pq;
+	fuse_pqueue_init(&queue->fpq);
 
 	spin_lock(&fc->lock);
 	if (ring->queues[qid]) {
 		spin_unlock(&fc->lock);
+		kfree(queue->fpq.processing);
 		kfree(queue);
 		return ring->queues[qid];
 	}
@@ -126,6 +154,214 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
 	return queue;
 }
 
+/*
+ * Checks for errors and stores it into the request
+ */
+static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
+					 struct fuse_req *req,
+					 struct fuse_conn *fc)
+{
+	int err;
+
+	err = -EINVAL;
+	if (oh->unique == 0) {
+		/* Not supported through io-uring yet */
+		pr_warn_once("notify through fuse-io-uring not supported\n");
+		goto err;
+	}
+
+	if (oh->error <= -ERESTARTSYS || oh->error > 0)
+		goto err;
+
+	if (oh->error) {
+		err = oh->error;
+		goto err;
+	}
+
+	err = -ENOENT;
+	if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) {
+		pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n",
+				    req->in.h.unique,
+				    oh->unique & ~FUSE_INT_REQ_BIT);
+		goto err;
+	}
+
+	/*
+	 * Is it an interrupt reply ID?
+	 * XXX: Not supported through fuse-io-uring yet, it should not even
+	 *      find the request - should not happen.
+	 */
+	WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT);
+
+	err = 0;
+err:
+	return err;
+}
+
+static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
+				     struct fuse_req *req,
+				     struct fuse_ring_ent *ent)
+{
+	struct fuse_copy_state cs;
+	struct fuse_args *args = req->args;
+	struct iov_iter iter;
+	int err;
+	struct fuse_uring_ent_in_out ring_in_out;
+
+	err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out,
+			     sizeof(ring_in_out));
+	if (err)
+		return -EFAULT;
+
+	err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz,
+			  &iter);
+	if (err)
+		return err;
+
+	fuse_copy_init(&cs, 0, &iter);
+	cs.is_uring = 1;
+	cs.req = req;
+
+	return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
+}
+
+ /*
+  * Copy data from the req to the ring buffer
+  */
+static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
+				   struct fuse_ring_ent *ent)
+{
+	struct fuse_copy_state cs;
+	struct fuse_args *args = req->args;
+	struct fuse_in_arg *in_args = args->in_args;
+	int num_args = args->in_numargs;
+	int err;
+	struct iov_iter iter;
+	struct fuse_uring_ent_in_out ent_in_out = {
+		.flags = 0,
+		.commit_id = req->in.h.unique,
+	};
+
+	err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter);
+	if (err) {
+		pr_info_ratelimited("fuse: Import of user buffer failed\n");
+		return err;
+	}
+
+	fuse_copy_init(&cs, 1, &iter);
+	cs.is_uring = 1;
+	cs.req = req;
+
+	if (num_args > 0) {
+		/*
+		 * Expectation is that the first argument is the per op header.
+		 * Some op code have that as zero size.
+		 */
+		if (args->in_args[0].size > 0) {
+			err = copy_to_user(&ent->headers->op_in, in_args->value,
+					   in_args->size);
+			if (err) {
+				pr_info_ratelimited(
+					"Copying the header failed.\n");
+				return -EFAULT;
+			}
+		}
+		in_args++;
+		num_args--;
+	}
+
+	/* copy the payload */
+	err = fuse_copy_args(&cs, num_args, args->in_pages,
+			     (struct fuse_arg *)in_args, 0);
+	if (err) {
+		pr_info_ratelimited("%s fuse_copy_args failed\n", __func__);
+		return err;
+	}
+
+	ent_in_out.payload_sz = cs.ring.copied_sz;
+	err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out,
+			   sizeof(ent_in_out));
+	return err ? -EFAULT : 0;
+}
+
+static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
+				   struct fuse_req *req)
+{
+	struct fuse_ring_queue *queue = ent->queue;
+	struct fuse_ring *ring = queue->ring;
+	int err;
+
+	err = -EIO;
+	if (WARN_ON(ent->state != FRRS_FUSE_REQ)) {
+		pr_err("qid=%d ring-req=%p invalid state %d on send\n",
+		       queue->qid, ent, ent->state);
+		return err;
+	}
+
+	err = -EINVAL;
+	if (WARN_ON(req->in.h.unique == 0))
+		return err;
+
+	/* copy the request */
+	err = fuse_uring_args_to_ring(ring, req, ent);
+	if (unlikely(err)) {
+		pr_info_ratelimited("Copy to ring failed: %d\n", err);
+		return err;
+	}
+
+	/* copy fuse_in_header */
+	err = copy_to_user(&ent->headers->in_out, &req->in.h,
+			   sizeof(req->in.h));
+	if (err) {
+		err = -EFAULT;
+		return err;
+	}
+
+	return 0;
+}
+
+static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
+				   struct fuse_req *req)
+{
+	int err;
+
+	err = fuse_uring_copy_to_ring(ent, req);
+	if (!err)
+		set_bit(FR_SENT, &req->flags);
+	else
+		fuse_uring_req_end(ent, req, err);
+
+	return err;
+}
+
+/*
+ * Write data to the ring buffer and send the request to userspace,
+ * userspace will read it
+ * This is comparable with classical read(/dev/fuse)
+ */
+static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
+					struct fuse_req *req,
+					unsigned int issue_flags)
+{
+	struct fuse_ring_queue *queue = ent->queue;
+	int err;
+	struct io_uring_cmd *cmd;
+
+	err = fuse_uring_prepare_send(ent, req);
+	if (err)
+		return err;
+
+	spin_lock(&queue->lock);
+	cmd = ent->cmd;
+	ent->cmd = NULL;
+	ent->state = FRRS_USERSPACE;
+	list_move(&ent->list, &queue->ent_in_userspace);
+	spin_unlock(&queue->lock);
+
+	io_uring_cmd_done(cmd, 0, 0, issue_flags);
+	return 0;
+}
+
 /*
  * Make a ring entry available for fuse_req assignment
  */
@@ -137,6 +373,203 @@ static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
 	ent->state = FRRS_AVAILABLE;
 }
 
+/* Used to find the request on SQE commit */
+static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent,
+				 struct fuse_req *req)
+{
+	struct fuse_ring_queue *queue = ent->queue;
+	struct fuse_pqueue *fpq = &queue->fpq;
+	unsigned int hash;
+
+	req->ring_entry = ent;
+	hash = fuse_req_hash(req->in.h.unique);
+	list_move_tail(&req->list, &fpq->processing[hash]);
+}
+
+/*
+ * Assign a fuse queue entry to the given entry
+ */
+static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
+					   struct fuse_req *req)
+{
+	struct fuse_ring_queue *queue = ent->queue;
+	struct fuse_conn *fc = req->fm->fc;
+	struct fuse_iqueue *fiq = &fc->iq;
+
+	lockdep_assert_held(&queue->lock);
+
+	if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE &&
+			 ent->state != FRRS_COMMIT)) {
+		pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid,
+			ent->state);
+	}
+
+	spin_lock(&fiq->lock);
+	clear_bit(FR_PENDING, &req->flags);
+	spin_unlock(&fiq->lock);
+	ent->fuse_req = req;
+	ent->state = FRRS_FUSE_REQ;
+	list_move(&ent->list, &queue->ent_w_req_queue);
+	fuse_uring_add_to_pq(ent, req);
+}
+
+/* Fetch the next fuse request if available */
+static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent)
+	__must_hold(&queue->lock)
+{
+	struct fuse_req *req;
+	struct fuse_ring_queue *queue = ent->queue;
+	struct list_head *req_queue = &queue->fuse_req_queue;
+
+	lockdep_assert_held(&queue->lock);
+
+	/* get and assign the next entry while it is still holding the lock */
+	req = list_first_entry_or_null(req_queue, struct fuse_req, list);
+	if (req)
+		fuse_uring_add_req_to_ring_ent(ent, req);
+
+	return req;
+}
+
+/*
+ * Read data from the ring buffer, which user space has written to
+ * This is comparible with handling of classical write(/dev/fuse).
+ * Also make the ring request available again for new fuse requests.
+ */
+static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
+			      unsigned int issue_flags)
+{
+	struct fuse_ring *ring = ent->queue->ring;
+	struct fuse_conn *fc = ring->fc;
+	ssize_t err = 0;
+
+	err = copy_from_user(&req->out.h, &ent->headers->in_out,
+			     sizeof(req->out.h));
+	if (err) {
+		req->out.h.error = -EFAULT;
+		goto out;
+	}
+
+	err = fuse_uring_out_header_has_err(&req->out.h, req, fc);
+	if (err) {
+		/* req->out.h.error already set */
+		goto out;
+	}
+
+	err = fuse_uring_copy_from_ring(ring, req, ent);
+out:
+	fuse_uring_req_end(ent, req, err);
+}
+
+/*
+ * Get the next fuse req and send it
+ */
+static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent,
+				     struct fuse_ring_queue *queue,
+				     unsigned int issue_flags)
+{
+	int err;
+	struct fuse_req *req;
+
+retry:
+	spin_lock(&queue->lock);
+	fuse_uring_ent_avail(ent, queue);
+	req = fuse_uring_ent_assign_req(ent);
+	spin_unlock(&queue->lock);
+
+	if (req) {
+		err = fuse_uring_send_next_to_ring(ent, req, issue_flags);
+		if (err)
+			goto retry;
+	}
+}
+
+static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent)
+{
+	struct fuse_ring_queue *queue = ent->queue;
+
+	lockdep_assert_held(&queue->lock);
+
+	if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE))
+		return -EIO;
+
+	ent->state = FRRS_COMMIT;
+	list_move(&ent->list, &queue->ent_commit_queue);
+
+	return 0;
+}
+
+/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */
+static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
+				   struct fuse_conn *fc)
+{
+	const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+	struct fuse_ring_ent *ent;
+	int err;
+	struct fuse_ring *ring = fc->ring;
+	struct fuse_ring_queue *queue;
+	uint64_t commit_id = READ_ONCE(cmd_req->commit_id);
+	unsigned int qid = READ_ONCE(cmd_req->qid);
+	struct fuse_pqueue *fpq;
+	struct fuse_req *req;
+
+	err = -ENOTCONN;
+	if (!ring)
+		return err;
+
+	if (qid >= ring->nr_queues)
+		return -EINVAL;
+
+	queue = ring->queues[qid];
+	if (!queue)
+		return err;
+	fpq = &queue->fpq;
+
+	spin_lock(&queue->lock);
+	/* Find a request based on the unique ID of the fuse request
+	 * This should get revised, as it needs a hash calculation and list
+	 * search. And full struct fuse_pqueue is needed (memory overhead).
+	 * As well as the link from req to ring_ent.
+	 */
+	req = fuse_request_find(fpq, commit_id);
+	err = -ENOENT;
+	if (!req) {
+		pr_info("qid=%d commit_id %llu not found\n", queue->qid,
+			commit_id);
+		spin_unlock(&queue->lock);
+		return err;
+	}
+	list_del_init(&req->list);
+	ent = req->ring_entry;
+	req->ring_entry = NULL;
+
+	err = fuse_ring_ent_set_commit(ent);
+	if (err != 0) {
+		pr_info_ratelimited("qid=%d commit_id %llu state %d",
+				    queue->qid, commit_id, ent->state);
+		spin_unlock(&queue->lock);
+		req->out.h.error = err;
+		clear_bit(FR_SENT, &req->flags);
+		fuse_request_end(req);
+		return err;
+	}
+
+	ent->cmd = cmd;
+	spin_unlock(&queue->lock);
+
+	/* without the queue lock, as other locks are taken */
+	fuse_uring_commit(ent, req, issue_flags);
+
+	/*
+	 * Fetching the next request is absolutely required as queued
+	 * fuse requests would otherwise not get processed - committing
+	 * and fetching is done in one step vs legacy fuse, which has separated
+	 * read (fetch request) and write (commit result).
+	 */
+	fuse_uring_next_fuse_req(ent, queue, issue_flags);
+	return 0;
+}
+
 /*
  * fuse_uring_req_fetch command handling
  */
@@ -318,6 +751,14 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd,
 			return err;
 		}
 		break;
+	case FUSE_IO_URING_CMD_COMMIT_AND_FETCH:
+		err = fuse_uring_commit_fetch(cmd, issue_flags, fc);
+		if (err) {
+			pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n",
+				     err);
+			return err;
+		}
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index ae1536355b36..44bf237f0d5a 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -20,6 +20,9 @@ enum fuse_ring_req_state {
 	/* The ring entry is waiting for new fuse requests */
 	FRRS_AVAILABLE,
 
+	/* The ring entry got assigned a fuse req */
+	FRRS_FUSE_REQ,
+
 	/* The ring entry is in or on the way to user space */
 	FRRS_USERSPACE,
 };
@@ -67,7 +70,16 @@ struct fuse_ring_queue {
 	 * entries in the process of being committed or in the process
 	 * to be sent to userspace
 	 */
+	struct list_head ent_w_req_queue;
 	struct list_head ent_commit_queue;
+
+	/* entries in userspace */
+	struct list_head ent_in_userspace;
+
+	/* fuse requests waiting for an entry slot */
+	struct list_head fuse_req_queue;
+
+	struct fuse_pqueue fpq;
 };
 
 /**
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fb981002287d..ba6901c1bc2d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -438,6 +438,10 @@ struct fuse_req {
 
 	/** fuse_mount this request belongs to */
 	struct fuse_mount *fm;
+
+#ifdef CONFIG_FUSE_IO_URING
+	void *ring_entry;
+#endif
 };
 
 struct fuse_iqueue;
-- 
cgit v1.2.3


From 4a9bfb9b6850fec0685447aed280533cf980de70 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:04 +0100
Subject: fuse: {io-uring} Handle teardown of ring entries

On teardown struct file_operations::uring_cmd requests
need to be completed by calling io_uring_cmd_done().
Not completing all ring entries would result in busy io-uring
tasks giving warning messages in intervals and unreleased
struct file.

Additionally the fuse connection and with that the ring can
only get released when all io-uring commands are completed.

Completion is done with ring entries that are
a) in waiting state for new fuse requests - io_uring_cmd_done
is needed

b) already in userspace - io_uring_cmd_done through teardown
is not needed, the request can just get released. If fuse server
is still active and commits such a ring entry, fuse_uring_cmd()
already checks if the connection is active and then complete the
io-uring itself with -ENOTCONN. I.e. special handling is not
needed.

This scheme is basically represented by the ring entry state
FRRS_WAIT and FRRS_USERSPACE.

Entries in state:
- FRRS_INIT: No action needed, do not contribute to
  ring->queue_refs yet
- All other states: Are currently processed by other tasks,
  async teardown is needed and it has to wait for the two
  states above. It could be also solved without an async
  teardown task, but would require additional if conditions
  in hot code paths. Also in my personal opinion the code
  looks cleaner with async teardown.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> # io_uring
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c         |   9 +++
 fs/fuse/dev_uring.c   | 207 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dev_uring_i.h |  51 +++++++++++++
 3 files changed, 267 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index aa33eba51c51..1c21e491e891 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -6,6 +6,7 @@
   See the file COPYING.
 */
 
+#include "dev_uring_i.h"
 #include "fuse_i.h"
 #include "fuse_dev_i.h"
 
@@ -2291,6 +2292,12 @@ void fuse_abort_conn(struct fuse_conn *fc)
 		spin_unlock(&fc->lock);
 
 		fuse_dev_end_requests(&to_end);
+
+		/*
+		 * fc->lock must not be taken to avoid conflicts with io-uring
+		 * locks
+		 */
+		fuse_uring_abort(fc);
 	} else {
 		spin_unlock(&fc->lock);
 	}
@@ -2302,6 +2309,8 @@ void fuse_wait_aborted(struct fuse_conn *fc)
 	/* matches implicit memory barrier in fuse_drop_waiting() */
 	smp_mb();
 	wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0);
+
+	fuse_uring_wait_stopped_queues(fc);
 }
 
 int fuse_dev_release(struct inode *inode, struct file *file)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 1030c1720990..1161b9aa5e11 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -35,6 +35,37 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
 	fuse_request_end(req);
 }
 
+/* Abort all list queued request on the given ring queue */
+static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue)
+{
+	struct fuse_req *req;
+	LIST_HEAD(req_list);
+
+	spin_lock(&queue->lock);
+	list_for_each_entry(req, &queue->fuse_req_queue, list)
+		clear_bit(FR_PENDING, &req->flags);
+	list_splice_init(&queue->fuse_req_queue, &req_list);
+	spin_unlock(&queue->lock);
+
+	/* must not hold queue lock to avoid order issues with fi->lock */
+	fuse_dev_end_requests(&req_list);
+}
+
+void fuse_uring_abort_end_requests(struct fuse_ring *ring)
+{
+	int qid;
+	struct fuse_ring_queue *queue;
+
+	for (qid = 0; qid < ring->nr_queues; qid++) {
+		queue = READ_ONCE(ring->queues[qid]);
+		if (!queue)
+			continue;
+
+		queue->stopped = true;
+		fuse_uring_abort_end_queue_requests(queue);
+	}
+}
+
 void fuse_uring_destruct(struct fuse_conn *fc)
 {
 	struct fuse_ring *ring = fc->ring;
@@ -94,10 +125,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
 		goto out_err;
 	}
 
+	init_waitqueue_head(&ring->stop_waitq);
+
 	fc->ring = ring;
 	ring->nr_queues = nr_queues;
 	ring->fc = fc;
 	ring->max_payload_sz = max_payload_size;
+	atomic_set(&ring->queue_refs, 0);
 
 	spin_unlock(&fc->lock);
 	return ring;
@@ -154,6 +188,175 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
 	return queue;
 }
 
+static void fuse_uring_stop_fuse_req_end(struct fuse_req *req)
+{
+	clear_bit(FR_SENT, &req->flags);
+	req->out.h.error = -ECONNABORTED;
+	fuse_request_end(req);
+}
+
+/*
+ * Release a request/entry on connection tear down
+ */
+static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
+{
+	struct fuse_req *req;
+	struct io_uring_cmd *cmd;
+
+	struct fuse_ring_queue *queue = ent->queue;
+
+	spin_lock(&queue->lock);
+	cmd = ent->cmd;
+	ent->cmd = NULL;
+	req = ent->fuse_req;
+	ent->fuse_req = NULL;
+	if (req) {
+		/* remove entry from queue->fpq->processing */
+		list_del_init(&req->list);
+	}
+	spin_unlock(&queue->lock);
+
+	if (cmd)
+		io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED);
+
+	if (req)
+		fuse_uring_stop_fuse_req_end(req);
+
+	list_del_init(&ent->list);
+	kfree(ent);
+}
+
+static void fuse_uring_stop_list_entries(struct list_head *head,
+					 struct fuse_ring_queue *queue,
+					 enum fuse_ring_req_state exp_state)
+{
+	struct fuse_ring *ring = queue->ring;
+	struct fuse_ring_ent *ent, *next;
+	ssize_t queue_refs = SSIZE_MAX;
+	LIST_HEAD(to_teardown);
+
+	spin_lock(&queue->lock);
+	list_for_each_entry_safe(ent, next, head, list) {
+		if (ent->state != exp_state) {
+			pr_warn("entry teardown qid=%d state=%d expected=%d",
+				queue->qid, ent->state, exp_state);
+			continue;
+		}
+
+		list_move(&ent->list, &to_teardown);
+	}
+	spin_unlock(&queue->lock);
+
+	/* no queue lock to avoid lock order issues */
+	list_for_each_entry_safe(ent, next, &to_teardown, list) {
+		fuse_uring_entry_teardown(ent);
+		queue_refs = atomic_dec_return(&ring->queue_refs);
+		WARN_ON_ONCE(queue_refs < 0);
+	}
+}
+
+static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue)
+{
+	fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue,
+				     FRRS_USERSPACE);
+	fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue,
+				     FRRS_AVAILABLE);
+}
+
+/*
+ * Log state debug info
+ */
+static void fuse_uring_log_ent_state(struct fuse_ring *ring)
+{
+	int qid;
+	struct fuse_ring_ent *ent;
+
+	for (qid = 0; qid < ring->nr_queues; qid++) {
+		struct fuse_ring_queue *queue = ring->queues[qid];
+
+		if (!queue)
+			continue;
+
+		spin_lock(&queue->lock);
+		/*
+		 * Log entries from the intermediate queue, the other queues
+		 * should be empty
+		 */
+		list_for_each_entry(ent, &queue->ent_w_req_queue, list) {
+			pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n",
+				ring, qid, ent, ent->state);
+		}
+		list_for_each_entry(ent, &queue->ent_commit_queue, list) {
+			pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n",
+				ring, qid, ent, ent->state);
+		}
+		spin_unlock(&queue->lock);
+	}
+	ring->stop_debug_log = 1;
+}
+
+static void fuse_uring_async_stop_queues(struct work_struct *work)
+{
+	int qid;
+	struct fuse_ring *ring =
+		container_of(work, struct fuse_ring, async_teardown_work.work);
+
+	/* XXX code dup */
+	for (qid = 0; qid < ring->nr_queues; qid++) {
+		struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+
+		if (!queue)
+			continue;
+
+		fuse_uring_teardown_entries(queue);
+	}
+
+	/*
+	 * Some ring entries might be in the middle of IO operations,
+	 * i.e. in process to get handled by file_operations::uring_cmd
+	 * or on the way to userspace - we could handle that with conditions in
+	 * run time code, but easier/cleaner to have an async tear down handler
+	 * If there are still queue references left
+	 */
+	if (atomic_read(&ring->queue_refs) > 0) {
+		if (time_after(jiffies,
+			       ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT))
+			fuse_uring_log_ent_state(ring);
+
+		schedule_delayed_work(&ring->async_teardown_work,
+				      FUSE_URING_TEARDOWN_INTERVAL);
+	} else {
+		wake_up_all(&ring->stop_waitq);
+	}
+}
+
+/*
+ * Stop the ring queues
+ */
+void fuse_uring_stop_queues(struct fuse_ring *ring)
+{
+	int qid;
+
+	for (qid = 0; qid < ring->nr_queues; qid++) {
+		struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+
+		if (!queue)
+			continue;
+
+		fuse_uring_teardown_entries(queue);
+	}
+
+	if (atomic_read(&ring->queue_refs) > 0) {
+		ring->teardown_time = jiffies;
+		INIT_DELAYED_WORK(&ring->async_teardown_work,
+				  fuse_uring_async_stop_queues);
+		schedule_delayed_work(&ring->async_teardown_work,
+				      FUSE_URING_TEARDOWN_INTERVAL);
+	} else {
+		wake_up_all(&ring->stop_waitq);
+	}
+}
+
 /*
  * Checks for errors and stores it into the request
  */
@@ -525,6 +728,9 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
 		return err;
 	fpq = &queue->fpq;
 
+	if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped))
+		return err;
+
 	spin_lock(&queue->lock);
 	/* Find a request based on the unique ID of the fuse request
 	 * This should get revised, as it needs a hash calculation and list
@@ -652,6 +858,7 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
 	ent->headers = iov[0].iov_base;
 	ent->payload = iov[1].iov_base;
 
+	atomic_inc(&ring->queue_refs);
 	return ent;
 }
 
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 44bf237f0d5a..a4316e118cbd 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -11,6 +11,9 @@
 
 #ifdef CONFIG_FUSE_IO_URING
 
+#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ)
+#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20)
+
 enum fuse_ring_req_state {
 	FRRS_INVALID = 0,
 
@@ -80,6 +83,8 @@ struct fuse_ring_queue {
 	struct list_head fuse_req_queue;
 
 	struct fuse_pqueue fpq;
+
+	bool stopped;
 };
 
 /**
@@ -97,12 +102,51 @@ struct fuse_ring {
 	size_t max_payload_sz;
 
 	struct fuse_ring_queue **queues;
+
+	/*
+	 * Log ring entry states on stop when entries cannot be released
+	 */
+	unsigned int stop_debug_log : 1;
+
+	wait_queue_head_t stop_waitq;
+
+	/* async tear down */
+	struct delayed_work async_teardown_work;
+
+	/* log */
+	unsigned long teardown_time;
+
+	atomic_t queue_refs;
 };
 
 bool fuse_uring_enabled(void);
 void fuse_uring_destruct(struct fuse_conn *fc);
+void fuse_uring_stop_queues(struct fuse_ring *ring);
+void fuse_uring_abort_end_requests(struct fuse_ring *ring);
 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
+static inline void fuse_uring_abort(struct fuse_conn *fc)
+{
+	struct fuse_ring *ring = fc->ring;
+
+	if (ring == NULL)
+		return;
+
+	if (atomic_read(&ring->queue_refs) > 0) {
+		fuse_uring_abort_end_requests(ring);
+		fuse_uring_stop_queues(ring);
+	}
+}
+
+static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
+{
+	struct fuse_ring *ring = fc->ring;
+
+	if (ring)
+		wait_event(ring->stop_waitq,
+			   atomic_read(&ring->queue_refs) == 0);
+}
+
 #else /* CONFIG_FUSE_IO_URING */
 
 struct fuse_ring;
@@ -120,6 +164,13 @@ static inline bool fuse_uring_enabled(void)
 	return false;
 }
 
+static inline void fuse_uring_abort(struct fuse_conn *fc)
+{
+}
+
+static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
+{
+}
 #endif /* CONFIG_FUSE_IO_URING */
 
 #endif /* _FS_FUSE_DEV_URING_I_H */
-- 
cgit v1.2.3


From ba74ba571189668697a8d8da906ad6d44762ebc6 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:05 +0100
Subject: fuse: {io-uring} Make fuse_dev_queue_{interrupt,forget} non-static

These functions are also needed by fuse-over-io-uring.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c        | 5 +++--
 fs/fuse/fuse_dev_i.h | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1c21e491e891..ecf2f805f456 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -237,7 +237,8 @@ __releases(fiq->lock)
 	spin_unlock(&fiq->lock);
 }
 
-static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget)
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+			   struct fuse_forget_link *forget)
 {
 	spin_lock(&fiq->lock);
 	if (fiq->connected) {
@@ -250,7 +251,7 @@ static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_li
 	}
 }
 
-static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
 	spin_lock(&fiq->lock);
 	if (list_empty(&req->intr_entry)) {
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index b64ab84cbc0d..3b2bfe1248d3 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -16,6 +16,8 @@ struct fuse_arg;
 struct fuse_args;
 struct fuse_pqueue;
 struct fuse_req;
+struct fuse_iqueue;
+struct fuse_forget_link;
 
 struct fuse_copy_state {
 	int write;
@@ -56,6 +58,9 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
 		   int zeroing);
 int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
 		       unsigned int nbytes);
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+			   struct fuse_forget_link *forget);
+void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req);
 
 #endif
 
-- 
cgit v1.2.3


From c2c9af9a0b13261c36909036057a116f2edb5e1a Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:06 +0100
Subject: fuse: Allow to queue fg requests through io-uring

This prepares queueing and sending foreground requests through
io-uring.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> # io_uring
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev_uring.c   | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dev_uring_i.h |   8 +++
 2 files changed, 188 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 1161b9aa5e11..728000434589 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -24,6 +24,29 @@ bool fuse_uring_enabled(void)
 	return enable_uring;
 }
 
+struct fuse_uring_pdu {
+	struct fuse_ring_ent *ent;
+};
+
+static const struct fuse_iqueue_ops fuse_io_uring_ops;
+
+static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
+				   struct fuse_ring_ent *ring_ent)
+{
+	struct fuse_uring_pdu *pdu =
+		io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
+
+	pdu->ent = ring_ent;
+}
+
+static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd)
+{
+	struct fuse_uring_pdu *pdu =
+		io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
+
+	return pdu->ent;
+}
+
 static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
 			       int error)
 {
@@ -776,6 +799,31 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
 	return 0;
 }
 
+static bool is_ring_ready(struct fuse_ring *ring, int current_qid)
+{
+	int qid;
+	struct fuse_ring_queue *queue;
+	bool ready = true;
+
+	for (qid = 0; qid < ring->nr_queues && ready; qid++) {
+		if (current_qid == qid)
+			continue;
+
+		queue = ring->queues[qid];
+		if (!queue) {
+			ready = false;
+			break;
+		}
+
+		spin_lock(&queue->lock);
+		if (list_empty(&queue->ent_avail_queue))
+			ready = false;
+		spin_unlock(&queue->lock);
+	}
+
+	return ready;
+}
+
 /*
  * fuse_uring_req_fetch command handling
  */
@@ -784,11 +832,23 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent,
 				   unsigned int issue_flags)
 {
 	struct fuse_ring_queue *queue = ent->queue;
+	struct fuse_ring *ring = queue->ring;
+	struct fuse_conn *fc = ring->fc;
+	struct fuse_iqueue *fiq = &fc->iq;
 
 	spin_lock(&queue->lock);
 	ent->cmd = cmd;
 	fuse_uring_ent_avail(ent, queue);
 	spin_unlock(&queue->lock);
+
+	if (!ring->ready) {
+		bool ready = is_ring_ready(ring, queue->qid);
+
+		if (ready) {
+			WRITE_ONCE(fiq->ops, &fuse_io_uring_ops);
+			WRITE_ONCE(ring->ready, true);
+		}
+	}
 }
 
 /*
@@ -972,3 +1032,123 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd,
 
 	return -EIOCBQUEUED;
 }
+
+static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
+			    ssize_t ret, unsigned int issue_flags)
+{
+	struct fuse_ring_queue *queue = ent->queue;
+
+	spin_lock(&queue->lock);
+	ent->state = FRRS_USERSPACE;
+	list_move(&ent->list, &queue->ent_in_userspace);
+	ent->cmd = NULL;
+	spin_unlock(&queue->lock);
+
+	io_uring_cmd_done(cmd, ret, 0, issue_flags);
+}
+
+/*
+ * This prepares and sends the ring request in fuse-uring task context.
+ * User buffers are not mapped yet - the application does not have permission
+ * to write to it - this has to be executed in ring task context.
+ */
+static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
+				    unsigned int issue_flags)
+{
+	struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
+	struct fuse_ring_queue *queue = ent->queue;
+	int err;
+
+	if (!(issue_flags & IO_URING_F_TASK_DEAD)) {
+		err = fuse_uring_prepare_send(ent, ent->fuse_req);
+		if (err) {
+			fuse_uring_next_fuse_req(ent, queue, issue_flags);
+			return;
+		}
+	} else {
+		err = -ECANCELED;
+	}
+
+	fuse_uring_send(ent, cmd, err, issue_flags);
+}
+
+static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
+{
+	unsigned int qid;
+	struct fuse_ring_queue *queue;
+
+	qid = task_cpu(current);
+
+	if (WARN_ONCE(qid >= ring->nr_queues,
+		      "Core number (%u) exceeds nr queues (%zu)\n", qid,
+		      ring->nr_queues))
+		qid = 0;
+
+	queue = ring->queues[qid];
+	WARN_ONCE(!queue, "Missing queue for qid %d\n", qid);
+
+	return queue;
+}
+
+static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent)
+{
+	struct io_uring_cmd *cmd = ent->cmd;
+
+	uring_cmd_set_ring_ent(cmd, ent);
+	io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task);
+}
+
+/* queue a fuse request and send it if a ring entry is available */
+void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
+{
+	struct fuse_conn *fc = req->fm->fc;
+	struct fuse_ring *ring = fc->ring;
+	struct fuse_ring_queue *queue;
+	struct fuse_ring_ent *ent = NULL;
+	int err;
+
+	err = -EINVAL;
+	queue = fuse_uring_task_to_queue(ring);
+	if (!queue)
+		goto err;
+
+	if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+		req->in.h.unique = fuse_get_unique(fiq);
+
+	spin_lock(&queue->lock);
+	err = -ENOTCONN;
+	if (unlikely(queue->stopped))
+		goto err_unlock;
+
+	ent = list_first_entry_or_null(&queue->ent_avail_queue,
+				       struct fuse_ring_ent, list);
+	if (ent)
+		fuse_uring_add_req_to_ring_ent(ent, req);
+	else
+		list_add_tail(&req->list, &queue->fuse_req_queue);
+	spin_unlock(&queue->lock);
+
+	if (ent)
+		fuse_uring_dispatch_ent(ent);
+
+	return;
+
+err_unlock:
+	spin_unlock(&queue->lock);
+err:
+	req->out.h.error = err;
+	clear_bit(FR_PENDING, &req->flags);
+	fuse_request_end(req);
+}
+
+static const struct fuse_iqueue_ops fuse_io_uring_ops = {
+	/* should be send over io-uring as enhancement */
+	.send_forget = fuse_dev_queue_forget,
+
+	/*
+	 * could be send over io-uring, but interrupts should be rare,
+	 * no need to make the code complex
+	 */
+	.send_interrupt = fuse_dev_queue_interrupt,
+	.send_req = fuse_uring_queue_fuse_req,
+};
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index a4316e118cbd..0517a6eafc91 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -117,6 +117,8 @@ struct fuse_ring {
 	unsigned long teardown_time;
 
 	atomic_t queue_refs;
+
+	bool ready;
 };
 
 bool fuse_uring_enabled(void);
@@ -124,6 +126,7 @@ void fuse_uring_destruct(struct fuse_conn *fc);
 void fuse_uring_stop_queues(struct fuse_ring *ring);
 void fuse_uring_abort_end_requests(struct fuse_ring *ring);
 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
 
 static inline void fuse_uring_abort(struct fuse_conn *fc)
 {
@@ -147,6 +150,11 @@ static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
 			   atomic_read(&ring->queue_refs) == 0);
 }
 
+static inline bool fuse_uring_ready(struct fuse_conn *fc)
+{
+	return fc->ring && fc->ring->ready;
+}
+
 #else /* CONFIG_FUSE_IO_URING */
 
 struct fuse_ring;
-- 
cgit v1.2.3


From 857b0263f30eebe13ab4b6a65156a0d6c8fc2210 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:07 +0100
Subject: fuse: Allow to queue bg requests through io-uring

This prepares queueing and sending background requests through
io-uring.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> # io_uring
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c         | 26 +++++++++++++-
 fs/fuse/dev_uring.c   | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dev_uring_i.h | 12 +++++++
 3 files changed, 136 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ecf2f805f456..1b593b23f7b8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -568,7 +568,25 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap,
 	return ret;
 }
 
-static bool fuse_request_queue_background(struct fuse_req *req)
+#ifdef CONFIG_FUSE_IO_URING
+static bool fuse_request_queue_background_uring(struct fuse_conn *fc,
+					       struct fuse_req *req)
+{
+	struct fuse_iqueue *fiq = &fc->iq;
+
+	req->in.h.unique = fuse_get_unique(fiq);
+	req->in.h.len = sizeof(struct fuse_in_header) +
+		fuse_len_args(req->args->in_numargs,
+			      (struct fuse_arg *) req->args->in_args);
+
+	return fuse_uring_queue_bq_req(req);
+}
+#endif
+
+/*
+ * @return true if queued
+ */
+static int fuse_request_queue_background(struct fuse_req *req)
 {
 	struct fuse_mount *fm = req->fm;
 	struct fuse_conn *fc = fm->fc;
@@ -580,6 +598,12 @@ static bool fuse_request_queue_background(struct fuse_req *req)
 		atomic_inc(&fc->num_waiting);
 	}
 	__set_bit(FR_ISREPLY, &req->flags);
+
+#ifdef CONFIG_FUSE_IO_URING
+	if (fuse_uring_ready(fc))
+		return fuse_request_queue_background_uring(fc, req);
+#endif
+
 	spin_lock(&fc->bg_lock);
 	if (likely(fc->connected)) {
 		fc->num_background++;
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 728000434589..27bc103c17c8 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -47,10 +47,53 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd)
 	return pdu->ent;
 }
 
+static void fuse_uring_flush_bg(struct fuse_ring_queue *queue)
+{
+	struct fuse_ring *ring = queue->ring;
+	struct fuse_conn *fc = ring->fc;
+
+	lockdep_assert_held(&queue->lock);
+	lockdep_assert_held(&fc->bg_lock);
+
+	/*
+	 * Allow one bg request per queue, ignoring global fc limits.
+	 * This prevents a single queue from consuming all resources and
+	 * eliminates the need for remote queue wake-ups when global
+	 * limits are met but this queue has no more waiting requests.
+	 */
+	while ((fc->active_background < fc->max_background ||
+		!queue->active_background) &&
+	       (!list_empty(&queue->fuse_req_bg_queue))) {
+		struct fuse_req *req;
+
+		req = list_first_entry(&queue->fuse_req_bg_queue,
+				       struct fuse_req, list);
+		fc->active_background++;
+		queue->active_background++;
+
+		list_move_tail(&req->list, &queue->fuse_req_queue);
+	}
+}
+
 static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
 			       int error)
 {
+	struct fuse_ring_queue *queue = ent->queue;
+	struct fuse_ring *ring = queue->ring;
+	struct fuse_conn *fc = ring->fc;
+
+	lockdep_assert_not_held(&queue->lock);
+	spin_lock(&queue->lock);
 	ent->fuse_req = NULL;
+	if (test_bit(FR_BACKGROUND, &req->flags)) {
+		queue->active_background--;
+		spin_lock(&fc->bg_lock);
+		fuse_uring_flush_bg(queue);
+		spin_unlock(&fc->bg_lock);
+	}
+
+	spin_unlock(&queue->lock);
+
 	if (error)
 		req->out.h.error = error;
 
@@ -78,6 +121,7 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring)
 {
 	int qid;
 	struct fuse_ring_queue *queue;
+	struct fuse_conn *fc = ring->fc;
 
 	for (qid = 0; qid < ring->nr_queues; qid++) {
 		queue = READ_ONCE(ring->queues[qid]);
@@ -85,6 +129,13 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring)
 			continue;
 
 		queue->stopped = true;
+
+		WARN_ON_ONCE(ring->fc->max_background != UINT_MAX);
+		spin_lock(&queue->lock);
+		spin_lock(&fc->bg_lock);
+		fuse_uring_flush_bg(queue);
+		spin_unlock(&fc->bg_lock);
+		spin_unlock(&queue->lock);
 		fuse_uring_abort_end_queue_requests(queue);
 	}
 }
@@ -190,6 +241,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
 	INIT_LIST_HEAD(&queue->ent_w_req_queue);
 	INIT_LIST_HEAD(&queue->ent_in_userspace);
 	INIT_LIST_HEAD(&queue->fuse_req_queue);
+	INIT_LIST_HEAD(&queue->fuse_req_bg_queue);
 
 	queue->fpq.processing = pq;
 	fuse_pqueue_init(&queue->fpq);
@@ -1141,6 +1193,53 @@ err:
 	fuse_request_end(req);
 }
 
+bool fuse_uring_queue_bq_req(struct fuse_req *req)
+{
+	struct fuse_conn *fc = req->fm->fc;
+	struct fuse_ring *ring = fc->ring;
+	struct fuse_ring_queue *queue;
+	struct fuse_ring_ent *ent = NULL;
+
+	queue = fuse_uring_task_to_queue(ring);
+	if (!queue)
+		return false;
+
+	spin_lock(&queue->lock);
+	if (unlikely(queue->stopped)) {
+		spin_unlock(&queue->lock);
+		return false;
+	}
+
+	list_add_tail(&req->list, &queue->fuse_req_bg_queue);
+
+	ent = list_first_entry_or_null(&queue->ent_avail_queue,
+				       struct fuse_ring_ent, list);
+	spin_lock(&fc->bg_lock);
+	fc->num_background++;
+	if (fc->num_background == fc->max_background)
+		fc->blocked = 1;
+	fuse_uring_flush_bg(queue);
+	spin_unlock(&fc->bg_lock);
+
+	/*
+	 * Due to bg_queue flush limits there might be other bg requests
+	 * in the queue that need to be handled first. Or no further req
+	 * might be available.
+	 */
+	req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req,
+				       list);
+	if (ent && req) {
+		fuse_uring_add_req_to_ring_ent(ent, req);
+		spin_unlock(&queue->lock);
+
+		fuse_uring_dispatch_ent(ent);
+	} else {
+		spin_unlock(&queue->lock);
+	}
+
+	return true;
+}
+
 static const struct fuse_iqueue_ops fuse_io_uring_ops = {
 	/* should be send over io-uring as enhancement */
 	.send_forget = fuse_dev_queue_forget,
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 0517a6eafc91..0182be61778b 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -82,8 +82,13 @@ struct fuse_ring_queue {
 	/* fuse requests waiting for an entry slot */
 	struct list_head fuse_req_queue;
 
+	/* background fuse requests */
+	struct list_head fuse_req_bg_queue;
+
 	struct fuse_pqueue fpq;
 
+	unsigned int active_background;
+
 	bool stopped;
 };
 
@@ -127,6 +132,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring);
 void fuse_uring_abort_end_requests(struct fuse_ring *ring);
 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
 void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
+bool fuse_uring_queue_bq_req(struct fuse_req *req);
 
 static inline void fuse_uring_abort(struct fuse_conn *fc)
 {
@@ -179,6 +185,12 @@ static inline void fuse_uring_abort(struct fuse_conn *fc)
 static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
 {
 }
+
+static inline bool fuse_uring_ready(struct fuse_conn *fc)
+{
+	return false;
+}
+
 #endif /* CONFIG_FUSE_IO_URING */
 
 #endif /* _FS_FUSE_DEV_URING_I_H */
-- 
cgit v1.2.3


From b6236c8407cba5d7a108facb1bcfab24994d3814 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:08 +0100
Subject: fuse: {io-uring} Prevent mount point hang on fuse-server termination

When the fuse-server terminates while the fuse-client or kernel
still has queued URING_CMDs, these commands retain references
to the struct file used by the fuse connection. This prevents
fuse_dev_release() from being invoked, resulting in a hung mount
point.

This patch addresses the issue by making queued URING_CMDs
cancelable, allowing fuse_dev_release() to proceed as expected
and preventing the mount point from hanging.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> # io_uring
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev_uring.c   | 69 ++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/fuse/dev_uring_i.h |  9 +++++++
 2 files changed, 75 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 27bc103c17c8..fa0451176385 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -150,6 +150,7 @@ void fuse_uring_destruct(struct fuse_conn *fc)
 
 	for (qid = 0; qid < ring->nr_queues; qid++) {
 		struct fuse_ring_queue *queue = ring->queues[qid];
+		struct fuse_ring_ent *ent, *next;
 
 		if (!queue)
 			continue;
@@ -159,6 +160,12 @@ void fuse_uring_destruct(struct fuse_conn *fc)
 		WARN_ON(!list_empty(&queue->ent_commit_queue));
 		WARN_ON(!list_empty(&queue->ent_in_userspace));
 
+		list_for_each_entry_safe(ent, next, &queue->ent_released,
+					 list) {
+			list_del_init(&ent->list);
+			kfree(ent);
+		}
+
 		kfree(queue->fpq.processing);
 		kfree(queue);
 		ring->queues[qid] = NULL;
@@ -242,6 +249,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
 	INIT_LIST_HEAD(&queue->ent_in_userspace);
 	INIT_LIST_HEAD(&queue->fuse_req_queue);
 	INIT_LIST_HEAD(&queue->fuse_req_bg_queue);
+	INIT_LIST_HEAD(&queue->ent_released);
 
 	queue->fpq.processing = pq;
 	fuse_pqueue_init(&queue->fpq);
@@ -289,6 +297,15 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
 		/* remove entry from queue->fpq->processing */
 		list_del_init(&req->list);
 	}
+
+	/*
+	 * The entry must not be freed immediately, due to access of direct
+	 * pointer access of entries through IO_URING_F_CANCEL - there is a risk
+	 * of race between daemon termination (which triggers IO_URING_F_CANCEL
+	 * and accesses entries without checking the list state first
+	 */
+	list_move(&ent->list, &queue->ent_released);
+	ent->state = FRRS_RELEASED;
 	spin_unlock(&queue->lock);
 
 	if (cmd)
@@ -296,9 +313,6 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
 
 	if (req)
 		fuse_uring_stop_fuse_req_end(req);
-
-	list_del_init(&ent->list);
-	kfree(ent);
 }
 
 static void fuse_uring_stop_list_entries(struct list_head *head,
@@ -318,6 +332,7 @@ static void fuse_uring_stop_list_entries(struct list_head *head,
 			continue;
 		}
 
+		ent->state = FRRS_TEARDOWN;
 		list_move(&ent->list, &to_teardown);
 	}
 	spin_unlock(&queue->lock);
@@ -432,6 +447,46 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
 	}
 }
 
+/*
+ * Handle IO_URING_F_CANCEL, typically should come on daemon termination.
+ *
+ * Releasing the last entry should trigger fuse_dev_release() if
+ * the daemon was terminated
+ */
+static void fuse_uring_cancel(struct io_uring_cmd *cmd,
+			      unsigned int issue_flags)
+{
+	struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
+	struct fuse_ring_queue *queue;
+	bool need_cmd_done = false;
+
+	/*
+	 * direct access on ent - it must not be destructed as long as
+	 * IO_URING_F_CANCEL might come up
+	 */
+	queue = ent->queue;
+	spin_lock(&queue->lock);
+	if (ent->state == FRRS_AVAILABLE) {
+		ent->state = FRRS_USERSPACE;
+		list_move(&ent->list, &queue->ent_in_userspace);
+		need_cmd_done = true;
+		ent->cmd = NULL;
+	}
+	spin_unlock(&queue->lock);
+
+	if (need_cmd_done) {
+		/* no queue lock to avoid lock order issues */
+		io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
+	}
+}
+
+static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags,
+				      struct fuse_ring_ent *ring_ent)
+{
+	uring_cmd_set_ring_ent(cmd, ring_ent);
+	io_uring_cmd_mark_cancelable(cmd, issue_flags);
+}
+
 /*
  * Checks for errors and stores it into the request
  */
@@ -839,6 +894,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
 	spin_unlock(&queue->lock);
 
 	/* without the queue lock, as other locks are taken */
+	fuse_uring_prepare_cancel(cmd, issue_flags, ent);
 	fuse_uring_commit(ent, req, issue_flags);
 
 	/*
@@ -888,6 +944,8 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent,
 	struct fuse_conn *fc = ring->fc;
 	struct fuse_iqueue *fiq = &fc->iq;
 
+	fuse_uring_prepare_cancel(cmd, issue_flags, ent);
+
 	spin_lock(&queue->lock);
 	ent->cmd = cmd;
 	fuse_uring_ent_avail(ent, queue);
@@ -1038,6 +1096,11 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd,
 		return -EOPNOTSUPP;
 	}
 
+	if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
+		fuse_uring_cancel(cmd, issue_flags);
+		return 0;
+	}
+
 	/* This extra SQE size holds struct fuse_uring_cmd_req */
 	if (!(issue_flags & IO_URING_F_SQE128))
 		return -EINVAL;
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 0182be61778b..2102b3d0c1ae 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -28,6 +28,12 @@ enum fuse_ring_req_state {
 
 	/* The ring entry is in or on the way to user space */
 	FRRS_USERSPACE,
+
+	/* The ring entry is in teardown */
+	FRRS_TEARDOWN,
+
+	/* The ring entry is released, but not freed yet */
+	FRRS_RELEASED,
 };
 
 /** A fuse ring entry, part of the ring queue */
@@ -79,6 +85,9 @@ struct fuse_ring_queue {
 	/* entries in userspace */
 	struct list_head ent_in_userspace;
 
+	/* entries that are released */
+	struct list_head ent_released;
+
 	/* fuse requests waiting for an entry slot */
 	struct list_head fuse_req_queue;
 
-- 
cgit v1.2.3


From 3393ff964e0fa5def66570c54a4612bf9df06b76 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bernd@bsbernd.com>
Date: Mon, 20 Jan 2025 02:29:09 +0100
Subject: fuse: block request allocation until io-uring init is complete

Avoid races and block request allocation until io-uring
queues are ready.

This is a especially important for background requests,
as bg request completion might cause lock order inversion
of the typical queue->lock and then fc->bg_lock

    fuse_request_end
       spin_lock(&fc->bg_lock);
       flush_bg_queue
         fuse_send_one
           fuse_uring_queue_fuse_req
           spin_lock(&queue->lock);

Signed-off-by: Bernd Schubert <bernd@bsbernd.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c       | 3 ++-
 fs/fuse/dev_uring.c | 3 +++
 fs/fuse/fuse_i.h    | 3 +++
 fs/fuse/inode.c     | 2 ++
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1b593b23f7b8..f002e8a096f9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -76,7 +76,8 @@ void fuse_set_initialized(struct fuse_conn *fc)
 
 static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
 {
-	return !fc->initialized || (for_background && fc->blocked);
+	return !fc->initialized || (for_background && fc->blocked) ||
+	       (fc->io_uring && !fuse_uring_ready(fc));
 }
 
 static void fuse_drop_waiting(struct fuse_conn *fc)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index fa0451176385..ea197ccd4c51 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -957,6 +957,7 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent,
 		if (ready) {
 			WRITE_ONCE(fiq->ops, &fuse_io_uring_ops);
 			WRITE_ONCE(ring->ready, true);
+			wake_up_all(&fc->blocked_waitq);
 		}
 	}
 }
@@ -1130,6 +1131,8 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd,
 		if (err) {
 			pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n",
 				     err);
+			fc->io_uring = 0;
+			wake_up_all(&fc->blocked_waitq);
 			return err;
 		}
 		break;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ba6901c1bc2d..fee96fe7887b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -867,6 +867,9 @@ struct fuse_conn {
 	/* Use pages instead of pointer for kernel I/O */
 	unsigned int use_pages_for_kvec_io:1;
 
+	/* Use io_uring for communication */
+	unsigned int io_uring;
+
 	/** Maximum stack depth for passthrough backing files */
 	int max_stack_depth;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 328797b9aac9..e9db2cb8c150 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1390,6 +1390,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				else
 					ok = false;
 			}
+			if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
+				fc->io_uring = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
-- 
cgit v1.2.3


From 786412a73e7ee5b00ef3437bbf2f3a250759b2ae Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 20 Jan 2025 02:29:10 +0100
Subject: fuse: enable fuse-over-io-uring

All required parts are handled now, fuse-io-uring can
be enabled.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> # io_uring
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c       | 3 +++
 fs/fuse/dev_uring.c | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f002e8a096f9..5b5f789b37eb 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2493,6 +2493,9 @@ const struct file_operations fuse_dev_operations = {
 	.fasync		= fuse_dev_fasync,
 	.unlocked_ioctl = fuse_dev_ioctl,
 	.compat_ioctl   = compat_ptr_ioctl,
+#ifdef CONFIG_FUSE_IO_URING
+	.uring_cmd	= fuse_uring_cmd,
+#endif
 };
 EXPORT_SYMBOL_GPL(fuse_dev_operations);
 
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index ea197ccd4c51..3bdc75518e5b 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -1084,8 +1084,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
  * Entry function from io_uring to handle the given passthrough command
  * (op code IORING_OP_URING_CMD)
  */
-int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd,
-				  unsigned int issue_flags)
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	struct fuse_dev *fud;
 	struct fuse_conn *fc;
-- 
cgit v1.2.3


From 2d4fde59fd502a65c1698b61ad4d0f10a9ab665a Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Thu, 23 Jan 2025 17:55:32 +0100
Subject: fuse: prevent disabling io-uring on active connections

The enable_uring module parameter allows administrators to enable/disable
io-uring support for FUSE at runtime. However, disabling io-uring while
connections already have it enabled can lead to an inconsistent state.

Fix this by keeping io-uring enabled on connections that were already using
it, even if the module parameter is later disabled. This ensures active
FUSE mounts continue to function correctly.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Reviewed-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev_uring.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 3bdc75518e5b..ebd2931b4f2a 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -1091,11 +1091,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	u32 cmd_op = cmd->cmd_op;
 	int err;
 
-	if (!enable_uring) {
-		pr_info_ratelimited("fuse-io-uring is disabled\n");
-		return -EOPNOTSUPP;
-	}
-
 	if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
 		fuse_uring_cancel(cmd, issue_flags);
 		return 0;
@@ -1112,6 +1107,12 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	}
 	fc = fud->fc;
 
+	/* Once a connection has io-uring enabled on it, it can't be disabled */
+	if (!enable_uring && !fc->io_uring) {
+		pr_info_ratelimited("fuse-io-uring is disabled\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (fc->aborted)
 		return -ECONNABORTED;
 	if (!fc->connected)
-- 
cgit v1.2.3


From 95a4ccbbe596b45264af5e3a019cb920c05ccffd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Dec 2024 20:03:33 -0500
Subject: dissolve external_name.u into separate members

... and document the constraints on the layout.  Kept separate from
the previous commit to keep the noise separate from actual changes.
The reason for explicit __aligned() on ->name[] rather than relying
upon the alignment of the previous field is that the previous iteration
of that commit tried to save 4 bytes on 64bit by eliminating a hole
in there, which broke the assumptions in dentry_string_cmp().
Better spell it out and avoid the temptation for the future...

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index f387dc97df86..f8d6a2557736 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -295,12 +295,16 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
 	return dentry_string_cmp(cs, ct, tcount);
 }
 
+/*
+ * long names are allocated separately from dentry and never modified.
+ * Refcounted, freeing is RCU-delayed.  See take_dentry_name_snapshot()
+ * for the reason why ->count and ->head can't be combined into a union.
+ * dentry_string_cmp() relies upon ->name[] being word-aligned.
+ */
 struct external_name {
-	struct {
-		atomic_t count;		// ->count and ->head can't be combined
-		struct rcu_head head;	// see take_dentry_name_snapshot()
-	} u;
-	unsigned char name[];
+	atomic_t count;
+	struct rcu_head head;
+	unsigned char name[] __aligned(sizeof(unsigned long));
 };
 
 static inline struct external_name *external_name(struct dentry *dentry)
@@ -344,7 +348,7 @@ retry:
 		struct external_name *p;
 		p = container_of(s, struct external_name, name[0]);
 		// get a valid reference
-		if (unlikely(!atomic_inc_not_zero(&p->u.count)))
+		if (unlikely(!atomic_inc_not_zero(&p->count)))
 			goto retry;
 		name->name.name = s;
 	}
@@ -361,8 +365,8 @@ void release_dentry_name_snapshot(struct name_snapshot *name)
 	if (unlikely(name->name.name != name->inline_name.string)) {
 		struct external_name *p;
 		p = container_of(name->name.name, struct external_name, name[0]);
-		if (unlikely(atomic_dec_and_test(&p->u.count)))
-			kfree_rcu(p, u.head);
+		if (unlikely(atomic_dec_and_test(&p->count)))
+			kfree_rcu(p, head);
 	}
 }
 EXPORT_SYMBOL(release_dentry_name_snapshot);
@@ -400,7 +404,7 @@ static void dentry_free(struct dentry *dentry)
 	WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
 	if (unlikely(dname_external(dentry))) {
 		struct external_name *p = external_name(dentry);
-		if (likely(atomic_dec_and_test(&p->u.count))) {
+		if (likely(atomic_dec_and_test(&p->count))) {
 			call_rcu(&dentry->d_u.d_rcu, __d_free_external);
 			return;
 		}
@@ -1681,7 +1685,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
 		}
-		atomic_set(&p->u.count, 1);
+		atomic_set(&p->count, 1);
 		dname = p->name;
 	} else  {
 		dname = dentry->d_shortname.string;
@@ -2774,15 +2778,15 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
 	if (unlikely(dname_external(dentry)))
 		old_name = external_name(dentry);
 	if (unlikely(dname_external(target))) {
-		atomic_inc(&external_name(target)->u.count);
+		atomic_inc(&external_name(target)->count);
 		dentry->d_name = target->d_name;
 	} else {
 		dentry->d_shortname = target->d_shortname;
 		dentry->d_name.name = dentry->d_shortname.string;
 		dentry->d_name.hash_len = target->d_name.hash_len;
 	}
-	if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
-		kfree_rcu(old_name, u.head);
+	if (old_name && likely(atomic_dec_and_test(&old_name->count)))
+		kfree_rcu(old_name, head);
 }
 
 /*
-- 
cgit v1.2.3


From 7e3270165a8704ffb3948ef78f139aefc033135a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Dec 2024 20:10:42 -0500
Subject: ext4 fast_commit: make use of name_snapshot primitives

... rather than open-coding them.  As a bonus, that avoids the pointless
work with extra allocations, etc. for long names.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/fast_commit.c | 29 +++++------------------------
 fs/ext4/fast_commit.h |  3 +--
 2 files changed, 6 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 26c4fc37edcf..da4263a14a20 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -322,9 +322,7 @@ restart:
 	WARN_ON(!list_empty(&ei->i_fc_dilist));
 	spin_unlock(&sbi->s_fc_lock);
 
-	if (fc_dentry->fcd_name.name &&
-		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
-		kfree(fc_dentry->fcd_name.name);
+	release_dentry_name_snapshot(&fc_dentry->fcd_name);
 	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
 
 	return;
@@ -449,22 +447,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
 	node->fcd_op = dentry_update->op;
 	node->fcd_parent = dir->i_ino;
 	node->fcd_ino = inode->i_ino;
-	if (dentry->d_name.len > DNAME_INLINE_LEN) {
-		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
-		if (!node->fcd_name.name) {
-			kmem_cache_free(ext4_fc_dentry_cachep, node);
-			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
-			mutex_lock(&ei->i_fc_lock);
-			return -ENOMEM;
-		}
-		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
-			dentry->d_name.len);
-	} else {
-		memcpy(node->fcd_iname, dentry->d_name.name,
-			dentry->d_name.len);
-		node->fcd_name.name = node->fcd_iname;
-	}
-	node->fcd_name.len = dentry->d_name.len;
+	take_dentry_name_snapshot(&node->fcd_name, dentry);
 	INIT_LIST_HEAD(&node->fcd_dilist);
 	spin_lock(&sbi->s_fc_lock);
 	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
@@ -832,7 +815,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 {
 	struct ext4_fc_dentry_info fcd;
 	struct ext4_fc_tl tl;
-	int dlen = fc_dentry->fcd_name.len;
+	int dlen = fc_dentry->fcd_name.name.len;
 	u8 *dst = ext4_fc_reserve_space(sb,
 			EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
 
@@ -847,7 +830,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 	dst += EXT4_FC_TAG_BASE_LEN;
 	memcpy(dst, &fcd, sizeof(fcd));
 	dst += sizeof(fcd);
-	memcpy(dst, fc_dentry->fcd_name.name, dlen);
+	memcpy(dst, fc_dentry->fcd_name.name.name, dlen);
 
 	return true;
 }
@@ -1328,9 +1311,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 		list_del_init(&fc_dentry->fcd_dilist);
 		spin_unlock(&sbi->s_fc_lock);
 
-		if (fc_dentry->fcd_name.name &&
-			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
-			kfree(fc_dentry->fcd_name.name);
+		release_dentry_name_snapshot(&fc_dentry->fcd_name);
 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
 		spin_lock(&sbi->s_fc_lock);
 	}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 2fadb2c4780c..3bd534e4dbbf 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -109,8 +109,7 @@ struct ext4_fc_dentry_update {
 	int fcd_op;		/* Type of update create / unlink / link */
 	int fcd_parent;		/* Parent inode number */
 	int fcd_ino;		/* Inode number */
-	struct qstr fcd_name;	/* Dirent name */
-	unsigned char fcd_iname[DNAME_INLINE_LEN];	/* Dirent name string */
+	struct name_snapshot fcd_name;	/* Dirent name */
 	struct list_head fcd_list;
 	struct list_head fcd_dilist;
 };
-- 
cgit v1.2.3


From 4c43ab1b100f95c410a8633d5a6eb9d74afb5b8d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 23 Dec 2024 02:23:00 -0500
Subject: generic_ci_d_compare(): use shortname_storage

... and check the "name might be unstable" predicate
the right way.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Gabriel Krisman Bertazi <gabriel@krisman.be>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 748ac5923154..3ad1b1b7fed6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1789,7 +1789,7 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 {
 	const struct dentry *parent;
 	const struct inode *dir;
-	char strbuf[DNAME_INLINE_LEN];
+	union shortname_store strbuf;
 	struct qstr qstr;
 
 	/*
@@ -1809,22 +1809,23 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 	if (!dir || !IS_CASEFOLDED(dir))
 		return 1;
 
+	qstr.len = len;
+	qstr.name = str;
 	/*
 	 * If the dentry name is stored in-line, then it may be concurrently
 	 * modified by a rename.  If this happens, the VFS will eventually retry
 	 * the lookup, so it doesn't matter what ->d_compare() returns.
 	 * However, it's unsafe to call utf8_strncasecmp() with an unstable
 	 * string.  Therefore, we have to copy the name into a temporary buffer.
+	 * As above, len is guaranteed to match str, so the shortname case
+	 * is exactly when str points to ->d_shortname.
 	 */
-	if (len <= DNAME_INLINE_LEN - 1) {
-		memcpy(strbuf, str, len);
-		strbuf[len] = 0;
-		str = strbuf;
+	if (qstr.name == dentry->d_shortname.string) {
+		strbuf = dentry->d_shortname; // NUL is guaranteed to be in there
+		qstr.name = strbuf.string;
 		/* prevent compiler from optimizing out the temporary buffer */
 		barrier();
 	}
-	qstr.len = len;
-	qstr.name = str;
 
 	return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
 }
-- 
cgit v1.2.3


From 5be1fa8abd7b049f51e6e98e75a37eef5ae2c296 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Dec 2024 00:28:51 -0500
Subject: Pass parent directory inode and expected name to ->d_revalidate()

->d_revalidate() often needs to access dentry parent and name; that has
to be done carefully, since the locking environment varies from caller
to caller.  We are not guaranteed that dentry in question will not be
moved right under us - not unless the filesystem is such that nothing
on it ever gets renamed.

It can be dealt with, but that results in boilerplate code that isn't
even needed - the callers normally have just found the dentry via dcache
lookup and want to verify that it's in the right place; they already
have the values of ->d_parent and ->d_name stable.  There is a couple
of exceptions (overlayfs and, to less extent, ecryptfs), but for the
majority of calls that song and dance is not needed at all.

It's easier to make ecryptfs and overlayfs find and pass those values if
there's a ->d_revalidate() instance to be called, rather than doing that
in the instances.

This commit only changes the calling conventions; making use of supplied
values is left to followups.

NOTE: some instances need more than just the parent - things like CIFS
may need to build an entire path from filesystem root, so they need
more precautions than the usual boilerplate.  This series doesn't
do anything to that need - these filesystems have to keep their locking
mechanisms (rename_lock loops, use of dentry_path_raw(), private rwsem
a-la v9fs).

One thing to keep in mind when using name is that name->name will normally
point into the pathname being resolved; the filename in question occupies
name->len bytes starting at name->name, and there is NUL somewhere after it,
but it the next byte might very well be '/' rather than '\0'.  Do not
ignore name->len.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Gabriel Krisman Bertazi <gabriel@krisman.be>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/locking.rst |  3 ++-
 Documentation/filesystems/porting.rst | 16 ++++++++++++++++
 Documentation/filesystems/vfs.rst     |  3 ++-
 fs/9p/vfs_dentry.c                    | 10 ++++++++--
 fs/afs/dir.c                          |  6 ++++--
 fs/ceph/dir.c                         |  5 +++--
 fs/coda/dir.c                         |  3 ++-
 fs/crypto/fname.c                     |  3 ++-
 fs/ecryptfs/dentry.c                  | 18 ++++++++++++++----
 fs/exfat/namei.c                      |  3 ++-
 fs/fat/namei_vfat.c                   |  6 ++++--
 fs/fuse/dir.c                         |  3 ++-
 fs/gfs2/dentry.c                      |  7 +++++--
 fs/hfs/sysdep.c                       |  3 ++-
 fs/jfs/namei.c                        |  3 ++-
 fs/kernfs/dir.c                       |  3 ++-
 fs/namei.c                            | 18 ++++++++++--------
 fs/nfs/dir.c                          |  9 ++++++---
 fs/ocfs2/dcache.c                     |  3 ++-
 fs/orangefs/dcache.c                  |  3 ++-
 fs/overlayfs/super.c                  | 22 ++++++++++++++++++++--
 fs/proc/base.c                        |  6 ++++--
 fs/proc/fd.c                          |  3 ++-
 fs/proc/generic.c                     |  6 ++++--
 fs/proc/proc_sysctl.c                 |  3 ++-
 fs/smb/client/dir.c                   |  3 ++-
 fs/tracefs/inode.c                    |  3 ++-
 fs/vboxsf/dir.c                       |  3 ++-
 include/linux/dcache.h                |  3 ++-
 include/linux/fscrypt.h               |  7 ++++---
 30 files changed, 136 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index f5e3676db954..146e7d8aa736 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -17,7 +17,8 @@ dentry_operations
 
 prototypes::
 
-	int (*d_revalidate)(struct dentry *, unsigned int);
+	int (*d_revalidate)(struct inode *, const struct qstr *,
+			    struct dentry *, unsigned int);
 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
 	int (*d_hash)(const struct dentry *, struct qstr *);
 	int (*d_compare)(const struct dentry *,
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 9ab2a3d6f2b4..568e7ea3c4ae 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1141,3 +1141,19 @@ pointer are gone.
 
 set_blocksize() takes opened struct file instead of struct block_device now
 and it *must* be opened exclusive.
+
+---
+
+** mandatory**
+
+->d_revalidate() gets two extra arguments - inode of parent directory and
+name our dentry is expected to have.  Both are stable (dir is pinned in
+non-RCU case and will stay around during the call in RCU case, and name
+is guaranteed to stay unchanging).  Your instance doesn't have to use
+either, but it often helps to avoid a lot of painful boilerplate.
+Note that while name->name is stable and NUL-terminated, it may (and
+often will) have name->name[name->len] equal to '/' rather than '\0' -
+in normal case it points into the pathname being looked up.
+NOTE: if you need something like full path from the root of filesystem,
+you are still on your own - this assists with simple cases, but it's not
+magic.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 0b18af3f954e..7c352ebaae98 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -1251,7 +1251,8 @@ defined:
 .. code-block:: c
 
 	struct dentry_operations {
-		int (*d_revalidate)(struct dentry *, unsigned int);
+		int (*d_revalidate)(struct inode *, const struct qstr *,
+				    struct dentry *, unsigned int);
 		int (*d_weak_revalidate)(struct dentry *, unsigned int);
 		int (*d_hash)(const struct dentry *, struct qstr *);
 		int (*d_compare)(const struct dentry *,
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 01338d4c2d9e..872c1abe3295 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -61,7 +61,7 @@ static void v9fs_dentry_release(struct dentry *dentry)
 		p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
 }
 
-static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct p9_fid *fid;
 	struct inode *inode;
@@ -99,9 +99,15 @@ out_valid:
 	return 1;
 }
 
+static int v9fs_lookup_revalidate(struct inode *dir, const struct qstr *name,
+				  struct dentry *dentry, unsigned int flags)
+{
+	return __v9fs_lookup_revalidate(dentry, flags);
+}
+
 const struct dentry_operations v9fs_cached_dentry_operations = {
 	.d_revalidate = v9fs_lookup_revalidate,
-	.d_weak_revalidate = v9fs_lookup_revalidate,
+	.d_weak_revalidate = __v9fs_lookup_revalidate,
 	.d_delete = v9fs_cached_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ada363af5aab..9780013cd83a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -22,7 +22,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 				 unsigned int flags);
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, struct dir_context *ctx);
-static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
+static int afs_d_revalidate(struct inode *dir, const struct qstr *name,
+			    struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_iput(struct dentry *dentry, struct inode *inode);
 static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
@@ -1093,7 +1094,8 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
  * - NOTE! the hit can be a negative hit too, so we can't assume we have an
  *   inode
  */
-static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
+			    struct dentry *dentry, unsigned int flags)
 {
 	struct afs_vnode *vnode, *dir;
 	struct afs_fid fid;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bf388e07a02..c4c71c24221b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1940,7 +1940,8 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
 /*
  * Check if cached dentry can be trusted.
  */
-static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int ceph_d_revalidate(struct inode *parent_dir, const struct qstr *name,
+			     struct dentry *dentry, unsigned int flags)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
 	struct ceph_client *cl = mdsc->fsc->client;
@@ -1948,7 +1949,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 	struct dentry *parent;
 	struct inode *dir, *inode;
 
-	valid = fscrypt_d_revalidate(dentry, flags);
+	valid = fscrypt_d_revalidate(parent_dir, name, dentry, flags);
 	if (valid <= 0)
 		return valid;
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4e552ba7bd43..a3e2dfeedfbf 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -445,7 +445,8 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 }
 
 /* called when a cache lookup succeeds */
-static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
+static int coda_dentry_revalidate(struct inode *dir, const struct qstr *name,
+				  struct dentry *de, unsigned int flags)
 {
 	struct inode *inode;
 	struct coda_inode_info *cii;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 0ad52fbe51c9..389f5b2bf63b 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -574,7 +574,8 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);
  * Validate dentries in encrypted directories to make sure we aren't potentially
  * caching stale dentries after a key has been added.
  */
-int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+int fscrypt_d_revalidate(struct inode *parent_dir, const struct qstr *name,
+			 struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *dir;
 	int err;
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index acaa0825e9bb..1dfd5b81d831 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -17,7 +17,9 @@
 
 /**
  * ecryptfs_d_revalidate - revalidate an ecryptfs dentry
- * @dentry: The ecryptfs dentry
+ * @dir: inode of expected parent
+ * @name: expected name
+ * @dentry: dentry to revalidate
  * @flags: lookup flags
  *
  * Called when the VFS needs to revalidate a dentry. This
@@ -28,7 +30,8 @@
  * Returns 1 if valid, 0 otherwise.
  *
  */
-static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name,
+				 struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	int rc = 1;
@@ -36,8 +39,15 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE)
-		rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+	if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) {
+		struct inode *lower_dir = ecryptfs_inode_to_lower(dir);
+		struct name_snapshot n;
+
+		take_dentry_name_snapshot(&n, lower_dentry);
+		rc = lower_dentry->d_op->d_revalidate(lower_dir, &n.name,
+						      lower_dentry, flags);
+		release_dentry_name_snapshot(&n);
+	}
 
 	if (d_really_is_positive(dentry)) {
 		struct inode *inode = d_inode(dentry);
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 97d2774760fe..e3b4feccba07 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -31,7 +31,8 @@ static inline void exfat_d_version_set(struct dentry *dentry,
  * If it happened, the negative dentry isn't actually negative anymore.  So,
  * drop it.
  */
-static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int exfat_d_revalidate(struct inode *dir, const struct qstr *name,
+			      struct dentry *dentry, unsigned int flags)
 {
 	int ret;
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 15bf32c21ac0..f9cbd5c6f932 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -53,7 +53,8 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 	return ret;
 }
 
-static int vfat_revalidate(struct dentry *dentry, unsigned int flags)
+static int vfat_revalidate(struct inode *dir, const struct qstr *name,
+			   struct dentry *dentry, unsigned int flags)
 {
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -64,7 +65,8 @@ static int vfat_revalidate(struct dentry *dentry, unsigned int flags)
 	return vfat_revalidate_shortname(dentry);
 }
 
-static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags)
+static int vfat_revalidate_ci(struct inode *dir, const struct qstr *name,
+			      struct dentry *dentry, unsigned int flags)
 {
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 494ac372ace0..d9e9f26917eb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -192,7 +192,8 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
  * the lookup once more.  If the lookup results in the same inode,
  * then refresh the attributes, timeouts and mark the dentry valid.
  */
-static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
+static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
+				  struct dentry *entry, unsigned int flags)
 {
 	struct inode *inode;
 	struct dentry *parent;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 2e215e8c3c88..86c338901fab 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -21,7 +21,9 @@
 
 /**
  * gfs2_drevalidate - Check directory lookup consistency
- * @dentry: the mapping to check
+ * @dir: expected parent directory inode
+ * @name: expexted name
+ * @dentry: dentry to check
  * @flags: lookup flags
  *
  * Check to make sure the lookup necessary to arrive at this inode from its
@@ -30,7 +32,8 @@
  * Returns: 1 if the dentry is ok, 0 if it isn't
  */
 
-static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
+static int gfs2_drevalidate(struct inode *dir, const struct qstr *name,
+			    struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *parent;
 	struct gfs2_sbd *sdp;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 76fa02e3835b..ef54fc8093cf 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -13,7 +13,8 @@
 
 /* dentry case-handling: just lowercase everything */
 
-static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
+static int hfs_revalidate_dentry(struct inode *dir, const struct qstr *name,
+				 struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	int diff;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index d68a4e6ac345..fc8ede43afde 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1576,7 +1576,8 @@ out:
 	return result;
 }
 
-static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags)
+static int jfs_ci_revalidate(struct inode *dir, const struct qstr *name,
+			     struct dentry *dentry, unsigned int flags)
 {
 	/*
 	 * This is not negative dentry. Always valid.
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 458519e416fe..5f0f8b95f44c 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1109,7 +1109,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
 	return ERR_PTR(rc);
 }
 
-static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name,
+				 struct dentry *dentry, unsigned int flags)
 {
 	struct kernfs_node *kn;
 	struct kernfs_root *root;
diff --git a/fs/namei.c b/fs/namei.c
index 9d30c7aa9aa6..77e5d136faaf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -921,10 +921,11 @@ out_dput:
 	return false;
 }
 
-static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
+static inline int d_revalidate(struct inode *dir, const struct qstr *name,
+			       struct dentry *dentry, unsigned int flags)
 {
 	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
-		return dentry->d_op->d_revalidate(dentry, flags);
+		return dentry->d_op->d_revalidate(dir, name, dentry, flags);
 	else
 		return 1;
 }
@@ -1652,7 +1653,7 @@ static struct dentry *lookup_dcache(const struct qstr *name,
 {
 	struct dentry *dentry = d_lookup(dir, name);
 	if (dentry) {
-		int error = d_revalidate(dentry, flags);
+		int error = d_revalidate(dir->d_inode, name, dentry, flags);
 		if (unlikely(error <= 0)) {
 			if (!error)
 				d_invalidate(dentry);
@@ -1737,19 +1738,20 @@ static struct dentry *lookup_fast(struct nameidata *nd)
 		if (read_seqcount_retry(&parent->d_seq, nd->seq))
 			return ERR_PTR(-ECHILD);
 
-		status = d_revalidate(dentry, nd->flags);
+		status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
 		if (likely(status > 0))
 			return dentry;
 		if (!try_to_unlazy_next(nd, dentry))
 			return ERR_PTR(-ECHILD);
 		if (status == -ECHILD)
 			/* we'd been told to redo it in non-rcu mode */
-			status = d_revalidate(dentry, nd->flags);
+			status = d_revalidate(nd->inode, &nd->last,
+					      dentry, nd->flags);
 	} else {
 		dentry = __d_lookup(parent, &nd->last);
 		if (unlikely(!dentry))
 			return NULL;
-		status = d_revalidate(dentry, nd->flags);
+		status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
 	}
 	if (unlikely(status <= 0)) {
 		if (!status)
@@ -1777,7 +1779,7 @@ again:
 	if (IS_ERR(dentry))
 		return dentry;
 	if (unlikely(!d_in_lookup(dentry))) {
-		int error = d_revalidate(dentry, flags);
+		int error = d_revalidate(inode, name, dentry, flags);
 		if (unlikely(error <= 0)) {
 			if (!error) {
 				d_invalidate(dentry);
@@ -3575,7 +3577,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 		if (d_in_lookup(dentry))
 			break;
 
-		error = d_revalidate(dentry, nd->flags);
+		error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags);
 		if (likely(error > 0))
 			break;
 		if (error)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 492cffd9d3d8..9910d9796f4c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1814,7 +1814,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
 	return ret;
 }
 
-static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name,
+				 struct dentry *dentry, unsigned int flags)
 {
 	return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
 }
@@ -2025,7 +2026,8 @@ void nfs_d_prune_case_insensitive_aliases(struct inode *inode)
 EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases);
 
 #if IS_ENABLED(CONFIG_NFS_V4)
-static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
+static int nfs4_lookup_revalidate(struct inode *, const struct qstr *,
+				  struct dentry *, unsigned int);
 
 const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs4_lookup_revalidate,
@@ -2260,7 +2262,8 @@ full_reval:
 	return nfs_do_lookup_revalidate(dir, dentry, flags);
 }
 
-static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name,
+				  struct dentry *dentry, unsigned int flags)
 {
 	return __nfs_lookup_revalidate(dentry, flags,
 			nfs4_do_lookup_revalidate);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index a9b8688aaf30..ecb1ce6301c4 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -32,7 +32,8 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry)
 }
 
 
-static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name,
+				   struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	int ret = 0;    /* if all else fails, just return false */
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 395a00ed8ac7..c32c9a86e8d0 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -92,7 +92,8 @@ out_drop:
  *
  * Should return 1 if dentry can still be trusted, else 0.
  */
-static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int orangefs_d_revalidate(struct inode *dir, const struct qstr *name,
+				 struct dentry *dentry, unsigned int flags)
 {
 	int ret;
 	unsigned long time = (unsigned long) dentry->d_fsdata;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index fe511192f83c..86ae6f6da36b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -91,7 +91,24 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
 		if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
 			ret =  d->d_op->d_weak_revalidate(d, flags);
 	} else if (d->d_flags & DCACHE_OP_REVALIDATE) {
-		ret = d->d_op->d_revalidate(d, flags);
+		struct dentry *parent;
+		struct inode *dir;
+		struct name_snapshot n;
+
+		if (flags & LOOKUP_RCU) {
+			parent = READ_ONCE(d->d_parent);
+			dir = d_inode_rcu(parent);
+			if (!dir)
+				return -ECHILD;
+		} else {
+			parent = dget_parent(d);
+			dir = d_inode(parent);
+		}
+		take_dentry_name_snapshot(&n, d);
+		ret = d->d_op->d_revalidate(dir, &n.name, d, flags);
+		release_dentry_name_snapshot(&n);
+		if (!(flags & LOOKUP_RCU))
+			dput(parent);
 		if (!ret) {
 			if (!(flags & LOOKUP_RCU))
 				d_invalidate(d);
@@ -127,7 +144,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
 	return ret;
 }
 
-static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_dentry_revalidate(struct inode *dir, const struct qstr *name,
+				 struct dentry *dentry, unsigned int flags)
 {
 	return ovl_dentry_revalidate_common(dentry, flags, false);
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0edf14a9840e..fb5493d0edf0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2058,7 +2058,8 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
  * performed a setuid(), etc.
  *
  */
-static int pid_revalidate(struct dentry *dentry, unsigned int flags)
+static int pid_revalidate(struct inode *dir, const struct qstr *name,
+			  struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct task_struct *task;
@@ -2191,7 +2192,8 @@ static int dname_to_vma_addr(struct dentry *dentry,
 	return 0;
 }
 
-static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int map_files_d_revalidate(struct inode *dir, const struct qstr *name,
+				  struct dentry *dentry, unsigned int flags)
 {
 	unsigned long vm_start, vm_end;
 	bool exact_vma_exists = false;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 24baf23e864f..37aa778d1af7 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -140,7 +140,8 @@ static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
 	security_task_to_inode(task, inode);
 }
 
-static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+static int tid_fd_revalidate(struct inode *dir, const struct qstr *name,
+			     struct dentry *dentry, unsigned int flags)
 {
 	struct task_struct *task;
 	struct inode *inode;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dbe82cf23ee4..8ec90826a49e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -216,7 +216,8 @@ void proc_free_inum(unsigned int inum)
 	ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
 }
 
-static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name,
+				  struct dentry *dentry, unsigned int flags)
 {
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -343,7 +344,8 @@ static const struct file_operations proc_dir_operations = {
 	.iterate_shared		= proc_readdir,
 };
 
-static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name,
+				 struct dentry *dentry, unsigned int flags)
 {
 	return 0;
 }
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 27a283d85a6e..cc9d74a06ff0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -884,7 +884,8 @@ static const struct inode_operations proc_sys_dir_operations = {
 	.getattr	= proc_sys_getattr,
 };
 
-static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_sys_revalidate(struct inode *dir, const struct qstr *name,
+			       struct dentry *dentry, unsigned int flags)
 {
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 864b194dbaa0..8c5d44ee91ed 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -737,7 +737,8 @@ again:
 }
 
 static int
-cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
+cifs_d_revalidate(struct inode *dir, const struct qstr *name,
+		  struct dentry *direntry, unsigned int flags)
 {
 	struct inode *inode;
 	int rc;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index cfc614c638da..53214499e384 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -457,7 +457,8 @@ static void tracefs_d_release(struct dentry *dentry)
 		eventfs_d_release(dentry);
 }
 
-static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int tracefs_d_revalidate(struct inode *inode, const struct qstr *name,
+				struct dentry *dentry, unsigned int flags)
 {
 	struct eventfs_inode *ei = dentry->d_fsdata;
 
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 5f1a14d5b927..a859ac9b74ba 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -192,7 +192,8 @@ const struct file_operations vboxsf_dir_fops = {
  * This is called during name resolution/lookup to check if the @dentry in
  * the cache is still valid. the job is handled by vboxsf_inode_revalidate.
  */
-static int vboxsf_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int vboxsf_dentry_revalidate(struct inode *dir, const struct qstr *name,
+				    struct dentry *dentry, unsigned int flags)
 {
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 8bc567a35718..4a6bdadf2f29 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -144,7 +144,8 @@ enum d_real_type {
 };
 
 struct dentry_operations {
-	int (*d_revalidate)(struct dentry *, unsigned int);
+	int (*d_revalidate)(struct inode *, const struct qstr *,
+			    struct dentry *, unsigned int);
 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
 	int (*d_hash)(const struct dentry *, struct qstr *);
 	int (*d_compare)(const struct dentry *,
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 772f822dc6b8..18855cb44b1c 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -192,7 +192,8 @@ struct fscrypt_operations {
 					     unsigned int *num_devs);
 };
 
-int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags);
+int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
+			 struct dentry *dentry, unsigned int flags);
 
 static inline struct fscrypt_inode_info *
 fscrypt_get_inode_info(const struct inode *inode)
@@ -711,8 +712,8 @@ static inline u64 fscrypt_fname_siphash(const struct inode *dir,
 	return 0;
 }
 
-static inline int fscrypt_d_revalidate(struct dentry *dentry,
-				       unsigned int flags)
+static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
+				       struct dentry *dentry, unsigned int flags)
 {
 	return 1;
 }
-- 
cgit v1.2.3


From da192022f82f075709c60f070f72fae8a185ecff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 00:27:27 -0500
Subject: afs_d_revalidate(): use stable name and parent inode passed by caller

No need to bother with boilerplate for obtaining the latter and for
the former we really should not count upon ->d_name.name remaining
stable under us.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/dir.c | 34 ++++++++--------------------------
 1 file changed, 8 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 9780013cd83a..e04cffe4beb1 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -607,19 +607,19 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
  * Do a lookup of a single name in a directory
  * - just returns the FID the dentry name maps to if found
  */
-static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
+static int afs_do_lookup_one(struct inode *dir, const struct qstr *name,
 			     struct afs_fid *fid, struct key *key,
 			     afs_dataversion_t *_dir_version)
 {
 	struct afs_super_info *as = dir->i_sb->s_fs_info;
 	struct afs_lookup_one_cookie cookie = {
 		.ctx.actor = afs_lookup_one_filldir,
-		.name = dentry->d_name,
+		.name = *name,
 		.fid.vid = as->volume->vid
 	};
 	int ret;
 
-	_enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
+	_enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name);
 
 	/* search the directory */
 	ret = afs_dir_iterate(dir, &cookie.ctx, key, _dir_version);
@@ -1052,21 +1052,12 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 /*
  * Check the validity of a dentry under RCU conditions.
  */
-static int afs_d_revalidate_rcu(struct dentry *dentry)
+static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry)
 {
-	struct afs_vnode *dvnode;
-	struct dentry *parent;
-	struct inode *dir;
 	long dir_version, de_version;
 
 	_enter("%p", dentry);
 
-	/* Check the parent directory is still valid first. */
-	parent = READ_ONCE(dentry->d_parent);
-	dir = d_inode_rcu(parent);
-	if (!dir)
-		return -ECHILD;
-	dvnode = AFS_FS_I(dir);
 	if (test_bit(AFS_VNODE_DELETED, &dvnode->flags))
 		return -ECHILD;
 
@@ -1097,9 +1088,8 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
 static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 			    struct dentry *dentry, unsigned int flags)
 {
-	struct afs_vnode *vnode, *dir;
+	struct afs_vnode *vnode, *dir = AFS_FS_I(parent_dir);
 	struct afs_fid fid;
-	struct dentry *parent;
 	struct inode *inode;
 	struct key *key;
 	afs_dataversion_t dir_version, invalid_before;
@@ -1107,7 +1097,7 @@ static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	int ret;
 
 	if (flags & LOOKUP_RCU)
-		return afs_d_revalidate_rcu(dentry);
+		return afs_d_revalidate_rcu(dir, dentry);
 
 	if (d_really_is_positive(dentry)) {
 		vnode = AFS_FS_I(d_inode(dentry));
@@ -1122,14 +1112,9 @@ static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	if (IS_ERR(key))
 		key = NULL;
 
-	/* Hold the parent dentry so we can peer at it */
-	parent = dget_parent(dentry);
-	dir = AFS_FS_I(d_inode(parent));
-
 	/* validate the parent directory */
 	ret = afs_validate(dir, key);
 	if (ret == -ERESTARTSYS) {
-		dput(parent);
 		key_put(key);
 		return ret;
 	}
@@ -1157,7 +1142,7 @@ static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	afs_stat_v(dir, n_reval);
 
 	/* search the directory for this vnode */
-	ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version);
+	ret = afs_do_lookup_one(&dir->netfs.inode, name, &fid, key, &dir_version);
 	switch (ret) {
 	case 0:
 		/* the filename maps to something */
@@ -1201,22 +1186,19 @@ static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 		goto out_valid;
 
 	default:
-		_debug("failed to iterate dir %pd: %d",
-		       parent, ret);
+		_debug("failed to iterate parent %pd2: %d", dentry, ret);
 		goto not_found;
 	}
 
 out_valid:
 	dentry->d_fsdata = (void *)(unsigned long)dir_version;
 out_valid_noupdate:
-	dput(parent);
 	key_put(key);
 	_leave(" = 1 [valid]");
 	return 1;
 
 not_found:
 	_debug("dropping dentry %pd2", dentry);
-	dput(parent);
 	key_put(key);
 
 	_leave(" = 0 [bad]");
-- 
cgit v1.2.3


From bf636ed4a9521617ea309b063d8f67522d75a203 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 00:54:18 -0500
Subject: ceph_d_revalidate(): use stable parent inode passed by caller

No need to mess with the boilerplate for obtaining what we already
have.  Note that ceph is one of the "will want a path from filesystem
root if we want to talk to server" cases, so the name of the last
component is of little use - it is passed to fscrypt_d_revalidate()
and it's used to deal with (also crypt-related) case in request
marshalling, when encrypted name turns out to be too long.  The former
is not a problem, but the latter is racy; that part will be handled
in the next commit.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/dir.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c4c71c24221b..dc5f55bebad7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1940,30 +1940,19 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
 /*
  * Check if cached dentry can be trusted.
  */
-static int ceph_d_revalidate(struct inode *parent_dir, const struct qstr *name,
+static int ceph_d_revalidate(struct inode *dir, const struct qstr *name,
 			     struct dentry *dentry, unsigned int flags)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
 	struct ceph_client *cl = mdsc->fsc->client;
 	int valid = 0;
-	struct dentry *parent;
-	struct inode *dir, *inode;
+	struct inode *inode;
 
-	valid = fscrypt_d_revalidate(parent_dir, name, dentry, flags);
+	valid = fscrypt_d_revalidate(dir, name, dentry, flags);
 	if (valid <= 0)
 		return valid;
 
-	if (flags & LOOKUP_RCU) {
-		parent = READ_ONCE(dentry->d_parent);
-		dir = d_inode_rcu(parent);
-		if (!dir)
-			return -ECHILD;
-		inode = d_inode_rcu(dentry);
-	} else {
-		parent = dget_parent(dentry);
-		dir = d_inode(parent);
-		inode = d_inode(dentry);
-	}
+	inode = d_inode_rcu(dentry);
 
 	doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n",
 	      dentry, dentry, inode, ceph_dentry(dentry)->offset,
@@ -2039,9 +2028,6 @@ static int ceph_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid");
 	if (!valid)
 		ceph_dir_clear_complete(dir);
-
-	if (!(flags & LOOKUP_RCU))
-		dput(parent);
 	return valid;
 }
 
-- 
cgit v1.2.3


From 541795cb0be6e29f79deff50831bc6caa72b5f8e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Jan 2025 10:04:11 -0500
Subject: ceph_d_revalidate(): propagate stable name down into request encoding

Currently get_fscrypt_altname() requires ->r_dentry->d_name to be stable
and it gets that in almost all cases.  The only exception is ->d_revalidate(),
where we have a stable name, but it's passed separately - dentry->d_name
is not stable there.

Propagate it down to get_fscrypt_altname() as a new field of struct
ceph_mds_request - ->r_dname, to be used instead ->r_dentry->d_name
when non-NULL.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/dir.c        | 2 ++
 fs/ceph/mds_client.c | 9 ++++++---
 fs/ceph/mds_client.h | 2 ++
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index dc5f55bebad7..62e99e65250d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1998,6 +1998,8 @@ static int ceph_d_revalidate(struct inode *dir, const struct qstr *name,
 			req->r_parent = dir;
 			ihold(dir);
 
+			req->r_dname = name;
+
 			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
 			if (ceph_security_xattr_wanted(dir))
 				mask |= CEPH_CAP_XATTR_SHARED;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 219a2cc2bf3c..3b766b984713 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2621,6 +2621,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
 {
 	struct inode *dir = req->r_parent;
 	struct dentry *dentry = req->r_dentry;
+	const struct qstr *name = req->r_dname;
 	u8 *cryptbuf = NULL;
 	u32 len = 0;
 	int ret = 0;
@@ -2641,8 +2642,10 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
 	if (!fscrypt_has_encryption_key(dir))
 		goto success;
 
-	if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX,
-					  &len)) {
+	if (!name)
+		name = &dentry->d_name;
+
+	if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) {
 		WARN_ON_ONCE(1);
 		return ERR_PTR(-ENAMETOOLONG);
 	}
@@ -2657,7 +2660,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
 	if (!cryptbuf)
 		return ERR_PTR(-ENOMEM);
 
-	ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len);
+	ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len);
 	if (ret) {
 		kfree(cryptbuf);
 		return ERR_PTR(ret);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 38bb7e0d2d79..7c9fee9e80d4 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -299,6 +299,8 @@ struct ceph_mds_request {
 	struct inode *r_target_inode;       /* resulting inode */
 	struct inode *r_new_inode;	    /* new inode (for creates) */
 
+	const struct qstr *r_dname;	    /* stable name (for ->d_revalidate) */
+
 #define CEPH_MDS_R_DIRECT_IS_HASH	(1) /* r_direct_hash is valid */
 #define CEPH_MDS_R_ABORTED		(2) /* call was aborted */
 #define CEPH_MDS_R_GOT_UNSAFE		(3) /* got an unsafe reply */
-- 
cgit v1.2.3


From c4a9fe6319e3343ed26ac9d671d96decebc95a03 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 00:58:15 -0500
Subject: fscrypt_d_revalidate(): use stable parent inode passed by caller

The only thing it's using is parent directory inode and we are already
given a stable reference to that - no need to bother with boilerplate.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/crypto/fname.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 389f5b2bf63b..010f9c0a4c2f 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -574,12 +574,10 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);
  * Validate dentries in encrypted directories to make sure we aren't potentially
  * caching stale dentries after a key has been added.
  */
-int fscrypt_d_revalidate(struct inode *parent_dir, const struct qstr *name,
+int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
 			 struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *dir;
 	int err;
-	int valid;
 
 	/*
 	 * Plaintext names are always valid, since fscrypt doesn't support
@@ -592,30 +590,21 @@ int fscrypt_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	/*
 	 * No-key name; valid if the directory's key is still unavailable.
 	 *
-	 * Although fscrypt forbids rename() on no-key names, we still must use
-	 * dget_parent() here rather than use ->d_parent directly.  That's
-	 * because a corrupted fs image may contain directory hard links, which
-	 * the VFS handles by moving the directory's dentry tree in the dcache
-	 * each time ->lookup() finds the directory and it already has a dentry
-	 * elsewhere.  Thus ->d_parent can be changing, and we must safely grab
-	 * a reference to some ->d_parent to prevent it from being freed.
+	 * Note in RCU mode we have to bail if we get here -
+	 * fscrypt_get_encryption_info() may block.
 	 */
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	dir = dget_parent(dentry);
 	/*
 	 * Pass allow_unsupported=true, so that files with an unsupported
 	 * encryption policy can be deleted.
 	 */
-	err = fscrypt_get_encryption_info(d_inode(dir), true);
-	valid = !fscrypt_has_encryption_key(d_inode(dir));
-	dput(dir);
-
+	err = fscrypt_get_encryption_info(dir, true);
 	if (err < 0)
 		return err;
 
-	return valid;
+	return !fscrypt_has_encryption_key(dir);
 }
 EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
-- 
cgit v1.2.3


From 14d02c3dcf774f325c77d948eed05ff8196f94cc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 01:13:16 -0500
Subject: exfat_d_revalidate(): use stable parent inode passed by caller

... no need to bother with ->d_lock and ->d_parent->d_inode.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exfat/namei.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index e3b4feccba07..61c7164b85b3 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -34,8 +34,6 @@ static inline void exfat_d_version_set(struct dentry *dentry,
 static int exfat_d_revalidate(struct inode *dir, const struct qstr *name,
 			      struct dentry *dentry, unsigned int flags)
 {
-	int ret;
-
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
@@ -59,11 +57,7 @@ static int exfat_d_revalidate(struct inode *dir, const struct qstr *name,
 	if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
 		return 0;
 
-	spin_lock(&dentry->d_lock);
-	ret = inode_eq_iversion(d_inode(dentry->d_parent),
-			exfat_d_version(dentry));
-	spin_unlock(&dentry->d_lock);
-	return ret;
+	return inode_eq_iversion(dir, exfat_d_version(dentry));
 }
 
 /* returns the length of a struct qstr, ignoring trailing dots if necessary */
-- 
cgit v1.2.3


From 464361133e6af521d1307ed89a54c4c181162c91 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 01:16:14 -0500
Subject: vfat_revalidate{,_ci}(): use stable parent inode passed by caller

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fat/namei_vfat.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f9cbd5c6f932..926c26e90ef8 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,14 +43,9 @@ static inline void vfat_d_version_set(struct dentry *dentry,
  * If it happened, the negative dentry isn't actually negative
  * anymore.  So, drop it.
  */
-static int vfat_revalidate_shortname(struct dentry *dentry)
+static bool vfat_revalidate_shortname(struct dentry *dentry, struct inode *dir)
 {
-	int ret = 1;
-	spin_lock(&dentry->d_lock);
-	if (!inode_eq_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry)))
-		ret = 0;
-	spin_unlock(&dentry->d_lock);
-	return ret;
+	return inode_eq_iversion(dir, vfat_d_version(dentry));
 }
 
 static int vfat_revalidate(struct inode *dir, const struct qstr *name,
@@ -62,7 +57,7 @@ static int vfat_revalidate(struct inode *dir, const struct qstr *name,
 	/* This is not negative dentry. Always valid. */
 	if (d_really_is_positive(dentry))
 		return 1;
-	return vfat_revalidate_shortname(dentry);
+	return vfat_revalidate_shortname(dentry, dir);
 }
 
 static int vfat_revalidate_ci(struct inode *dir, const struct qstr *name,
@@ -99,7 +94,7 @@ static int vfat_revalidate_ci(struct inode *dir, const struct qstr *name,
 	if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
 		return 0;
 
-	return vfat_revalidate_shortname(dentry);
+	return vfat_revalidate_shortname(dentry, dir);
 }
 
 /* returns the length of a struct qstr, ignoring trailing dots */
-- 
cgit v1.2.3


From 19e1dbdc6b9e9b069bef94c6856364781338eb76 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 01:20:35 -0500
Subject: fuse_dentry_revalidate(): use stable parent inode and name passed by
 caller

No need to mess with dget_parent() for the former; for the latter we really should
not rely upon ->d_name.name remaining stable - it's a real-life UAF.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/dir.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d9e9f26917eb..3019bc1d9f9d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -175,9 +175,11 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
 	memset(outarg, 0, sizeof(struct fuse_entry_out));
 	args->opcode = FUSE_LOOKUP;
 	args->nodeid = nodeid;
-	args->in_numargs = 1;
-	args->in_args[0].size = name->len + 1;
+	args->in_numargs = 2;
+	args->in_args[0].size = name->len;
 	args->in_args[0].value = name->name;
+	args->in_args[1].size = 1;
+	args->in_args[1].value = "";
 	args->out_numargs = 1;
 	args->out_args[0].size = sizeof(struct fuse_entry_out);
 	args->out_args[0].value = outarg;
@@ -196,7 +198,6 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
 				  struct dentry *entry, unsigned int flags)
 {
 	struct inode *inode;
-	struct dentry *parent;
 	struct fuse_mount *fm;
 	struct fuse_inode *fi;
 	int ret;
@@ -228,11 +229,9 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
 
 		attr_version = fuse_get_attr_version(fm->fc);
 
-		parent = dget_parent(entry);
-		fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
-				 &entry->d_name, &outarg);
+		fuse_lookup_init(fm->fc, &args, get_node_id(dir),
+				 name, &outarg);
 		ret = fuse_simple_request(fm, &args);
-		dput(parent);
 		/* Zero nodeid is same as -ENOENT */
 		if (!ret && !outarg.nodeid)
 			ret = -ENOENT;
@@ -266,9 +265,7 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
 			if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
 				return -ECHILD;
 		} else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
-			parent = dget_parent(entry);
-			fuse_advise_use_readdirplus(d_inode(parent));
-			dput(parent);
+			fuse_advise_use_readdirplus(dir);
 		}
 	}
 	ret = 1;
-- 
cgit v1.2.3


From eab2a11e5be68e42832957f6ad813db4bee4d18c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 01:23:50 -0500
Subject: gfs2_drevalidate(): use stable parent inode and name passed by caller

No need to mess with dget_parent() for the former; for the latter we really should
not rely upon ->d_name.name remaining stable.  Theoretically a UAF, but it's
hard to exfiltrate the information...

Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/dentry.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 86c338901fab..95050e719233 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -35,48 +35,40 @@
 static int gfs2_drevalidate(struct inode *dir, const struct qstr *name,
 			    struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *parent;
-	struct gfs2_sbd *sdp;
-	struct gfs2_inode *dip;
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct gfs2_inode *dip = GFS2_I(dir);
 	struct inode *inode;
 	struct gfs2_holder d_gh;
 	struct gfs2_inode *ip = NULL;
-	int error, valid = 0;
+	int error, valid;
 	int had_lock = 0;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	parent = dget_parent(dentry);
-	sdp = GFS2_SB(d_inode(parent));
-	dip = GFS2_I(d_inode(parent));
 	inode = d_inode(dentry);
 
 	if (inode) {
 		if (is_bad_inode(inode))
-			goto out;
+			return 0;
 		ip = GFS2_I(inode);
 	}
 
-	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) {
-		valid = 1;
-		goto out;
-	}
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		return 1;
 
 	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
 	if (!had_lock) {
 		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
 		if (error)
-			goto out;
+			return 0;
 	}
 
-	error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip);
+	error = gfs2_dir_check(dir, name, ip);
 	valid = inode ? !error : (error == -ENOENT);
 
 	if (!had_lock)
 		gfs2_glock_dq_uninit(&d_gh);
-out:
-	dput(parent);
 	return valid;
 }
 
-- 
cgit v1.2.3


From 39f644a26600634aee00f179ba42cbe5437fd04f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 01:28:29 -0500
Subject: nfs{,4}_lookup_validate(): use stable parent inode passed by caller

we can't kill __nfs_lookup_revalidate() completely, but ->d_parent boilerplate
in it is gone

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c | 43 +++++++++++++------------------------------
 1 file changed, 13 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9910d9796f4c..c28983ee75ca 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1732,8 +1732,8 @@ out:
  * cached dentry and do a new lookup.
  */
 static int
-nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
-			 unsigned int flags)
+nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name,
+			 struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	int error = 0;
@@ -1785,39 +1785,26 @@ out_bad:
 }
 
 static int
-__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
-			int (*reval)(struct inode *, struct dentry *, unsigned int))
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *parent;
-	struct inode *dir;
-	int ret;
-
 	if (flags & LOOKUP_RCU) {
 		if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
 			return -ECHILD;
-		parent = READ_ONCE(dentry->d_parent);
-		dir = d_inode_rcu(parent);
-		if (!dir)
-			return -ECHILD;
-		ret = reval(dir, dentry, flags);
-		if (parent != READ_ONCE(dentry->d_parent))
-			return -ECHILD;
 	} else {
 		/* Wait for unlink to complete - see unblock_revalidate() */
 		wait_var_event(&dentry->d_fsdata,
 			       smp_load_acquire(&dentry->d_fsdata)
 			       != NFS_FSDATA_BLOCKED);
-		parent = dget_parent(dentry);
-		ret = reval(d_inode(parent), dentry, flags);
-		dput(parent);
 	}
-	return ret;
+	return 0;
 }
 
 static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name,
 				 struct dentry *dentry, unsigned int flags)
 {
-	return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
+	if (__nfs_lookup_revalidate(dentry, flags))
+		return -ECHILD;
+	return nfs_do_lookup_revalidate(dir, name, dentry, flags);
 }
 
 static void block_revalidate(struct dentry *dentry)
@@ -2216,11 +2203,14 @@ no_open:
 EXPORT_SYMBOL_GPL(nfs_atomic_open);
 
 static int
-nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
-			  unsigned int flags)
+nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name,
+		       struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 
+	if (__nfs_lookup_revalidate(dentry, flags))
+		return -ECHILD;
+
 	trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
 
 	if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
@@ -2259,14 +2249,7 @@ reval_dentry:
 	return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
 
 full_reval:
-	return nfs_do_lookup_revalidate(dir, dentry, flags);
-}
-
-static int nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name,
-				  struct dentry *dentry, unsigned int flags)
-{
-	return __nfs_lookup_revalidate(dentry, flags,
-			nfs4_do_lookup_revalidate);
+	return nfs_do_lookup_revalidate(dir, name, dentry, flags);
 }
 
 #endif /* CONFIG_NFSV4 */
-- 
cgit v1.2.3


From ffeeaada2bddb88078f16ba24f24ce8651c22d5b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Dec 2024 01:27:11 -0500
Subject: nfs: fix ->d_revalidate() UAF on ->d_name accesses

Pass the stable name all the way down to ->rpc_ops->lookup() instances.

Note that passing &dentry->d_name is safe in e.g. nfs_lookup() - it *is*
stable there, as it is in ->create() et.al.

dget_parent() in nfs_instantiate() should be redundant - it'd better be
stable there; if it's not, we have more trouble, since ->d_name would
also be unsafe in such case.

nfs_submount() and nfs4_submount() may or may not require fixes - if
they ever get moved on server with fhandle preserved, we are in trouble
there...

UAF window is fairly narrow here and exfiltration requires the ability
to watch the traffic.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c            | 14 ++++++++------
 fs/nfs/namespace.c      |  2 +-
 fs/nfs/nfs3proc.c       |  5 ++---
 fs/nfs/nfs4proc.c       | 20 ++++++++++----------
 fs/nfs/proc.c           |  6 +++---
 include/linux/nfs_xdr.h |  2 +-
 6 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c28983ee75ca..2b04038b0e40 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1672,7 +1672,7 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
 	return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
 }
 
-static int nfs_lookup_revalidate_dentry(struct inode *dir,
+static int nfs_lookup_revalidate_dentry(struct inode *dir, const struct qstr *name,
 					struct dentry *dentry,
 					struct inode *inode, unsigned int flags)
 {
@@ -1690,7 +1690,7 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir,
 		goto out;
 
 	dir_verifier = nfs_save_change_attribute(dir);
-	ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+	ret = NFS_PROTO(dir)->lookup(dir, dentry, name, fhandle, fattr);
 	if (ret < 0)
 		goto out;
 
@@ -1775,7 +1775,7 @@ nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name,
 	if (NFS_STALE(inode))
 		goto out_bad;
 
-	return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
+	return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags);
 out_valid:
 	return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
 out_bad:
@@ -1970,7 +1970,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
 
 	dir_verifier = nfs_save_change_attribute(dir);
 	trace_nfs_lookup_enter(dir, dentry, flags);
-	error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+	error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name,
+				       fhandle, fattr);
 	if (error == -ENOENT) {
 		if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
 			dir_verifier = inode_peek_iversion_raw(dir);
@@ -2246,7 +2247,7 @@ nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name,
 reval_dentry:
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
-	return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
+	return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags);
 
 full_reval:
 	return nfs_do_lookup_revalidate(dir, name, dentry, flags);
@@ -2305,7 +2306,8 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
 	d_drop(dentry);
 
 	if (fhandle->size == 0) {
-		error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+		error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name,
+					       fhandle, fattr);
 		if (error)
 			goto out_error;
 	}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2d53574da605..973aed9cc5fe 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -308,7 +308,7 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server)
 	int err;
 
 	/* Look it up again to get its attributes */
-	err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry,
+	err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, &dentry->d_name,
 						  ctx->mntfh, ctx->clone_data.fattr);
 	dput(parent);
 	if (err != 0)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1566163c6d85..ce70768e0201 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -192,7 +192,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len,
 }
 
 static int
-nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
+nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name,
 		 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	unsigned short task_flags = 0;
@@ -202,8 +202,7 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
 		task_flags |= RPC_TASK_TIMEOUT;
 
 	dprintk("NFS call  lookup %pd2\n", dentry);
-	return __nfs3_proc_lookup(dir, dentry->d_name.name,
-				  dentry->d_name.len, fhandle, fattr,
+	return __nfs3_proc_lookup(dir, name->name, name->len, fhandle, fattr,
 				  task_flags);
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 405f17e6e0b4..4d85068e820d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4536,15 +4536,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 
 static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
-		struct dentry *dentry, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr)
+		struct dentry *dentry, const struct qstr *name,
+		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	int		       status;
 	struct nfs4_lookup_arg args = {
 		.bitmask = server->attr_bitmask,
 		.dir_fh = NFS_FH(dir),
-		.name = &dentry->d_name,
+		.name = name,
 	};
 	struct nfs4_lookup_res res = {
 		.server = server,
@@ -4586,17 +4586,16 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
 }
 
 static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
-				   struct dentry *dentry, struct nfs_fh *fhandle,
-				   struct nfs_fattr *fattr)
+				   struct dentry *dentry, const struct qstr *name,
+				   struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs4_exception exception = {
 		.interruptible = true,
 	};
 	struct rpc_clnt *client = *clnt;
-	const struct qstr *name = &dentry->d_name;
 	int err;
 	do {
-		err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr);
+		err = _nfs4_proc_lookup(client, dir, dentry, name, fhandle, fattr);
 		trace_nfs4_lookup(dir, name, err);
 		switch (err) {
 		case -NFS4ERR_BADNAME:
@@ -4631,13 +4630,13 @@ out:
 	return err;
 }
 
-static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry,
+static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name,
 			    struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	int status;
 	struct rpc_clnt *client = NFS_CLIENT(dir);
 
-	status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
+	status = nfs4_proc_lookup_common(&client, dir, dentry, name, fhandle, fattr);
 	if (client != NFS_CLIENT(dir)) {
 		rpc_shutdown_client(client);
 		nfs_fixup_secinfo_attributes(fattr);
@@ -4652,7 +4651,8 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry,
 	struct rpc_clnt *client = NFS_CLIENT(dir);
 	int status;
 
-	status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
+	status = nfs4_proc_lookup_common(&client, dir, dentry, &dentry->d_name,
+					 fhandle, fattr);
 	if (status < 0)
 		return ERR_PTR(status);
 	return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 6c09cd090c34..77920a2e3cef 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -153,13 +153,13 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 
 static int
-nfs_proc_lookup(struct inode *dir, struct dentry *dentry,
+nfs_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name,
 		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs_diropargs	arg = {
 		.fh		= NFS_FH(dir),
-		.name		= dentry->d_name.name,
-		.len		= dentry->d_name.len
+		.name		= name->name,
+		.len		= name->len
 	};
 	struct nfs_diropok	res = {
 		.fh		= fhandle,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 559273a0f16d..08b62bbf59f0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1785,7 +1785,7 @@ struct nfs_rpc_ops {
 			    struct nfs_fattr *, struct inode *);
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
 			    struct iattr *);
-	int	(*lookup)  (struct inode *, struct dentry *,
+	int	(*lookup)  (struct inode *, struct dentry *, const struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 	int	(*lookupp) (struct inode *, struct nfs_fh *,
 			    struct nfs_fattr *);
-- 
cgit v1.2.3


From 9640fe5b5e40f664da401ac8ca5d7046edbc62db Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 01:32:38 -0500
Subject: ocfs2_dentry_revalidate(): use stable parent inode and name passed by
 caller

theoretically, ->d_name use in there is a UAF, but only if you are messing with
tracepoints...

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/dcache.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index ecb1ce6301c4..1873bbbb7e5b 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -45,8 +45,7 @@ static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name,
 	inode = d_inode(dentry);
 	osb = OCFS2_SB(dentry->d_sb);
 
-	trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
-				      dentry->d_name.name);
+	trace_ocfs2_dentry_revalidate(dentry, name->len, name->name);
 
 	/* For a negative dentry -
 	 * check the generation number of the parent and compare with the
@@ -54,12 +53,8 @@ static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name,
 	 */
 	if (inode == NULL) {
 		unsigned long gen = (unsigned long) dentry->d_fsdata;
-		unsigned long pgen;
-		spin_lock(&dentry->d_lock);
-		pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen;
-		spin_unlock(&dentry->d_lock);
-		trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
-						       dentry->d_name.name,
+		unsigned long pgen = OCFS2_I(dir)->ip_dir_lock_gen;
+		trace_ocfs2_dentry_revalidate_negative(name->len, name->name,
 						       pgen, gen);
 		if (gen != pgen)
 			goto bail;
-- 
cgit v1.2.3


From 90341f22c364d8ad55184ce2b8777545942dc5cf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jan 2025 01:35:18 -0500
Subject: orangefs_d_revalidate(): use stable parent inode and name passed by
 caller

->d_name use is a UAF if the userland side of things can be slowed down
by attacker.

Tested-by: Mike Marshall <hubcap@omnibond.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/orangefs/dcache.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index c32c9a86e8d0..a19d1ad705db 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -13,10 +13,9 @@
 #include "orangefs-kernel.h"
 
 /* Returns 1 if dentry can still be trusted, else 0. */
-static int orangefs_revalidate_lookup(struct dentry *dentry)
+static int orangefs_revalidate_lookup(struct inode *parent_inode, const struct qstr *name,
+				      struct dentry *dentry)
 {
-	struct dentry *parent_dentry = dget_parent(dentry);
-	struct inode *parent_inode = parent_dentry->d_inode;
 	struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode);
 	struct inode *inode = dentry->d_inode;
 	struct orangefs_kernel_op_s *new_op;
@@ -26,14 +25,14 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
 	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
 
 	new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
-	if (!new_op) {
-		ret = -ENOMEM;
-		goto out_put_parent;
-	}
+	if (!new_op)
+		return -ENOMEM;
 
 	new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
 	new_op->upcall.req.lookup.parent_refn = parent->refn;
-	strscpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name);
+	/* op_alloc() leaves ->upcall zeroed */
+	memcpy(new_op->upcall.req.lookup.d_name, name->name,
+			min(name->len, ORANGEFS_NAME_MAX - 1));
 
 	gossip_debug(GOSSIP_DCACHE_DEBUG,
 		     "%s:%s:%d interrupt flag [%d]\n",
@@ -78,8 +77,6 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
 	ret = 1;
 out_release_op:
 	op_release(new_op);
-out_put_parent:
-	dput(parent_dentry);
 	return ret;
 out_drop:
 	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n",
@@ -115,7 +112,7 @@ static int orangefs_d_revalidate(struct inode *dir, const struct qstr *name,
 	 * If this passes, the positive dentry still exists or the negative
 	 * dentry still does not exist.
 	 */
-	if (!orangefs_revalidate_lookup(dentry))
+	if (!orangefs_revalidate_lookup(dir, name, dentry))
 		return 0;
 
 	/* We do not need to continue with negative dentries. */
-- 
cgit v1.2.3


From 30d61efe118cad1a73ad2ad66a3298e4abdf9f41 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 5 Jan 2025 21:33:17 -0500
Subject: 9p: fix ->rename_sem exclusion

9p wants to be able to build a path from given dentry to fs root and keep
it valid over a blocking operation.

->s_vfs_rename_mutex would be a natural candidate, but there are places
where we need that and where we have no way to tell if ->s_vfs_rename_mutex
is already held deeper in callchain.  Moreover, it's only held for
cross-directory renames; name changes within the same directory happen
without it.

Solution:
	* have d_move() done in ->rename() rather than in its caller
	* maintain a 9p-private rwsem (per-filesystem)
	* hold it exclusive over the relevant part of ->rename()
	* hold it shared over the places where we want the path.

That almost works.  FS_RENAME_DOES_D_MOVE is enough to put all d_move()
and d_exchange() calls under filesystem's control.  However, there's
also __d_unalias(), which isn't covered by any of that.

If ->lookup() hits a directory inode with preexisting dentry elsewhere
(due to e.g. rename done on server behind our back), d_splice_alias()
called by ->lookup() will move/rename that alias.

Add a couple of optional methods, so that __d_unalias() would do
	if alias->d_op->d_unalias_trylock != NULL
		if (!alias->d_op->d_unalias_trylock(alias))
			fail (resulting in -ESTALE from lookup)
	__d_move(...)
	if alias->d_op->d_unalias_unlock != NULL
		alias->d_unalias_unlock(alias)
where it currently does __d_move().  9p instances do down_write_trylock()
and up_write() of ->rename_mutex.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/locking.rst |  4 ++++
 Documentation/filesystems/vfs.rst     | 21 +++++++++++++++++++++
 fs/9p/v9fs.h                          |  2 +-
 fs/9p/vfs_dentry.c                    | 16 ++++++++++++++++
 fs/dcache.c                           |  5 +++++
 include/linux/dcache.h                |  2 ++
 6 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 146e7d8aa736..d20a32b77b60 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -31,6 +31,8 @@ prototypes::
 	struct vfsmount *(*d_automount)(struct path *path);
 	int (*d_manage)(const struct path *, bool);
 	struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
+	bool (*d_unalias_trylock)(const struct dentry *);
+	void (*d_unalias_unlock)(const struct dentry *);
 
 locking rules:
 
@@ -50,6 +52,8 @@ d_dname:	   no		no		no		no
 d_automount:	   no		no		yes		no
 d_manage:	   no		no		yes (ref-walk)	maybe
 d_real		   no		no		yes 		no
+d_unalias_trylock  yes		no		no 		no
+d_unalias_unlock   yes		no		no 		no
 ================== ===========	========	==============	========
 
 inode_operations
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7c352ebaae98..31eea688609a 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -1265,6 +1265,8 @@ defined:
 		struct vfsmount *(*d_automount)(struct path *);
 		int (*d_manage)(const struct path *, bool);
 		struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
+		bool (*d_unalias_trylock)(const struct dentry *);
+		void (*d_unalias_unlock)(const struct dentry *);
 	};
 
 ``d_revalidate``
@@ -1428,6 +1430,25 @@ defined:
 
 	For non-regular files, the 'dentry' argument is returned.
 
+``d_unalias_trylock``
+	if present, will be called by d_splice_alias() before moving a
+	preexisting attached alias.  Returning false prevents __d_move(),
+	making d_splice_alias() fail with -ESTALE.
+
+	Rationale: setting FS_RENAME_DOES_D_MOVE will prevent d_move()
+	and d_exchange() calls from the outside of filesystem methods;
+	however, it does not guarantee that attached dentries won't
+	be renamed or moved by d_splice_alias() finding a preexisting
+	alias for a directory inode.  Normally we would not care;
+	however, something that wants to stabilize the entire path to
+	root over a blocking operation might need that.  See 9p for one
+	(and hopefully only) example.
+
+``d_unalias_unlock``
+	should be paired with ``d_unalias_trylock``; that one is called after
+	__d_move() call in __d_unalias().
+
+
 Each dentry has a pointer to its parent dentry, as well as a hash list
 of child dentries.  Child dentries are basically like files in a
 directory.
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 698c43dd5dc8..f28bc763847a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -202,7 +202,7 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
 	return inode->i_sb->s_fs_info;
 }
 
-static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
+static inline struct v9fs_session_info *v9fs_dentry2v9ses(const struct dentry *dentry)
 {
 	return dentry->d_sb->s_fs_info;
 }
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 872c1abe3295..5061f192eafd 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -105,14 +105,30 @@ static int v9fs_lookup_revalidate(struct inode *dir, const struct qstr *name,
 	return __v9fs_lookup_revalidate(dentry, flags);
 }
 
+static bool v9fs_dentry_unalias_trylock(const struct dentry *dentry)
+{
+	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+	return down_write_trylock(&v9ses->rename_sem);
+}
+
+static void v9fs_dentry_unalias_unlock(const struct dentry *dentry)
+{
+	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+	up_write(&v9ses->rename_sem);
+}
+
 const struct dentry_operations v9fs_cached_dentry_operations = {
 	.d_revalidate = v9fs_lookup_revalidate,
 	.d_weak_revalidate = __v9fs_lookup_revalidate,
 	.d_delete = v9fs_cached_dentry_delete,
 	.d_release = v9fs_dentry_release,
+	.d_unalias_trylock = v9fs_dentry_unalias_trylock,
+	.d_unalias_unlock = v9fs_dentry_unalias_unlock,
 };
 
 const struct dentry_operations v9fs_dentry_operations = {
 	.d_delete = always_delete_dentry,
 	.d_release = v9fs_dentry_release,
+	.d_unalias_trylock = v9fs_dentry_unalias_trylock,
+	.d_unalias_unlock = v9fs_dentry_unalias_unlock,
 };
diff --git a/fs/dcache.c b/fs/dcache.c
index f8d6a2557736..903142b324e9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2967,7 +2967,12 @@ static int __d_unalias(struct dentry *dentry, struct dentry *alias)
 		goto out_err;
 	m2 = &alias->d_parent->d_inode->i_rwsem;
 out_unalias:
+	if (alias->d_op->d_unalias_trylock &&
+	    !alias->d_op->d_unalias_trylock(alias))
+		goto out_err;
 	__d_move(alias, dentry, false);
+	if (alias->d_op->d_unalias_unlock)
+		alias->d_op->d_unalias_unlock(alias);
 	ret = 0;
 out_err:
 	if (m2)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4a6bdadf2f29..9a1a30857763 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -159,6 +159,8 @@ struct dentry_operations {
 	struct vfsmount *(*d_automount)(struct path *);
 	int (*d_manage)(const struct path *, bool);
 	struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
+	bool (*d_unalias_trylock)(const struct dentry *);
+	void (*d_unalias_unlock)(const struct dentry *);
 } ____cacheline_aligned;
 
 /*
-- 
cgit v1.2.3


From c1feab95e0b2e9fce7e4f4b2739baf40d84543af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 23 Jan 2025 22:51:04 -0500
Subject: add a string-to-qstr constructor

Quite a few places want to build a struct qstr by given string;
it would be convenient to have a primitive doing that, rather
than open-coding it via QSTR_INIT().

The closest approximation was in bcachefs, but that expands to
initializer list - {.len = strlen(string), .name = string}.
It would be more useful to have it as compound literal -
(struct qstr){.len = strlen(string), .name = string}.

Unlike initializer list it's a valid expression.  What's more,
it's a valid lvalue - it's an equivalent of anonymous local
variable with such initializer, so the things like
	path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name));
are valid.  It can also be used as initializer, with identical
effect -
	struct qstr x = (struct qstr){.name = s, .len = strlen(s)};
is equivalent to
	struct qstr anon_variable = {.name = s, .len = strlen(s)};
	struct qstr x = anon_variable;
	// anon_variable is never used after that point
and any even remotely sane compiler will manage to collapse that
into
	struct qstr x = {.name = s, .len = strlen(s)};

What compound literals can't be used for is initialization of
global variables, but those are covered by QSTR_INIT().

This commit lifts definition(s) of QSTR() into linux/dcache.h,
converts it to compound literal (all bcachefs users are fine
with that) and converts assorted open-coded instances to using
that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/anon_inodes.c       |  4 ++--
 fs/bcachefs/fsck.c     |  2 +-
 fs/bcachefs/recovery.c |  2 --
 fs/bcachefs/util.h     |  2 --
 fs/erofs/xattr.c       |  2 +-
 fs/file_table.c        |  4 +---
 fs/kernfs/file.c       |  2 +-
 include/linux/dcache.h |  1 +
 mm/secretmem.c         |  3 +--
 net/sunrpc/rpc_pipe.c  | 14 +++++---------
 10 files changed, 13 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 42bd1cb7c9cd..583ac81669c2 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -60,14 +60,14 @@ static struct inode *anon_inode_make_secure_inode(
 	const struct inode *context_inode)
 {
 	struct inode *inode;
-	const struct qstr qname = QSTR_INIT(name, strlen(name));
 	int error;
 
 	inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
 	if (IS_ERR(inode))
 		return inode;
 	inode->i_flags &= ~S_PRIVATE;
-	error =	security_inode_init_security_anon(inode, &qname, context_inode);
+	error =	security_inode_init_security_anon(inode, &QSTR(name),
+						  context_inode);
 	if (error) {
 		iput(inode);
 		return ERR_PTR(error);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 75c8a97a6954..7b3b63ed747c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -405,7 +405,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
 		return ret;
 
 	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
-	struct qstr name = (struct qstr) QSTR(name_buf);
+	struct qstr name = QSTR(name_buf);
 
 	inode->bi_dir = lostfound.bi_inum;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3c7f941dde39..ebabba296882 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -32,8 +32,6 @@
 #include <linux/sort.h>
 #include <linux/stat.h>
 
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
 void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
 {
 	if (btree >= BTREE_ID_NR_MAX)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index fb02c1c36004..a27f4b84fe77 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -647,8 +647,6 @@ static inline int cmp_le32(__le32 l, __le32 r)
 
 #include <linux/uuid.h>
 
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
 static inline bool qstr_eq(const struct qstr l, const struct qstr r)
 {
 	return l.len == r.len && !memcmp(l.name, r.name, l.len);
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index a90d7d649739..60d2cf26e837 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -407,7 +407,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
 	}
 
 	it.index = index;
-	it.name = (struct qstr)QSTR_INIT(name, strlen(name));
+	it.name = QSTR(name);
 	if (it.name.len > EROFS_NAME_LEN)
 		return -ERANGE;
 
diff --git a/fs/file_table.c b/fs/file_table.c
index 976736be47cb..a329623d0b42 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -351,9 +351,7 @@ static struct file *alloc_file(const struct path *path, int flags,
 static inline int alloc_path_pseudo(const char *name, struct inode *inode,
 				    struct vfsmount *mnt, struct path *path)
 {
-	struct qstr this = QSTR_INIT(name, strlen(name));
-
-	path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
+	path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name));
 	if (!path->dentry)
 		return -ENOMEM;
 	path->mnt = mntget(mnt);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 8502ef68459b..0eb320617d7b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -927,7 +927,7 @@ repeat:
 		if (!inode)
 			continue;
 
-		name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
+		name = QSTR(kn->name);
 		parent = kernfs_get_parent(kn);
 		if (parent) {
 			p_inode = ilookup(info->sb, kernfs_ino(parent));
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index bff956f7b2b9..3d53a6014591 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -57,6 +57,7 @@ struct qstr {
 };
 
 #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
+#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n))
 
 extern const struct qstr empty_name;
 extern const struct qstr slash_name;
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 399552814fd0..1b0a214ee558 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -195,14 +195,13 @@ static struct file *secretmem_file_create(unsigned long flags)
 	struct file *file;
 	struct inode *inode;
 	const char *anon_name = "[secretmem]";
-	const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
 	int err;
 
 	inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-	err = security_inode_init_security_anon(inode, &qname, NULL);
+	err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL);
 	if (err) {
 		file = ERR_PTR(err);
 		goto err_free_inode;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 7ce3721c06ca..eadc00410ebc 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -630,7 +630,7 @@ static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
 static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
 					  const char *name)
 {
-	struct qstr q = QSTR_INIT(name, strlen(name));
+	struct qstr q = QSTR(name);
 	struct dentry *dentry = d_hash_and_lookup(parent, &q);
 	if (!dentry) {
 		dentry = d_alloc(parent, &q);
@@ -1190,8 +1190,7 @@ static const struct rpc_filelist files[] = {
 struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
 			       const unsigned char *dir_name)
 {
-	struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name));
-	return d_hash_and_lookup(sb->s_root, &dir);
+	return d_hash_and_lookup(sb->s_root, &QSTR(dir_name));
 }
 EXPORT_SYMBOL_GPL(rpc_d_lookup_sb);
 
@@ -1300,11 +1299,9 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
 	struct dentry *gssd_dentry;
 	struct dentry *clnt_dentry = NULL;
 	struct dentry *pipe_dentry = NULL;
-	struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name,
-				  strlen(files[RPCAUTH_gssd].name));
 
 	/* We should never get this far if "gssd" doesn't exist */
-	gssd_dentry = d_hash_and_lookup(root, &q);
+	gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name));
 	if (!gssd_dentry)
 		return ERR_PTR(-ENOENT);
 
@@ -1314,9 +1311,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
 		goto out;
 	}
 
-	q.name = gssd_dummy_clnt_dir[0].name;
-	q.len = strlen(gssd_dummy_clnt_dir[0].name);
-	clnt_dentry = d_hash_and_lookup(gssd_dentry, &q);
+	clnt_dentry = d_hash_and_lookup(gssd_dentry,
+					&QSTR(gssd_dummy_clnt_dir[0].name));
 	if (!clnt_dentry) {
 		__rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
 		pipe_dentry = ERR_PTR(-ENOENT);
-- 
cgit v1.2.3


From 1751f872cc97f992ed5c4c72c55588db1f0021e1 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Tue, 28 Jan 2025 13:48:37 +0100
Subject: treewide: const qualify ctl_tables where applicable

Add the const qualifier to all the ctl_tables in the tree except for
watchdog_hardlockup_sysctl, memory_allocation_profiling_sysctls,
loadpin_sysctl_table and the ones calling register_net_sysctl (./net,
drivers/inifiniband dirs). These are special cases as they use a
registration function with a non-const qualified ctl_table argument or
modify the arrays before passing them on to the registration function.

Constifying ctl_table structs will prevent the modification of
proc_handler function pointers as the arrays would reside in .rodata.
This is made possible after commit 78eb4ea25cd5 ("sysctl: treewide:
constify the ctl_table argument of proc_handlers") constified all the
proc_handlers.

Created this by running an spatch followed by a sed command:
Spatch:
    virtual patch

    @
    depends on !(file in "net")
    disable optional_qualifier
    @

    identifier table_name != {
      watchdog_hardlockup_sysctl,
      iwcm_ctl_table,
      ucma_ctl_table,
      memory_allocation_profiling_sysctls,
      loadpin_sysctl_table
    };
    @@

    + const
    struct ctl_table table_name [] = { ... };

sed:
    sed --in-place \
      -e "s/struct ctl_table .table = &uts_kern/const struct ctl_table *table = \&uts_kern/" \
      kernel/utsname_sysctl.c

Reviewed-by: Song Liu <song@kernel.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org> # for kernel/trace/
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> # SCSI
Reviewed-by: Darrick J. Wong <djwong@kernel.org> # xfs
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Corey Minyard <cminyard@mvista.com>
Acked-by: Wei Liu <wei.liu@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Bill O'Donnell <bodonnel@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Acked-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Acked-by: Anna Schumaker <anna.schumaker@oracle.com>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 arch/arm/kernel/isa.c                         | 2 +-
 arch/arm64/kernel/fpsimd.c                    | 4 ++--
 arch/arm64/kernel/process.c                   | 2 +-
 arch/powerpc/kernel/idle.c                    | 2 +-
 arch/powerpc/platforms/pseries/mobility.c     | 2 +-
 arch/riscv/kernel/process.c                   | 2 +-
 arch/riscv/kernel/vector.c                    | 2 +-
 arch/s390/appldata/appldata_base.c            | 2 +-
 arch/s390/kernel/debug.c                      | 2 +-
 arch/s390/kernel/hiperdispatch.c              | 2 +-
 arch/s390/kernel/topology.c                   | 2 +-
 arch/s390/mm/cmm.c                            | 2 +-
 arch/s390/mm/pgalloc.c                        | 2 +-
 arch/x86/entry/vdso/vdso32-setup.c            | 2 +-
 arch/x86/kernel/cpu/bus_lock.c                | 2 +-
 crypto/fips.c                                 | 2 +-
 drivers/base/firmware_loader/fallback_table.c | 2 +-
 drivers/cdrom/cdrom.c                         | 2 +-
 drivers/char/hpet.c                           | 2 +-
 drivers/char/ipmi/ipmi_poweroff.c             | 2 +-
 drivers/char/random.c                         | 2 +-
 drivers/gpu/drm/i915/i915_perf.c              | 2 +-
 drivers/gpu/drm/xe/xe_observation.c           | 2 +-
 drivers/hv/hv_common.c                        | 2 +-
 drivers/md/md.c                               | 2 +-
 drivers/misc/sgi-xp/xpc_main.c                | 4 ++--
 drivers/perf/arm_pmuv3.c                      | 2 +-
 drivers/perf/riscv_pmu_sbi.c                  | 2 +-
 drivers/scsi/scsi_sysctl.c                    | 2 +-
 drivers/scsi/sg.c                             | 2 +-
 drivers/tty/tty_io.c                          | 2 +-
 drivers/xen/balloon.c                         | 2 +-
 fs/aio.c                                      | 2 +-
 fs/cachefiles/error_inject.c                  | 2 +-
 fs/coda/sysctl.c                              | 2 +-
 fs/coredump.c                                 | 2 +-
 fs/dcache.c                                   | 2 +-
 fs/devpts/inode.c                             | 2 +-
 fs/eventpoll.c                                | 2 +-
 fs/exec.c                                     | 2 +-
 fs/file_table.c                               | 2 +-
 fs/fuse/sysctl.c                              | 2 +-
 fs/inode.c                                    | 2 +-
 fs/lockd/svc.c                                | 2 +-
 fs/locks.c                                    | 2 +-
 fs/namei.c                                    | 2 +-
 fs/namespace.c                                | 2 +-
 fs/nfs/nfs4sysctl.c                           | 2 +-
 fs/nfs/sysctl.c                               | 2 +-
 fs/notify/dnotify/dnotify.c                   | 2 +-
 fs/notify/fanotify/fanotify_user.c            | 2 +-
 fs/notify/inotify/inotify_user.c              | 2 +-
 fs/ocfs2/stackglue.c                          | 2 +-
 fs/pipe.c                                     | 2 +-
 fs/quota/dquot.c                              | 2 +-
 fs/sysctls.c                                  | 2 +-
 fs/userfaultfd.c                              | 2 +-
 fs/verity/init.c                              | 2 +-
 fs/xfs/xfs_sysctl.c                           | 2 +-
 init/do_mounts_initrd.c                       | 2 +-
 io_uring/io_uring.c                           | 2 +-
 ipc/ipc_sysctl.c                              | 2 +-
 ipc/mq_sysctl.c                               | 2 +-
 kernel/acct.c                                 | 2 +-
 kernel/bpf/syscall.c                          | 2 +-
 kernel/delayacct.c                            | 2 +-
 kernel/exit.c                                 | 2 +-
 kernel/hung_task.c                            | 2 +-
 kernel/kexec_core.c                           | 2 +-
 kernel/kprobes.c                              | 2 +-
 kernel/latencytop.c                           | 2 +-
 kernel/locking/lockdep.c                      | 2 +-
 kernel/panic.c                                | 2 +-
 kernel/pid.c                                  | 2 +-
 kernel/pid_namespace.c                        | 2 +-
 kernel/pid_sysctl.h                           | 2 +-
 kernel/printk/sysctl.c                        | 2 +-
 kernel/reboot.c                               | 2 +-
 kernel/sched/autogroup.c                      | 2 +-
 kernel/sched/core.c                           | 2 +-
 kernel/sched/deadline.c                       | 2 +-
 kernel/sched/fair.c                           | 2 +-
 kernel/sched/rt.c                             | 2 +-
 kernel/sched/topology.c                       | 2 +-
 kernel/seccomp.c                              | 2 +-
 kernel/signal.c                               | 2 +-
 kernel/stackleak.c                            | 2 +-
 kernel/sysctl-test.c                          | 6 +++---
 kernel/sysctl.c                               | 4 ++--
 kernel/time/timer.c                           | 2 +-
 kernel/trace/ftrace.c                         | 2 +-
 kernel/trace/trace_events_user.c              | 2 +-
 kernel/umh.c                                  | 2 +-
 kernel/utsname_sysctl.c                       | 4 ++--
 kernel/watchdog.c                             | 2 +-
 lib/test_sysctl.c                             | 6 +++---
 mm/compaction.c                               | 2 +-
 mm/hugetlb.c                                  | 2 +-
 mm/hugetlb_vmemmap.c                          | 2 +-
 mm/memory-failure.c                           | 2 +-
 mm/oom_kill.c                                 | 2 +-
 mm/page-writeback.c                           | 2 +-
 mm/page_alloc.c                               | 2 +-
 security/apparmor/lsm.c                       | 2 +-
 security/keys/sysctl.c                        | 2 +-
 security/yama/yama_lsm.c                      | 2 +-
 106 files changed, 114 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/kernel/isa.c b/arch/arm/kernel/isa.c
index 905b1b191546..db8be609fab2 100644
--- a/arch/arm/kernel/isa.c
+++ b/arch/arm/kernel/isa.c
@@ -16,7 +16,7 @@
 
 static unsigned int isa_membase, isa_portbase, isa_portshift;
 
-static struct ctl_table ctl_isa_vars[] = {
+static const struct ctl_table ctl_isa_vars[] = {
 	{
 		.procname	= "membase",
 		.data		= &isa_membase, 
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 8c4c1a2186cc..2b601d88762d 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -562,7 +562,7 @@ static int vec_proc_do_default_vl(const struct ctl_table *table, int write,
 	return 0;
 }
 
-static struct ctl_table sve_default_vl_table[] = {
+static const struct ctl_table sve_default_vl_table[] = {
 	{
 		.procname	= "sve_default_vector_length",
 		.mode		= 0644,
@@ -585,7 +585,7 @@ static int __init sve_sysctl_init(void) { return 0; }
 #endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */
 
 #if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL)
-static struct ctl_table sme_default_vl_table[] = {
+static const struct ctl_table sme_default_vl_table[] = {
 	{
 		.procname	= "sme_default_vector_length",
 		.mode		= 0644,
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 2968a33bb3bc..42faebb7b712 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -859,7 +859,7 @@ long get_tagged_addr_ctrl(struct task_struct *task)
  * disable it for tasks that already opted in to the relaxed ABI.
  */
 
-static struct ctl_table tagged_addr_sysctl_table[] = {
+static const struct ctl_table tagged_addr_sysctl_table[] = {
 	{
 		.procname	= "tagged_addr_disabled",
 		.mode		= 0644,
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 30b56c67fa61..e527cd3ef128 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -97,7 +97,7 @@ void power4_idle(void)
 /*
  * Register the sysctl to set/clear powersave_nap.
  */
-static struct ctl_table powersave_nap_ctl_table[] = {
+static const struct ctl_table powersave_nap_ctl_table[] = {
 	{
 		.procname	= "powersave-nap",
 		.data		= &powersave_nap,
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index 1798f0f14d58..62bd8e2d5d4c 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -53,7 +53,7 @@ struct update_props_workarea {
 static unsigned int nmi_wd_lpm_factor = 200;
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table nmi_wd_lpm_factor_ctl_table[] = {
+static const struct ctl_table nmi_wd_lpm_factor_ctl_table[] = {
 	{
 		.procname	= "nmi_wd_lpm_factor",
 		.data		= &nmi_wd_lpm_factor,
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 58b6482c2bf6..7891294abf49 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -364,7 +364,7 @@ static bool try_to_set_pmm(unsigned long value)
  * disable it for tasks that already opted in to the relaxed ABI.
  */
 
-static struct ctl_table tagged_addr_sysctl_table[] = {
+static const struct ctl_table tagged_addr_sysctl_table[] = {
 	{
 		.procname	= "tagged_addr_disabled",
 		.mode		= 0644,
diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
index 821818886fab..d022b028ac3f 100644
--- a/arch/riscv/kernel/vector.c
+++ b/arch/riscv/kernel/vector.c
@@ -287,7 +287,7 @@ long riscv_v_vstate_ctrl_set_current(unsigned long arg)
 
 #ifdef CONFIG_SYSCTL
 
-static struct ctl_table riscv_v_default_vstate_table[] = {
+static const struct ctl_table riscv_v_default_vstate_table[] = {
 	{
 		.procname	= "riscv_v_default_allow",
 		.data		= &riscv_v_implicit_uacc,
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 91a30e017d65..dd7ba7587dd5 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -52,7 +52,7 @@ static int appldata_interval_handler(const struct ctl_table *ctl, int write,
 				     void *buffer, size_t *lenp, loff_t *ppos);
 
 static struct ctl_table_header *appldata_sysctl_header;
-static struct ctl_table appldata_table[] = {
+static const struct ctl_table appldata_table[] = {
 	{
 		.procname	= "timer",
 		.mode		= S_IRUGO | S_IWUSR,
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index ba6b7329a10e..ce038e9205f7 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -1122,7 +1122,7 @@ static int s390dbf_procactive(const struct ctl_table *table, int write,
 		return 0;
 }
 
-static struct ctl_table s390dbf_table[] = {
+static const struct ctl_table s390dbf_table[] = {
 	{
 		.procname	= "debug_stoppable",
 		.data		= &debug_stoppable,
diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c
index 2a99a216ab62..7857a7e8e56c 100644
--- a/arch/s390/kernel/hiperdispatch.c
+++ b/arch/s390/kernel/hiperdispatch.c
@@ -292,7 +292,7 @@ static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static struct ctl_table hiperdispatch_ctl_table[] = {
+static const struct ctl_table hiperdispatch_ctl_table[] = {
 	{
 		.procname	= "hiperdispatch",
 		.mode		= 0644,
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3808f942a433..211cc8382e4a 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -667,7 +667,7 @@ static int polarization_ctl_handler(const struct ctl_table *ctl, int write,
 	return set_polarization(polarization);
 }
 
-static struct ctl_table topology_ctl_table[] = {
+static const struct ctl_table topology_ctl_table[] = {
 	{
 		.procname	= "topology",
 		.mode		= 0644,
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 7bf0f691827b..39f44b6256e0 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -332,7 +332,7 @@ static int cmm_timeout_handler(const struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static struct ctl_table cmm_table[] = {
+static const struct ctl_table cmm_table[] = {
 	{
 		.procname	= "cmm_pages",
 		.mode		= 0644,
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index d33f55b7ee98..cd2fef79ad2c 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -21,7 +21,7 @@
 int page_table_allocate_pgste = 0;
 EXPORT_SYMBOL(page_table_allocate_pgste);
 
-static struct ctl_table page_table_sysctl[] = {
+static const struct ctl_table page_table_sysctl[] = {
 	{
 		.procname	= "allocate_pgste",
 		.data		= &page_table_allocate_pgste,
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 76e4e74f35b5..f6d2d8aba643 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -57,7 +57,7 @@ __setup_param("vdso=", vdso_setup, vdso32_setup, 0);
 /* Register vsyscall32 into the ABI table */
 #include <linux/sysctl.h>
 
-static struct ctl_table abi_table2[] = {
+static const struct ctl_table abi_table2[] = {
 	{
 		.procname	= "vsyscall32",
 		.data		= &vdso32_enabled,
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
index 704e9241b964..6cba85c79d42 100644
--- a/arch/x86/kernel/cpu/bus_lock.c
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -49,7 +49,7 @@ static unsigned int sysctl_sld_mitigate = 1;
 static DEFINE_SEMAPHORE(buslock_sem, 1);
 
 #ifdef CONFIG_PROC_SYSCTL
-static struct ctl_table sld_sysctls[] = {
+static const struct ctl_table sld_sysctls[] = {
 	{
 		.procname       = "split_lock_mitigate",
 		.data           = &sysctl_sld_mitigate,
diff --git a/crypto/fips.c b/crypto/fips.c
index a58e7750f532..2fa3a9ee61a1 100644
--- a/crypto/fips.c
+++ b/crypto/fips.c
@@ -41,7 +41,7 @@ __setup("fips=", fips_enable);
 static char fips_name[] = FIPS_MODULE_NAME;
 static char fips_version[] = FIPS_MODULE_VERSION;
 
-static struct ctl_table crypto_sysctl_table[] = {
+static const struct ctl_table crypto_sysctl_table[] = {
 	{
 		.procname	= "fips_enabled",
 		.data		= &fips_enabled,
diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c
index ddb70e29eb42..c8afc501a8a4 100644
--- a/drivers/base/firmware_loader/fallback_table.c
+++ b/drivers/base/firmware_loader/fallback_table.c
@@ -25,7 +25,7 @@ struct firmware_fallback_config fw_fallback_config = {
 EXPORT_SYMBOL_NS_GPL(fw_fallback_config, "FIRMWARE_LOADER_PRIVATE");
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table firmware_config_table[] = {
+static const struct ctl_table firmware_config_table[] = {
 	{
 		.procname	= "force_sysfs_fallback",
 		.data		= &fw_fallback_config.force_sysfs_fallback,
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 51745ed1bbab..b163e043c687 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3612,7 +3612,7 @@ static int cdrom_sysctl_handler(const struct ctl_table *ctl, int write,
 }
 
 /* Place files in /proc/sys/dev/cdrom */
-static struct ctl_table cdrom_table[] = {
+static const struct ctl_table cdrom_table[] = {
 	{
 		.procname	= "info",
 		.data		= &cdrom_sysctl_settings.info, 
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 48fe96ab4649..e110857824fc 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -724,7 +724,7 @@ static int hpet_is_known(struct hpet_data *hdp)
 	return 0;
 }
 
-static struct ctl_table hpet_table[] = {
+static const struct ctl_table hpet_table[] = {
 	{
 	 .procname = "max-user-freq",
 	 .data = &hpet_max_freq,
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index 05f17e3e6207..e63c316d8aaa 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -650,7 +650,7 @@ static struct ipmi_smi_watcher smi_watcher = {
 #ifdef CONFIG_PROC_FS
 #include <linux/sysctl.h>
 
-static struct ctl_table ipmi_table[] = {
+static const struct ctl_table ipmi_table[] = {
 	{ .procname	= "poweroff_powercycle",
 	  .data		= &poweroff_powercycle,
 	  .maxlen	= sizeof(poweroff_powercycle),
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 23ee76bbb4aa..2581186fa61b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1665,7 +1665,7 @@ static int proc_do_rointvec(const struct ctl_table *table, int write, void *buf,
 	return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos);
 }
 
-static struct ctl_table random_table[] = {
+static const struct ctl_table random_table[] = {
 	{
 		.procname	= "poolsize",
 		.data		= &sysctl_poolsize,
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 2406cda75b7b..5384d1bb4923 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -4802,7 +4802,7 @@ err_unlock:
 	return ret;
 }
 
-static struct ctl_table oa_table[] = {
+static const struct ctl_table oa_table[] = {
 	{
 	 .procname = "perf_stream_paranoid",
 	 .data = &i915_perf_stream_paranoid,
diff --git a/drivers/gpu/drm/xe/xe_observation.c b/drivers/gpu/drm/xe/xe_observation.c
index 8ec1b84cbb9e..57cf01efc07f 100644
--- a/drivers/gpu/drm/xe/xe_observation.c
+++ b/drivers/gpu/drm/xe/xe_observation.c
@@ -56,7 +56,7 @@ int xe_observation_ioctl(struct drm_device *dev, void *data, struct drm_file *fi
 	}
 }
 
-static struct ctl_table observation_ctl_table[] = {
+static const struct ctl_table observation_ctl_table[] = {
 	{
 	 .procname = "observation_paranoid",
 	 .data = &xe_observation_paranoid,
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index af5d1dc451f6..f2e6f55d6ca6 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -141,7 +141,7 @@ static int sysctl_record_panic_msg = 1;
  * sysctl option to allow the user to control whether kmsg data should be
  * reported to Hyper-V on panic.
  */
-static struct ctl_table hv_ctl_table[] = {
+static const struct ctl_table hv_ctl_table[] = {
 	{
 		.procname	= "hyperv_record_panic_msg",
 		.data		= &sysctl_record_panic_msg,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866015b681af..22f7bd3b94d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -294,7 +294,7 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
 
 static struct ctl_table_header *raid_table_header;
 
-static struct ctl_table raid_table[] = {
+static const struct ctl_table raid_table[] = {
 	{
 		.procname	= "speed_limit_min",
 		.data		= &sysctl_speed_limit_min,
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 61b66e318488..7a3c34306de9 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -93,7 +93,7 @@ int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT;
 static int xpc_disengage_min_timelimit;	/* = 0 */
 static int xpc_disengage_max_timelimit = 120;
 
-static struct ctl_table xpc_sys_xpc_hb[] = {
+static const struct ctl_table xpc_sys_xpc_hb[] = {
 	{
 	 .procname = "hb_interval",
 	 .data = &xpc_hb_interval,
@@ -111,7 +111,7 @@ static struct ctl_table xpc_sys_xpc_hb[] = {
 	 .extra1 = &xpc_hb_check_min_interval,
 	 .extra2 = &xpc_hb_check_max_interval},
 };
-static struct ctl_table xpc_sys_xpc[] = {
+static const struct ctl_table xpc_sys_xpc[] = {
 	{
 	 .procname = "disengage_timelimit",
 	 .data = &xpc_disengage_timelimit,
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index b5cc11abc962..0e360feb3432 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -1279,7 +1279,7 @@ static int armv8pmu_proc_user_access_handler(const struct ctl_table *table, int
 	return 0;
 }
 
-static struct ctl_table armv8_pmu_sysctl_table[] = {
+static const struct ctl_table armv8_pmu_sysctl_table[] = {
 	{
 		.procname       = "perf_user_access",
 		.data		= &sysctl_perf_user_access,
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 194c153e5d71..698de8ddf895 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -1317,7 +1317,7 @@ static int riscv_pmu_proc_user_access_handler(const struct ctl_table *table,
 	return 0;
 }
 
-static struct ctl_table sbi_pmu_sysctl_table[] = {
+static const struct ctl_table sbi_pmu_sysctl_table[] = {
 	{
 		.procname       = "perf_user_access",
 		.data		= &sysctl_perf_user_access,
diff --git a/drivers/scsi/scsi_sysctl.c b/drivers/scsi/scsi_sysctl.c
index 093774d77534..be4aef0f4f99 100644
--- a/drivers/scsi/scsi_sysctl.c
+++ b/drivers/scsi/scsi_sysctl.c
@@ -12,7 +12,7 @@
 #include "scsi_priv.h"
 
 
-static struct ctl_table scsi_table[] = {
+static const struct ctl_table scsi_table[] = {
 	{ .procname	= "logging_level",
 	  .data		= &scsi_logging_level,
 	  .maxlen	= sizeof(scsi_logging_level),
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 94127868bedf..effb7e768165 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1639,7 +1639,7 @@ MODULE_PARM_DESC(allow_dio, "allow direct I/O (default: 0 (disallow))");
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 
-static struct ctl_table sg_sysctls[] = {
+static const struct ctl_table sg_sysctls[] = {
 	{
 		.procname	= "sg-big-buff",
 		.data		= &sg_big_buff,
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 50c0c23ae678..449dbd216460 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3617,7 +3617,7 @@ void console_sysfs_notify(void)
 		sysfs_notify(&consdev->kobj, NULL, "active");
 }
 
-static struct ctl_table tty_table[] = {
+static const struct ctl_table tty_table[] = {
 	{
 		.procname	= "legacy_tiocsti",
 		.data		= &tty_legacy_tiocsti,
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 528395133b4f..163f7f1d70f1 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -84,7 +84,7 @@ module_param(balloon_boot_timeout, uint, 0444);
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 static int xen_hotplug_unpopulated;
 
-static struct ctl_table balloon_table[] = {
+static const struct ctl_table balloon_table[] = {
 	{
 		.procname	= "hotplug_unpopulated",
 		.data		= &xen_hotplug_unpopulated,
diff --git a/fs/aio.c b/fs/aio.c
index 50671640b588..7b976b564cfc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -224,7 +224,7 @@ static unsigned long aio_nr;		/* current system wide number of aio requests */
 static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
 /*----end sysctl variables---*/
 #ifdef CONFIG_SYSCTL
-static struct ctl_table aio_sysctls[] = {
+static const struct ctl_table aio_sysctls[] = {
 	{
 		.procname	= "aio-nr",
 		.data		= &aio_nr,
diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c
index 1715d5ca2b2d..e341ade47dd8 100644
--- a/fs/cachefiles/error_inject.c
+++ b/fs/cachefiles/error_inject.c
@@ -11,7 +11,7 @@
 unsigned int cachefiles_error_injection_state;
 
 static struct ctl_table_header *cachefiles_sysctl;
-static struct ctl_table cachefiles_sysctls[] = {
+static const struct ctl_table cachefiles_sysctls[] = {
 	{
 		.procname	= "error_injection",
 		.data		= &cachefiles_error_injection_state,
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 9f2d5743e2c8..0df46f09b6cc 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -14,7 +14,7 @@
 
 static struct ctl_table_header *fs_table_header;
 
-static struct ctl_table coda_table[] = {
+static const struct ctl_table coda_table[] = {
 	{
 		.procname	= "timeout",
 		.data		= &coda_timeout,
diff --git a/fs/coredump.c b/fs/coredump.c
index d48edb37bc35..591700e1b2ce 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -995,7 +995,7 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
 static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT;
 static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX;
 
-static struct ctl_table coredump_sysctls[] = {
+static const struct ctl_table coredump_sysctls[] = {
 	{
 		.procname	= "core_uses_pid",
 		.data		= &core_uses_pid,
diff --git a/fs/dcache.c b/fs/dcache.c
index 1a01d7a6a7a9..1cd929f17eec 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -192,7 +192,7 @@ static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 
-static struct ctl_table fs_dcache_sysctls[] = {
+static const struct ctl_table fs_dcache_sysctls[] = {
 	{
 		.procname	= "dentry-state",
 		.data		= &dentry_stat,
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index b20e565b9c5e..1096ff8562fa 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -45,7 +45,7 @@ static int pty_limit_min;
 static int pty_limit_max = INT_MAX;
 static atomic_t pty_count = ATOMIC_INIT(0);
 
-static struct ctl_table pty_table[] = {
+static const struct ctl_table pty_table[] = {
 	{
 		.procname	= "max",
 		.maxlen		= sizeof(int),
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f9898e60dd8b..7c0980db77b3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -318,7 +318,7 @@ static void unlist_file(struct epitems_head *head)
 static long long_zero;
 static long long_max = LONG_MAX;
 
-static struct ctl_table epoll_table[] = {
+static const struct ctl_table epoll_table[] = {
 	{
 		.procname	= "max_user_watches",
 		.data		= &max_user_watches,
diff --git a/fs/exec.c b/fs/exec.c
index a49839174472..506cd411f4ac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2159,7 +2159,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
 	return error;
 }
 
-static struct ctl_table fs_exec_sysctls[] = {
+static const struct ctl_table fs_exec_sysctls[] = {
 	{
 		.procname	= "suid_dumpable",
 		.data		= &suid_dumpable,
diff --git a/fs/file_table.c b/fs/file_table.c
index a32171d2b83f..7f7c378c6e31 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -106,7 +106,7 @@ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 
-static struct ctl_table fs_stat_sysctls[] = {
+static const struct ctl_table fs_stat_sysctls[] = {
 	{
 		.procname	= "file-nr",
 		.data		= &files_stat,
diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c
index b272bb333005..63fb1e5bee30 100644
--- a/fs/fuse/sysctl.c
+++ b/fs/fuse/sysctl.c
@@ -13,7 +13,7 @@ static struct ctl_table_header *fuse_table_header;
 /* Bound by fuse_init_out max_pages, which is a u16 */
 static unsigned int sysctl_fuse_max_pages_limit = 65535;
 
-static struct ctl_table fuse_sysctl_table[] = {
+static const struct ctl_table fuse_sysctl_table[] = {
 	{
 		.procname	= "max_pages_limit",
 		.data		= &fuse_max_pages_limit,
diff --git a/fs/inode.c b/fs/inode.c
index 6b4c77268fc0..5587aabdaa5e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -184,7 +184,7 @@ static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 
-static struct ctl_table inodes_sysctls[] = {
+static const struct ctl_table inodes_sysctls[] = {
 	{
 		.procname	= "inode-nr",
 		.data		= &inodes_stat,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7ded57ec3a60..2c8eedc6c2cc 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(lockd_down);
  * Sysctl parameters (same as module parameters, different interface).
  */
 
-static struct ctl_table nlm_sysctls[] = {
+static const struct ctl_table nlm_sysctls[] = {
 	{
 		.procname	= "nlm_grace_period",
 		.data		= &nlm_grace_period,
diff --git a/fs/locks.c b/fs/locks.c
index 25afc8d9c9d1..1619cddfa7a4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -97,7 +97,7 @@ static int leases_enable = 1;
 static int lease_break_time = 45;
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table locks_sysctls[] = {
+static const struct ctl_table locks_sysctls[] = {
 	{
 		.procname	= "leases-enable",
 		.data		= &leases_enable,
diff --git a/fs/namei.c b/fs/namei.c
index e56c29a22d26..8c82afddd2ad 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1099,7 +1099,7 @@ static int sysctl_protected_fifos __read_mostly;
 static int sysctl_protected_regular __read_mostly;
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table namei_sysctls[] = {
+static const struct ctl_table namei_sysctls[] = {
 	{
 		.procname	= "protected_symlinks",
 		.data		= &sysctl_protected_symlinks,
diff --git a/fs/namespace.c b/fs/namespace.c
index 4013fbac354a..a3ed3f2980cb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5985,7 +5985,7 @@ const struct proc_ns_operations mntns_operations = {
 };
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table fs_namespace_sysctls[] = {
+static const struct ctl_table fs_namespace_sysctls[] = {
 	{
 		.procname	= "mount-max",
 		.data		= &sysctl_mount_max,
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 886a7c4c60b3..d1a92d8f8ba4 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -17,7 +17,7 @@ static const int nfs_set_port_min;
 static const int nfs_set_port_max = 65535;
 static struct ctl_table_header *nfs4_callback_sysctl_table;
 
-static struct ctl_table nfs4_cb_sysctls[] = {
+static const struct ctl_table nfs4_cb_sysctls[] = {
 	{
 		.procname = "nfs_callback_tcpport",
 		.data = &nfs_callback_set_tcpport,
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index e645be1a3381..f579df0e8d67 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -14,7 +14,7 @@
 
 static struct ctl_table_header *nfs_callback_sysctl_table;
 
-static struct ctl_table nfs_cb_sysctls[] = {
+static const struct ctl_table nfs_cb_sysctls[] = {
 	{
 		.procname	= "nfs_mountpoint_timeout",
 		.data		= &nfs_mountpoint_expiry_timeout,
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 6004dfdfdf0f..c4cdaf5fa7ed 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -20,7 +20,7 @@
 
 static int dir_notify_enable __read_mostly = 1;
 #ifdef CONFIG_SYSCTL
-static struct ctl_table dnotify_sysctls[] = {
+static const struct ctl_table dnotify_sysctls[] = {
 	{
 		.procname	= "dir-notify-enable",
 		.data		= &dir_notify_enable,
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6ff94e312232..ba3e2d09eb44 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -58,7 +58,7 @@ static int fanotify_max_queued_events __read_mostly;
 static long ft_zero = 0;
 static long ft_int_max = INT_MAX;
 
-static struct ctl_table fanotify_table[] = {
+static const struct ctl_table fanotify_table[] = {
 	{
 		.procname	= "max_user_groups",
 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e0c48956608a..b372fb2c56bd 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __ro_after_init;
 static long it_zero = 0;
 static long it_int_max = INT_MAX;
 
-static struct ctl_table inotify_table[] = {
+static const struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
 		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 20aa37b67cfb..ddd761cf44c8 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -650,7 +650,7 @@ error:
  * and easier to preserve the name.
  */
 
-static struct ctl_table ocfs2_nm_table[] = {
+static const struct ctl_table ocfs2_nm_table[] = {
 	{
 		.procname	= "hb_ctl_path",
 		.data		= ocfs2_hb_ctl_path,
diff --git a/fs/pipe.c b/fs/pipe.c
index 82fede0f2111..94b59045ab44 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1478,7 +1478,7 @@ static int proc_dopipe_max_size(const struct ctl_table *table, int write,
 				 do_proc_dopipe_max_size_conv, NULL);
 }
 
-static struct ctl_table fs_pipe_sysctls[] = {
+static const struct ctl_table fs_pipe_sysctls[] = {
 	{
 		.procname	= "pipe-max-size",
 		.data		= &pipe_max_size,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index f9578918cfb2..825c5c2e0962 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2926,7 +2926,7 @@ static int do_proc_dqstats(const struct ctl_table *table, int write,
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 
-static struct ctl_table fs_dqstats_table[] = {
+static const struct ctl_table fs_dqstats_table[] = {
 	{
 		.procname	= "lookups",
 		.data		= &dqstats.stat[DQST_LOOKUPS],
diff --git a/fs/sysctls.c b/fs/sysctls.c
index 8dbde9a802fa..ad429dffeb4b 100644
--- a/fs/sysctls.c
+++ b/fs/sysctls.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/sysctl.h>
 
-static struct ctl_table fs_shared_sysctls[] = {
+static const struct ctl_table fs_shared_sysctls[] = {
 	{
 		.procname	= "overflowuid",
 		.data		= &fs_overflowuid,
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7c0bd0b55f88..97c4d71115d8 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -36,7 +36,7 @@
 static int sysctl_unprivileged_userfaultfd __read_mostly;
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table vm_userfaultfd_table[] = {
+static const struct ctl_table vm_userfaultfd_table[] = {
 	{
 		.procname	= "unprivileged_userfaultfd",
 		.data		= &sysctl_unprivileged_userfaultfd,
diff --git a/fs/verity/init.c b/fs/verity/init.c
index f440f0e61e3e..6e8d33b50240 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -10,7 +10,7 @@
 #include <linux/ratelimit.h>
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table fsverity_sysctl_table[] = {
+static const struct ctl_table fsverity_sysctl_table[] = {
 #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
 	{
 		.procname       = "require_signatures",
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index c84df23b494d..751dc74a3067 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -66,7 +66,7 @@ xfs_deprecated_dointvec_minmax(
 	return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
 }
 
-static struct ctl_table xfs_table[] = {
+static const struct ctl_table xfs_table[] = {
 	{
 		.procname	= "irix_sgid_inherit",
 		.data		= &xfs_params.sgid_inherit.val,
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index f86ef92a6c46..f6867bad0d78 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -21,7 +21,7 @@ phys_addr_t phys_initrd_start __initdata;
 unsigned long phys_initrd_size __initdata;
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table kern_do_mounts_initrd_table[] = {
+static const struct ctl_table kern_do_mounts_initrd_table[] = {
 	{
 		.procname       = "real-root-dev",
 		.data           = &real_root_dev,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7bfbc7c22367..5a0f8a5041d6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -157,7 +157,7 @@ static int __read_mostly sysctl_io_uring_disabled;
 static int __read_mostly sysctl_io_uring_group = -1;
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table kernel_io_uring_disabled_table[] = {
+static const struct ctl_table kernel_io_uring_disabled_table[] = {
 	{
 		.procname	= "io_uring_disabled",
 		.data		= &sysctl_io_uring_disabled,
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 54318e0b4557..15b17e86e198 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -73,7 +73,7 @@ int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
 
-static struct ctl_table ipc_sysctls[] = {
+static const struct ctl_table ipc_sysctls[] = {
 	{
 		.procname	= "shmmax",
 		.data		= &init_ipc_ns.shm_ctlmax,
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index b70dc2ff22d8..0dd12e1c9f53 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -20,7 +20,7 @@ static int msg_max_limit_max = HARD_MSGMAX;
 static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
 static int msg_maxsize_limit_max = HARD_MSGSIZEMAX;
 
-static struct ctl_table mq_sysctls[] = {
+static const struct ctl_table mq_sysctls[] = {
 	{
 		.procname	= "queues_max",
 		.data		= &init_ipc_ns.mq_queues_max,
diff --git a/kernel/acct.c b/kernel/acct.c
index 179848ad33e9..31222e8cd534 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,7 +76,7 @@ static int acct_parm[3] = {4, 2, 30};
 #define ACCT_TIMEOUT	(acct_parm[2])	/* foo second timeout between checks */
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table kern_acct_table[] = {
+static const struct ctl_table kern_acct_table[] = {
 	{
 		.procname       = "acct",
 		.data           = &acct_parm,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0daf098e3207..c420edbfb7c8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -6128,7 +6128,7 @@ static int bpf_unpriv_handler(const struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table bpf_syscall_table[] = {
+static const struct ctl_table bpf_syscall_table[] = {
 	{
 		.procname	= "unprivileged_bpf_disabled",
 		.data		= &sysctl_unprivileged_bpf_disabled,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b238eb8c6573..eb63a021ac04 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -64,7 +64,7 @@ static int sysctl_delayacct(const struct ctl_table *table, int write, void *buff
 	return err;
 }
 
-static struct ctl_table kern_delayacct_table[] = {
+static const struct ctl_table kern_delayacct_table[] = {
 	{
 		.procname       = "task_delayacct",
 		.data           = NULL,
diff --git a/kernel/exit.c b/kernel/exit.c
index 1dcddfe537ee..3485e5fc499e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,7 @@
 static unsigned int oops_limit = 10000;
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table kern_exit_table[] = {
+static const struct ctl_table kern_exit_table[] = {
 	{
 		.procname       = "oops_limit",
 		.data           = &oops_limit,
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 953169893a95..04efa7a6e69b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -274,7 +274,7 @@ static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int writ
  * and hung_task_check_interval_secs
  */
 static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
-static struct ctl_table hung_task_sysctls[] = {
+static const struct ctl_table hung_task_sysctls[] = {
 #ifdef CONFIG_SMP
 	{
 		.procname	= "hung_task_all_cpu_backtrace",
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index b424a5c6ae87..c0bdc1686154 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -925,7 +925,7 @@ static int kexec_limit_handler(const struct ctl_table *table, int write,
 	return proc_dointvec(&tmp, write, buffer, lenp, ppos);
 }
 
-static struct ctl_table kexec_core_sysctls[] = {
+static const struct ctl_table kexec_core_sysctls[] = {
 	{
 		.procname	= "kexec_load_disabled",
 		.data		= &kexec_load_disabled,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 030569210670..88aeac84e4c0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -946,7 +946,7 @@ static int proc_kprobes_optimization_handler(const struct ctl_table *table,
 	return ret;
 }
 
-static struct ctl_table kprobe_sysctls[] = {
+static const struct ctl_table kprobe_sysctls[] = {
 	{
 		.procname	= "kprobes-optimization",
 		.data		= &sysctl_kprobes_optimization,
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 77ee3ea8a573..d4281d1e13a6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -77,7 +77,7 @@ static int sysctl_latencytop(const struct ctl_table *table, int write, void *buf
 	return err;
 }
 
-static struct ctl_table latencytop_sysctl[] = {
+static const struct ctl_table latencytop_sysctl[] = {
 	{
 		.procname   = "latencytop",
 		.data       = &latencytop_enabled,
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29acd238dad7..4470680f0226 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -79,7 +79,7 @@ module_param(lock_stat, int, 0644);
 #endif
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table kern_lockdep_table[] = {
+static const struct ctl_table kern_lockdep_table[] = {
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname       = "prove_locking",
diff --git a/kernel/panic.c b/kernel/panic.c
index fbc59b3b64d0..d8635d5cecb2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -84,7 +84,7 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table kern_panic_table[] = {
+static const struct ctl_table kern_panic_table[] = {
 #ifdef CONFIG_SMP
 	{
 		.procname       = "oops_all_cpu_backtrace",
diff --git a/kernel/pid.c b/kernel/pid.c
index 3a10a7b6fcf8..924084713be8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -695,7 +695,7 @@ static struct ctl_table_root pid_table_root = {
 	.set_ownership	= pid_table_root_set_ownership,
 };
 
-static struct ctl_table pid_table[] = {
+static const struct ctl_table pid_table[] = {
 	{
 		.procname	= "pid_max",
 		.data		= &init_pid_ns.pid_max,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index f1ffa032fc32..8f6cfec87555 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -303,7 +303,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table pid_ns_ctl_table[] = {
+static const struct ctl_table pid_ns_ctl_table[] = {
 	{
 		.procname = "ns_last_pid",
 		.maxlen = sizeof(int),
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
index 18ecaef6be41..5d8f981de7c5 100644
--- a/kernel/pid_sysctl.h
+++ b/kernel/pid_sysctl.h
@@ -31,7 +31,7 @@ static int pid_mfd_noexec_dointvec_minmax(const struct ctl_table *table,
 	return err;
 }
 
-static struct ctl_table pid_ns_ctl_table_vm[] = {
+static const struct ctl_table pid_ns_ctl_table_vm[] = {
 	{
 		.procname	= "memfd_noexec",
 		.data		= &init_pid_ns.memfd_noexec_scope,
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
index f5072dc85f7a..da77f3f5c1fe 100644
--- a/kernel/printk/sysctl.c
+++ b/kernel/printk/sysctl.c
@@ -20,7 +20,7 @@ static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int writ
 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
 
-static struct ctl_table printk_sysctls[] = {
+static const struct ctl_table printk_sysctls[] = {
 	{
 		.procname	= "printk",
 		.data		= &console_loglevel,
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a701000bab34..b5a8569e5d81 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -1287,7 +1287,7 @@ static struct attribute *reboot_attrs[] = {
 };
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table kern_reboot_table[] = {
+static const struct ctl_table kern_reboot_table[] = {
 	{
 		.procname       = "poweroff_cmd",
 		.data           = &poweroff_cmd,
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index db68a964e34e..83d46b9b8ec8 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -9,7 +9,7 @@ static struct autogroup autogroup_default;
 static atomic_t autogroup_seq_nr;
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table sched_autogroup_sysctls[] = {
+static const struct ctl_table sched_autogroup_sysctls[] = {
 	{
 		.procname       = "sched_autogroup_enabled",
 		.data           = &sysctl_sched_autogroup_enabled,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9142a0394d46..165c90ba64ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4654,7 +4654,7 @@ static int sysctl_schedstats(const struct ctl_table *table, int write, void *buf
 #endif /* CONFIG_SCHEDSTATS */
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table sched_core_sysctls[] = {
+static const struct ctl_table sched_core_sysctls[] = {
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname       = "sched_schedstats",
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 62192ac79c30..38e4537790af 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -26,7 +26,7 @@
 static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
 static unsigned int sysctl_sched_dl_period_min = 100;     /* 100 us */
 #ifdef CONFIG_SYSCTL
-static struct ctl_table sched_dl_sysctls[] = {
+static const struct ctl_table sched_dl_sysctls[] = {
 	{
 		.procname       = "sched_deadline_period_max_us",
 		.data           = &sysctl_sched_dl_period_max,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1e78caa21436..ce2e94ccad0c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -133,7 +133,7 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
 #endif
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table sched_fair_sysctls[] = {
+static const struct ctl_table sched_fair_sysctls[] = {
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.procname       = "sched_cfs_bandwidth_slice_us",
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd66a46b06ac..4b8e33c615b1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -26,7 +26,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
 		size_t *lenp, loff_t *ppos);
 static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
-static struct ctl_table sched_rt_sysctls[] = {
+static const struct ctl_table sched_rt_sysctls[] = {
 	{
 		.procname       = "sched_rt_period_us",
 		.data           = &sysctl_sched_rt_period,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index da33ec9e94ab..c49aea8c1025 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -312,7 +312,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table sched_energy_aware_sysctls[] = {
+static const struct ctl_table sched_energy_aware_sysctls[] = {
 	{
 		.procname       = "sched_energy_aware",
 		.data           = &sysctl_sched_energy_aware,
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 385d48293a5f..f59381c4a2ff 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2450,7 +2450,7 @@ static int seccomp_actions_logged_handler(const struct ctl_table *ro_table, int
 	return ret;
 }
 
-static struct ctl_table seccomp_sysctl_table[] = {
+static const struct ctl_table seccomp_sysctl_table[] = {
 	{
 		.procname	= "actions_avail",
 		.data		= (void *) &seccomp_actions_avail,
diff --git a/kernel/signal.c b/kernel/signal.c
index a2afd54303f0..875e97f6205a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4950,7 +4950,7 @@ static inline void siginfo_buildtime_checks(void)
 }
 
 #if defined(CONFIG_SYSCTL)
-static struct ctl_table signal_debug_table[] = {
+static const struct ctl_table signal_debug_table[] = {
 #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
 	{
 		.procname	= "exception-trace",
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index 0f4804f28c61..bb65321761b4 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -45,7 +45,7 @@ static int stack_erasing_sysctl(const struct ctl_table *table, int write,
 					str_enabled_disabled(state));
 	return ret;
 }
-static struct ctl_table stackleak_sysctls[] = {
+static const struct ctl_table stackleak_sysctls[] = {
 	{
 		.procname	= "stack_erasing",
 		.data		= NULL,
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index 3ac98bb7fb82..eb2842bd0557 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -374,7 +374,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value(
 		struct kunit *test)
 {
 	unsigned char data = 0;
-	struct ctl_table table_foo[] = {
+	const struct ctl_table table_foo[] = {
 		{
 			.procname	= "foo",
 			.data		= &data,
@@ -386,7 +386,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value(
 		},
 	};
 
-	struct ctl_table table_bar[] = {
+	const struct ctl_table table_bar[] = {
 		{
 			.procname	= "bar",
 			.data		= &data,
@@ -398,7 +398,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value(
 		},
 	};
 
-	struct ctl_table table_qux[] = {
+	const struct ctl_table table_qux[] = {
 		{
 			.procname	= "qux",
 			.data		= &data,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7ae7a4136855..cb57da499ebb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1609,7 +1609,7 @@ int proc_do_static_key(const struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table kern_table[] = {
+static const struct ctl_table kern_table[] = {
 	{
 		.procname	= "panic",
 		.data		= &panic_timeout,
@@ -2021,7 +2021,7 @@ static struct ctl_table kern_table[] = {
 #endif
 };
 
-static struct ctl_table vm_table[] = {
+static const struct ctl_table vm_table[] = {
 	{
 		.procname	= "overcommit_memory",
 		.data		= &sysctl_overcommit_memory,
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 40706cb36920..c8f776dc6ee0 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -301,7 +301,7 @@ static int timer_migration_handler(const struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table timer_sysctl[] = {
+static const struct ctl_table timer_sysctl[] = {
 	{
 		.procname	= "timer_migration",
 		.data		= &sysctl_timer_migration,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f79eb9386c7f..728ecda6e8d4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -8780,7 +8780,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write,
 	return 0;
 }
 
-static struct ctl_table ftrace_sysctls[] = {
+static const struct ctl_table ftrace_sysctls[] = {
 	{
 		.procname       = "ftrace_enabled",
 		.data           = &ftrace_enabled,
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 17bcad8f79de..97325fbd6283 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2899,7 +2899,7 @@ static int set_max_user_events_sysctl(const struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table user_event_sysctls[] = {
+static const struct ctl_table user_event_sysctls[] = {
 	{
 		.procname	= "user_events_max",
 		.data		= &max_user_events,
diff --git a/kernel/umh.c b/kernel/umh.c
index be9234270777..b4da45a3a7cf 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -544,7 +544,7 @@ static int proc_cap_handler(const struct ctl_table *table, int write,
 	return 0;
 }
 
-static struct ctl_table usermodehelper_table[] = {
+static const struct ctl_table usermodehelper_table[] = {
 	{
 		.procname	= "bset",
 		.data		= &usermodehelper_bset,
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 7282f61a8650..bfbaaecb1dd4 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -75,7 +75,7 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll);
 static DEFINE_CTL_TABLE_POLL(domainname_poll);
 
 // Note: update 'enum uts_proc' to match any changes to this table
-static struct ctl_table uts_kern_table[] = {
+static const struct ctl_table uts_kern_table[] = {
 	{
 		.procname	= "arch",
 		.data		= init_uts_ns.name.machine,
@@ -129,7 +129,7 @@ static struct ctl_table uts_kern_table[] = {
  */
 void uts_proc_notify(enum uts_proc proc)
 {
-	struct ctl_table *table = &uts_kern_table[proc];
+	const struct ctl_table *table = &uts_kern_table[proc];
 
 	proc_sys_poll_notify(table->poll);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 177abb7d0d4e..b2da7de39d06 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -1094,7 +1094,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write,
 
 static const int sixty = 60;
 
-static struct ctl_table watchdog_sysctls[] = {
+static const struct ctl_table watchdog_sysctls[] = {
 	{
 		.procname       = "watchdog",
 		.data		= &watchdog_user_enabled,
diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c
index b6696fa1d426..4249e0cc8aaf 100644
--- a/lib/test_sysctl.c
+++ b/lib/test_sysctl.c
@@ -71,7 +71,7 @@ static struct test_sysctl_data test_data = {
 };
 
 /* These are all under /proc/sys/debug/test_sysctl/ */
-static struct ctl_table test_table[] = {
+static const struct ctl_table test_table[] = {
 	{
 		.procname	= "int_0001",
 		.data		= &test_data.int_0001,
@@ -177,7 +177,7 @@ static int test_sysctl_setup_node_tests(void)
 }
 
 /* Used to test that unregister actually removes the directory */
-static struct ctl_table test_table_unregister[] = {
+static const struct ctl_table test_table_unregister[] = {
 	{
 		.procname	= "unregister_error",
 		.data		= &test_data.int_0001,
@@ -220,7 +220,7 @@ static int test_sysctl_run_register_mount_point(void)
 	return 0;
 }
 
-static struct ctl_table test_table_empty[] = { };
+static const struct ctl_table test_table_empty[] = { };
 
 static int test_sysctl_run_register_empty(void)
 {
diff --git a/mm/compaction.c b/mm/compaction.c
index 73e80b2fb22e..bcc0df0066dc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -3272,7 +3272,7 @@ static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table,
 	return ret;
 }
 
-static struct ctl_table vm_compaction[] = {
+static const struct ctl_table vm_compaction[] = {
 	{
 		.procname	= "compact_memory",
 		.data		= &sysctl_compact_memory,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 87761b042ed0..3b25b69aa94f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4867,7 +4867,7 @@ out:
 	return ret;
 }
 
-static struct ctl_table hugetlb_table[] = {
+static const struct ctl_table hugetlb_table[] = {
 	{
 		.procname	= "nr_hugepages",
 		.data		= NULL,
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 57b7f591eee8..7735972add01 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -693,7 +693,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
-static struct ctl_table hugetlb_vmemmap_sysctls[] = {
+static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
 	{
 		.procname	= "hugetlb_optimize_vmemmap",
 		.data		= &vmemmap_optimize_enabled,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a7b8ccd29b6f..995a15eb67e2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -124,7 +124,7 @@ const struct attribute_group memory_failure_attr_group = {
 	.attrs = memory_failure_attr,
 };
 
-static struct ctl_table memory_failure_table[] = {
+static const struct ctl_table memory_failure_table[] = {
 	{
 		.procname	= "memory_failure_early_kill",
 		.data		= &sysctl_memory_failure_early_kill,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 044ebab2c941..1cf121ad7085 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -705,7 +705,7 @@ static void queue_oom_reaper(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table vm_oom_kill_table[] = {
+static const struct ctl_table vm_oom_kill_table[] = {
 	{
 		.procname	= "panic_on_oom",
 		.data		= &sysctl_panic_on_oom,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4f5970723cf2..eb55ece39c56 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2298,7 +2298,7 @@ static int page_writeback_cpu_online(unsigned int cpu)
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 
-static struct ctl_table vm_page_writeback_sysctls[] = {
+static const struct ctl_table vm_page_writeback_sysctls[] = {
 	{
 		.procname   = "dirty_background_ratio",
 		.data       = &dirty_background_ratio,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e469c7ef9a4..579789600a3c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6166,7 +6166,7 @@ out:
 	return ret;
 }
 
-static struct ctl_table page_alloc_sysctl_table[] = {
+static const struct ctl_table page_alloc_sysctl_table[] = {
 	{
 		.procname	= "min_free_kbytes",
 		.data		= &min_free_kbytes,
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 1edc12862a7d..9b6c2f157f83 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -2038,7 +2038,7 @@ static int apparmor_dointvec(const struct ctl_table *table, int write,
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 
-static struct ctl_table apparmor_sysctl_table[] = {
+static const struct ctl_table apparmor_sysctl_table[] = {
 #ifdef CONFIG_USER_NS
 	{
 		.procname       = "unprivileged_userns_apparmor_policy",
diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c
index 91f000eef3ad..cde08c478f32 100644
--- a/security/keys/sysctl.c
+++ b/security/keys/sysctl.c
@@ -9,7 +9,7 @@
 #include <linux/sysctl.h>
 #include "internal.h"
 
-static struct ctl_table key_sysctls[] = {
+static const struct ctl_table key_sysctls[] = {
 	{
 		.procname = "maxkeys",
 		.data = &key_quota_maxkeys,
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 1a2d02fee09b..1971710620c1 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -452,7 +452,7 @@ static int yama_dointvec_minmax(const struct ctl_table *table, int write,
 
 static int max_scope = YAMA_SCOPE_NO_ATTACH;
 
-static struct ctl_table yama_sysctl_table[] = {
+static const struct ctl_table yama_sysctl_table[] = {
 	{
 		.procname       = "ptrace_scope",
 		.data           = &ptrace_scope,
-- 
cgit v1.2.3


From 438e2116d7bd3095184d1997b367380c4f465164 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Mon, 14 Oct 2024 13:00:48 +0200
Subject: cifs: Change translation of STATUS_PRIVILEGE_NOT_HELD to -EPERM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

STATUS_PRIVILEGE_NOT_HELD indicates that user does not have privilege to
issue some operation, for example to create symlink.

Currently STATUS_PRIVILEGE_NOT_HELD is translated to -EIO. Change it to
-EPERM which better describe this error code.

Note that there is no ERR* code usable in ntstatus_to_dos_map[] table which
can be used to -EPERM translation, so do explicit translation in
map_smb_to_linux_error() function.

Signed-off-by: Pali Rohár <pali@kernel.org>
Acked-by: Tom Talpey <tom@talpey.com>
Acked-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/netmisc.c      | 3 ++-
 fs/smb/client/smb2maperror.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/netmisc.c b/fs/smb/client/netmisc.c
index 9dc16211e7a1..9ec20601cee2 100644
--- a/fs/smb/client/netmisc.c
+++ b/fs/smb/client/netmisc.c
@@ -313,7 +313,6 @@ static const struct {
 	ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, {
 	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, {
 	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, {
-	ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, {
 	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, {
 	ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS},
 /*	{ This NT error code was 'sqashed'
@@ -876,6 +875,8 @@ map_smb_to_linux_error(char *buf, bool logErr)
 		__u32 err = le32_to_cpu(smb->Status.CifsError);
 		if (err == (NT_STATUS_NOT_A_REPARSE_POINT))
 			rc = -ENODATA;
+		else if (err == (NT_STATUS_PRIVILEGE_NOT_HELD))
+			rc = -EPERM;
 	}
 
 	cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n",
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index 612e7b5181b6..12c2b868789f 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -380,7 +380,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
 	{STATUS_NO_LOGON_SERVERS, -EIO, "STATUS_NO_LOGON_SERVERS"},
 	{STATUS_NO_SUCH_LOGON_SESSION, -EIO, "STATUS_NO_SUCH_LOGON_SESSION"},
 	{STATUS_NO_SUCH_PRIVILEGE, -EIO, "STATUS_NO_SUCH_PRIVILEGE"},
-	{STATUS_PRIVILEGE_NOT_HELD, -EIO, "STATUS_PRIVILEGE_NOT_HELD"},
+	{STATUS_PRIVILEGE_NOT_HELD, -EPERM, "STATUS_PRIVILEGE_NOT_HELD"},
 	{STATUS_INVALID_ACCOUNT_NAME, -EIO, "STATUS_INVALID_ACCOUNT_NAME"},
 	{STATUS_USER_EXISTS, -EIO, "STATUS_USER_EXISTS"},
 	{STATUS_NO_SUCH_USER, -EIO, "STATUS_NO_SUCH_USER"},
-- 
cgit v1.2.3


From ef201e8759d20bf82b5943101147072de12bc524 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Thu, 26 Dec 2024 15:20:39 +0100
Subject: cifs: Validate EAs for WSL reparse points
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Major and minor numbers for char and block devices are mandatory for stat.
So check that the WSL EA $LXDEV is present for WSL CHR and BLK reparse
points.

WSL reparse point tag determinate type of the file. But file type is
present also in the WSL EA $LXMOD. So check that both file types are same.

Fixes: 78e26bec4d6d ("smb: client: parse uid, gid, mode and dev from WSL reparse points")
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/reparse.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index d88b41133e00..b387dfbaf16b 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -747,11 +747,12 @@ int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
 	return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data);
 }
 
-static void wsl_to_fattr(struct cifs_open_info_data *data,
+static bool wsl_to_fattr(struct cifs_open_info_data *data,
 			 struct cifs_sb_info *cifs_sb,
 			 u32 tag, struct cifs_fattr *fattr)
 {
 	struct smb2_file_full_ea_info *ea;
+	bool have_xattr_dev = false;
 	u32 next = 0;
 
 	switch (tag) {
@@ -794,13 +795,24 @@ static void wsl_to_fattr(struct cifs_open_info_data *data,
 			fattr->cf_uid = wsl_make_kuid(cifs_sb, v);
 		else if (!strncmp(name, SMB2_WSL_XATTR_GID, nlen))
 			fattr->cf_gid = wsl_make_kgid(cifs_sb, v);
-		else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen))
+		else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen)) {
+			/* File type in reparse point tag and in xattr mode must match. */
+			if (S_DT(fattr->cf_mode) != S_DT(le32_to_cpu(*(__le32 *)v)))
+				return false;
 			fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v);
-		else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen))
+		} else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen)) {
 			fattr->cf_rdev = reparse_mkdev(v);
+			have_xattr_dev = true;
+		}
 	} while (next);
 out:
+
+	/* Major and minor numbers for char and block devices are mandatory. */
+	if (!have_xattr_dev && (tag == IO_REPARSE_TAG_LX_CHR || tag == IO_REPARSE_TAG_LX_BLK))
+		return false;
+
 	fattr->cf_dtype = S_DT(fattr->cf_mode);
+	return true;
 }
 
 static bool posix_reparse_to_fattr(struct cifs_sb_info *cifs_sb,
@@ -874,7 +886,9 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
 	case IO_REPARSE_TAG_AF_UNIX:
 	case IO_REPARSE_TAG_LX_CHR:
 	case IO_REPARSE_TAG_LX_BLK:
-		wsl_to_fattr(data, cifs_sb, tag, fattr);
+		ok = wsl_to_fattr(data, cifs_sb, tag, fattr);
+		if (!ok)
+			return false;
 		break;
 	case IO_REPARSE_TAG_NFS:
 		ok = posix_reparse_to_fattr(cifs_sb, fattr, data);
-- 
cgit v1.2.3


From 25f6184e24b3991eae977a29ecf27d537cc930b2 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Wed, 25 Dec 2024 14:00:39 +0100
Subject: cifs: Remove intermediate object of failed create SFU call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check if the server honored ATTR_SYSTEM flag by CREATE_OPTION_SPECIAL
option. If not then server does not support ATTR_SYSTEM and newly
created file is not SFU compatible, which means that the call failed.

If CREATE was successful but either setting ATTR_SYSTEM failed or
writing type/data information failed then remove the intermediate
object created by CREATE. Otherwise intermediate empty object stay
on the server.

This ensures that if the creating of SFU files with system attribute is
unsupported by the server then no empty file stay on the server as a result
of unsupported operation.

This is for example case with Samba server and Linux tmpfs storage without
enabled xattr support (where Samba stores ATTR_SYSTEM bit).

Cc: stable@vger.kernel.org
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2ops.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index d640dcabc305..77309217dab4 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -5077,6 +5077,7 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
 {
 	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_open_parms oparms;
+	struct cifs_open_info_data idata;
 	struct cifs_io_parms io_parms = {};
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifs_fid fid;
@@ -5146,10 +5147,20 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
 			     CREATE_OPTION_SPECIAL, ACL_NO_MODE);
 	oparms.fid = &fid;
 
-	rc = server->ops->open(xid, &oparms, &oplock, NULL);
+	rc = server->ops->open(xid, &oparms, &oplock, &idata);
 	if (rc)
 		goto out;
 
+	/*
+	 * Check if the server honored ATTR_SYSTEM flag by CREATE_OPTION_SPECIAL
+	 * option. If not then server does not support ATTR_SYSTEM and newly
+	 * created file is not SFU compatible, which means that the call failed.
+	 */
+	if (!(le32_to_cpu(idata.fi.Attributes) & ATTR_SYSTEM)) {
+		rc = -EOPNOTSUPP;
+		goto out_close;
+	}
+
 	if (type_len + data_len > 0) {
 		io_parms.pid = current->tgid;
 		io_parms.tcon = tcon;
@@ -5164,8 +5175,18 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
 					     iov, ARRAY_SIZE(iov)-1);
 	}
 
+out_close:
 	server->ops->close(xid, tcon, &fid);
 
+	/*
+	 * If CREATE was successful but either setting ATTR_SYSTEM failed or
+	 * writing type/data information failed then remove the intermediate
+	 * object created by CREATE. Otherwise intermediate empty object stay
+	 * on the server.
+	 */
+	if (rc)
+		server->ops->unlink(xid, tcon, full_path, cifs_sb, NULL);
+
 out:
 	kfree(symname_utf16);
 	return rc;
-- 
cgit v1.2.3


From 8b19dfb34d17e77a0809d433cc128b779282131b Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Mon, 14 Oct 2024 13:43:23 +0200
Subject: cifs: Fix getting and setting SACLs over SMB1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SMB1 callback get_cifs_acl_by_fid() currently ignores its last argument and
therefore ignores request for SACL_SECINFO. Fix this issue by correctly
propagating info argument from get_cifs_acl() and get_cifs_acl_by_fid() to
CIFSSMBGetCIFSACL() function and pass SACL_SECINFO when requested.

For accessing SACLs it is needed to open object with SYSTEM_SECURITY
access. Pass this flag when trying to get or set SACLs.

Same logic is in the SMB2+ code path.

This change fixes getting and setting of "system.cifs_ntsd_full" and
"system.smb3_ntsd_full" xattrs over SMB1 as currently it silentely ignored
SACL part of passed xattr buffer.

Fixes: 3970acf7ddb9 ("SMB3: Add support for getting and setting SACLs")
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsacl.c   | 25 +++++++++++++++----------
 fs/smb/client/cifsproto.h |  2 +-
 fs/smb/client/cifssmb.c   |  4 ++--
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index ba79aa2107cc..699a3f76d083 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -1395,7 +1395,7 @@ chown_chgrp_exit:
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 				      const struct cifs_fid *cifsfid, u32 *pacllen,
-				      u32 __maybe_unused unused)
+				      u32 info)
 {
 	struct smb_ntsd *pntsd = NULL;
 	unsigned int xid;
@@ -1407,7 +1407,7 @@ struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 
 	xid = get_xid();
 	rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd,
-				pacllen);
+				pacllen, info);
 	free_xid(xid);
 
 	cifs_put_tlink(tlink);
@@ -1419,7 +1419,7 @@ struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 }
 
 static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
-		const char *path, u32 *pacllen)
+		const char *path, u32 *pacllen, u32 info)
 {
 	struct smb_ntsd *pntsd = NULL;
 	int oplock = 0;
@@ -1446,9 +1446,12 @@ static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 		.fid = &fid,
 	};
 
+	if (info & SACL_SECINFO)
+		oparms.desired_access |= SYSTEM_SECURITY;
+
 	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (!rc) {
-		rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
+		rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen, info);
 		CIFSSMBClose(xid, tcon, fid.netfid);
 	}
 
@@ -1472,7 +1475,7 @@ struct smb_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 	if (inode)
 		open_file = find_readable_file(CIFS_I(inode), true);
 	if (!open_file)
-		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
+		return get_cifs_acl_by_path(cifs_sb, path, pacllen, info);
 
 	pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen, info);
 	cifsFileInfo_put(open_file);
@@ -1485,7 +1488,7 @@ int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen,
 {
 	int oplock = 0;
 	unsigned int xid;
-	int rc, access_flags;
+	int rc, access_flags = 0;
 	struct cifs_tcon *tcon;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -1498,10 +1501,12 @@ int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen,
 	tcon = tlink_tcon(tlink);
 	xid = get_xid();
 
-	if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
-		access_flags = WRITE_OWNER;
-	else
-		access_flags = WRITE_DAC;
+	if (aclflag & CIFS_ACL_OWNER || aclflag & CIFS_ACL_GROUP)
+		access_flags |= WRITE_OWNER;
+	if (aclflag & CIFS_ACL_SACL)
+		access_flags |= SYSTEM_SECURITY;
+	if (aclflag & CIFS_ACL_DACL)
+		access_flags |= WRITE_DAC;
 
 	oparms = (struct cifs_open_parms) {
 		.tcon = tcon,
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 223e5e231f42..fcc9da838b70 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -557,7 +557,7 @@ extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
 		const struct nls_table *nls_codepage,
 		struct cifs_sb_info *cifs_sb);
 extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
-			__u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen);
+			__u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen, __u32 info);
 extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
 			struct smb_ntsd *pntsd, __u32 len, int aclflag);
 extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 7f1cacc89dbb..3feaa0f68169 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -3369,7 +3369,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
 CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
-		  struct smb_ntsd **acl_inf, __u32 *pbuflen)
+		  struct smb_ntsd **acl_inf, __u32 *pbuflen, __u32 info)
 {
 	int rc = 0;
 	int buf_type = 0;
@@ -3392,7 +3392,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 	pSMB->MaxSetupCount = 0;
 	pSMB->Fid = fid; /* file handle always le */
 	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
-				     CIFS_ACL_DACL);
+				     CIFS_ACL_DACL | info);
 	pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
 	inc_rfc1001_len(pSMB, 11);
 	iov[0].iov_base = (char *)pSMB;
-- 
cgit v1.2.3


From 24cf72976acee4b731c7c3ef2080e8535441fa3b Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Sun, 6 Oct 2024 19:44:50 +0200
Subject: cifs: Remove unicode parameter from parse_reparse_point() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This parameter is always true, so remove it and also remove dead code which
is never called (for all false code paths).

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsproto.h |  2 +-
 fs/smb/client/reparse.c   | 25 +++++++++++--------------
 fs/smb/client/smb1ops.c   |  2 +-
 fs/smb/client/smb2file.c  |  1 -
 fs/smb/client/smb2proto.h |  2 +-
 5 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index fcc9da838b70..81680001944d 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -656,7 +656,7 @@ char *extract_sharename(const char *unc);
 int parse_reparse_point(struct reparse_data_buffer *buf,
 			u32 plen, struct cifs_sb_info *cifs_sb,
 			const char *full_path,
-			bool unicode, struct cifs_open_info_data *data);
+			struct cifs_open_info_data *data);
 int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
 			 struct dentry *dentry, struct cifs_tcon *tcon,
 			 const char *full_path, umode_t mode, dev_t dev,
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index b387dfbaf16b..0e47b8e097a0 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -536,7 +536,7 @@ static int parse_reparse_posix(struct reparse_posix_data *buf,
 }
 
 int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
-			      bool unicode, bool relative,
+			      bool relative,
 			      const char *full_path,
 			      struct cifs_sb_info *cifs_sb)
 {
@@ -547,26 +547,24 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
 	int rc;
 	int i;
 
-	/* Check that length it valid for unicode/non-unicode mode */
-	if (!len || (unicode && (len % 2))) {
+	/* Check that length it valid */
+	if (!len || (len % 2)) {
 		cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
 		rc = -EIO;
 		goto out;
 	}
 
 	/*
-	 * Check that buffer does not contain UTF-16 null codepoint in unicode
-	 * mode or null byte in non-unicode mode because Linux cannot process
-	 * symlink with null byte.
+	 * Check that buffer does not contain UTF-16 null codepoint
+	 * because Linux cannot process symlink with null byte.
 	 */
-	if ((unicode && UniStrnlen((wchar_t *)buf, len/2) != len/2) ||
-	    (!unicode && strnlen(buf, len) != len)) {
+	if (UniStrnlen((wchar_t *)buf, len/2) != len/2) {
 		cifs_dbg(VFS, "srv returned null byte in native symlink target location\n");
 		rc = -EIO;
 		goto out;
 	}
 
-	smb_target = cifs_strndup_from_utf16(buf, len, unicode, cifs_sb->local_nls);
+	smb_target = cifs_strndup_from_utf16(buf, len, true, cifs_sb->local_nls);
 	if (!smb_target) {
 		rc = -ENOMEM;
 		goto out;
@@ -621,7 +619,7 @@ out:
 }
 
 static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
-				 u32 plen, bool unicode,
+				 u32 plen,
 				 struct cifs_sb_info *cifs_sb,
 				 const char *full_path,
 				 struct cifs_open_info_data *data)
@@ -641,7 +639,6 @@ static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
 	return smb2_parse_native_symlink(&data->symlink_target,
 					 sym->PathBuffer + offs,
 					 len,
-					 unicode,
 					 le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE,
 					 full_path,
 					 cifs_sb);
@@ -696,7 +693,7 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf
 int parse_reparse_point(struct reparse_data_buffer *buf,
 			u32 plen, struct cifs_sb_info *cifs_sb,
 			const char *full_path,
-			bool unicode, struct cifs_open_info_data *data)
+			struct cifs_open_info_data *data)
 {
 	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 
@@ -710,7 +707,7 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
 	case IO_REPARSE_TAG_SYMLINK:
 		return parse_reparse_symlink(
 			(struct reparse_symlink_data_buffer *)buf,
-			plen, unicode, cifs_sb, full_path, data);
+			plen, cifs_sb, full_path, data);
 	case IO_REPARSE_TAG_LX_SYMLINK:
 		return parse_reparse_wsl_symlink(
 			(struct reparse_wsl_symlink_data_buffer *)buf,
@@ -744,7 +741,7 @@ int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
 
 	buf = (struct reparse_data_buffer *)((u8 *)io +
 					     le32_to_cpu(io->OutputOffset));
-	return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data);
+	return parse_reparse_point(buf, plen, cifs_sb, full_path, data);
 }
 
 static bool wsl_to_fattr(struct cifs_open_info_data *data,
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 749a83cd0deb..55014c22f082 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -1010,7 +1010,7 @@ static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb,
 
 	buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
 					     le32_to_cpu(io->DataOffset));
-	return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data);
+	return parse_reparse_point(buf, plen, cifs_sb, full_path, data);
 }
 
 static bool
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index 9ec44eab8dbc..c5e689b2fc49 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -89,7 +89,6 @@ int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec
 	return smb2_parse_native_symlink(path,
 					 (char *)sym->PathBuffer + sub_offs,
 					 sub_len,
-					 true,
 					 le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE,
 					 full_path,
 					 cifs_sb);
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 09349fa8da03..10f5e37d1530 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -112,7 +112,7 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 			  const unsigned char *path, char *pbuf,
 			  unsigned int *pbytes_read);
 int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
-			      bool unicode, bool relative,
+			      bool relative,
 			      const char *full_path,
 			      struct cifs_sb_info *cifs_sb);
 int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb,
-- 
cgit v1.2.3


From 65ccccee4eb1a8147a3242238f7730bd8359077e Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Sat, 28 Sep 2024 14:13:44 +0200
Subject: cifs: Remove struct reparse_posix_data from struct
 cifs_open_info_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux SMB client already supports more reparse point types but only the
reparse_posix_data is defined in union of struct cifs_open_info_data.
This union is currently used as implicit casting between point types.

With this code style, it hides information that union is used for pointer
casting, and just in mknod_nfs() and posix_reparse_to_fattr() functions.

Other reparse point buffers do not use this kind of casting. So remove
reparse_posix_data from reparse part of struct cifs_open_info_data and for
all cases of reparse buffer use just struct reparse_data_buffer *buf.

Signed-off-by: Pali Rohár <pali@kernel.org>
Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h | 5 +----
 fs/smb/client/reparse.c  | 5 ++---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 49ffc040f736..63f23a845e15 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -226,10 +226,7 @@ struct cifs_open_info_data {
 			struct kvec iov;
 		} io;
 		__u32 tag;
-		union {
-			struct reparse_data_buffer *buf;
-			struct reparse_posix_data *posix;
-		};
+		struct reparse_data_buffer *buf;
 	} reparse;
 	struct {
 		__u8		eas[SMB2_WSL_MAX_QUERY_EA_RESP_SIZE];
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 0e47b8e097a0..77f891f718c0 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -294,7 +294,7 @@ static int mknod_nfs(unsigned int xid, struct inode *inode,
 
 	data = (struct cifs_open_info_data) {
 		.reparse_point = true,
-		.reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
+		.reparse = { .tag = IO_REPARSE_TAG_NFS, .buf = (struct reparse_data_buffer *)p, },
 	};
 
 	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
@@ -816,8 +816,7 @@ static bool posix_reparse_to_fattr(struct cifs_sb_info *cifs_sb,
 				   struct cifs_fattr *fattr,
 				   struct cifs_open_info_data *data)
 {
-	struct reparse_posix_data *buf = data->reparse.posix;
-
+	struct reparse_posix_data *buf = (struct reparse_posix_data *)data->reparse.buf;
 
 	if (buf == NULL)
 		return true;
-- 
cgit v1.2.3


From b6d002f0a345218edbe9de049693004482a81327 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Fri, 13 Sep 2024 11:46:35 +0200
Subject: cifs: Rename struct reparse_posix_data to reparse_nfs_data_buffer and
 move to common/smb2pdu.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Function parse_reparse_posix() parses NFS-style reparse points, which are
used only by Windows NFS server since Windows Server 2012 version. This
style is not understood by Microsoft POSIX/Interix/SFU/SUA subsystems.

So make it clear that parse_reparse_posix() function and reparse_posix_data
structure are not POSIX general, but rather NFS specific.

All reparse buffer structures are defined in common/smb2pdu.h and have
_buffer suffix. So move struct reparse_posix_data from client/cifspdu.h to
common/smb2pdu.h and rename it to reparse_nfs_data_buffer for consistency.
Note that also SMB specification in [MS-FSCC] document, section 2.1.2.6
defines it under name "Network File System (NFS) Reparse Data Buffer".
So use this name for consistency.

Having this structure in common/smb2pdu.h can be useful for ksmbd server
code as NFS-style reparse points is the preferred way for implementing
support for special files.

Signed-off-by: Pali Rohár <pali@kernel.org>
Acked-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifspdu.h | 14 --------------
 fs/smb/client/reparse.c | 12 ++++++------
 fs/smb/common/smb2pdu.h | 14 +++++++++++++-
 3 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 5c047b00516f..c285f0e5ce0f 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -1484,20 +1484,6 @@ struct file_notify_information {
 	__u8  FileName[];
 } __attribute__((packed));
 
-/* For IO_REPARSE_TAG_NFS */
-#define NFS_SPECFILE_LNK	0x00000000014B4E4C
-#define NFS_SPECFILE_CHR	0x0000000000524843
-#define NFS_SPECFILE_BLK	0x00000000004B4C42
-#define NFS_SPECFILE_FIFO	0x000000004F464946
-#define NFS_SPECFILE_SOCK	0x000000004B434F53
-struct reparse_posix_data {
-	__le32	ReparseTag;
-	__le16	ReparseDataLength;
-	__u16	Reserved;
-	__le64	InodeType; /* LNK, FIFO, CHR etc. */
-	__u8	DataBuffer[];
-} __attribute__((packed));
-
 struct cifs_quota_data {
 	__u32	rsrvd1;  /* 0 */
 	__u32	sid_size;
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 77f891f718c0..3be2173a026d 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -242,7 +242,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
 	return 0;
 }
 
-static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
+static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
 			       mode_t mode, dev_t dev,
 			       struct kvec *iov)
 {
@@ -281,13 +281,13 @@ static int mknod_nfs(unsigned int xid, struct inode *inode,
 		     const char *full_path, umode_t mode, dev_t dev)
 {
 	struct cifs_open_info_data data;
-	struct reparse_posix_data *p;
+	struct reparse_nfs_data_buffer *p;
 	struct inode *new;
 	struct kvec iov;
 	__u8 buf[sizeof(*p) + sizeof(__le64)];
 	int rc;
 
-	p = (struct reparse_posix_data *)buf;
+	p = (struct reparse_nfs_data_buffer *)buf;
 	rc = nfs_set_reparse_buf(p, mode, dev, &iov);
 	if (rc)
 		return rc;
@@ -474,7 +474,7 @@ int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
 }
 
 /* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
-static int parse_reparse_posix(struct reparse_posix_data *buf,
+static int parse_reparse_nfs(struct reparse_nfs_data_buffer *buf,
 			       struct cifs_sb_info *cifs_sb,
 			       struct cifs_open_info_data *data)
 {
@@ -702,7 +702,7 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
 	/* See MS-FSCC 2.1.2 */
 	switch (le32_to_cpu(buf->ReparseTag)) {
 	case IO_REPARSE_TAG_NFS:
-		return parse_reparse_posix((struct reparse_posix_data *)buf,
+		return parse_reparse_nfs((struct reparse_nfs_data_buffer *)buf,
 					   cifs_sb, data);
 	case IO_REPARSE_TAG_SYMLINK:
 		return parse_reparse_symlink(
@@ -816,7 +816,7 @@ static bool posix_reparse_to_fattr(struct cifs_sb_info *cifs_sb,
 				   struct cifs_fattr *fattr,
 				   struct cifs_open_info_data *data)
 {
-	struct reparse_posix_data *buf = (struct reparse_posix_data *)data->reparse.buf;
+	struct reparse_nfs_data_buffer *buf = (struct reparse_nfs_data_buffer *)data->reparse.buf;
 
 	if (buf == NULL)
 		return true;
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 3c7c706c797d..3336df2ea5d4 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -1550,7 +1550,19 @@ struct reparse_symlink_data_buffer {
 	__u8	PathBuffer[]; /* Variable Length */
 } __packed;
 
-/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
+/* For IO_REPARSE_TAG_NFS - see MS-FSCC 2.1.2.6 */
+#define NFS_SPECFILE_LNK	0x00000000014B4E4C
+#define NFS_SPECFILE_CHR	0x0000000000524843
+#define NFS_SPECFILE_BLK	0x00000000004B4C42
+#define NFS_SPECFILE_FIFO	0x000000004F464946
+#define NFS_SPECFILE_SOCK	0x000000004B434F53
+struct reparse_nfs_data_buffer {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__le64	InodeType; /* NFS_SPECFILE_* */
+	__u8	DataBuffer[];
+} __packed;
 
 /* For IO_REPARSE_TAG_LX_SYMLINK */
 struct reparse_wsl_symlink_data_buffer {
-- 
cgit v1.2.3


From a46221fcdd40a29eb08900221797ad63d0271118 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Tue, 24 Dec 2024 15:31:22 +0100
Subject: cifs: Update description about ACL permissions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some incorrect information about individual SMB permission
constants like WRITE_DAC can change ownership, or incomplete information to
distinguish between ACL types (discretionary vs system) and there is
completely missing information how permissions apply for directory objects
and what is meaning of GENERIC_* bits.

Also there is missing constant for MAXIMUM_ALLOWED permission.

Fix and extend description of all SMB permission constants to match the
reality, how the reference Windows SMB / NTFS implementation handles them.

Links to official Microsoft documentation related to permissions:
https://learn.microsoft.com/en-us/windows/win32/fileio/file-access-rights-constants
https://learn.microsoft.com/en-us/windows/win32/secauthz/access-mask
https://learn.microsoft.com/en-us/windows/win32/secauthz/standard-access-rights
https://learn.microsoft.com/en-us/windows/win32/secauthz/generic-access-rights
https://learn.microsoft.com/en-us/windows/win32/api/winternl/nf-winternl-ntcreatefile
https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/ntifs/nf-ntifs-ntcreatefile

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifspdu.h | 82 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index c285f0e5ce0f..84743f3d7c51 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -190,42 +190,82 @@
  */
 
 #define FILE_READ_DATA        0x00000001  /* Data can be read from the file   */
+					  /* or directory child entries can   */
+					  /* be listed together with the      */
+					  /* associated child attributes      */
+					  /* (so the FILE_READ_ATTRIBUTES on  */
+					  /* the child entry is not needed)   */
 #define FILE_WRITE_DATA       0x00000002  /* Data can be written to the file  */
+					  /* or new file can be created in    */
+					  /* the directory                    */
 #define FILE_APPEND_DATA      0x00000004  /* Data can be appended to the file */
+					  /* (for non-local files over SMB it */
+					  /* is same as FILE_WRITE_DATA)      */
+					  /* or new subdirectory can be       */
+					  /* created in the directory         */
 #define FILE_READ_EA          0x00000008  /* Extended attributes associated   */
 					  /* with the file can be read        */
 #define FILE_WRITE_EA         0x00000010  /* Extended attributes associated   */
 					  /* with the file can be written     */
 #define FILE_EXECUTE          0x00000020  /*Data can be read into memory from */
 					  /* the file using system paging I/O */
-#define FILE_DELETE_CHILD     0x00000040
+					  /* for executing the file / script  */
+					  /* or right to traverse directory   */
+					  /* (but by default all users have   */
+					  /* directory bypass traverse        */
+					  /* privilege and do not need this   */
+					  /* permission on directories at all)*/
+#define FILE_DELETE_CHILD     0x00000040  /* Child entry can be deleted from  */
+					  /* the directory (so the DELETE on  */
+					  /* the child entry is not needed)   */
 #define FILE_READ_ATTRIBUTES  0x00000080  /* Attributes associated with the   */
-					  /* file can be read                 */
+					  /* file or directory can be read    */
 #define FILE_WRITE_ATTRIBUTES 0x00000100  /* Attributes associated with the   */
-					  /* file can be written              */
-#define DELETE                0x00010000  /* The file can be deleted          */
-#define READ_CONTROL          0x00020000  /* The access control list and      */
-					  /* ownership associated with the    */
-					  /* file can be read                 */
-#define WRITE_DAC             0x00040000  /* The access control list and      */
-					  /* ownership associated with the    */
-					  /* file can be written.             */
+					  /* file or directory can be written */
+#define DELETE                0x00010000  /* The file or dir can be deleted   */
+#define READ_CONTROL          0x00020000  /* The discretionary access control */
+					  /* list and ownership associated    */
+					  /* with the file or dir can be read */
+#define WRITE_DAC             0x00040000  /* The discretionary access control */
+					  /* list associated with the file or */
+					  /* directory can be written         */
 #define WRITE_OWNER           0x00080000  /* Ownership information associated */
-					  /* with the file can be written     */
+					  /* with the file/dir can be written */
 #define SYNCHRONIZE           0x00100000  /* The file handle can waited on to */
 					  /* synchronize with the completion  */
 					  /* of an input/output request       */
 #define SYSTEM_SECURITY       0x01000000  /* The system access control list   */
-					  /* can be read and changed          */
-#define GENERIC_ALL           0x10000000
-#define GENERIC_EXECUTE       0x20000000
-#define GENERIC_WRITE         0x40000000
-#define GENERIC_READ          0x80000000
-					 /* In summary - Relevant file       */
-					 /* access flags from CIFS are       */
-					 /* file_read_data, file_write_data  */
-					 /* file_execute, file_read_attributes*/
-					 /* write_dac, and delete.           */
+					  /* associated with the file or      */
+					  /* directory can be read or written */
+					  /* (cannot be in DACL, can in SACL) */
+#define MAXIMUM_ALLOWED       0x02000000  /* Maximal subset of GENERIC_ALL    */
+					  /* permissions which can be granted */
+					  /* (cannot be in DACL nor SACL)     */
+#define GENERIC_ALL           0x10000000  /* Same as: GENERIC_EXECUTE |       */
+					  /*          GENERIC_WRITE |         */
+					  /*          GENERIC_READ |          */
+					  /*          FILE_DELETE_CHILD |     */
+					  /*          DELETE |                */
+					  /*          WRITE_DAC |             */
+					  /*          WRITE_OWNER             */
+					  /* So GENERIC_ALL contains all bits */
+					  /* mentioned above except these two */
+					  /* SYSTEM_SECURITY  MAXIMUM_ALLOWED */
+#define GENERIC_EXECUTE       0x20000000  /* Same as: FILE_EXECUTE |          */
+					  /*          FILE_READ_ATTRIBUTES |  */
+					  /*          READ_CONTROL |          */
+					  /*          SYNCHRONIZE             */
+#define GENERIC_WRITE         0x40000000  /* Same as: FILE_WRITE_DATA |       */
+					  /*          FILE_APPEND_DATA |      */
+					  /*          FILE_WRITE_EA |         */
+					  /*          FILE_WRITE_ATTRIBUTES | */
+					  /*          READ_CONTROL |          */
+					  /*          SYNCHRONIZE             */
+#define GENERIC_READ          0x80000000  /* Same as: FILE_READ_DATA |        */
+					  /*          FILE_READ_EA |          */
+					  /*          FILE_READ_ATTRIBUTES |  */
+					  /*          READ_CONTROL |          */
+					  /*          SYNCHRONIZE             */
 
 #define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES)
 #define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
-- 
cgit v1.2.3


From 65c49767dd4fc058673f9259fda1772fd398eaa7 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Thu, 26 Dec 2024 14:50:38 +0100
Subject: cifs: Remove symlink member from cifs_open_info_data union
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Member 'symlink' is part of the union in struct cifs_open_info_data. Its
value is assigned on few places, but is always read through another union
member 'reparse_point'. So to make code more readable, always use only
'reparse_point' member and drop whole union structure. No function change.

Signed-off-by: Pali Rohár <pali@kernel.org>
Acked-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h | 5 +----
 fs/smb/client/inode.c    | 2 +-
 fs/smb/client/smb1ops.c  | 4 ++--
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 63f23a845e15..2266b5b9a19f 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -215,10 +215,7 @@ struct cifs_cred {
 
 struct cifs_open_info_data {
 	bool adjust_tz;
-	union {
-		bool reparse_point;
-		bool symlink;
-	};
+	bool reparse_point;
 	struct {
 		/* ioctl response buffer */
 		struct {
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 93e9188b2632..8896c88320c8 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -990,7 +990,7 @@ cifs_get_file_info(struct file *filp)
 		/* TODO: add support to query reparse tag */
 		data.adjust_tz = false;
 		if (data.symlink_target) {
-			data.symlink = true;
+			data.reparse_point = true;
 			data.reparse.tag = IO_REPARSE_TAG_SYMLINK;
 		}
 		path = build_path_from_dentry(dentry, page);
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 55014c22f082..236289ff14ed 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -551,7 +551,7 @@ static int cifs_query_path_info(const unsigned int xid,
 	int rc;
 	FILE_ALL_INFO fi = {};
 
-	data->symlink = false;
+	data->reparse_point = false;
 	data->adjust_tz = false;
 
 	/* could do find first instead but this returns more info */
@@ -592,7 +592,7 @@ static int cifs_query_path_info(const unsigned int xid,
 		/* Need to check if this is a symbolic link or not */
 		tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
 		if (tmprc == -EOPNOTSUPP)
-			data->symlink = true;
+			data->reparse_point = true;
 		else if (tmprc == 0)
 			CIFSSMBClose(xid, tcon, fid.netfid);
 	}
-- 
cgit v1.2.3


From 32ba03042ab2618f2622e4dae57ca802ac982e39 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Thu, 26 Dec 2024 14:56:33 +0100
Subject: cifs: Simplify reparse point check in cifs_query_path_info() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For checking if path is reparse point and setting data->reparse_point
member, it is enough to check if ATTR_REPARSE is present.

It is not required to call CIFS_open() without OPEN_REPARSE_POINT and
checking for -EOPNOTSUPP error code.

Signed-off-by: Pali Rohár <pali@kernel.org>
Acked-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb1ops.c | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 236289ff14ed..9756b876a75e 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -569,32 +569,8 @@ static int cifs_query_path_info(const unsigned int xid,
 	}
 
 	if (!rc) {
-		int tmprc;
-		int oplock = 0;
-		struct cifs_fid fid;
-		struct cifs_open_parms oparms;
-
 		move_cifs_info_to_smb2(&data->fi, &fi);
-
-		if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE))
-			return 0;
-
-		oparms = (struct cifs_open_parms) {
-			.tcon = tcon,
-			.cifs_sb = cifs_sb,
-			.desired_access = FILE_READ_ATTRIBUTES,
-			.create_options = cifs_create_options(cifs_sb, 0),
-			.disposition = FILE_OPEN,
-			.path = full_path,
-			.fid = &fid,
-		};
-
-		/* Need to check if this is a symbolic link or not */
-		tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
-		if (tmprc == -EOPNOTSUPP)
-			data->reparse_point = true;
-		else if (tmprc == 0)
-			CIFSSMBClose(xid, tcon, fid.netfid);
+		data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE;
 	}
 
 	return rc;
-- 
cgit v1.2.3


From 12b466eb52d926802b6898d2cb7e67386467f54a Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Mon, 23 Sep 2024 23:01:59 +0200
Subject: cifs: Fix creating and resolving absolute NT-style symlinks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the SMB symlink is stored on NT server in absolute form then it points
to the NT object hierarchy, which is different from POSIX one and needs
some conversion / mapping.

To make interoperability with Windows SMB server and WSL subsystem, reuse
its logic of mapping between NT paths and POSIX paths into Linux SMB
client.

WSL subsystem on Windows uses for -t drvfs mount option -o symlinkroot=
which specifies the POSIX path where are expected to be mounted lowercase
Windows drive letters (without colon).

Do same for Linux SMB client and add a new mount option -o symlinkroot=
which mimics the drvfs mount option of the same name. It specifies where in
the Linux VFS hierarchy is the root of the DOS / Windows drive letters, and
translates between absolute NT-style symlinks and absolute Linux VFS
symlinks. Default value of symlinkroot is "/mnt", same what is using WSL.

Note that DOS / Windows drive letter symlinks are just subset of all
possible NT-style symlinks. Drive letters live in NT subtree \??\ and
important details about NT paths and object hierarchy are in the comments
in this change.

When symlink target location from non-POSIX SMB server is in absolute form
(indicated by absence of SYMLINK_FLAG_RELATIVE) then it is converted to
Linux absolute symlink according to symlinkroot configuration.

And when creating a new symlink on non-POSIX SMB server in absolute form
then Linux absolute target is converted to NT-style according to
symlinkroot configuration.

When SMB server is POSIX, then this change does not affect neither reading
target location of symlink, nor creating a new symlink. It is expected that
POSIX SMB server works with POSIX paths where the absolute root is /.

This change improves interoperability of absolute SMB symlinks with Windows
SMB servers.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/fs_context.c |  22 ++++
 fs/smb/client/fs_context.h |   2 +
 fs/smb/client/reparse.c    | 267 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 273 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 5381f05420bc..d7d2f6c607b5 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -185,6 +185,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
 	fsparam_string("cache", Opt_cache),
 	fsparam_string("reparse", Opt_reparse),
 	fsparam_string("upcall_target", Opt_upcalltarget),
+	fsparam_string("symlinkroot", Opt_symlinkroot),
 
 	/* Arguments that should be ignored */
 	fsparam_flag("guest", Opt_ignore),
@@ -386,6 +387,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
 	new_ctx->iocharset = NULL;
 	new_ctx->leaf_fullpath = NULL;
 	new_ctx->dns_dom = NULL;
+	new_ctx->symlinkroot = NULL;
 	/*
 	 * Make sure to stay in sync with smb3_cleanup_fs_context_contents()
 	 */
@@ -401,6 +403,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
 	DUP_CTX_STR(iocharset);
 	DUP_CTX_STR(leaf_fullpath);
 	DUP_CTX_STR(dns_dom);
+	DUP_CTX_STR(symlinkroot);
 
 	return 0;
 }
@@ -1727,6 +1730,16 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 		if (parse_reparse_flavor(fc, param->string, ctx))
 			goto cifs_parse_mount_err;
 		break;
+	case Opt_symlinkroot:
+		if (param->string[0] != '/') {
+			cifs_errorf(fc, "symlinkroot mount options must be absolute path\n");
+			goto cifs_parse_mount_err;
+		}
+		kfree(ctx->symlinkroot);
+		ctx->symlinkroot = kstrdup(param->string, GFP_KERNEL);
+		if (!ctx->symlinkroot)
+			goto cifs_parse_mount_err;
+		break;
 	}
 	/* case Opt_ignore: - is ignored as expected ... */
 
@@ -1735,6 +1748,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 		goto cifs_parse_mount_err;
 	}
 
+	/*
+	 * By default resolve all native absolute symlinks relative to "/mnt/".
+	 * Same default has drvfs driver running in WSL for resolving SMB shares.
+	 */
+	if (!ctx->symlinkroot)
+		ctx->symlinkroot = kstrdup("/mnt/", GFP_KERNEL);
+
 	return 0;
 
  cifs_parse_mount_err:
@@ -1867,6 +1887,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
 	ctx->leaf_fullpath = NULL;
 	kfree(ctx->dns_dom);
 	ctx->dns_dom = NULL;
+	kfree(ctx->symlinkroot);
+	ctx->symlinkroot = NULL;
 }
 
 void
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 8813533345ee..43bc3119af21 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -166,6 +166,7 @@ enum cifs_param {
 	Opt_cache,
 	Opt_reparse,
 	Opt_upcalltarget,
+	Opt_symlinkroot,
 
 	/* Mount options to be ignored */
 	Opt_ignore,
@@ -296,6 +297,7 @@ struct smb3_fs_context {
 	enum cifs_reparse_type reparse_type;
 	bool dfs_conn:1; /* set for dfs mounts */
 	char *dns_dom;
+	char *symlinkroot; /* top level directory for native SMB symlinks in absolute format */
 };
 
 extern const struct fs_parameter_spec smb3_fs_parameters[];
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 3be2173a026d..344371dd895c 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -25,36 +25,131 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
 				const char *full_path, const char *symname)
 {
 	struct reparse_symlink_data_buffer *buf = NULL;
-	struct cifs_open_info_data data;
+	struct cifs_open_info_data data = {};
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct inode *new;
 	struct kvec iov;
-	__le16 *path;
+	__le16 *path = NULL;
 	bool directory;
-	char *sym, sep = CIFS_DIR_SEP(cifs_sb);
-	u16 len, plen;
+	char *symlink_target = NULL;
+	char *sym = NULL;
+	char sep = CIFS_DIR_SEP(cifs_sb);
+	u16 len, plen, poff, slen;
 	int rc = 0;
 
 	if (strlen(symname) > REPARSE_SYM_PATH_MAX)
 		return -ENAMETOOLONG;
 
-	sym = kstrdup(symname, GFP_KERNEL);
-	if (!sym)
-		return -ENOMEM;
+	symlink_target = kstrdup(symname, GFP_KERNEL);
+	if (!symlink_target) {
+		rc = -ENOMEM;
+		goto out;
+	}
 
 	data = (struct cifs_open_info_data) {
 		.reparse_point = true,
 		.reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
-		.symlink_target = sym,
+		.symlink_target = symlink_target,
 	};
 
-	convert_delimiter(sym, sep);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+		/*
+		 * This is a request to create an absolute symlink on the server
+		 * which does not support POSIX paths, and expects symlink in
+		 * NT-style path. So convert absolute Linux symlink target path
+		 * to the absolute NT-style path. Root of the NT-style path for
+		 * symlinks is specified in "symlinkroot" mount option. This will
+		 * ensure compatibility of this symlink stored in absolute form
+		 * on the SMB server.
+		 */
+		if (!strstarts(symname, cifs_sb->ctx->symlinkroot)) {
+			/*
+			 * If the absolute Linux symlink target path is not
+			 * inside "symlinkroot" location then there is no way
+			 * to convert such Linux symlink to NT-style path.
+			 */
+			cifs_dbg(VFS,
+				 "absolute symlink '%s' cannot be converted to NT format "
+				 "because it is outside of symlinkroot='%s'\n",
+				 symname, cifs_sb->ctx->symlinkroot);
+			rc = -EINVAL;
+			goto out;
+		}
+		len = strlen(cifs_sb->ctx->symlinkroot);
+		if (cifs_sb->ctx->symlinkroot[len-1] != '/')
+			len++;
+		if (symname[len] >= 'a' && symname[len] <= 'z' &&
+		    (symname[len+1] == '/' || symname[len+1] == '\0')) {
+			/*
+			 * Symlink points to Linux target /symlinkroot/x/path/...
+			 * where 'x' is the lowercase local Windows drive.
+			 * NT-style path for 'x' has common form \??\X:\path\...
+			 * with uppercase local Windows drive.
+			 */
+			int common_path_len = strlen(symname+len+1)+1;
+			sym = kzalloc(6+common_path_len, GFP_KERNEL);
+			if (!sym) {
+				rc = -ENOMEM;
+				goto out;
+			}
+			memcpy(sym, "\\??\\", 4);
+			sym[4] = symname[len] - ('a'-'A');
+			sym[5] = ':';
+			memcpy(sym+6, symname+len+1, common_path_len);
+		} else {
+			/* Unhandled absolute symlink. Report an error. */
+			cifs_dbg(
+				 VFS,
+				 "absolute symlink '%s' cannot be converted to NT format "
+				 "because it points to unknown target\n",
+				 symname);
+			rc = -EINVAL;
+			goto out;
+		}
+	} else {
+		/*
+		 * This is request to either create an absolute symlink on
+		 * server which expects POSIX paths or it is an request to
+		 * create a relative symlink from the current directory.
+		 * These paths have same format as relative SMB symlinks,
+		 * so no conversion is needed. So just take symname as-is.
+		 */
+		sym = kstrdup(symname, GFP_KERNEL);
+		if (!sym) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (sep == '\\')
+		convert_delimiter(sym, sep);
+
+	/*
+	 * For absolute NT symlinks it is required to pass also leading
+	 * backslash and to not mangle NT object prefix "\\??\\" and not to
+	 * mangle colon in drive letter. But cifs_convert_path_to_utf16()
+	 * removes leading backslash and replaces '?' and ':'. So temporary
+	 * mask these characters in NT object prefix by '_' and then change
+	 * them back.
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/')
+		sym[0] = sym[1] = sym[2] = sym[5] = '_';
+
 	path = cifs_convert_path_to_utf16(sym, cifs_sb);
 	if (!path) {
 		rc = -ENOMEM;
 		goto out;
 	}
 
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+		sym[0] = '\\';
+		sym[1] = sym[2] = '?';
+		sym[5] = ':';
+		path[0] = cpu_to_le16('\\');
+		path[1] = path[2] = cpu_to_le16('?');
+		path[5] = cpu_to_le16(':');
+	}
+
 	/*
 	 * SMB distinguish between symlink to directory and symlink to file.
 	 * They cannot be exchanged (symlink of file type which points to
@@ -67,8 +162,18 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
 	if (rc < 0)
 		goto out;
 
-	plen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX);
-	len = sizeof(*buf) + plen * 2;
+	slen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX);
+	poff = 0;
+	plen = slen;
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+		/*
+		 * For absolute NT symlinks skip leading "\\??\\" in PrintName as
+		 * PrintName is user visible location in DOS/Win32 format (not in NT format).
+		 */
+		poff = 4;
+		plen -= 2 * poff;
+	}
+	len = sizeof(*buf) + plen + slen;
 	buf = kzalloc(len, GFP_KERNEL);
 	if (!buf) {
 		rc = -ENOMEM;
@@ -77,17 +182,17 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
 
 	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
 	buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
+
 	buf->SubstituteNameOffset = cpu_to_le16(plen);
-	buf->SubstituteNameLength = cpu_to_le16(plen);
-	memcpy(&buf->PathBuffer[plen], path, plen);
+	buf->SubstituteNameLength = cpu_to_le16(slen);
+	memcpy(&buf->PathBuffer[plen], path, slen);
+
 	buf->PrintNameOffset = 0;
 	buf->PrintNameLength = cpu_to_le16(plen);
-	memcpy(buf->PathBuffer, path, plen);
+	memcpy(buf->PathBuffer, path+poff, plen);
+
 	buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
-	if (*sym != sep)
-		buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
 
-	convert_delimiter(sym, '/');
 	iov.iov_base = buf;
 	iov.iov_len = len;
 	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
@@ -98,6 +203,7 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
 	else
 		rc = PTR_ERR(new);
 out:
+	kfree(sym);
 	kfree(path);
 	cifs_free_open_info(&data);
 	kfree(buf);
@@ -543,6 +649,9 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
 	char sep = CIFS_DIR_SEP(cifs_sb);
 	char *linux_target = NULL;
 	char *smb_target = NULL;
+	int symlinkroot_len;
+	int abs_path_len;
+	char *abs_path;
 	int levels;
 	int rc;
 	int i;
@@ -570,7 +679,123 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
 		goto out;
 	}
 
-	if (smb_target[0] == sep && relative) {
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && !relative) {
+		/*
+		 * This is an absolute symlink from the server which does not
+		 * support POSIX paths, so the symlink is in NT-style path.
+		 * So convert it to absolute Linux symlink target path. Root of
+		 * the NT-style path for symlinks is specified in "symlinkroot"
+		 * mount option.
+		 *
+		 * Root of the DOS and Win32 paths is at NT path \??\
+		 * It means that DOS/Win32 path C:\folder\file.txt is
+		 * NT path \??\C:\folder\file.txt
+		 *
+		 * NT systems have some well-known object symlinks in their NT
+		 * hierarchy, which is needed to take into account when resolving
+		 * other symlinks. Most commonly used symlink paths are:
+		 * \?? -> \GLOBAL??
+		 * \DosDevices -> \??
+		 * \GLOBAL??\GLOBALROOT -> \
+		 * \GLOBAL??\Global -> \GLOBAL??
+		 * \GLOBAL??\NUL -> \Device\Null
+		 * \GLOBAL??\UNC -> \Device\Mup
+		 * \GLOBAL??\PhysicalDrive0 -> \Device\Harddisk0\DR0 (for each harddisk)
+		 * \GLOBAL??\A: -> \Device\Floppy0 (if A: is the first floppy)
+		 * \GLOBAL??\C: -> \Device\HarddiskVolume1 (if C: is the first harddisk)
+		 * \GLOBAL??\D: -> \Device\CdRom0 (if D: is first cdrom)
+		 * \SystemRoot -> \Device\Harddisk0\Partition1\WINDOWS (or where is NT system installed)
+		 * \Volume{...} -> \Device\HarddiskVolume1 (where ... is system generated guid)
+		 *
+		 * In most common cases, absolute NT symlinks points to path on
+		 * DOS/Win32 drive letter, system-specific Volume or on UNC share.
+		 * Here are few examples of commonly used absolute NT symlinks
+		 * created by mklink.exe tool:
+		 * \??\C:\folder\file.txt
+		 * \??\\C:\folder\file.txt
+		 * \??\UNC\server\share\file.txt
+		 * \??\\UNC\server\share\file.txt
+		 * \??\Volume{b75e2c83-0000-0000-0000-602f00000000}\folder\file.txt
+		 *
+		 * It means that the most common path prefix \??\ is also NT path
+		 * symlink (to \GLOBAL??). It is less common that second path
+		 * separator is double backslash, but it is valid.
+		 *
+		 * Volume guid is randomly generated by the target system and so
+		 * only the target system knows the mapping between guid and the
+		 * hardisk number. Over SMB it is not possible to resolve this
+		 * mapping, therefore symlinks pointing to target location of
+		 * volume guids are totally unusable over SMB.
+		 *
+		 * For now parse only symlink paths available for DOS and Win32.
+		 * Those are paths with \??\ prefix or paths which points to \??\
+		 * via other NT symlink (\DosDevices\, \GLOBAL??\, ...).
+		 */
+		abs_path = smb_target;
+globalroot:
+		if (strstarts(abs_path, "\\??\\"))
+			abs_path += sizeof("\\??\\")-1;
+		else if (strstarts(abs_path, "\\DosDevices\\"))
+			abs_path += sizeof("\\DosDevices\\")-1;
+		else if (strstarts(abs_path, "\\GLOBAL??\\"))
+			abs_path += sizeof("\\GLOBAL??\\")-1;
+		else {
+			/* Unhandled absolute symlink, points outside of DOS/Win32 */
+			cifs_dbg(VFS,
+				 "absolute symlink '%s' cannot be converted from NT format "
+				 "because points to unknown target\n",
+				 smb_target);
+			rc = -EIO;
+			goto out;
+		}
+
+		/* Sometimes path separator after \?? is double backslash */
+		if (abs_path[0] == '\\')
+			abs_path++;
+
+		while (strstarts(abs_path, "Global\\"))
+			abs_path += sizeof("Global\\")-1;
+
+		if (strstarts(abs_path, "GLOBALROOT\\")) {
+			/* Label globalroot requires path with leading '\\', so do not trim '\\' */
+			abs_path += sizeof("GLOBALROOT")-1;
+			goto globalroot;
+		}
+
+		/* For now parse only paths to drive letters */
+		if (((abs_path[0] >= 'A' && abs_path[0] <= 'Z') ||
+		     (abs_path[0] >= 'a' && abs_path[0] <= 'z')) &&
+		    abs_path[1] == ':' &&
+		    (abs_path[2] == '\\' || abs_path[2] == '\0')) {
+			/* Convert drive letter to lowercase and drop colon */
+			char drive_letter = abs_path[0];
+			if (drive_letter >= 'A' && drive_letter <= 'Z')
+				drive_letter += 'a'-'A';
+			abs_path++;
+			abs_path[0] = drive_letter;
+		} else {
+			/* Unhandled absolute symlink. Report an error. */
+			cifs_dbg(VFS,
+				 "absolute symlink '%s' cannot be converted from NT format "
+				 "because points to unknown target\n",
+				 smb_target);
+			rc = -EIO;
+			goto out;
+		}
+
+		abs_path_len = strlen(abs_path)+1;
+		symlinkroot_len = strlen(cifs_sb->ctx->symlinkroot);
+		if (cifs_sb->ctx->symlinkroot[symlinkroot_len-1] == '/')
+			symlinkroot_len--;
+		linux_target = kmalloc(symlinkroot_len + 1 + abs_path_len, GFP_KERNEL);
+		if (!linux_target) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		memcpy(linux_target, cifs_sb->ctx->symlinkroot, symlinkroot_len);
+		linux_target[symlinkroot_len] = '/';
+		memcpy(linux_target + symlinkroot_len + 1, abs_path, abs_path_len);
+	} else if (smb_target[0] == sep && relative) {
 		/*
 		 * This is a relative SMB symlink from the top of the share,
 		 * which is the top level directory of the Linux mount point.
@@ -599,6 +824,12 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
 		}
 		memcpy(linux_target + levels*3, smb_target+1, smb_target_len); /* +1 to skip leading sep */
 	} else {
+		/*
+		 * This is either an absolute symlink in POSIX-style format
+		 * or relative SMB symlink from the current directory.
+		 * These paths have same format as Linux symlinks, so no
+		 * conversion is needed.
+		 */
 		linux_target = smb_target;
 		smb_target = NULL;
 	}
-- 
cgit v1.2.3


From 660618dde2b4c372132a6be62f11ab68a0a1571a Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Fri, 11 Oct 2024 11:20:56 +0200
Subject: cifs: Add mount option -o symlink= for choosing symlink create type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently Linux CIFS client creates a new symlink of the first flavor which
is allowed by mount options, parsed in this order: -o (no)mfsymlinks,
-o (no)sfu, -o (no)unix (+ its aliases) and -o reparse=[type].

Introduce a new mount option -o symlink= for explicitly choosing a symlink
flavor. Possible options are:

  -o symlink=default    - The default behavior, like before this change.
  -o symlink=none       - Disallow creating a new symlinks
  -o symlink=native     - Create as native SMB symlink reparse point
  -o symlink=unix       - Create via SMB1 unix extension command
  -o symlink=mfsymlinks - Create as regular file of mfsymlinks format
  -o symlink=sfu        - Create as regular system file of SFU format
  -o symlink=nfs        - Create as NFS reparse point
  -o symlink=wsl        - Create as WSL reparse point

So for example specifying -o sfu,mfsymlinks,symlink=native will allow to
parse symlinks also of SFU and mfsymlinks types (which are disabled by
default unless mount option is explicitly specified), but new symlinks will
be created under native SMB type (which parsing is always enabled).

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsfs.c     |  2 ++
 fs/smb/client/cifsglob.h   | 33 +++++++++++++++++++++
 fs/smb/client/connect.c    |  2 ++
 fs/smb/client/fs_context.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/smb/client/fs_context.h | 16 +++++++++++
 fs/smb/client/link.c       | 60 ++++++++++++++++++++++++++++++---------
 fs/smb/client/reparse.c    | 52 ++++++++++++++++++++++++++-------
 fs/smb/client/reparse.h    |  2 ++
 8 files changed, 214 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index b800c9f585d8..f2c852c9d6a1 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -715,6 +715,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 					    cifs_sb->ctx->backupgid));
 	seq_show_option(s, "reparse",
 			cifs_reparse_type_str(cifs_sb->ctx->reparse_type));
+	seq_show_option(s, "symlink",
+			cifs_symlink_type_str(get_cifs_symlink_type(cifs_sb)));
 
 	seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize);
 	seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize);
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 2266b5b9a19f..9a96f69e67d0 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -177,6 +177,39 @@ static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type)
 	}
 }
 
+enum cifs_symlink_type {
+	CIFS_SYMLINK_TYPE_DEFAULT,
+	CIFS_SYMLINK_TYPE_NONE,
+	CIFS_SYMLINK_TYPE_NATIVE,
+	CIFS_SYMLINK_TYPE_UNIX,
+	CIFS_SYMLINK_TYPE_MFSYMLINKS,
+	CIFS_SYMLINK_TYPE_SFU,
+	CIFS_SYMLINK_TYPE_NFS,
+	CIFS_SYMLINK_TYPE_WSL,
+};
+
+static inline const char *cifs_symlink_type_str(enum cifs_symlink_type type)
+{
+	switch (type) {
+	case CIFS_SYMLINK_TYPE_NONE:
+		return "none";
+	case CIFS_SYMLINK_TYPE_NATIVE:
+		return "native";
+	case CIFS_SYMLINK_TYPE_UNIX:
+		return "unix";
+	case CIFS_SYMLINK_TYPE_MFSYMLINKS:
+		return "mfsymlinks";
+	case CIFS_SYMLINK_TYPE_SFU:
+		return "sfu";
+	case CIFS_SYMLINK_TYPE_NFS:
+		return "nfs";
+	case CIFS_SYMLINK_TYPE_WSL:
+		return "wsl";
+	default:
+		return "unknown";
+	}
+}
+
 struct session_key {
 	unsigned int len;
 	char *response;
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 880d7cf8b730..ebd20f48f6aa 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2849,6 +2849,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 		return 0;
 	if (old->ctx->reparse_type != new->ctx->reparse_type)
 		return 0;
+	if (old->ctx->symlink_type != new->ctx->symlink_type)
+		return 0;
 
 	return 1;
 }
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index d7d2f6c607b5..5a9a5e04fb05 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -185,6 +185,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
 	fsparam_string("cache", Opt_cache),
 	fsparam_string("reparse", Opt_reparse),
 	fsparam_string("upcall_target", Opt_upcalltarget),
+	fsparam_string("symlink", Opt_symlink),
 	fsparam_string("symlinkroot", Opt_symlinkroot),
 
 	/* Arguments that should be ignored */
@@ -360,6 +361,55 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value,
 	return 0;
 }
 
+static const match_table_t symlink_flavor_tokens = {
+	{ Opt_symlink_default,		"default" },
+	{ Opt_symlink_none,		"none" },
+	{ Opt_symlink_native,		"native" },
+	{ Opt_symlink_unix,		"unix" },
+	{ Opt_symlink_mfsymlinks,	"mfsymlinks" },
+	{ Opt_symlink_sfu,		"sfu" },
+	{ Opt_symlink_nfs,		"nfs" },
+	{ Opt_symlink_wsl,		"wsl" },
+	{ Opt_symlink_err,		NULL },
+};
+
+static int parse_symlink_flavor(struct fs_context *fc, char *value,
+				struct smb3_fs_context *ctx)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, symlink_flavor_tokens, args)) {
+	case Opt_symlink_default:
+		ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT;
+		break;
+	case Opt_symlink_none:
+		ctx->symlink_type = CIFS_SYMLINK_TYPE_NONE;
+		break;
+	case Opt_symlink_native:
+		ctx->symlink_type = CIFS_SYMLINK_TYPE_NATIVE;
+		break;
+	case Opt_symlink_unix:
+		ctx->symlink_type = CIFS_SYMLINK_TYPE_UNIX;
+		break;
+	case Opt_symlink_mfsymlinks:
+		ctx->symlink_type = CIFS_SYMLINK_TYPE_MFSYMLINKS;
+		break;
+	case Opt_symlink_sfu:
+		ctx->symlink_type = CIFS_SYMLINK_TYPE_SFU;
+		break;
+	case Opt_symlink_nfs:
+		ctx->symlink_type = CIFS_SYMLINK_TYPE_NFS;
+		break;
+	case Opt_symlink_wsl:
+		ctx->symlink_type = CIFS_SYMLINK_TYPE_WSL;
+		break;
+	default:
+		cifs_errorf(fc, "bad symlink= option: %s\n", value);
+		return 1;
+	}
+	return 0;
+}
+
 #define DUP_CTX_STR(field)						\
 do {									\
 	if (ctx->field) {						\
@@ -1730,6 +1780,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 		if (parse_reparse_flavor(fc, param->string, ctx))
 			goto cifs_parse_mount_err;
 		break;
+	case Opt_symlink:
+		if (parse_symlink_flavor(fc, param->string, ctx))
+			goto cifs_parse_mount_err;
+		break;
 	case Opt_symlinkroot:
 		if (param->string[0] != '/') {
 			cifs_errorf(fc, "symlinkroot mount options must be absolute path\n");
@@ -1765,6 +1819,22 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 	return -EINVAL;
 }
 
+enum cifs_symlink_type get_cifs_symlink_type(struct cifs_sb_info *cifs_sb)
+{
+	if (cifs_sb->ctx->symlink_type == CIFS_SYMLINK_TYPE_DEFAULT) {
+		if (cifs_sb->ctx->mfsymlinks)
+			return CIFS_SYMLINK_TYPE_MFSYMLINKS;
+		else if (cifs_sb->ctx->sfu_emul)
+			return CIFS_SYMLINK_TYPE_SFU;
+		else if (cifs_sb->ctx->linux_ext && !cifs_sb->ctx->no_linux_ext)
+			return CIFS_SYMLINK_TYPE_UNIX;
+		else
+			return CIFS_SYMLINK_TYPE_NATIVE;
+	} else {
+		return cifs_sb->ctx->symlink_type;
+	}
+}
+
 int smb3_init_fs_context(struct fs_context *fc)
 {
 	struct smb3_fs_context *ctx;
@@ -1841,6 +1911,7 @@ int smb3_init_fs_context(struct fs_context *fc)
 
 	ctx->retrans = 1;
 	ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
+	ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT;
 
 /*
  *	short int override_uid = -1;
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 43bc3119af21..204643428068 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -48,6 +48,18 @@ enum cifs_reparse_parm {
 	Opt_reparse_err
 };
 
+enum cifs_symlink_parm {
+	Opt_symlink_default,
+	Opt_symlink_none,
+	Opt_symlink_native,
+	Opt_symlink_unix,
+	Opt_symlink_mfsymlinks,
+	Opt_symlink_sfu,
+	Opt_symlink_nfs,
+	Opt_symlink_wsl,
+	Opt_symlink_err
+};
+
 enum cifs_sec_param {
 	Opt_sec_krb5,
 	Opt_sec_krb5i,
@@ -166,6 +178,7 @@ enum cifs_param {
 	Opt_cache,
 	Opt_reparse,
 	Opt_upcalltarget,
+	Opt_symlink,
 	Opt_symlinkroot,
 
 	/* Mount options to be ignored */
@@ -295,6 +308,7 @@ struct smb3_fs_context {
 	struct cifs_ses *dfs_root_ses;
 	bool dfs_automount:1; /* set for dfs automount only */
 	enum cifs_reparse_type reparse_type;
+	enum cifs_symlink_type symlink_type;
 	bool dfs_conn:1; /* set for dfs mounts */
 	char *dns_dom;
 	char *symlinkroot; /* top level directory for native SMB symlinks in absolute format */
@@ -302,6 +316,8 @@ struct smb3_fs_context {
 
 extern const struct fs_parameter_spec smb3_fs_parameters[];
 
+extern enum cifs_symlink_type get_cifs_symlink_type(struct cifs_sb_info *cifs_sb);
+
 extern int smb3_init_fs_context(struct fs_context *fc);
 extern void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx);
 extern void smb3_cleanup_fs_context(struct smb3_fs_context *ctx);
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index 47ddeb7fa111..6e6c09cc5ce7 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -18,6 +18,7 @@
 #include "cifs_unicode.h"
 #include "smb2proto.h"
 #include "cifs_ioctl.h"
+#include "fs_context.h"
 
 /*
  * M-F Symlink Functions - Begin
@@ -604,22 +605,53 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
 	cifs_dbg(FYI, "symname is %s\n", symname);
 
 	/* BB what if DFS and this volume is on different share? BB */
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-		rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
-	} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-		rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon,
-					  full_path, S_IFLNK, 0, symname);
+	rc = -EOPNOTSUPP;
+	switch (get_cifs_symlink_type(cifs_sb)) {
+	case CIFS_SYMLINK_TYPE_DEFAULT:
+		/* should not happen, get_cifs_symlink_type() resolves the default */
+		break;
+
+	case CIFS_SYMLINK_TYPE_NONE:
+		break;
+
+	case CIFS_SYMLINK_TYPE_UNIX:
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
-	} else if (pTcon->unix_ext) {
-		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
-					   cifs_sb->local_nls,
-					   cifs_remap(cifs_sb));
+		if (pTcon->unix_ext) {
+			rc = CIFSUnixCreateSymLink(xid, pTcon, full_path,
+						   symname,
+						   cifs_sb->local_nls,
+						   cifs_remap(cifs_sb));
+		}
 #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-	} else if (server->ops->create_reparse_symlink) {
-		rc =  server->ops->create_reparse_symlink(xid, inode, direntry,
-							  pTcon, full_path,
-							  symname);
-		goto symlink_exit;
+		break;
+
+	case CIFS_SYMLINK_TYPE_MFSYMLINKS:
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+			rc = create_mf_symlink(xid, pTcon, cifs_sb,
+					       full_path, symname);
+		}
+		break;
+
+	case CIFS_SYMLINK_TYPE_SFU:
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+			rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon,
+						  full_path, S_IFLNK,
+						  0, symname);
+		}
+		break;
+
+	case CIFS_SYMLINK_TYPE_NATIVE:
+	case CIFS_SYMLINK_TYPE_NFS:
+	case CIFS_SYMLINK_TYPE_WSL:
+		if (server->ops->create_reparse_symlink) {
+			rc = server->ops->create_reparse_symlink(xid, inode,
+								 direntry,
+								 pTcon,
+								 full_path,
+								 symname);
+			goto symlink_exit;
+		}
+		break;
 	}
 
 	if (rc == 0) {
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 344371dd895c..24a5f563df26 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -14,6 +14,20 @@
 #include "fs_context.h"
 #include "reparse.h"
 
+static int mknod_nfs(unsigned int xid, struct inode *inode,
+		     struct dentry *dentry, struct cifs_tcon *tcon,
+		     const char *full_path, umode_t mode, dev_t dev,
+		     const char *symname);
+
+static int mknod_wsl(unsigned int xid, struct inode *inode,
+		     struct dentry *dentry, struct cifs_tcon *tcon,
+		     const char *full_path, umode_t mode, dev_t dev,
+		     const char *symname);
+
+static int create_native_symlink(const unsigned int xid, struct inode *inode,
+				 struct dentry *dentry, struct cifs_tcon *tcon,
+				 const char *full_path, const char *symname);
+
 static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
 					   const unsigned int xid,
 					   const char *full_path,
@@ -23,6 +37,22 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
 int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
 				struct dentry *dentry, struct cifs_tcon *tcon,
 				const char *full_path, const char *symname)
+{
+	switch (get_cifs_symlink_type(CIFS_SB(inode->i_sb))) {
+	case CIFS_SYMLINK_TYPE_NATIVE:
+		return create_native_symlink(xid, inode, dentry, tcon, full_path, symname);
+	case CIFS_SYMLINK_TYPE_NFS:
+		return mknod_nfs(xid, inode, dentry, tcon, full_path, S_IFLNK, 0, symname);
+	case CIFS_SYMLINK_TYPE_WSL:
+		return mknod_wsl(xid, inode, dentry, tcon, full_path, S_IFLNK, 0, symname);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int create_native_symlink(const unsigned int xid, struct inode *inode,
+				 struct dentry *dentry, struct cifs_tcon *tcon,
+				 const char *full_path, const char *symname)
 {
 	struct reparse_symlink_data_buffer *buf = NULL;
 	struct cifs_open_info_data data = {};
@@ -366,6 +396,7 @@ static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
 	case NFS_SPECFILE_SOCK:
 		dlen = 0;
 		break;
+	case NFS_SPECFILE_LNK: /* TODO: add support for NFS symlinks */
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -384,7 +415,8 @@ static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
 
 static int mknod_nfs(unsigned int xid, struct inode *inode,
 		     struct dentry *dentry, struct cifs_tcon *tcon,
-		     const char *full_path, umode_t mode, dev_t dev)
+		     const char *full_path, umode_t mode, dev_t dev,
+		     const char *symname)
 {
 	struct cifs_open_info_data data;
 	struct reparse_nfs_data_buffer *p;
@@ -424,6 +456,7 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
 	case IO_REPARSE_TAG_LX_FIFO:
 	case IO_REPARSE_TAG_AF_UNIX:
 		break;
+	case IO_REPARSE_TAG_LX_SYMLINK: /* TODO: add support for WSL symlinks */
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -521,7 +554,8 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode,
 
 static int mknod_wsl(unsigned int xid, struct inode *inode,
 		     struct dentry *dentry, struct cifs_tcon *tcon,
-		     const char *full_path, umode_t mode, dev_t dev)
+		     const char *full_path, umode_t mode, dev_t dev,
+		     const char *symname)
 {
 	struct cifs_open_info_data data;
 	struct reparse_data_buffer buf;
@@ -566,17 +600,15 @@ int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
 		       const char *full_path, umode_t mode, dev_t dev)
 {
 	struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
-	int rc = -EOPNOTSUPP;
 
 	switch (ctx->reparse_type) {
 	case CIFS_REPARSE_TYPE_NFS:
-		rc = mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev);
-		break;
+		return mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev, NULL);
 	case CIFS_REPARSE_TYPE_WSL:
-		rc = mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev);
-		break;
+		return mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev, NULL);
+	default:
+		return -EOPNOTSUPP;
 	}
-	return rc;
 }
 
 /* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
@@ -849,7 +881,7 @@ out:
 	return rc;
 }
 
-static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
+static int parse_reparse_native_symlink(struct reparse_symlink_data_buffer *sym,
 				 u32 plen,
 				 struct cifs_sb_info *cifs_sb,
 				 const char *full_path,
@@ -936,7 +968,7 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
 		return parse_reparse_nfs((struct reparse_nfs_data_buffer *)buf,
 					   cifs_sb, data);
 	case IO_REPARSE_TAG_SYMLINK:
-		return parse_reparse_symlink(
+		return parse_reparse_native_symlink(
 			(struct reparse_symlink_data_buffer *)buf,
 			plen, cifs_sb, full_path, data);
 	case IO_REPARSE_TAG_LX_SYMLINK:
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index ff05b0e75c92..5a753fec7e2c 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -50,6 +50,7 @@ static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb,
 static inline u64 reparse_mode_nfs_type(mode_t mode)
 {
 	switch (mode & S_IFMT) {
+	case S_IFLNK: return NFS_SPECFILE_LNK;
 	case S_IFBLK: return NFS_SPECFILE_BLK;
 	case S_IFCHR: return NFS_SPECFILE_CHR;
 	case S_IFIFO: return NFS_SPECFILE_FIFO;
@@ -61,6 +62,7 @@ static inline u64 reparse_mode_nfs_type(mode_t mode)
 static inline u32 reparse_mode_wsl_tag(mode_t mode)
 {
 	switch (mode & S_IFMT) {
+	case S_IFLNK: return IO_REPARSE_TAG_LX_SYMLINK;
 	case S_IFBLK: return IO_REPARSE_TAG_LX_BLK;
 	case S_IFCHR: return IO_REPARSE_TAG_LX_CHR;
 	case S_IFIFO: return IO_REPARSE_TAG_LX_FIFO;
-- 
cgit v1.2.3


From 78f69467cbbfd24da5ce9917c4b738b38a615f8b Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Sat, 21 Sep 2024 16:15:30 +0200
Subject: cifs: Add mount option -o reparse=none
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This new mount option allows to completely disable creating new reparse
points. When -o sfu or -o mfsymlinks or -o symlink= is not specified then
creating any special file (fifo, socket, symlink, block and char) will fail
with -EOPNOTSUPP error.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h   | 3 +++
 fs/smb/client/fs_context.c | 8 +++++++-
 fs/smb/client/fs_context.h | 1 +
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 9a96f69e67d0..ee9754fad3e8 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -160,6 +160,7 @@ enum upcall_target_enum {
 };
 
 enum cifs_reparse_type {
+	CIFS_REPARSE_TYPE_NONE,
 	CIFS_REPARSE_TYPE_NFS,
 	CIFS_REPARSE_TYPE_WSL,
 	CIFS_REPARSE_TYPE_DEFAULT = CIFS_REPARSE_TYPE_NFS,
@@ -168,6 +169,8 @@ enum cifs_reparse_type {
 static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type)
 {
 	switch (type) {
+	case CIFS_REPARSE_TYPE_NONE:
+		return "none";
 	case CIFS_REPARSE_TYPE_NFS:
 		return "nfs";
 	case CIFS_REPARSE_TYPE_WSL:
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 5a9a5e04fb05..821eb149e4b8 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -334,6 +334,7 @@ cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_conte
 
 static const match_table_t reparse_flavor_tokens = {
 	{ Opt_reparse_default,	"default" },
+	{ Opt_reparse_none,	"none" },
 	{ Opt_reparse_nfs,	"nfs" },
 	{ Opt_reparse_wsl,	"wsl" },
 	{ Opt_reparse_err,	NULL },
@@ -348,6 +349,9 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value,
 	case Opt_reparse_default:
 		ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
 		break;
+	case Opt_reparse_none:
+		ctx->reparse_type = CIFS_REPARSE_TYPE_NONE;
+		break;
 	case Opt_reparse_nfs:
 		ctx->reparse_type = CIFS_REPARSE_TYPE_NFS;
 		break;
@@ -1828,8 +1832,10 @@ enum cifs_symlink_type get_cifs_symlink_type(struct cifs_sb_info *cifs_sb)
 			return CIFS_SYMLINK_TYPE_SFU;
 		else if (cifs_sb->ctx->linux_ext && !cifs_sb->ctx->no_linux_ext)
 			return CIFS_SYMLINK_TYPE_UNIX;
-		else
+		else if (cifs_sb->ctx->reparse_type != CIFS_REPARSE_TYPE_NONE)
 			return CIFS_SYMLINK_TYPE_NATIVE;
+		else
+			return CIFS_SYMLINK_TYPE_NONE;
 	} else {
 		return cifs_sb->ctx->symlink_type;
 	}
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 204643428068..2ccdda350267 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -43,6 +43,7 @@ enum {
 
 enum cifs_reparse_parm {
 	Opt_reparse_default,
+	Opt_reparse_none,
 	Opt_reparse_nfs,
 	Opt_reparse_wsl,
 	Opt_reparse_err
-- 
cgit v1.2.3


From 57b314752ec0ad42685bc78b376326f1f4c04669 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 29 Jan 2025 19:19:37 +0000
Subject: debugfs: Fix the missing initializations in __debugfs_file_get()

both method table pointers in debugfs_fsdata need to be initialized,
obviously, and calculating the bitmap of present methods would also
go better if we start with initialized state.

Fixes: 41a0ecc0997c ("debugfs: get rid of dynamically allocation proxy_ops")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20250129191937.GR1977892@ZenIV
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index e33cc77699cd..69e9ddcb113d 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -94,6 +94,7 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
 		fsd = d_fsd;
 	} else {
 		struct inode *inode = dentry->d_inode;
+		unsigned int methods = 0;
 
 		if (WARN_ON(mode == DBGFS_GET_ALREADY))
 			return -EINVAL;
@@ -106,25 +107,28 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
 			const struct debugfs_short_fops *ops;
 			ops = fsd->short_fops = DEBUGFS_I(inode)->short_fops;
 			if (ops->llseek)
-				fsd->methods |= HAS_LSEEK;
+				methods |= HAS_LSEEK;
 			if (ops->read)
-				fsd->methods |= HAS_READ;
+				methods |= HAS_READ;
 			if (ops->write)
-				fsd->methods |= HAS_WRITE;
+				methods |= HAS_WRITE;
+			fsd->real_fops = NULL;
 		} else {
 			const struct file_operations *ops;
 			ops = fsd->real_fops = DEBUGFS_I(inode)->real_fops;
 			if (ops->llseek)
-				fsd->methods |= HAS_LSEEK;
+				methods |= HAS_LSEEK;
 			if (ops->read)
-				fsd->methods |= HAS_READ;
+				methods |= HAS_READ;
 			if (ops->write)
-				fsd->methods |= HAS_WRITE;
+				methods |= HAS_WRITE;
 			if (ops->unlocked_ioctl)
-				fsd->methods |= HAS_IOCTL;
+				methods |= HAS_IOCTL;
 			if (ops->poll)
-				fsd->methods |= HAS_POLL;
+				methods |= HAS_POLL;
+			fsd->short_fops = NULL;
 		}
+		fsd->methods = methods;
 		refcount_set(&fsd->active_users, 1);
 		init_completion(&fsd->active_users_drained);
 		INIT_LIST_HEAD(&fsd->cancellations);
-- 
cgit v1.2.3


From 45a99d5d117300eb84eceaa312bb3c3262f8c85b Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Mon, 16 Sep 2024 20:21:52 +0200
Subject: cifs: Add support for creating native Windows sockets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Native Windows sockets created by WinSock on Windows 10 April 2018 Update
(version 1803) or Windows Server 2019 (version 1809) or later versions is
reparse point with IO_REPARSE_TAG_AF_UNIX tag, with empty reparse point
data buffer and without any EAs.

Create AF_UNIX sockets in this native format if -o nonativesocket was not
specified.

This change makes AF_UNIX sockets created by Linux CIFS client compatible
with AF_UNIX sockets created by Windows applications on NTFS volumes.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsfs.c     |  4 ++++
 fs/smb/client/connect.c    |  2 ++
 fs/smb/client/fs_context.c |  5 +++++
 fs/smb/client/fs_context.h |  2 ++
 fs/smb/client/reparse.c    | 32 ++++++++++++++++++++++++++++++++
 5 files changed, 45 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index f2c852c9d6a1..6a3bd652d251 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -715,6 +715,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 					    cifs_sb->ctx->backupgid));
 	seq_show_option(s, "reparse",
 			cifs_reparse_type_str(cifs_sb->ctx->reparse_type));
+	if (cifs_sb->ctx->nonativesocket)
+		seq_puts(s, ",nonativesocket");
+	else
+		seq_puts(s, ",nativesocket");
 	seq_show_option(s, "symlink",
 			cifs_symlink_type_str(get_cifs_symlink_type(cifs_sb)));
 
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index ebd20f48f6aa..f917de020dd5 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2849,6 +2849,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 		return 0;
 	if (old->ctx->reparse_type != new->ctx->reparse_type)
 		return 0;
+	if (old->ctx->nonativesocket != new->ctx->nonativesocket)
+		return 0;
 	if (old->ctx->symlink_type != new->ctx->symlink_type)
 		return 0;
 
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 821eb149e4b8..e9b286d9a7ba 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -133,6 +133,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
 	fsparam_flag("rootfs", Opt_rootfs),
 	fsparam_flag("compress", Opt_compress),
 	fsparam_flag("witness", Opt_witness),
+	fsparam_flag_no("nativesocket", Opt_nativesocket),
 
 	/* Mount options which take uid or gid */
 	fsparam_uid("backupuid", Opt_backupuid),
@@ -1784,6 +1785,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 		if (parse_reparse_flavor(fc, param->string, ctx))
 			goto cifs_parse_mount_err;
 		break;
+	case Opt_nativesocket:
+		ctx->nonativesocket = result.negated;
+		break;
 	case Opt_symlink:
 		if (parse_symlink_flavor(fc, param->string, ctx))
 			goto cifs_parse_mount_err;
@@ -1918,6 +1922,7 @@ int smb3_init_fs_context(struct fs_context *fc)
 	ctx->retrans = 1;
 	ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
 	ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT;
+	ctx->nonativesocket = 0;
 
 /*
  *	short int override_uid = -1;
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 2ccdda350267..881bfc08667e 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -179,6 +179,7 @@ enum cifs_param {
 	Opt_cache,
 	Opt_reparse,
 	Opt_upcalltarget,
+	Opt_nativesocket,
 	Opt_symlink,
 	Opt_symlinkroot,
 
@@ -310,6 +311,7 @@ struct smb3_fs_context {
 	bool dfs_automount:1; /* set for dfs automount only */
 	enum cifs_reparse_type reparse_type;
 	enum cifs_symlink_type symlink_type;
+	bool nonativesocket:1;
 	bool dfs_conn:1; /* set for dfs mounts */
 	char *dns_dom;
 	char *symlinkroot; /* top level directory for native SMB symlinks in absolute format */
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 24a5f563df26..34bf4e3f28d9 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -378,6 +378,35 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
 	return 0;
 }
 
+static int create_native_socket(const unsigned int xid, struct inode *inode,
+				struct dentry *dentry, struct cifs_tcon *tcon,
+				const char *full_path)
+{
+	struct reparse_data_buffer buf = {
+		.ReparseTag = cpu_to_le32(IO_REPARSE_TAG_AF_UNIX),
+		.ReparseDataLength = cpu_to_le16(0),
+	};
+	struct cifs_open_info_data data = {
+		.reparse_point = true,
+		.reparse = { .tag = IO_REPARSE_TAG_AF_UNIX, .buf = &buf, },
+	};
+	struct kvec iov = {
+		.iov_base = &buf,
+		.iov_len = sizeof(buf),
+	};
+	struct inode *new;
+	int rc = 0;
+
+	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+				     tcon, full_path, false, &iov, NULL);
+	if (!IS_ERR(new))
+		d_instantiate(dentry, new);
+	else
+		rc = PTR_ERR(new);
+	cifs_free_open_info(&data);
+	return rc;
+}
+
 static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
 			       mode_t mode, dev_t dev,
 			       struct kvec *iov)
@@ -601,6 +630,9 @@ int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
 {
 	struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
 
+	if (S_ISSOCK(mode) && !ctx->nonativesocket && ctx->reparse_type != CIFS_REPARSE_TYPE_NONE)
+		return create_native_socket(xid, inode, dentry, tcon, full_path);
+
 	switch (ctx->reparse_type) {
 	case CIFS_REPARSE_TYPE_NFS:
 		return mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev, NULL);
-- 
cgit v1.2.3


From 071b8a67a8b2e611e837dfa342a883183a19c190 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Fri, 13 Sep 2024 11:42:59 +0200
Subject: cifs: Add support for creating NFS-style symlinks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CIFS client is currently able to parse NFS-style symlinks, but is not able
to create them. This functionality is useful when the mounted SMB share is
used also by Windows NFS server (on Windows Server 2012 or new). It allows
interop of symlinks between SMB share mounted by Linux CIFS client and same
export from Windows NFS server mounted by some NFS client.

New symlinks would be created in NFS-style only in case the mount option
-o reparse=nfs is specified, which is not by default. So default CIFS
mounts are not affected by this change.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/reparse.c | 47 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 34bf4e3f28d9..4cdaa83a215c 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -409,6 +409,8 @@ static int create_native_socket(const unsigned int xid, struct inode *inode,
 
 static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
 			       mode_t mode, dev_t dev,
+			       __le16 *symname_utf16,
+			       int symname_utf16_len,
 			       struct kvec *iov)
 {
 	u64 type;
@@ -419,13 +421,18 @@ static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
 	switch ((type = reparse_mode_nfs_type(mode))) {
 	case NFS_SPECFILE_BLK:
 	case NFS_SPECFILE_CHR:
-		dlen = sizeof(__le64);
+		dlen = 2 * sizeof(__le32);
+		((__le32 *)buf->DataBuffer)[0] = cpu_to_le32(MAJOR(dev));
+		((__le32 *)buf->DataBuffer)[1] = cpu_to_le32(MINOR(dev));
+		break;
+	case NFS_SPECFILE_LNK:
+		dlen = symname_utf16_len;
+		memcpy(buf->DataBuffer, symname_utf16, symname_utf16_len);
 		break;
 	case NFS_SPECFILE_FIFO:
 	case NFS_SPECFILE_SOCK:
 		dlen = 0;
 		break;
-	case NFS_SPECFILE_LNK: /* TODO: add support for NFS symlinks */
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -435,8 +442,6 @@ static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
 	buf->InodeType = cpu_to_le64(type);
 	buf->ReparseDataLength = cpu_to_le16(len + dlen -
 					     sizeof(struct reparse_data_buffer));
-	*(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MINOR(dev) << 32) |
-						 MAJOR(dev));
 	iov->iov_base = buf;
 	iov->iov_len = len + dlen;
 	return 0;
@@ -447,21 +452,42 @@ static int mknod_nfs(unsigned int xid, struct inode *inode,
 		     const char *full_path, umode_t mode, dev_t dev,
 		     const char *symname)
 {
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifs_open_info_data data;
-	struct reparse_nfs_data_buffer *p;
+	struct reparse_nfs_data_buffer *p = NULL;
+	__le16 *symname_utf16 = NULL;
+	int symname_utf16_len = 0;
 	struct inode *new;
 	struct kvec iov;
 	__u8 buf[sizeof(*p) + sizeof(__le64)];
 	int rc;
 
-	p = (struct reparse_nfs_data_buffer *)buf;
-	rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+	if (S_ISLNK(mode)) {
+		symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname),
+						      &symname_utf16_len,
+						      cifs_sb->local_nls,
+						      NO_MAP_UNI_RSVD);
+		if (!symname_utf16) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		symname_utf16_len -= 2; /* symlink is without trailing wide-nul */
+		p = kzalloc(sizeof(*p) + symname_utf16_len, GFP_KERNEL);
+		if (!p) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	} else {
+		p = (struct reparse_nfs_data_buffer *)buf;
+	}
+	rc = nfs_set_reparse_buf(p, mode, dev, symname_utf16, symname_utf16_len, &iov);
 	if (rc)
-		return rc;
+		goto out;
 
 	data = (struct cifs_open_info_data) {
 		.reparse_point = true,
 		.reparse = { .tag = IO_REPARSE_TAG_NFS, .buf = (struct reparse_data_buffer *)p, },
+		.symlink_target = kstrdup(symname, GFP_KERNEL),
 	};
 
 	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
@@ -471,6 +497,11 @@ static int mknod_nfs(unsigned int xid, struct inode *inode,
 	else
 		rc = PTR_ERR(new);
 	cifs_free_open_info(&data);
+out:
+	if (S_ISLNK(mode)) {
+		kfree(symname_utf16);
+		kfree(p);
+	}
 	return rc;
 }
 
-- 
cgit v1.2.3


From 021840c1426c012a812f8b8d9413f3cf9d3e0b9b Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Sun, 29 Dec 2024 15:31:05 +0100
Subject: cifs: Fix struct FILE_ALL_INFO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct FILE_ALL_INFO for level 263 (0x107) used by QPathInfo does not have
any IndexNumber, AccessFlags, IndexNumber1, CurrentByteOffset, Mode or
AlignmentRequirement members. So remove all of them.

Also adjust code in move_cifs_info_to_smb2() function which converts struct
FILE_ALL_INFO to struct smb2_file_all_info.

Fixed content of struct FILE_ALL_INFO was verified that is correct against:
* [MS-CIFS] section 2.2.8.3.10 SMB_QUERY_FILE_ALL_INFO
* Samba server implementation of trans2 query file/path for level 263
* Packet structure tests against Windows SMB servers

This change fixes CIFSSMBQFileInfo() and CIFSSMBQPathInfo() functions which
directly copy received FILE_ALL_INFO network buffers into kernel structures
of FILE_ALL_INFO type.

struct FILE_ALL_INFO is the response structure returned by the SMB server.
So the incorrect definition of this structure can lead to returning bogus
information in stat() call.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h | 12 +++++++-----
 fs/smb/client/cifspdu.h  |  6 ------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index ee9754fad3e8..5ba6b46fe9d1 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -2203,11 +2203,13 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses)
 
 static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src)
 {
-	memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src));
-	dst->AccessFlags = src->AccessFlags;
-	dst->CurrentByteOffset = src->CurrentByteOffset;
-	dst->Mode = src->Mode;
-	dst->AlignmentRequirement = src->AlignmentRequirement;
+	memcpy(dst, src, (size_t)((u8 *)&src->EASize - (u8 *)src));
+	dst->IndexNumber = 0;
+	dst->EASize = src->EASize;
+	dst->AccessFlags = 0;
+	dst->CurrentByteOffset = 0;
+	dst->Mode = 0;
+	dst->AlignmentRequirement = 0;
 	dst->FileNameLength = src->FileNameLength;
 }
 
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 84743f3d7c51..48d0d6f439cf 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -2290,13 +2290,7 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */
 	__u8 DeletePending;
 	__u8 Directory;
 	__u16 Pad2;
-	__le64 IndexNumber;
 	__le32 EASize;
-	__le32 AccessFlags;
-	__u64 IndexNumber1;
-	__le64 CurrentByteOffset;
-	__le32 Mode;
-	__le32 AlignmentRequirement;
 	__le32 FileNameLength;
 	union {
 		char __pad;
-- 
cgit v1.2.3


From eea5119fa5979c350af5783a8148eacdd4219715 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Tue, 28 Jan 2025 01:04:23 -0600
Subject: smb3: add support for IAKerb

There are now more servers which advertise support for IAKerb (passthrough
Kerberos authentication via proxy).  IAKerb is a public extension industry
standard Kerberos protocol that allows a client without line-of-sight
to a Domain Controller to authenticate. There can be cases where we
would fail to mount if the server only advertises the OID for IAKerb
in SPNEGO/GSSAPI.  Add code to allow us to still upcall to userspace
in these cases to obtain the Kerberos ticket.

Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/asn1.c        | 2 ++
 fs/smb/client/cifs_spnego.c | 4 +++-
 fs/smb/client/cifsglob.h    | 4 ++++
 fs/smb/client/sess.c        | 3 ++-
 fs/smb/client/smb2pdu.c     | 2 +-
 5 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/asn1.c b/fs/smb/client/asn1.c
index b5724ef9f182..214a44509e7b 100644
--- a/fs/smb/client/asn1.c
+++ b/fs/smb/client/asn1.c
@@ -52,6 +52,8 @@ int cifs_neg_token_init_mech_type(void *context, size_t hdrlen,
 		server->sec_kerberos = true;
 	else if (oid == OID_ntlmssp)
 		server->sec_ntlmssp = true;
+	else if (oid == OID_IAKerb)
+		server->sec_iakerb = true;
 	else {
 		char buf[50];
 
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 28f568b5fc27..bc1c1e9b288a 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -138,11 +138,13 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
 
 	dp = description + strlen(description);
 
-	/* for now, only sec=krb5 and sec=mskrb5 are valid */
+	/* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */
 	if (server->sec_kerberos)
 		sprintf(dp, ";sec=krb5");
 	else if (server->sec_mskerberos)
 		sprintf(dp, ";sec=mskrb5");
+	else if (server->sec_iakerb)
+		sprintf(dp, ";sec=iakerb");
 	else {
 		cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n");
 		sprintf(dp, ";sec=krb5");
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 5ba6b46fe9d1..a68434ad744a 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -151,6 +151,7 @@ enum securityEnum {
 	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
 	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
 	Kerberos,		/* Kerberos via SPNEGO */
+	IAKerb,			/* Kerberos proxy */
 };
 
 enum upcall_target_enum {
@@ -781,6 +782,7 @@ struct TCP_Server_Info {
 	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
+	bool	sec_iakerb;		/* supports pass-through auth for Kerberos (krb5 proxy) */
 	bool	large_buf;		/* is current buffer large? */
 	/* use SMBD connection instead of socket */
 	bool	rdma;
@@ -2148,6 +2150,8 @@ static inline char *get_security_type_str(enum securityEnum sectype)
 		return "Kerberos";
 	case NTLMv2:
 		return "NTLMv2";
+	case IAKerb:
+		return "IAKerb";
 	default:
 		return "Unknown";
 	}
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 91d4d409cb1d..faa80e7d54a6 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -1235,12 +1235,13 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
 		switch (requested) {
 		case Kerberos:
 		case RawNTLMSSP:
+		case IAKerb:
 			return requested;
 		case Unspecified:
 			if (server->sec_ntlmssp &&
 			    (global_secflags & CIFSSEC_MAY_NTLMSSP))
 				return RawNTLMSSP;
-			if ((server->sec_kerberos || server->sec_mskerberos) &&
+			if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) &&
 			    (global_secflags & CIFSSEC_MAY_KRB5))
 				return Kerberos;
 			fallthrough;
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 9f54596a6866..40ad9e79437a 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1429,7 +1429,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
 		if (server->sec_ntlmssp &&
 			(global_secflags & CIFSSEC_MAY_NTLMSSP))
 			return RawNTLMSSP;
-		if ((server->sec_kerberos || server->sec_mskerberos) &&
+		if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) &&
 			(global_secflags & CIFSSEC_MAY_KRB5))
 			return Kerberos;
 		fallthrough;
-- 
cgit v1.2.3


From 4e2043be5c149cb07d806c438a8ec8657741bd31 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Sat, 28 Sep 2024 13:24:26 +0200
Subject: cifs: Add support for creating WSL-style symlinks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change implements support for creating new symlink in WSL-style by
Linux cifs client when -o reparse=wsl mount option is specified. WSL-style
symlink uses reparse point with tag IO_REPARSE_TAG_LX_SYMLINK and symlink
target location is stored in reparse buffer in UTF-8 encoding prefixed by
32-bit flags. Flags bits are unknown, but it was observed that WSL always
sets flags to value 0x02000000. Do same in Linux cifs client.

New symlinks would be created in WSL-style only in case the mount option
-o reparse=wsl is specified, which is not by default. So default CIFS
mounts are not affected by this change.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/reparse.c | 65 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 4cdaa83a215c..0a5a52a8a7dd 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -505,9 +505,17 @@ out:
 	return rc;
 }
 
-static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
-			       mode_t mode, struct kvec *iov)
+static int wsl_set_reparse_buf(struct reparse_data_buffer **buf,
+			       mode_t mode, const char *symname,
+			       struct cifs_sb_info *cifs_sb,
+			       struct kvec *iov)
 {
+	struct reparse_wsl_symlink_data_buffer *symlink_buf;
+	__le16 *symname_utf16;
+	int symname_utf16_len;
+	int symname_utf8_maxlen;
+	int symname_utf8_len;
+	size_t buf_len;
 	u32 tag;
 
 	switch ((tag = reparse_mode_wsl_tag(mode))) {
@@ -515,17 +523,45 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
 	case IO_REPARSE_TAG_LX_CHR:
 	case IO_REPARSE_TAG_LX_FIFO:
 	case IO_REPARSE_TAG_AF_UNIX:
+		buf_len = sizeof(struct reparse_data_buffer);
+		*buf = kzalloc(buf_len, GFP_KERNEL);
+		if (!*buf)
+			return -ENOMEM;
+		break;
+	case IO_REPARSE_TAG_LX_SYMLINK:
+		symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname),
+						      &symname_utf16_len,
+						      cifs_sb->local_nls,
+						      NO_MAP_UNI_RSVD);
+		if (!symname_utf16)
+			return -ENOMEM;
+		symname_utf8_maxlen = symname_utf16_len/2*3;
+		symlink_buf = kzalloc(sizeof(struct reparse_wsl_symlink_data_buffer) +
+				      symname_utf8_maxlen, GFP_KERNEL);
+		if (!symlink_buf) {
+			kfree(symname_utf16);
+			return -ENOMEM;
+		}
+		/* Flag 0x02000000 is unknown, but all wsl symlinks have this value */
+		symlink_buf->Flags = cpu_to_le32(0x02000000);
+		/* PathBuffer is in UTF-8 but without trailing null-term byte */
+		symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2,
+						   UTF16_LITTLE_ENDIAN,
+						   symlink_buf->PathBuffer,
+						   symname_utf8_maxlen);
+		*buf = (struct reparse_data_buffer *)symlink_buf;
+		buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len;
+		kfree(symname_utf16);
 		break;
-	case IO_REPARSE_TAG_LX_SYMLINK: /* TODO: add support for WSL symlinks */
 	default:
 		return -EOPNOTSUPP;
 	}
 
-	buf->ReparseTag = cpu_to_le32(tag);
-	buf->Reserved = 0;
-	buf->ReparseDataLength = 0;
-	iov->iov_base = buf;
-	iov->iov_len = sizeof(*buf);
+	(*buf)->ReparseTag = cpu_to_le32(tag);
+	(*buf)->Reserved = 0;
+	(*buf)->ReparseDataLength = cpu_to_le16(buf_len - sizeof(struct reparse_data_buffer));
+	iov->iov_base = *buf;
+	iov->iov_len = buf_len;
 	return 0;
 }
 
@@ -617,25 +653,29 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
 		     const char *full_path, umode_t mode, dev_t dev,
 		     const char *symname)
 {
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifs_open_info_data data;
-	struct reparse_data_buffer buf;
+	struct reparse_data_buffer *buf;
 	struct smb2_create_ea_ctx *cc;
 	struct inode *new;
 	unsigned int len;
 	struct kvec reparse_iov, xattr_iov;
 	int rc;
 
-	rc = wsl_set_reparse_buf(&buf, mode, &reparse_iov);
+	rc = wsl_set_reparse_buf(&buf, mode, symname, cifs_sb, &reparse_iov);
 	if (rc)
 		return rc;
 
 	rc = wsl_set_xattrs(inode, mode, dev, &xattr_iov);
-	if (rc)
+	if (rc) {
+		kfree(buf);
 		return rc;
+	}
 
 	data = (struct cifs_open_info_data) {
 		.reparse_point = true,
-		.reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, },
+		.reparse = { .tag = le32_to_cpu(buf->ReparseTag), .buf = buf, },
+		.symlink_target = kstrdup(symname, GFP_KERNEL),
 	};
 
 	cc = xattr_iov.iov_base;
@@ -652,6 +692,7 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
 		rc = PTR_ERR(new);
 	cifs_free_open_info(&data);
 	kfree(xattr_iov.iov_base);
+	kfree(buf);
 	return rc;
 }
 
-- 
cgit v1.2.3


From 2008d8c7121a9eee0ef8ea121581269886535150 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Mon, 27 Jan 2025 17:45:57 -0600
Subject: cifs: update internal version number

To 2.53

Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index a762dbbbd959..831fee962c4d 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -146,6 +146,6 @@ extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
 /* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 52
-#define CIFS_VERSION   "2.52"
+#define SMB3_PRODUCT_BUILD 53
+#define CIFS_VERSION   "2.53"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3


From a49da4ef4b94345554923cdba1127a2d2a73d1e6 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Mon, 23 Sep 2024 22:29:30 +0200
Subject: cifs: Fix parsing native symlinks directory/file type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As SMB protocol distinguish between symlink to directory and symlink to
file, add some mechanism to disallow resolving incompatible types.

When SMB symlink is of the directory type, ensure that its target path ends
with slash. This forces Linux to not allow resolving such symlink to file.

And when SMB symlink is of the file type and its target path ends with
slash then returns an error as such symlink is unresolvable. Such symlink
always points to invalid location as file cannot end with slash.

As POSIX server does not distinguish between symlinks to file and symlink
directory, do not apply this change for symlinks from POSIX SMB server. For
POSIX SMB servers, this change does nothing.

This mimics Windows behavior of native SMB symlinks.

Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/inode.c     |  5 +++++
 fs/smb/client/smb2file.c  | 51 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/smb/client/smb2inode.c |  5 +++++
 fs/smb/client/smb2proto.h |  1 +
 4 files changed, 62 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 8896c88320c8..9cc31cf6ebd0 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1216,6 +1216,11 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
 							      full_path,
 							      iov, data);
 		}
+
+		if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) {
+			bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+			rc = smb2_fix_symlink_target_type(&data->symlink_target, directory, cifs_sb);
+		}
 		break;
 	}
 
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index c5e689b2fc49..d609a20fb98a 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -63,6 +63,52 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
 	return sym;
 }
 
+int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_info *cifs_sb)
+{
+	char *buf;
+	int len;
+
+	/*
+	 * POSIX server does not distinguish between symlinks to file and
+	 * symlink directory. So nothing is needed to fix on the client side.
+	 */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+		return 0;
+
+	if (!*target)
+		return -EIO;
+
+	len = strlen(*target);
+	if (!len)
+		return -EIO;
+
+	/*
+	 * If this is directory symlink and it does not have trailing slash then
+	 * append it. Trailing slash simulates Windows/SMB behavior which do not
+	 * allow resolving directory symlink to file.
+	 */
+	if (directory && (*target)[len-1] != '/') {
+		buf = krealloc(*target, len+2, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		buf[len] = '/';
+		buf[len+1] = '\0';
+		*target = buf;
+		len++;
+	}
+
+	/*
+	 * If this is a file (non-directory) symlink and it points to path name
+	 * with trailing slash then this is an invalid symlink because file name
+	 * cannot contain slash character. File name with slash is invalid on
+	 * both Windows and Linux systems. So return an error for such symlink.
+	 */
+	if (!directory && (*target)[len-1] == '/')
+		return -EIO;
+
+	return 0;
+}
+
 int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov,
 				const char *full_path, char **path)
 {
@@ -132,6 +178,11 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
 					       NULL, NULL, NULL);
 				oparms->create_options &= ~OPEN_REPARSE_POINT;
 			}
+			if (!rc) {
+				bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+				rc = smb2_fix_symlink_target_type(&data->symlink_target,
+								  directory, oparms->cifs_sb);
+			}
 		}
 	}
 
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index c97f14757c27..5dfb30b0a852 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -1010,6 +1010,11 @@ int smb2_query_path_info(const unsigned int xid,
 			else
 				rc = -EOPNOTSUPP;
 		}
+
+		if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) {
+			bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+			rc = smb2_fix_symlink_target_type(&data->symlink_target, directory, cifs_sb);
+		}
 		break;
 	case -EREMOTE:
 		break;
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 10f5e37d1530..2336dfb23f36 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -111,6 +111,7 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 			  struct cifs_sb_info *cifs_sb,
 			  const unsigned char *path, char *pbuf,
 			  unsigned int *pbytes_read);
+int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_info *cifs_sb);
 int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
 			      bool relative,
 			      const char *full_path,
-- 
cgit v1.2.3


From f921da2c34692dfec5f72b5ae347b1bea22bb369 Mon Sep 17 00:00:00 2001
From: Heming Zhao <heming.zhao@suse.com>
Date: Tue, 21 Jan 2025 19:22:03 +0800
Subject: ocfs2: fix incorrect CPU endianness conversion causing mount failure

Commit 23aab037106d ("ocfs2: fix UBSAN warning in ocfs2_verify_volume()")
introduced a regression bug.  The blksz_bits value is already converted to
CPU endian in the previous code; therefore, the code shouldn't use
le32_to_cpu() anymore.

Link: https://lkml.kernel.org/r/20250121112204.12834-1-heming.zhao@suse.com
Fixes: 23aab037106d ("ocfs2: fix UBSAN warning in ocfs2_verify_volume()")
Signed-off-by: Heming Zhao <heming.zhao@suse.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e0b91dbaa0ac..8bb5022f3082 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2285,7 +2285,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 			mlog(ML_ERROR, "found superblock with incorrect block "
 			     "size bits: found %u, should be 9, 10, 11, or 12\n",
 			     blksz_bits);
-		} else if ((1 << le32_to_cpu(blksz_bits)) != blksz) {
+		} else if ((1 << blksz_bits) != blksz) {
 			mlog(ML_ERROR, "found superblock with incorrect block "
 			     "size: found %u, should be %u\n", 1 << blksz_bits, blksz);
 		} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
-- 
cgit v1.2.3


From 6438ef381c183444f7f9d1de18f22661cba1e946 Mon Sep 17 00:00:00 2001
From: Nikita Zhandarovich <n.zhandarovich@fintech.ru>
Date: Sat, 25 Jan 2025 07:20:53 +0900
Subject: nilfs2: fix possible int overflows in nilfs_fiemap()

Since nilfs_bmap_lookup_contig() in nilfs_fiemap() calculates its result
by being prepared to go through potentially maxblocks == INT_MAX blocks,
the value in n may experience an overflow caused by left shift of blkbits.

While it is extremely unlikely to occur, play it safe and cast right hand
expression to wider type to mitigate the issue.

Found by Linux Verification Center (linuxtesting.org) with static analysis
tool SVACE.

Link: https://lkml.kernel.org/r/20250124222133.5323-1-konishi.ryusuke@gmail.com
Fixes: 622daaff0a89 ("nilfs2: fiemap support")
Signed-off-by: Nikita Zhandarovich <n.zhandarovich@fintech.ru>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index e8015d24a82c..6613b8fcceb0 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -1186,7 +1186,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			if (size) {
 				if (phys && blkphy << blkbits == phys + size) {
 					/* The current extent goes on */
-					size += n << blkbits;
+					size += (u64)n << blkbits;
 				} else {
 					/* Terminate the current extent */
 					ret = fiemap_fill_next_extent(
@@ -1199,14 +1199,14 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 					flags = FIEMAP_EXTENT_MERGED;
 					logical = blkoff << blkbits;
 					phys = blkphy << blkbits;
-					size = n << blkbits;
+					size = (u64)n << blkbits;
 				}
 			} else {
 				/* Start a new extent */
 				flags = FIEMAP_EXTENT_MERGED;
 				logical = blkoff << blkbits;
 				phys = blkphy << blkbits;
-				size = n << blkbits;
+				size = (u64)n << blkbits;
 			}
 			blkoff += n;
 		}
-- 
cgit v1.2.3