From 825cf206ed510c4a1758bef8957e2b039253e2e3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 26 Aug 2022 23:58:44 -0700 Subject: statx: add direct I/O alignment information Traditionally, the conditions for when DIO (direct I/O) is supported were fairly simple. For both block devices and regular files, DIO had to be aligned to the logical block size of the block device. However, due to filesystem features that have been added over time (e.g. multi-device support, data journalling, inline data, encryption, verity, compression, checkpoint disabling, log-structured mode), the conditions for when DIO is allowed on a regular file have gotten increasingly complex. Whether a particular regular file supports DIO, and with what alignment, can depend on various file attributes and filesystem mount options, as well as which block device(s) the file's data is located on. Moreover, the general rule of DIO needing to be aligned to the block device's logical block size was recently relaxed to allow user buffers (but not file offsets) aligned to the DMA alignment instead. See commit bf8d08532bc1 ("iomap: add support for dma aligned direct-io"). XFS has an ioctl XFS_IOC_DIOINFO that exposes DIO alignment information. Uplifting this to the VFS is one possibility. However, as discussed (https://lore.kernel.org/linux-fsdevel/20220120071215.123274-1-ebiggers@kernel.org/T/#u), this ioctl is rarely used and not known to be used outside of XFS-specific code. It was also never intended to indicate when a file doesn't support DIO at all, nor was it intended for block devices. Therefore, let's expose this information via statx(). Add the STATX_DIOALIGN flag and two new statx fields associated with it: * stx_dio_mem_align: the alignment (in bytes) required for user memory buffers for DIO, or 0 if DIO is not supported on the file. * stx_dio_offset_align: the alignment (in bytes) required for file offsets and I/O segment lengths for DIO, or 0 if DIO is not supported on the file. This will only be nonzero if stx_dio_mem_align is nonzero, and vice versa. Note that as with other statx() extensions, if STATX_DIOALIGN isn't set in the returned statx struct, then these new fields won't be filled in. This will happen if the file is neither a regular file nor a block device, or if the file is a regular file and the filesystem doesn't support STATX_DIOALIGN. It might also happen if the caller didn't include STATX_DIOALIGN in the request mask, since statx() isn't required to return unrequested information. This commit only adds the VFS-level plumbing for STATX_DIOALIGN. For regular files, individual filesystems will still need to add code to support it. For block devices, a separate commit will wire it up too. Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Martin K. Petersen Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220827065851.135710-2-ebiggers@kernel.org --- include/linux/stat.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stat.h b/include/linux/stat.h index 7df06931f25d..ff277ced50e9 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -50,6 +50,8 @@ struct kstat { struct timespec64 btime; /* File creation time */ u64 blocks; u64 mnt_id; + u32 dio_mem_align; + u32 dio_offset_align; }; #endif -- cgit v1.2.3 From 2d985f8c6b91b5007a16e640bb9c038c5fb2839b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 26 Aug 2022 23:58:45 -0700 Subject: vfs: support STATX_DIOALIGN on block devices Add support for STATX_DIOALIGN to block devices, so that direct I/O alignment restrictions are exposed to userspace in a generic way. Note that this breaks the tradition of stat operating only on the block device node, not the block device itself. However, it was felt that doing this is preferable, in order to make the interface useful and avoid needing separate interfaces for regular files and block devices. Signed-off-by: Eric Biggers Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220827065851.135710-3-ebiggers@kernel.org --- block/bdev.c | 23 +++++++++++++++++++++++ fs/stat.c | 12 ++++++++++++ include/linux/blkdev.h | 4 ++++ 3 files changed, 39 insertions(+) (limited to 'include/linux') diff --git a/block/bdev.c b/block/bdev.c index ce05175e71ce..d699ecdb3260 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "../fs/internal.h" #include "blk.h" @@ -1069,3 +1070,25 @@ void sync_bdevs(bool wait) spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } + +/* + * Handle STATX_DIOALIGN for block devices. + * + * Note that the inode passed to this is the inode of a block device node file, + * not the block device's internal inode. Therefore it is *not* valid to use + * I_BDEV() here; the block device has to be looked up by i_rdev instead. + */ +void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +{ + struct block_device *bdev; + + bdev = blkdev_get_no_open(inode->i_rdev); + if (!bdev) + return; + + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + stat->result_mask |= STATX_DIOALIGN; + + blkdev_put_no_open(bdev); +} diff --git a/fs/stat.c b/fs/stat.c index a7930d744483..ef50573c72a2 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -5,6 +5,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include #include #include #include @@ -230,11 +231,22 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; + + /* Handle STATX_DIOALIGN for block devices. */ + if (request_mask & STATX_DIOALIGN) { + struct inode *inode = d_backing_inode(path.dentry); + + if (S_ISBLK(inode->i_mode)) + bdev_statx_dioalign(inode, stat); + } + path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 84b13fdd34a7..8038c5fbde40 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1498,6 +1498,7 @@ int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); +void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); void printk_all_partitions(void); #else static inline void invalidate_bdev(struct block_device *bdev) @@ -1514,6 +1515,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } +static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +{ +} static inline void printk_all_partitions(void) { } -- cgit v1.2.3 From 53dd3f802a6e269868cb599609287a841e65a996 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 26 Aug 2022 23:58:46 -0700 Subject: fscrypt: change fscrypt_dio_supported() to prepare for STATX_DIOALIGN To prepare for STATX_DIOALIGN support, make two changes to fscrypt_dio_supported(). First, remove the filesystem-block-alignment check and make the filesystems handle it instead. It previously made sense to have it in fs/crypto/; however, to support STATX_DIOALIGN the alignment restriction would have to be returned to filesystems. It ends up being simpler if filesystems handle this part themselves, especially for f2fs which only allows fs-block-aligned DIO in the first place. Second, make fscrypt_dio_supported() work on inodes whose encryption key hasn't been set up yet, by making it set up the key if needed. This is required for statx(), since statx() doesn't require a file descriptor. Reviewed-by: Christoph Hellwig Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220827065851.135710-4-ebiggers@kernel.org --- fs/crypto/inline_crypt.c | 49 ++++++++++++++++++++++++------------------------ fs/ext4/file.c | 9 +++++++-- fs/f2fs/f2fs.h | 2 +- include/linux/fscrypt.h | 7 ++----- 4 files changed, 34 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 90f3e68f166e..8d4bee5bccbf 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -401,46 +401,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio, EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); /** - * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is - * supported as far as encryption is concerned - * @iocb: the file and position the I/O is targeting - * @iter: the I/O data segment(s) + * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an + * inode, as far as encryption is concerned + * @inode: the inode in question * * Return: %true if there are no encryption constraints that prevent DIO from * being supported; %false if DIO is unsupported. (Note that in the * %true case, the filesystem might have other, non-encryption-related - * constraints that prevent DIO from actually being supported.) + * constraints that prevent DIO from actually being supported. Also, on + * encrypted files the filesystem is still responsible for only allowing + * DIO when requests are filesystem-block-aligned.) */ -bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +bool fscrypt_dio_supported(struct inode *inode) { - const struct inode *inode = file_inode(iocb->ki_filp); - const unsigned int blocksize = i_blocksize(inode); + int err; /* If the file is unencrypted, no veto from us. */ if (!fscrypt_needs_contents_encryption(inode)) return true; - /* We only support DIO with inline crypto, not fs-layer crypto. */ - if (!fscrypt_inode_uses_inline_crypto(inode)) - return false; - /* - * Since the granularity of encryption is filesystem blocks, the file - * position and total I/O length must be aligned to the filesystem block - * size -- not just to the block device's logical block size as is - * traditionally the case for DIO on many filesystems. + * We only support DIO with inline crypto, not fs-layer crypto. * - * We require that the user-provided memory buffers be filesystem block - * aligned too. It is simpler to have a single alignment value required - * for all properties of the I/O, as is normally the case for DIO. - * Also, allowing less aligned buffers would imply that data units could - * cross bvecs, which would greatly complicate the I/O stack, which - * assumes that bios can be split at any bvec boundary. + * To determine whether the inode is using inline crypto, we have to set + * up the key if it wasn't already done. This is because in the current + * design of fscrypt, the decision of whether to use inline crypto or + * not isn't made until the inode's encryption key is being set up. In + * the DIO read/write case, the key will always be set up already, since + * the file will be open. But in the case of statx(), the key might not + * be set up yet, as the file might not have been opened yet. */ - if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize)) + err = fscrypt_require_key(inode); + if (err) { + /* + * Key unavailable or couldn't be set up. This edge case isn't + * worth worrying about; just report that DIO is unsupported. + */ return false; - - return true; + } + return fscrypt_inode_uses_inline_crypto(inode); } EXPORT_SYMBOL_GPL(fscrypt_dio_supported); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 109d07629f81..26d742620897 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -40,8 +40,13 @@ static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); - if (!fscrypt_dio_supported(iocb, iter)) - return false; + if (IS_ENCRYPTED(inode)) { + if (!fscrypt_dio_supported(inode)) + return false; + if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), + i_blocksize(inode))) + return false; + } if (fsverity_active(inode)) return false; if (ext4_should_journal_data(inode)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3c7cdb70fe2e..0759da1919f4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4498,7 +4498,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int rw = iov_iter_rw(iter); - if (!fscrypt_dio_supported(iocb, iter)) + if (!fscrypt_dio_supported(inode)) return true; if (fsverity_active(inode)) return true; diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 7d2f1e0f23b1..13598859d5b3 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -768,7 +768,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); -bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter); +bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); @@ -801,11 +801,8 @@ static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, return true; } -static inline bool fscrypt_dio_supported(struct kiocb *iocb, - struct iov_iter *iter) +static inline bool fscrypt_dio_supported(struct inode *inode) { - const struct inode *inode = file_inode(iocb->ki_filp); - return !fscrypt_needs_contents_encryption(inode); } -- cgit v1.2.3