From e87f2c26c8085dac59978dee1beeb05cef31a9dd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 May 2022 09:28:12 -0400 Subject: struct file: use anonymous union member for rcuhead and llist Once upon a time we couldn't afford anon unions; these days minimal gcc version had been raised enough to take care of that. Reviewed-by: Jan Kara Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..6a2a4906041f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -924,9 +924,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) struct file { union { - struct llist_node fu_llist; - struct rcu_head fu_rcuhead; - } f_u; + struct llist_node f_llist; + struct rcu_head f_rcuhead; + }; struct path f_path; struct inode *f_inode; /* cached value */ const struct file_operations *f_op; -- cgit v1.3 From 91b94c5d6ae55d1161633047ffeea644b110b35f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 May 2022 09:39:27 -0400 Subject: iocb: delay evaluation of IS_SYNC(...) until we want to check IOCB_DSYNC New helper to be used instead of direct checks for IOCB_DSYNC: iocb_is_dsync(iocb). Checks converted, which allows to avoid the IS_SYNC(iocb->ki_filp->f_mapping->host) part (4 cache lines) from iocb_flags() - it's checked in iocb_is_dsync() instead Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- block/fops.c | 2 +- fs/btrfs/file.c | 2 +- fs/direct-io.c | 2 +- fs/fuse/file.c | 2 +- fs/iomap/direct-io.c | 3 +-- fs/zonefs/super.c | 2 +- include/linux/fs.h | 10 ++++++++-- 7 files changed, 14 insertions(+), 9 deletions(-) (limited to 'include/linux/fs.h') diff --git a/block/fops.c b/block/fops.c index d6b3276a6c68..6e86931ab847 100644 --- a/block/fops.c +++ b/block/fops.c @@ -37,7 +37,7 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) op |= REQ_FUA; return op; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 98f81e304eb1..54358a5c9d56 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2021,7 +2021,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - const bool sync = iocb->ki_flags & IOCB_DSYNC; + const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we diff --git a/fs/direct-io.c b/fs/direct-io.c index 840752006f60..39647eb56904 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1210,7 +1210,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (dio->is_async && iov_iter_rw(iter) == WRITE) { retval = 0; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) retval = dio_set_defer_completion(dio); else if (!dio->inode->i_sb->s_dio_done_wq) { /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 05caa2b9272e..00fa861aeead 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1042,7 +1042,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb) { unsigned int flags = iocb->ki_filp->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) flags |= O_DSYNC; if (iocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c10c69e2de24..31c7f1035b20 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -548,8 +548,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* for data sync or sync, we need sync completion processing */ - if (iocb->ki_flags & IOCB_DSYNC && - !(dio_flags & IOMAP_DIO_NOSYNC)) { + if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) { dio->flags |= IOMAP_DIO_NEED_SYNC; /* diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index bcb21aea990a..04a98b4cd7ee 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -746,7 +746,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); bio->bi_iter.bi_sector = zi->i_zsector; bio->bi_ioprio = iocb->ki_ioprio; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) bio->bi_opf |= REQ_FUA; ret = bio_iov_iter_get_pages(bio, from); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a2a4906041f..380a1292f4f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2720,6 +2720,12 @@ extern int vfs_fsync(struct file *file, int datasync); extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags); +static inline bool iocb_is_dsync(const struct kiocb *iocb) +{ + return (iocb->ki_flags & IOCB_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host); +} + /* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount @@ -2727,7 +2733,7 @@ extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, */ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) { - if (iocb->ki_flags & IOCB_DSYNC) { + if (iocb_is_dsync(iocb)) { int ret = vfs_fsync_range(iocb->ki_filp, iocb->ki_pos - count, iocb->ki_pos - 1, (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); @@ -3262,7 +3268,7 @@ static inline int iocb_flags(struct file *file) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; - if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) + if (file->f_flags & O_DSYNC) res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC) res |= IOCB_SYNC; -- cgit v1.3 From 164f4064ca81eefcea29f7f5dcf394f92be1d0c0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 May 2022 11:38:11 -0400 Subject: keep iocb_flags() result cached in struct file * calculate at the time we set FMODE_OPENED (do_dentry_open() for normal opens, alloc_file() for pipe()/socket()/etc.) * update when handling F_SETFL * keep in a new field - file->f_iocb_flags; since that thing is needed only before the refcount reaches zero, we can put it into the same anon union where ->f_rcuhead and ->f_llist live - those are used only after refcount reaches zero. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- drivers/nvme/target/io-cmd-file.c | 2 +- fs/aio.c | 2 +- fs/fcntl.c | 1 + fs/file_table.c | 1 + fs/io_uring.c | 2 +- fs/open.c | 1 + include/linux/fs.h | 5 ++--- 7 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index f3d58abf11e0..64b47e2a4633 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -112,7 +112,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, iocb->ki_pos = pos; iocb->ki_filp = req->ns->file; - iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); + iocb->ki_flags = ki_flags | iocb->ki_filp->f_iocb_flags; return call_iter(iocb, &iter); } diff --git a/fs/aio.c b/fs/aio.c index 3c249b938632..2bdd444d408b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1475,7 +1475,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = iocb_flags(req->ki_filp); + req->ki_flags = req->ki_filp->f_iocb_flags; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 34a3faa4886d..146c9ab0cd4b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -78,6 +78,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) } spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); + filp->f_iocb_flags = iocb_flags(filp); spin_unlock(&filp->f_lock); out: diff --git a/fs/file_table.c b/fs/file_table.c index b989e33aacda..905792b0521c 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -241,6 +241,7 @@ static struct file *alloc_file(const struct path *path, int flags, if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; + file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3aab4182fd89..53424b1f019f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4330,7 +4330,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) if (!io_req_ffs_set(req)) req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - kiocb->ki_flags = iocb_flags(file); + kiocb->ki_flags = file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, req->rw.flags); if (unlikely(ret)) return ret; diff --git a/fs/open.c b/fs/open.c index 1d57fbde2feb..d80441a0bf17 100644 --- a/fs/open.c +++ b/fs/open.c @@ -862,6 +862,7 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/include/linux/fs.h b/include/linux/fs.h index 380a1292f4f9..c82b9d442f56 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -926,6 +926,7 @@ struct file { union { struct llist_node f_llist; struct rcu_head f_rcuhead; + unsigned int f_iocb_flags; }; struct path f_path; struct inode *f_inode; /* cached value */ @@ -2199,13 +2200,11 @@ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns, !gid_valid(i_gid_into_mnt(mnt_userns, inode)); } -static inline int iocb_flags(struct file *file); - static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, - .ki_flags = iocb_flags(filp), + .ki_flags = filp->f_iocb_flags, .ki_ioprio = get_current_ioprio(), }; } -- cgit v1.3 From 868941b14441282ba08761b770fc6cad69d5bdb7 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 29 Jun 2022 15:07:00 +0200 Subject: fs: remove no_llseek Now that all callers of ->llseek are going through vfs_llseek(), we don't gain anything by keeping no_llseek around. Nothing actually calls it and setting ->llseek to no_lseek is completely equivalent to leaving it NULL. Longer term (== by the end of merge window) we want to remove all such intializations. To simplify the merge window this commit does *not* touch initializers - it only defines no_llseek as NULL (and simplifies the tests on file opening). At -rc1 we'll need do a mechanical removal of no_llseek - git grep -l -w no_llseek | grep -v porting.rst | while read i; do sed -i '/\/d' $i done would do it. Signed-off-by: Jason A. Donenfeld Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 8 ++++++++ drivers/gpu/drm/drm_file.c | 3 +-- fs/file_table.c | 2 +- fs/open.c | 2 -- fs/read_write.c | 6 ------ include/linux/fs.h | 2 +- kernel/bpf/bpf_iter.c | 3 +-- 7 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 2e0e4f0e0c6f..aee9aaf9f3df 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -914,3 +914,11 @@ Calling conventions for file_open_root() changed; now it takes struct path * instead of passing mount and dentry separately. For callers that used to pass mnt_root> pair (i.e. the root of given mount), a new helper is provided - file_open_root_mnt(). In-tree users adjusted. + +--- + +**mandatory** + +no_llseek is gone; don't set .llseek to that - just leave it NULL instead. +Checks for "does that file have llseek(2), or should it fail with ESPIPE" +should be done by looking at FMODE_LSEEK in file->f_mode. diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index ed25168619fc..dc7d2e5b16c8 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -552,8 +552,7 @@ EXPORT_SYMBOL(drm_release_noglobal); * Since events are used by the KMS API for vblank and page flip completion this * means all modern display drivers must use it. * - * @offset is ignored, DRM events are read like a pipe. Therefore drivers also - * must set the &file_operation.llseek to no_llseek(). Polling support is + * @offset is ignored, DRM events are read like a pipe. Polling support is * provided by drm_poll(). * * This function will only ever read a full event. Therefore userspace must diff --git a/fs/file_table.c b/fs/file_table.c index 0658b822beeb..5727a63a7b67 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -235,7 +235,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); - if (fop->llseek && fop->llseek != no_llseek) + if (fop->llseek) file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) diff --git a/fs/open.c b/fs/open.c index 4488bd77c390..07c332753a36 100644 --- a/fs/open.c +++ b/fs/open.c @@ -860,8 +860,6 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_CAN_WRITE; if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) f->f_mode &= ~FMODE_LSEEK; - if ((f->f_mode & FMODE_LSEEK) && f->f_op->llseek == no_llseek) - f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; diff --git a/fs/read_write.c b/fs/read_write.c index d94b6dbba6f9..6b2849b34781 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -227,12 +227,6 @@ loff_t noop_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(noop_llseek); -loff_t no_llseek(struct file *file, loff_t offset, int whence) -{ - return -ESPIPE; -} -EXPORT_SYMBOL(no_llseek); - loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..294932167335 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3022,7 +3022,7 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); -extern loff_t no_llseek(struct file *file, loff_t offset, int whence); +#define no_llseek NULL extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index d5d96ceca105..8af0cbf9c0cd 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -81,10 +81,9 @@ static bool bpf_iter_support_resched(struct seq_file *seq) #define MAX_ITER_OBJECTS 1000000 /* bpf_seq_read, a customized and simpler version for bpf iterator. - * no_llseek is assumed for this file. * The following are differences from seq_read(): * . fixed buffer size (PAGE_SIZE) - * . assuming no_llseek + * . assuming NULL ->llseek() * . stop() may call bpf program, handling potential overflow there */ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, -- cgit v1.3 From 6f7db3894ae23eb5d40af4efb404aa0c072a68d2 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:36 +0800 Subject: fsdax: dedup file range to use a compare function With dax we cannot deal with readpage() etc. So, we create a dax comparison function which is similar with vfs_dedupe_file_range_compare(). And introduce dax_remap_file_range_prep() for filesystem use. Link: https://lkml.kernel.org/r/20220603053738.1218681-13-ruansy.fnst@fujitsu.com Signed-off-by: Goldwyn Rodrigues Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/remap_range.c | 31 ++++++++++++++++---- fs/xfs/xfs_reflink.c | 8 +++-- include/linux/dax.h | 8 +++++ include/linux/fs.h | 12 +++++--- 5 files changed, 130 insertions(+), 11 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/dax.c b/fs/dax.c index 0aab32300531..e0f9c4a0a0c1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1873,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); + +static loff_t dax_range_compare_iter(struct iomap_iter *it_src, + struct iomap_iter *it_dest, u64 len, bool *same) +{ + const struct iomap *smap = &it_src->iomap; + const struct iomap *dmap = &it_dest->iomap; + loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + void *saddr, *daddr; + int id, ret; + + len = min(len, min(smap->length, dmap->length)); + + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { + *same = true; + return len; + } + + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { + *same = false; + return 0; + } + + id = dax_read_lock(); + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), + &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), + &daddr, NULL); + if (ret < 0) + goto out_unlock; + + *same = !memcmp(saddr, daddr, len); + if (!*same) + len = 0; + dax_read_unlock(id); + return len; + +out_unlock: + dax_read_unlock(id); + return -EIO; +} + +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dst, loff_t dstoff, loff_t len, bool *same, + const struct iomap_ops *ops) +{ + struct iomap_iter src_iter = { + .inode = src, + .pos = srcoff, + .len = len, + .flags = IOMAP_DAX, + }; + struct iomap_iter dst_iter = { + .inode = dst, + .pos = dstoff, + .len = len, + .flags = IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&src_iter, ops)) > 0) { + while ((ret = iomap_iter(&dst_iter, ops)) > 0) { + dst_iter.processed = dax_range_compare_iter(&src_iter, + &dst_iter, len, same); + } + if (ret <= 0) + src_iter.processed = ret; + } + return ret; +} + +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, ops); +} +EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); diff --git a/fs/remap_range.c b/fs/remap_range.c index e112b5424cdb..231de627c1b9 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #include @@ -271,9 +272,11 @@ out_error: * If there's an error, then the usual negative error code is returned. * Otherwise returns 0 with *len set to the request length. */ -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) +int +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -333,8 +336,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; - ret = vfs_dedupe_file_range_compare(file_in, pos_in, - file_out, pos_out, *len, &is_same); + if (*len == 0) + return 0; + + if (!IS_DAX(inode_in)) + ret = vfs_dedupe_file_range_compare(file_in, pos_in, + file_out, pos_out, *len, &is_same); + else if (dax_read_ops) + ret = dax_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same, + dax_read_ops); + else + return -EINVAL; if (ret) return ret; if (!is_same) @@ -352,6 +365,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, return ret; } + +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, NULL); +} EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e7a7c00d93be..cbaf36d21020 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1367,8 +1367,12 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); + if (!IS_DAX(inode_in)) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags); + else + ret = dax_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, &xfs_read_iomap_ops); if (ret || *len == 0) goto out_unlock; diff --git a/include/linux/dax.h b/include/linux/dax.h index 7116681b48c0..ba985333e26b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -246,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same, + const struct iomap_ops *ops); +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..134e9d7ad5d6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,6 +74,7 @@ struct fsverity_operations; struct fs_context; struct fs_parameter_spec; struct fileattr; +struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2070,10 +2071,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); -extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *count, - unsigned int remap_flags); +int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops); +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *count, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); -- cgit v1.3 From 2b3416ceff5e6bd4922f6d1c61fb68113dd82302 Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Thu, 14 Jul 2022 14:11:25 +0800 Subject: fs: add mode_strip_sgid() helper Add a dedicated helper to handle the setgid bit when creating a new file in a setgid directory. This is a preparatory patch for moving setgid stripping into the vfs. The patch contains no functional changes. Currently the setgid stripping logic is open-coded directly in inode_init_owner() and the individual filesystems are responsible for handling setgid inheritance. Since this has proven to be brittle as evidenced by old issues we uncovered over the last months (see [1] to [3] below) we will try to move this logic into the vfs. Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [1] Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [2] Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [3] Link: https://lore.kernel.org/r/1657779088-2242-1-git-send-email-xuyang2018.jy@fujitsu.com Reviewed-by: Darrick J. Wong Reviewed-by: Christian Brauner (Microsoft) Reviewed-and-Tested-by: Jeff Layton Signed-off-by: Yang Xu Signed-off-by: Christian Brauner (Microsoft) --- fs/inode.c | 36 ++++++++++++++++++++++++++++++++---- include/linux/fs.h | 2 ++ 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/inode.c b/fs/inode.c index bd4da9c5207e..71b36afc3893 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2246,10 +2246,8 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; - else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && - !in_group_p(i_gid_into_mnt(mnt_userns, dir)) && - !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) - mode &= ~S_ISGID; + else + mode = mode_strip_sgid(mnt_userns, dir, mode); } else inode_fsgid_set(inode, mnt_userns); inode->i_mode = mode; @@ -2405,3 +2403,33 @@ struct timespec64 current_time(struct inode *inode) return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); + +/** + * mode_strip_sgid - handle the sgid bit for non-directories + * @mnt_userns: User namespace of the mount the inode was created from + * @dir: parent directory inode + * @mode: mode of the file to be created in @dir + * + * If the @mode of the new file has both the S_ISGID and S_IXGRP bit + * raised and @dir has the S_ISGID bit raised ensure that the caller is + * either in the group of the parent directory or they have CAP_FSETID + * in their user namespace and are privileged over the parent directory. + * In all other cases, strip the S_ISGID bit from @mode. + * + * Return: the new mode to use for the file + */ +umode_t mode_strip_sgid(struct user_namespace *mnt_userns, + const struct inode *dir, umode_t mode) +{ + if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) + return mode; + if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) + return mode; + if (in_group_p(i_gid_into_mnt(mnt_userns, dir))) + return mode; + if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) + return mode; + + return mode & ~S_ISGID; +} +EXPORT_SYMBOL(mode_strip_sgid); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..50642668c60f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1903,6 +1903,8 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); +umode_t mode_strip_sgid(struct user_namespace *mnt_userns, + const struct inode *dir, umode_t mode); /* * This is the "filldir" function type, used by readdir() to let -- cgit v1.3 From 8017553980d0bbfef3e66c583363828565afd6da Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:50 -0700 Subject: fs: add a FMODE_BUF_WASYNC flags for f_mode This introduces the flag FMODE_BUF_WASYNC. If devices support async buffered writes, this flag can be set. It also modifies the check in generic_write_checks to take async buffered writes into consideration. Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-8-shr@fb.com Signed-off-by: Jens Axboe --- fs/read_write.c | 4 +++- include/linux/fs.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/fs/read_write.c b/fs/read_write.c index e0777eefd846..319d88825d1c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1660,7 +1660,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_NOWAIT) && + !((iocb->ki_flags & IOCB_DIRECT) || + (file->f_mode & FMODE_BUF_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..bc84847c201e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -180,6 +180,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports async buffered reads */ #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) +/* File supports async nowait buffered writes */ +#define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000) + /* * Attribute flags. These should be or-ed together to figure out what * has been changed! -- cgit v1.3 From 66fa3cedf16abc82d19b943e3289c82e685419d5 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:53 -0700 Subject: fs: Add async write file modification handling. This adds a file_modified_async() function to return -EAGAIN if the request either requires to remove privileges or needs to update the file modification time. This is required for async buffered writes, so the request gets handled in the io worker of io-uring. Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-11-shr@fb.com Signed-off-by: Jens Axboe --- fs/inode.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- include/linux/fs.h | 1 + 2 files changed, 43 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/inode.c b/fs/inode.c index ff726d99ecc7..259ebf438893 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2116,17 +2116,21 @@ int file_update_time(struct file *file) EXPORT_SYMBOL(file_update_time); /** - * file_modified - handle mandated vfs changes when modifying a file + * file_modified_flags - handle mandated vfs changes when modifying a file * @file: file that was modified + * @flags: kiocb flags * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * + * If IOCB_NOWAIT is set, special file privileges will not be removed and + * time settings will not be updated. It will return -EAGAIN. + * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ -int file_modified(struct file *file) +static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); @@ -2136,7 +2140,7 @@ int file_modified(struct file *file) * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - ret = __file_remove_privs(file, 0); + ret = __file_remove_privs(file, flags); if (ret) return ret; @@ -2146,11 +2150,46 @@ int file_modified(struct file *file) ret = inode_needs_update_time(inode, &now); if (ret <= 0) return ret; + if (flags & IOCB_NOWAIT) + return -EAGAIN; return __file_update_time(file, &now, ret); } + +/** + * file_modified - handle mandated vfs changes when modifying a file + * @file: file that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int file_modified(struct file *file) +{ + return file_modified_flags(file, 0); +} EXPORT_SYMBOL(file_modified); +/** + * kiocb_modified - handle mandated vfs changes when modifying a file + * @iocb: iocb that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int kiocb_modified(struct kiocb *iocb) +{ + return file_modified_flags(iocb->ki_filp, iocb->ki_flags); +} +EXPORT_SYMBOL_GPL(kiocb_modified); + int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) diff --git a/include/linux/fs.h b/include/linux/fs.h index bc84847c201e..c0d99b5a166b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2390,6 +2390,7 @@ static inline void file_accessed(struct file *file) } extern int file_modified(struct file *file); +int kiocb_modified(struct kiocb *iocb); int sync_inode_metadata(struct inode *inode, int wait); -- cgit v1.3 From 04b9407197789c81fffac52921e703cb47967d6a Mon Sep 17 00:00:00 2001 From: Daniil Lunev Date: Wed, 27 Jul 2022 16:44:24 +1000 Subject: vfs: function to prevent re-use of block-device-based superblocks The function is to be called from filesystem-specific code to mark a superblock to be ignored by superblock test and thus never re-used. The function also unregisters bdi if the bdi is per-superblock to avoid collision if a new superblock is created to represent the filesystem. generic_shutdown_super() skips unregistering bdi for a retired superlock as it assumes retire function has already done it. This patch adds the functionality only for the block-device-based supers, since the primary use case of the feature is to gracefully handle force unmount of external devices, mounted with FUSE. This can be further extended to cover all superblocks, if the need arises. Signed-off-by: Daniil Lunev Reviewed-by: Christoph Hellwig Signed-off-by: Miklos Szeredi --- fs/super.c | 33 +++++++++++++++++++++++++++++++-- include/linux/fs.h | 2 ++ 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/super.c b/fs/super.c index 60f57c7bc0a6..258d9257b0a7 100644 --- a/fs/super.c +++ b/fs/super.c @@ -422,6 +422,35 @@ bool trylock_super(struct super_block *sb) return false; } +/** + * retire_super - prevents superblock from being reused + * @sb: superblock to retire + * + * The function marks superblock to be ignored in superblock test, which + * prevents it from being reused for any new mounts. If the superblock has + * a private bdi, it also unregisters it, but doesn't reduce the refcount + * of the superblock to prevent potential races. The refcount is reduced + * by generic_shutdown_super(). The function can not be called + * concurrently with generic_shutdown_super(). It is safe to call the + * function multiple times, subsequent calls have no effect. + * + * The marker will affect the re-use only for block-device-based + * superblocks. Other superblocks will still get marked if this function + * is used, but that will not affect their reusability. + */ +void retire_super(struct super_block *sb) +{ + WARN_ON(!sb->s_bdev); + down_write(&sb->s_umount); + if (sb->s_iflags & SB_I_PERSB_BDI) { + bdi_unregister(sb->s_bdi); + sb->s_iflags &= ~SB_I_PERSB_BDI; + } + sb->s_iflags |= SB_I_RETIRED; + up_write(&sb->s_umount); +} +EXPORT_SYMBOL(retire_super); + /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill @@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) { - return s->s_bdev == fc->sget_key; + return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key; } /** @@ -1307,7 +1336,7 @@ EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { - return (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data; } struct dentry *mount_bdev(struct file_system_type *fs_type, diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..246120ee0573 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1412,6 +1412,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ +#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ /* Possible states of 'frozen' field */ enum { @@ -2432,6 +2433,7 @@ extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); +void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); -- cgit v1.3 From 68f2736a858324c3ec852f6c2cddd9d1c777357d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Jun 2022 15:38:48 -0400 Subject: mm: Convert all PageMovable users to movable_operations These drivers are rather uncomfortably hammered into the address_space_operations hole. They aren't filesystems and don't behave like filesystems. They just need their own movable_operations structure, which we can point to directly from page->mapping. Signed-off-by: Matthew Wilcox (Oracle) --- Documentation/filesystems/locking.rst | 4 -- Documentation/filesystems/vfs.rst | 12 ---- Documentation/vm/page_migration.rst | 113 +++------------------------------- arch/powerpc/platforms/pseries/cmm.c | 60 +----------------- drivers/misc/vmw_balloon.c | 61 +----------------- drivers/virtio/virtio_balloon.c | 47 +------------- include/linux/balloon_compaction.h | 6 +- include/linux/fs.h | 2 - include/linux/migrate.h | 56 +++++++++++++++-- include/linux/page-flags.h | 2 +- include/uapi/linux/magic.h | 4 -- mm/balloon_compaction.c | 10 ++- mm/compaction.c | 29 ++++----- mm/migrate.c | 24 ++++---- mm/util.c | 4 +- mm/z3fold.c | 84 +++---------------------- mm/zsmalloc.c | 102 +++++++----------------------- 17 files changed, 134 insertions(+), 486 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index c0fe711f14d3..9963d9600b71 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -252,9 +252,7 @@ prototypes:: bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); - bool (*isolate_page) (struct page *, isolate_mode_t); int (*migratepage)(struct address_space *, struct page *, struct page *); - void (*putback_page) (struct page *); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); @@ -280,9 +278,7 @@ invalidate_folio: yes exclusive release_folio: yes free_folio: yes direct_IO: -isolate_page: yes migratepage: yes (both) -putback_page: yes launder_folio: yes is_partially_uptodate: yes error_remove_page: yes diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index a08c652467d7..b51665cdabc4 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -737,12 +737,8 @@ cache in your filesystem. The following members are defined: bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); - /* isolate a page for migration */ - bool (*isolate_page) (struct page *, isolate_mode_t); /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); - /* put migration-failed page back to right list */ - void (*putback_page) (struct page *); int (*launder_folio) (struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, @@ -930,11 +926,6 @@ cache in your filesystem. The following members are defined: data directly between the storage and the application's address space. -``isolate_page`` - Called by the VM when isolating a movable non-lru page. If page - is successfully isolated, VM marks the page as PG_isolated via - __SetPageIsolated. - ``migrate_page`` This is used to compact the physical memory usage. If the VM wants to relocate a page (maybe off a memory card that is @@ -942,9 +933,6 @@ cache in your filesystem. The following members are defined: page to this function. migrate_page should transfer any private data across and update any references that it has to the page. -``putback_page`` - Called by the VM when isolated page's migration fails. - ``launder_folio`` Called before freeing a folio - it writes back the dirty folio. To prevent redirtying the folio, it is kept locked during the diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 8c5cb8147e55..11493bad7112 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -152,110 +152,15 @@ Steps: Non-LRU page migration ====================== -Although migration originally aimed for reducing the latency of memory accesses -for NUMA, compaction also uses migration to create high-order pages. +Although migration originally aimed for reducing the latency of memory +accesses for NUMA, compaction also uses migration to create high-order +pages. For compaction purposes, it is also useful to be able to move +non-LRU pages, such as zsmalloc and virtio-balloon pages. -Current problem of the implementation is that it is designed to migrate only -*LRU* pages. However, there are potential non-LRU pages which can be migrated -in drivers, for example, zsmalloc, virtio-balloon pages. - -For virtio-balloon pages, some parts of migration code path have been hooked -up and added virtio-balloon specific functions to intercept migration logics. -It's too specific to a driver so other drivers who want to make their pages -movable would have to add their own specific hooks in the migration path. - -To overcome the problem, VM supports non-LRU page migration which provides -generic functions for non-LRU movable pages without driver specific hooks -in the migration path. - -If a driver wants to make its pages movable, it should define three functions -which are function pointers of struct address_space_operations. - -1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` - - What VM expects from isolate_page() function of driver is to return *true* - if driver isolates the page successfully. On returning true, VM marks the page - as PG_isolated so concurrent isolation in several CPUs skip the page - for isolation. If a driver cannot isolate the page, it should return *false*. - - Once page is successfully isolated, VM uses page.lru fields so driver - shouldn't expect to preserve values in those fields. - -2. ``int (*migratepage) (struct address_space *mapping,`` -| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` - - After isolation, VM calls migratepage() of driver with the isolated page. - The function of migratepage() is to move the contents of the old page to the - new page - and set up fields of struct page newpage. Keep in mind that you should - indicate to the VM the oldpage is no longer movable via __ClearPageMovable() - under page_lock if you migrated the oldpage successfully and returned - MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver - can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time - because VM interprets -EAGAIN as "temporary migration failure". On returning - any error except -EAGAIN, VM will give up the page migration without - retrying. - - Driver shouldn't touch the page.lru field while in the migratepage() function. - -3. ``void (*putback_page)(struct page *);`` - - If migration fails on the isolated page, VM should return the isolated page - to the driver so VM calls the driver's putback_page() with the isolated page. - In this function, the driver should put the isolated page back into its own data - structure. - -Non-LRU movable page flags - - There are two page flags for supporting non-LRU movable page. - - * PG_movable - - Driver should use the function below to make page movable under page_lock:: - - void __SetPageMovable(struct page *page, struct address_space *mapping) - - It needs argument of address_space for registering migration - family functions which will be called by VM. Exactly speaking, - PG_movable is not a real flag of struct page. Rather, VM - reuses the page->mapping's lower bits to represent it:: - - #define PAGE_MAPPING_MOVABLE 0x2 - page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; - - so driver shouldn't access page->mapping directly. Instead, driver should - use page_mapping() which masks off the low two bits of page->mapping under - page lock so it can get the right struct address_space. - - For testing of non-LRU movable pages, VM supports __PageMovable() function. - However, it doesn't guarantee to identify non-LRU movable pages because - the page->mapping field is unified with other variables in struct page. - If the driver releases the page after isolation by VM, page->mapping - doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set - (look at __ClearPageMovable). But __PageMovable() is cheap to call whether - page is LRU or non-LRU movable once the page has been isolated because LRU - pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also - good for just peeking to test non-LRU movable pages before more expensive - checking with lock_page() in pfn scanning to select a victim. - - For guaranteeing non-LRU movable page, VM provides PageMovable() function. - Unlike __PageMovable(), PageMovable() validates page->mapping and - mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents - sudden destroying of page->mapping. - - Drivers using __SetPageMovable() should clear the flag via - __ClearMovablePage() under page_lock() before the releasing the page. - - * PG_isolated - - To prevent concurrent isolation among several CPUs, VM marks isolated page - as PG_isolated under lock_page(). So if a CPU encounters PG_isolated - non-LRU movable page, it can skip it. Driver doesn't need to manipulate the - flag because VM will set/clear it automatically. Keep in mind that if the - driver sees a PG_isolated page, it means the page has been isolated by the - VM so it shouldn't touch the page.lru field. - The PG_isolated flag is aliased with the PG_reclaim flag so drivers - shouldn't use PG_isolated for its own purposes. +If a driver wants to make its pages movable, it should define a struct +movable_operations. It then needs to call __SetPageMovable() on each +page that it may be able to move. This uses the ``page->mapping`` field, +so this field is not available for the driver to use for other purposes. Monitoring Migration ===================== @@ -286,3 +191,5 @@ THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase. Christoph Lameter, May 8, 2006. Minchan Kim, Mar 28, 2016. + +.. kernel-doc:: include/linux/migrate.h diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 15ed8206c463..5f4037c1d7fe 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -19,9 +19,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -500,19 +497,6 @@ static struct notifier_block cmm_mem_nb = { }; #ifdef CONFIG_BALLOON_COMPACTION -static struct vfsmount *balloon_mnt; - -static int cmm_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type balloon_fs = { - .name = "ppc-cmm", - .init_fs_context = cmm_init_fs_context, - .kill_sb = kill_anon_super, -}; - static int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -564,47 +548,13 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, return MIGRATEPAGE_SUCCESS; } -static int cmm_balloon_compaction_init(void) +static void cmm_balloon_compaction_init(void) { - int rc; - balloon_devinfo_init(&b_dev_info); b_dev_info.migratepage = cmm_migratepage; - - balloon_mnt = kern_mount(&balloon_fs); - if (IS_ERR(balloon_mnt)) { - rc = PTR_ERR(balloon_mnt); - balloon_mnt = NULL; - return rc; - } - - b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); - if (IS_ERR(b_dev_info.inode)) { - rc = PTR_ERR(b_dev_info.inode); - b_dev_info.inode = NULL; - kern_unmount(balloon_mnt); - balloon_mnt = NULL; - return rc; - } - - b_dev_info.inode->i_mapping->a_ops = &balloon_aops; - return 0; -} -static void cmm_balloon_compaction_deinit(void) -{ - if (b_dev_info.inode) - iput(b_dev_info.inode); - b_dev_info.inode = NULL; - kern_unmount(balloon_mnt); - balloon_mnt = NULL; } #else /* CONFIG_BALLOON_COMPACTION */ -static int cmm_balloon_compaction_init(void) -{ - return 0; -} - -static void cmm_balloon_compaction_deinit(void) +static void cmm_balloon_compaction_init(void) { } #endif /* CONFIG_BALLOON_COMPACTION */ @@ -622,9 +572,7 @@ static int cmm_init(void) if (!firmware_has_feature(FW_FEATURE_CMO) && !simulate) return -EOPNOTSUPP; - rc = cmm_balloon_compaction_init(); - if (rc) - return rc; + cmm_balloon_compaction_init(); rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) @@ -658,7 +606,6 @@ out_reboot_notifier: out_oom_notifier: unregister_oom_notifier(&cmm_oom_nb); out_balloon_compaction: - cmm_balloon_compaction_deinit(); return rc; } @@ -677,7 +624,6 @@ static void cmm_exit(void) unregister_memory_notifier(&cmm_mem_nb); cmm_free_pages(atomic_long_read(&loaned_pages)); cmm_unregister_sysfs(&cmm_dev); - cmm_balloon_compaction_deinit(); } /** diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 086ce77d9074..85dd6aa33df6 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -29,8 +29,6 @@ #include #include #include -#include -#include #include #include #include @@ -1730,20 +1728,6 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b) #ifdef CONFIG_BALLOON_COMPACTION - -static int vmballoon_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, BALLOON_VMW_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type vmballoon_fs = { - .name = "balloon-vmware", - .init_fs_context = vmballoon_init_fs_context, - .kill_sb = kill_anon_super, -}; - -static struct vfsmount *vmballoon_mnt; - /** * vmballoon_migratepage() - migrates a balloon page. * @b_dev_info: balloon device information descriptor. @@ -1862,21 +1846,6 @@ out_unlock: return ret; } -/** - * vmballoon_compaction_deinit() - removes compaction related data. - * - * @b: pointer to the balloon. - */ -static void vmballoon_compaction_deinit(struct vmballoon *b) -{ - if (!IS_ERR(b->b_dev_info.inode)) - iput(b->b_dev_info.inode); - - b->b_dev_info.inode = NULL; - kern_unmount(vmballoon_mnt); - vmballoon_mnt = NULL; -} - /** * vmballoon_compaction_init() - initialized compaction for the balloon. * @@ -1888,33 +1857,15 @@ static void vmballoon_compaction_deinit(struct vmballoon *b) * * Return: zero on success or error code on failure. */ -static __init int vmballoon_compaction_init(struct vmballoon *b) +static __init void vmballoon_compaction_init(struct vmballoon *b) { - vmballoon_mnt = kern_mount(&vmballoon_fs); - if (IS_ERR(vmballoon_mnt)) - return PTR_ERR(vmballoon_mnt); - b->b_dev_info.migratepage = vmballoon_migratepage; - b->b_dev_info.inode = alloc_anon_inode(vmballoon_mnt->mnt_sb); - - if (IS_ERR(b->b_dev_info.inode)) - return PTR_ERR(b->b_dev_info.inode); - - b->b_dev_info.inode->i_mapping->a_ops = &balloon_aops; - return 0; } #else /* CONFIG_BALLOON_COMPACTION */ - -static void vmballoon_compaction_deinit(struct vmballoon *b) -{ -} - -static int vmballoon_compaction_init(struct vmballoon *b) +static inline void vmballoon_compaction_init(struct vmballoon *b) { - return 0; } - #endif /* CONFIG_BALLOON_COMPACTION */ static int __init vmballoon_init(void) @@ -1939,9 +1890,7 @@ static int __init vmballoon_init(void) * balloon_devinfo_init() . */ balloon_devinfo_init(&balloon.b_dev_info); - error = vmballoon_compaction_init(&balloon); - if (error) - goto fail; + vmballoon_compaction_init(&balloon); INIT_LIST_HEAD(&balloon.huge_pages); spin_lock_init(&balloon.comm_lock); @@ -1958,7 +1907,6 @@ static int __init vmballoon_init(void) return 0; fail: vmballoon_unregister_shrinker(&balloon); - vmballoon_compaction_deinit(&balloon); return error; } @@ -1985,8 +1933,5 @@ static void __exit vmballoon_exit(void) */ vmballoon_send_start(&balloon, 0); vmballoon_pop(&balloon); - - /* Only once we popped the balloon, compaction can be deinit */ - vmballoon_compaction_deinit(&balloon); } module_exit(vmballoon_exit); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index b9737da6c4dd..bd360b91e9d3 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -17,9 +17,6 @@ #include #include #include -#include -#include -#include #include /* @@ -42,10 +39,6 @@ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER) -#ifdef CONFIG_BALLOON_COMPACTION -static struct vfsmount *balloon_mnt; -#endif - enum virtio_balloon_vq { VIRTIO_BALLOON_VQ_INFLATE, VIRTIO_BALLOON_VQ_DEFLATE, @@ -805,18 +798,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, return MIGRATEPAGE_SUCCESS; } - -static int balloon_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, BALLOON_KVM_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type balloon_fs = { - .name = "balloon-kvm", - .init_fs_context = balloon_init_fs_context, - .kill_sb = kill_anon_super, -}; - #endif /* CONFIG_BALLOON_COMPACTION */ static unsigned long shrink_free_pages(struct virtio_balloon *vb, @@ -909,19 +890,7 @@ static int virtballoon_probe(struct virtio_device *vdev) goto out_free_vb; #ifdef CONFIG_BALLOON_COMPACTION - balloon_mnt = kern_mount(&balloon_fs); - if (IS_ERR(balloon_mnt)) { - err = PTR_ERR(balloon_mnt); - goto out_del_vqs; - } - vb->vb_dev_info.migratepage = virtballoon_migratepage; - vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); - if (IS_ERR(vb->vb_dev_info.inode)) { - err = PTR_ERR(vb->vb_dev_info.inode); - goto out_kern_unmount; - } - vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops; #endif if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { /* @@ -930,13 +899,13 @@ static int virtballoon_probe(struct virtio_device *vdev) */ if (virtqueue_get_vring_size(vb->free_page_vq) < 2) { err = -ENOSPC; - goto out_iput; + goto out_del_vqs; } vb->balloon_wq = alloc_workqueue("balloon-wq", WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0); if (!vb->balloon_wq) { err = -ENOMEM; - goto out_iput; + goto out_del_vqs; } INIT_WORK(&vb->report_free_page_work, report_free_page_func); vb->cmd_id_received_cache = VIRTIO_BALLOON_CMD_ID_STOP; @@ -1030,13 +999,7 @@ out_unregister_shrinker: out_del_balloon_wq: if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) destroy_workqueue(vb->balloon_wq); -out_iput: -#ifdef CONFIG_BALLOON_COMPACTION - iput(vb->vb_dev_info.inode); -out_kern_unmount: - kern_unmount(balloon_mnt); out_del_vqs: -#endif vdev->config->del_vqs(vdev); out_free_vb: kfree(vb); @@ -1083,12 +1046,6 @@ static void virtballoon_remove(struct virtio_device *vdev) } remove_common(vb); -#ifdef CONFIG_BALLOON_COMPACTION - if (vb->vb_dev_info.inode) - iput(vb->vb_dev_info.inode); - - kern_unmount(balloon_mnt); -#endif kfree(vb); } diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index edb7f6d41faa..5ca2d5699620 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -57,7 +57,6 @@ struct balloon_dev_info { struct list_head pages; /* Pages enqueued & handled to Host */ int (*migratepage)(struct balloon_dev_info *, struct page *newpage, struct page *page, enum migrate_mode mode); - struct inode *inode; }; extern struct page *balloon_page_alloc(void); @@ -75,11 +74,10 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) spin_lock_init(&balloon->pages_lock); INIT_LIST_HEAD(&balloon->pages); balloon->migratepage = NULL; - balloon->inode = NULL; } #ifdef CONFIG_BALLOON_COMPACTION -extern const struct address_space_operations balloon_aops; +extern const struct movable_operations balloon_mops; /* * balloon_page_insert - insert a page into the balloon's page list and make @@ -94,7 +92,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { __SetPageOffline(page); - __SetPageMovable(page, balloon->inode->i_mapping); + __SetPageMovable(page, &balloon_mops); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..5d8ee3155ca2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -367,8 +367,6 @@ struct address_space_operations { */ int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); - bool (*isolate_page)(struct page *, isolate_mode_t); - void (*putback_page)(struct page *); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 069a89e847f3..82c735ba6109 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,6 +19,43 @@ struct migration_target_control; */ #define MIGRATEPAGE_SUCCESS 0 +/** + * struct movable_operations - Driver page migration + * @isolate_page: + * The VM calls this function to prepare the page to be moved. The page + * is locked and the driver should not unlock it. The driver should + * return ``true`` if the page is movable and ``false`` if it is not + * currently movable. After this function returns, the VM uses the + * page->lru field, so the driver must preserve any information which + * is usually stored here. + * + * @migrate_page: + * After isolation, the VM calls this function with the isolated + * @src page. The driver should copy the contents of the + * @src page to the @dst page and set up the fields of @dst page. + * Both pages are locked. + * If page migration is successful, the driver should call + * __ClearPageMovable(@src) and return MIGRATEPAGE_SUCCESS. + * If the driver cannot migrate the page at the moment, it can return + * -EAGAIN. The VM interprets this as a temporary migration failure and + * will retry it later. Any other error value is a permanent migration + * failure and migration will not be retried. + * The driver shouldn't touch the @src->lru field while in the + * migrate_page() function. It may write to @dst->lru. + * + * @putback_page: + * If migration fails on the isolated page, the VM informs the driver + * that the page is no longer a candidate for migration by calling + * this function. The driver should put the isolated page back into + * its own data structure. + */ +struct movable_operations { + bool (*isolate_page)(struct page *, isolate_mode_t); + int (*migrate_page)(struct page *dst, struct page *src, + enum migrate_mode); + void (*putback_page)(struct page *); +}; + /* Defined in mm/debug.c: */ extern const char *migrate_reason_names[MR_TYPES]; @@ -91,13 +128,13 @@ static inline int next_demotion_node(int node) #endif #ifdef CONFIG_COMPACTION -extern int PageMovable(struct page *page); -extern void __SetPageMovable(struct page *page, struct address_space *mapping); -extern void __ClearPageMovable(struct page *page); +bool PageMovable(struct page *page); +void __SetPageMovable(struct page *page, const struct movable_operations *ops); +void __ClearPageMovable(struct page *page); #else -static inline int PageMovable(struct page *page) { return 0; } +static inline bool PageMovable(struct page *page) { return false; } static inline void __SetPageMovable(struct page *page, - struct address_space *mapping) + const struct movable_operations *ops) { } static inline void __ClearPageMovable(struct page *page) @@ -110,6 +147,15 @@ static inline bool folio_test_movable(struct folio *folio) return PageMovable(&folio->page); } +static inline +const struct movable_operations *page_movable_ops(struct page *page) +{ + VM_BUG_ON(!__PageMovable(page)); + + return (const struct movable_operations *) + ((unsigned long)page->mapping - PAGE_MAPPING_MOVABLE); +} + #ifdef CONFIG_NUMA_BALANCING extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e66f7aa3191d..3f5490f6f038 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -639,7 +639,7 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable - * page and then page->mapping points a struct address_space. + * page and then page->mapping points to a struct movable_operations. * * Please note that, confusingly, "page_mapping" refers to the inode * address_space which maps the page from disk; whereas "page_mapped" diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index f724129c0425..6325d1d0e90f 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -98,12 +98,8 @@ /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346 -#define BALLOON_KVM_MAGIC 0x13661366 -#define ZSMALLOC_MAGIC 0x58295829 #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ -#define Z3FOLD_MAGIC 0x33 -#define PPC_CMM_MAGIC 0xc7571590 #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 4b8eab4b3f45..22c96fed70b5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -228,10 +228,8 @@ static void balloon_page_putback(struct page *page) spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } - /* move_to_new_page() counterpart for a ballooned page */ -static int balloon_page_migrate(struct address_space *mapping, - struct page *newpage, struct page *page, +static int balloon_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { struct balloon_dev_info *balloon = balloon_page_device(page); @@ -250,11 +248,11 @@ static int balloon_page_migrate(struct address_space *mapping, return balloon->migratepage(balloon, newpage, page, mode); } -const struct address_space_operations balloon_aops = { - .migratepage = balloon_page_migrate, +const struct movable_operations balloon_mops = { + .migrate_page = balloon_page_migrate, .isolate_page = balloon_page_isolate, .putback_page = balloon_page_putback, }; -EXPORT_SYMBOL_GPL(balloon_aops); +EXPORT_SYMBOL_GPL(balloon_mops); #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/compaction.c b/mm/compaction.c index 1f89b969c12b..f23efba1d118 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -110,28 +110,27 @@ static void split_map_pages(struct list_head *list) } #ifdef CONFIG_COMPACTION - -int PageMovable(struct page *page) +bool PageMovable(struct page *page) { - struct address_space *mapping; + const struct movable_operations *mops; VM_BUG_ON_PAGE(!PageLocked(page), page); if (!__PageMovable(page)) - return 0; + return false; - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->isolate_page) - return 1; + mops = page_movable_ops(page); + if (mops) + return true; - return 0; + return false; } EXPORT_SYMBOL(PageMovable); -void __SetPageMovable(struct page *page, struct address_space *mapping) +void __SetPageMovable(struct page *page, const struct movable_operations *mops) { VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page); - page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE); + VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page); + page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE); } EXPORT_SYMBOL(__SetPageMovable); @@ -139,12 +138,10 @@ void __ClearPageMovable(struct page *page) { VM_BUG_ON_PAGE(!PageMovable(page), page); /* - * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE - * flag so that VM can catch up released page by driver after isolation. - * With it, VM migration doesn't try to put it back. + * This page still has the type of a movable page, but it's + * actually not movable any more. */ - page->mapping = (void *)((unsigned long)page->mapping & - PAGE_MAPPING_MOVABLE); + page->mapping = (void *)PAGE_MAPPING_MOVABLE; } EXPORT_SYMBOL(__ClearPageMovable); diff --git a/mm/migrate.c b/mm/migrate.c index 6c1ea61f39d8..491f03747832 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -59,7 +59,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) { - struct address_space *mapping; + const struct movable_operations *mops; /* * Avoid burning cycles with pages that are yet under __free_pages(), @@ -97,10 +97,10 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) if (!PageMovable(page) || PageIsolated(page)) goto out_no_isolated; - mapping = page_mapping(page); - VM_BUG_ON_PAGE(!mapping, page); + mops = page_movable_ops(page); + VM_BUG_ON_PAGE(!mops, page); - if (!mapping->a_ops->isolate_page(page, mode)) + if (!mops->isolate_page(page, mode)) goto out_no_isolated; /* Driver shouldn't use PG_isolated bit of page->flags */ @@ -120,10 +120,9 @@ out: static void putback_movable_page(struct page *page) { - struct address_space *mapping; + const struct movable_operations *mops = page_movable_ops(page); - mapping = page_mapping(page); - mapping->a_ops->putback_page(page); + mops->putback_page(page); ClearPageIsolated(page); } @@ -846,16 +845,15 @@ static int fallback_migrate_page(struct address_space *mapping, static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { - struct address_space *mapping; int rc = -EAGAIN; bool is_lru = !__PageMovable(&src->page); VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); - mapping = folio_mapping(src); - if (likely(is_lru)) { + struct address_space *mapping = folio_mapping(src); + if (!mapping) rc = migrate_page(mapping, &dst->page, &src->page, mode); else if (mapping->a_ops->migratepage) @@ -872,6 +870,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, rc = fallback_migrate_page(mapping, &dst->page, &src->page, mode); } else { + const struct movable_operations *mops; + /* * In case of non-lru page, it could be released after * isolation step. In that case, we shouldn't try migration. @@ -883,8 +883,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, goto out; } - rc = mapping->a_ops->migratepage(mapping, &dst->page, - &src->page, mode); + mops = page_movable_ops(&src->page); + rc = mops->migrate_page(&dst->page, &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && !folio_test_isolated(src)); } diff --git a/mm/util.c b/mm/util.c index 0837570c9225..53af0e79d3e4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -804,10 +804,10 @@ struct address_space *folio_mapping(struct folio *folio) return swap_address_space(folio_swap_entry(folio)); mapping = folio->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_ANON) + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); + return mapping; } EXPORT_SYMBOL(folio_mapping); diff --git a/mm/z3fold.c b/mm/z3fold.c index f41f8b0d9e9a..cf71da10d04e 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -34,15 +34,11 @@ #include #include #include -#include -#include -#include #include #include #include #include #include -#include #include /* @@ -149,7 +145,6 @@ struct z3fold_header { * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release - * @inode: inode for z3fold pseudo filesystem * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -169,7 +164,6 @@ struct z3fold_pool { struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; struct work_struct work; - struct inode *inode; }; /* @@ -334,54 +328,6 @@ static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) } } -static int z3fold_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type z3fold_fs = { - .name = "z3fold", - .init_fs_context = z3fold_init_fs_context, - .kill_sb = kill_anon_super, -}; - -static struct vfsmount *z3fold_mnt; -static int __init z3fold_mount(void) -{ - int ret = 0; - - z3fold_mnt = kern_mount(&z3fold_fs); - if (IS_ERR(z3fold_mnt)) - ret = PTR_ERR(z3fold_mnt); - - return ret; -} - -static void z3fold_unmount(void) -{ - kern_unmount(z3fold_mnt); -} - -static const struct address_space_operations z3fold_aops; -static int z3fold_register_migration(struct z3fold_pool *pool) -{ - pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); - if (IS_ERR(pool->inode)) { - pool->inode = NULL; - return 1; - } - - pool->inode->i_mapping->private_data = pool; - pool->inode->i_mapping->a_ops = &z3fold_aops; - return 0; -} - -static void z3fold_unregister_migration(struct z3fold_pool *pool) -{ - if (pool->inode) - iput(pool->inode); -} - /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, struct z3fold_pool *pool, gfp_t gfp) @@ -1002,14 +948,10 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; - if (z3fold_register_migration(pool)) - goto out_rwq; INIT_WORK(&pool->work, free_pages_work); pool->ops = ops; return pool; -out_rwq: - destroy_workqueue(pool->release_wq); out_wq: destroy_workqueue(pool->compact_wq); out_unbuddied: @@ -1043,11 +985,12 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) destroy_workqueue(pool->compact_wq); destroy_workqueue(pool->release_wq); - z3fold_unregister_migration(pool); free_percpu(pool->unbuddied); kfree(pool); } +static const struct movable_operations z3fold_mops; + /** * z3fold_alloc() - allocates a region of a given size * @pool: z3fold pool from which to allocate @@ -1117,11 +1060,11 @@ retry: } if (can_sleep) { lock_page(page); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &z3fold_mops); unlock_page(page); } else { WARN_ON(!trylock_page(page)); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &z3fold_mops); unlock_page(page); } z3fold_page_lock(zhdr); @@ -1554,12 +1497,11 @@ out: return false; } -static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +static int z3fold_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) { struct z3fold_header *zhdr, *new_zhdr; struct z3fold_pool *pool; - struct address_space *new_mapping; VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); @@ -1592,7 +1534,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa * so we only have to reinitialize it. */ INIT_LIST_HEAD(&new_zhdr->buddy); - new_mapping = page_mapping(page); __ClearPageMovable(page); get_page(newpage); @@ -1608,7 +1549,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa spin_lock(&pool->lock); list_add(&newpage->lru, &pool->lru); spin_unlock(&pool->lock); - __SetPageMovable(newpage, new_mapping); + __SetPageMovable(newpage, &z3fold_mops); z3fold_page_unlock(new_zhdr); queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); @@ -1642,9 +1583,9 @@ static void z3fold_page_putback(struct page *page) z3fold_page_unlock(zhdr); } -static const struct address_space_operations z3fold_aops = { +static const struct movable_operations z3fold_mops = { .isolate_page = z3fold_page_isolate, - .migratepage = z3fold_page_migrate, + .migrate_page = z3fold_page_migrate, .putback_page = z3fold_page_putback, }; @@ -1746,17 +1687,11 @@ MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { - int ret; - /* * Make sure the z3fold header is not larger than the page size and * there has remaining spaces for its buddy. */ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE); - ret = z3fold_mount(); - if (ret) - return ret; - zpool_register_driver(&z3fold_zpool_driver); return 0; @@ -1764,7 +1699,6 @@ static int __init init_z3fold(void) static void __exit exit_z3fold(void) { - z3fold_unmount(); zpool_unregister_driver(&z3fold_zpool_driver); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d5fc04385b8..71d6edcbea48 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -59,8 +58,6 @@ #include #include #include -#include -#include #include #include #include @@ -177,10 +174,6 @@ struct zs_size_stat { static struct dentry *zs_stat_root; #endif -#ifdef CONFIG_COMPACTION -static struct vfsmount *zsmalloc_mnt; -#endif - /* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where @@ -252,7 +245,6 @@ struct zs_pool { struct dentry *stat_dentry; #endif #ifdef CONFIG_COMPACTION - struct inode *inode; struct work_struct free_work; #endif /* protect page/zspage migration */ @@ -271,6 +263,7 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ + struct zs_pool *pool; #ifdef CONFIG_COMPACTION rwlock_t lock; #endif @@ -295,8 +288,6 @@ static bool ZsHugePage(struct zspage *zspage) } #ifdef CONFIG_COMPACTION -static int zs_register_migration(struct zs_pool *pool); -static void zs_unregister_migration(struct zs_pool *pool); static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); @@ -307,10 +298,6 @@ static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static int zsmalloc_mount(void) { return 0; } -static void zsmalloc_unmount(void) {} -static int zs_register_migration(struct zs_pool *pool) { return 0; } -static void zs_unregister_migration(struct zs_pool *pool) {} static void migrate_lock_init(struct zspage *zspage) {} static void migrate_read_lock(struct zspage *zspage) {} static void migrate_read_unlock(struct zspage *zspage) {} @@ -1083,6 +1070,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, create_page_chain(class, zspage, pages); init_zspage(class, zspage); + zspage->pool = pool; return zspage; } @@ -1754,33 +1742,6 @@ static void lock_zspage(struct zspage *zspage) migrate_read_unlock(zspage); } -static int zs_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type zsmalloc_fs = { - .name = "zsmalloc", - .init_fs_context = zs_init_fs_context, - .kill_sb = kill_anon_super, -}; - -static int zsmalloc_mount(void) -{ - int ret = 0; - - zsmalloc_mnt = kern_mount(&zsmalloc_fs); - if (IS_ERR(zsmalloc_mnt)) - ret = PTR_ERR(zsmalloc_mnt); - - return ret; -} - -static void zsmalloc_unmount(void) -{ - kern_unmount(zsmalloc_mnt); -} - static void migrate_lock_init(struct zspage *zspage) { rwlock_init(&zspage->lock); @@ -1823,6 +1784,8 @@ static void dec_zspage_isolation(struct zspage *zspage) zspage->isolated--; } +static const struct movable_operations zsmalloc_mops; + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1843,7 +1806,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; - __SetPageMovable(newpage, page_mapping(oldpage)); + __SetPageMovable(newpage, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) @@ -1865,8 +1828,8 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) return true; } -static int zs_page_migrate(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +static int zs_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) { struct zs_pool *pool; struct size_class *class; @@ -1889,14 +1852,15 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); - pool = mapping->private_data; + /* The page is locked, so this pointer must remain valid */ + zspage = get_zspage(page); + pool = zspage->pool; /* * The pool migrate_lock protects the race between zpage migration * and zs_free. */ write_lock(&pool->migrate_lock); - zspage = get_zspage(page); class = zspage_class(pool, zspage); /* @@ -1964,31 +1928,12 @@ static void zs_page_putback(struct page *page) migrate_write_unlock(zspage); } -static const struct address_space_operations zsmalloc_aops = { +static const struct movable_operations zsmalloc_mops = { .isolate_page = zs_page_isolate, - .migratepage = zs_page_migrate, + .migrate_page = zs_page_migrate, .putback_page = zs_page_putback, }; -static int zs_register_migration(struct zs_pool *pool) -{ - pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb); - if (IS_ERR(pool->inode)) { - pool->inode = NULL; - return 1; - } - - pool->inode->i_mapping->private_data = pool; - pool->inode->i_mapping->a_ops = &zsmalloc_aops; - return 0; -} - -static void zs_unregister_migration(struct zs_pool *pool) -{ - flush_work(&pool->free_work); - iput(pool->inode); -} - /* * Caller should hold page_lock of all pages in the zspage * In here, we cannot use zspage meta data. @@ -2032,6 +1977,11 @@ static void kick_deferred_free(struct zs_pool *pool) schedule_work(&pool->free_work); } +static void zs_flush_migration(struct zs_pool *pool) +{ + flush_work(&pool->free_work); +} + static void init_deferred_free(struct zs_pool *pool) { INIT_WORK(&pool->free_work, async_free_zspage); @@ -2043,10 +1993,12 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) do { WARN_ON(!trylock_page(page)); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &zsmalloc_mops); unlock_page(page); } while ((page = get_next_page(page)) != NULL); } +#else +static inline void zs_flush_migration(struct zs_pool *pool) { } #endif /* @@ -2324,9 +2276,6 @@ struct zs_pool *zs_create_pool(const char *name) /* debug only, don't abort if it fails */ zs_pool_stat_create(pool, name); - if (zs_register_migration(pool)) - goto err; - /* * Not critical since shrinker is only used to trigger internal * defragmentation of the pool which is pretty optional thing. If @@ -2348,7 +2297,7 @@ void zs_destroy_pool(struct zs_pool *pool) int i; zs_unregister_shrinker(pool); - zs_unregister_migration(pool); + zs_flush_migration(pool); zs_pool_stat_destroy(pool); for (i = 0; i < ZS_SIZE_CLASSES; i++) { @@ -2380,14 +2329,10 @@ static int __init zs_init(void) { int ret; - ret = zsmalloc_mount(); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", zs_cpu_prepare, zs_cpu_dead); if (ret) - goto hp_setup_fail; + goto out; #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); @@ -2397,8 +2342,6 @@ static int __init zs_init(void) return 0; -hp_setup_fail: - zsmalloc_unmount(); out: return ret; } @@ -2408,7 +2351,6 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif - zsmalloc_unmount(); cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); zs_stat_exit(); -- cgit v1.3 From 5490da4f06d182ba944706875029e98fe7f6b821 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 09:00:16 -0400 Subject: fs: Add aops->migrate_folio Provide a folio-based replacement for aops->migratepage. Update the documentation to document migrate_folio instead of migratepage. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- Documentation/filesystems/locking.rst | 5 +++-- Documentation/filesystems/vfs.rst | 14 +++++++------- include/linux/fs.h | 4 +++- mm/compaction.c | 4 +++- mm/migrate.c | 11 +++++++---- 5 files changed, 23 insertions(+), 15 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 9963d9600b71..4bb2627026ec 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -252,7 +252,8 @@ prototypes:: bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); - int (*migratepage)(struct address_space *, struct page *, struct page *); + int (*migrate_folio)(struct address_space *, struct folio *dst, + struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); @@ -278,7 +279,7 @@ invalidate_folio: yes exclusive release_folio: yes free_folio: yes direct_IO: -migratepage: yes (both) +migrate_folio: yes (both) launder_folio: yes is_partially_uptodate: yes error_remove_page: yes diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index b51665cdabc4..6cd6953e175b 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -737,8 +737,8 @@ cache in your filesystem. The following members are defined: bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); - /* migrate the contents of a page to the specified target */ - int (*migratepage) (struct page *, struct page *); + int (*migrate_folio)(struct mapping *, struct folio *dst, + struct folio *src, enum migrate_mode); int (*launder_folio) (struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, @@ -926,12 +926,12 @@ cache in your filesystem. The following members are defined: data directly between the storage and the application's address space. -``migrate_page`` +``migrate_folio`` This is used to compact the physical memory usage. If the VM - wants to relocate a page (maybe off a memory card that is - signalling imminent failure) it will pass a new page and an old - page to this function. migrate_page should transfer any private - data across and update any references that it has to the page. + wants to relocate a folio (maybe from a memory device that is + signalling imminent failure) it will pass a new folio and an old + folio to this function. migrate_folio should transfer any private + data across and update any references that it has to the folio. ``launder_folio`` Called before freeing a folio - it writes back the dirty folio. diff --git a/include/linux/fs.h b/include/linux/fs.h index 5d8ee3155ca2..47431cf8fbb3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -362,9 +362,11 @@ struct address_space_operations { void (*free_folio)(struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* - * migrate the contents of a page to the specified target. If + * migrate the contents of a folio to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ + int (*migrate_folio)(struct address_space *, struct folio *dst, + struct folio *src, enum migrate_mode); int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); int (*launder_folio)(struct folio *); diff --git a/mm/compaction.c b/mm/compaction.c index f23efba1d118..458f49f9ab09 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1042,7 +1042,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail_put; mapping = page_mapping(page); - migrate_dirty = !mapping || mapping->a_ops->migratepage; + migrate_dirty = !mapping || + mapping->a_ops->migrate_folio || + mapping->a_ops->migratepage; unlock_page(page); if (!migrate_dirty) goto isolate_fail_put; diff --git a/mm/migrate.c b/mm/migrate.c index 491f03747832..3c3c168097dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -856,14 +856,17 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, if (!mapping) rc = migrate_page(mapping, &dst->page, &src->page, mode); - else if (mapping->a_ops->migratepage) + else if (mapping->a_ops->migrate_folio) /* - * Most pages have a mapping and most filesystems - * provide a migratepage callback. Anonymous pages + * Most folios have a mapping and most filesystems + * provide a migrate_folio callback. Anonymous folios * are part of swap space which also has its own - * migratepage callback. This is the most common path + * migrate_folio callback. This is the most common path * for page migration. */ + rc = mapping->a_ops->migrate_folio(mapping, dst, src, + mode); + else if (mapping->a_ops->migratepage) rc = mapping->a_ops->migratepage(mapping, &dst->page, &src->page, mode); else -- cgit v1.3 From 67235182a41c1bd6b32806a1556a1d299b84212b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 10:20:31 -0400 Subject: mm/migrate: Convert buffer_migrate_page() to buffer_migrate_folio() Use a folio throughout __buffer_migrate_folio(), add kernel-doc for buffer_migrate_folio() and buffer_migrate_folio_norefs(), move their declarations to buffer.h and switch all filesystems that have wired them up. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- block/fops.c | 2 +- fs/ext2/inode.c | 4 +-- fs/ext4/inode.c | 4 +-- fs/ntfs/aops.c | 6 ++-- fs/ocfs2/aops.c | 2 +- include/linux/buffer_head.h | 10 ++++++ include/linux/fs.h | 12 ------- mm/migrate.c | 76 +++++++++++++++++++++++++++------------------ 8 files changed, 65 insertions(+), 51 deletions(-) (limited to 'include/linux/fs.h') diff --git a/block/fops.c b/block/fops.c index d6b3276a6c68..743fc46d0aad 100644 --- a/block/fops.c +++ b/block/fops.c @@ -417,7 +417,7 @@ const struct address_space_operations def_blk_aops = { .write_end = blkdev_write_end, .writepages = blkdev_writepages, .direct_IO = blkdev_direct_IO, - .migratepage = buffer_migrate_page_norefs, + .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e6b932219803..58a9d061f17d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -973,7 +973,7 @@ const struct address_space_operations ext2_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -989,7 +989,7 @@ const struct address_space_operations ext2_nobh_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 06cc68878176..87a8b4382bce 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3633,7 +3633,7 @@ static const struct address_space_operations ext4_aops = { .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -3668,7 +3668,7 @@ static const struct address_space_operations ext4_da_aops = { .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 9e3964ea2ea0..5f4fb6ca6f2e 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1659,7 +1659,7 @@ const struct address_space_operations ntfs_normal_aops = { .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ .bmap = ntfs_bmap, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_compressed_aops = { .writepage = ntfs_writepage, .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1688,7 +1688,7 @@ const struct address_space_operations ntfs_mst_aops = { .writepage = ntfs_writepage, /* Write dirty page to disk. */ .dirty_folio = filemap_dirty_folio, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 767df51f8657..1d489003f99d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2462,7 +2462,7 @@ const struct address_space_operations ocfs2_aops = { .direct_IO = ocfs2_direct_IO, .invalidate_folio = block_invalidate_folio, .release_folio = ocfs2_release_folio, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c9d1463bb20f..b0366c89d6a4 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -267,6 +267,16 @@ int nobh_truncate_page(struct address_space *, loff_t, get_block_t *); int nobh_writepage(struct page *page, get_block_t *get_block, struct writeback_control *wbc); +#ifdef CONFIG_MIGRATION +extern int buffer_migrate_folio(struct address_space *, + struct folio *dst, struct folio *src, enum migrate_mode); +extern int buffer_migrate_folio_norefs(struct address_space *, + struct folio *dst, struct folio *src, enum migrate_mode); +#else +#define buffer_migrate_folio NULL +#define buffer_migrate_folio_norefs NULL +#endif + void buffer_init(void); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 47431cf8fbb3..9e6b17da4e11 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3215,18 +3215,6 @@ extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); -#ifdef CONFIG_MIGRATION -extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *, - enum migrate_mode); -extern int buffer_migrate_page_norefs(struct address_space *, - struct page *, struct page *, - enum migrate_mode); -#else -#define buffer_migrate_page NULL -#define buffer_migrate_page_norefs NULL -#endif - int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *); diff --git a/mm/migrate.c b/mm/migrate.c index 75b171425c45..ea5398d0f7f1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -656,23 +656,23 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, return true; } -static int __buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode, +static int __buffer_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode, bool check_refs) { struct buffer_head *bh, *head; int rc; int expected_count; - if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page, mode); + head = folio_buffers(src); + if (!head) + return migrate_page(mapping, &dst->page, &src->page, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = expected_page_refs(mapping, page); - if (page_count(page) != expected_count) + expected_count = expected_page_refs(mapping, &src->page); + if (folio_ref_count(src) != expected_count) return -EAGAIN; - head = page_buffers(page); if (!buffer_migrate_lock_buffers(head, mode)) return -EAGAIN; @@ -703,23 +703,22 @@ recheck_buffers: } } - rc = migrate_page_move_mapping(mapping, newpage, page, 0); + rc = folio_migrate_mapping(mapping, dst, src, 0); if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; - attach_page_private(newpage, detach_page_private(page)); + folio_attach_private(dst, folio_detach_private(src)); bh = head; do { - set_bh_page(bh, newpage, bh_offset(bh)); + set_bh_page(bh, &dst->page, bh_offset(bh)); bh = bh->b_this_page; - } while (bh != head); if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(dst, src); else - migrate_page_states(newpage, page); + folio_migrate_flags(dst, src); rc = MIGRATEPAGE_SUCCESS; unlock_buffers: @@ -729,34 +728,51 @@ unlock_buffers: do { unlock_buffer(bh); bh = bh->b_this_page; - } while (bh != head); return rc; } -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. For example attached buffer heads are accessed only under page lock. +/** + * buffer_migrate_folio() - Migration function for folios with buffers. + * @mapping: The address space containing @src. + * @dst: The folio to migrate to. + * @src: The folio to migrate from. + * @mode: How to migrate the folio. + * + * This function can only be used if the underlying filesystem guarantees + * that no other references to @src exist. For example attached buffer + * heads are accessed only under the folio lock. If your filesystem cannot + * provide this guarantee, buffer_migrate_folio_norefs() may be more + * appropriate. + * + * Return: 0 on success or a negative errno on failure. */ -int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +int buffer_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - return __buffer_migrate_page(mapping, newpage, page, mode, false); + return __buffer_migrate_folio(mapping, dst, src, mode, false); } -EXPORT_SYMBOL(buffer_migrate_page); +EXPORT_SYMBOL(buffer_migrate_folio); -/* - * Same as above except that this variant is more careful and checks that there - * are also no buffer head references. This function is the right one for - * mappings where buffer heads are directly looked up and referenced (such as - * block device mappings). +/** + * buffer_migrate_folio_norefs() - Migration function for folios with buffers. + * @mapping: The address space containing @src. + * @dst: The folio to migrate to. + * @src: The folio to migrate from. + * @mode: How to migrate the folio. + * + * Like buffer_migrate_folio() except that this variant is more careful + * and checks that there are also no buffer head references. This function + * is the right one for mappings where buffer heads are directly looked + * up and referenced (such as block device mappings). + * + * Return: 0 on success or a negative errno on failure. */ -int buffer_migrate_page_norefs(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +int buffer_migrate_folio_norefs(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - return __buffer_migrate_page(mapping, newpage, page, mode, true); + return __buffer_migrate_folio(mapping, dst, src, mode, true); } #endif -- cgit v1.3 From 9d0ddc0cb575fd41ff16131b06e08e1feac43b81 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 11:53:31 -0400 Subject: fs: Remove aops->migratepage() With all users converted to migrate_folio(), remove this operation. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 2 -- mm/compaction.c | 5 ++--- mm/migrate.c | 3 --- 3 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9e6b17da4e11..7e06919b8f60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -367,8 +367,6 @@ struct address_space_operations { */ int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); - int (*migratepage) (struct address_space *, - struct page *, struct page *, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); diff --git a/mm/compaction.c b/mm/compaction.c index 458f49f9ab09..a2c53fcf933e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1031,7 +1031,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Only pages without mappings or that have a - * ->migratepage callback are possible to migrate + * ->migrate_folio callback are possible to migrate * without blocking. However, we can be racing with * truncation so it's necessary to lock the page * to stabilise the mapping as truncation holds @@ -1043,8 +1043,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, mapping = page_mapping(page); migrate_dirty = !mapping || - mapping->a_ops->migrate_folio || - mapping->a_ops->migratepage; + mapping->a_ops->migrate_folio; unlock_page(page); if (!migrate_dirty) goto isolate_fail_put; diff --git a/mm/migrate.c b/mm/migrate.c index 0dd3ec9525b3..1b4b977809a1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -909,9 +909,6 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, */ rc = mapping->a_ops->migrate_folio(mapping, dst, src, mode); - else if (mapping->a_ops->migratepage) - rc = mapping->a_ops->migratepage(mapping, &dst->page, - &src->page, mode); else rc = fallback_migrate_folio(mapping, dst, src, mode); } else { -- cgit v1.3 From addebd9ac9ca0ef8b3764907bf8018e48caffc64 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 12 Aug 2022 15:56:33 -0700 Subject: fs: don't randomize struct kiocb fields This is a size sensitive structure and randomizing can introduce extra padding that breaks io_uring's fixed size expectations. There are few fields here as it is, half of which need a fixed order to optimally pack, so the randomization isn't providing much. Suggested-by: Linus Torvalds Signed-off-by: Keith Busch Link: https://lore.kernel.org/io-uring/b6f508ca-b1b2-5f40-7998-e4cff1cf7212@kernel.dk/ Signed-off-by: Jens Axboe --- include/linux/fs.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9f131e559d05..daf69a6504b6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -339,17 +339,12 @@ enum rw_hint { struct kiocb { struct file *ki_filp; - - /* The 'ki_filp' pointer is shared in a union for aio */ - randomized_struct_fields_start - loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ struct wait_page_queue *ki_waitq; /* for async buffered IO */ - randomized_struct_fields_end }; static inline bool is_sync_kiocb(struct kiocb *kiocb) -- cgit v1.3