From 41d36a9f3e5336f5b48c3adba0777b8e217020d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Mar 2022 07:05:28 +0100 Subject: fs: remove kiocb.ki_hint This field is entirely unused now except for a tracepoint in f2fs, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220308060529.736277-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/fs.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..d5658ac5d8c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -327,7 +327,6 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; - u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ struct wait_page_queue *ki_waitq; /* for async buffered IO */ randomized_struct_fields_end @@ -2225,21 +2224,11 @@ static inline enum rw_hint file_write_hint(struct file *file) static inline int iocb_flags(struct file *file); -static inline u16 ki_hint_validate(enum rw_hint hint) -{ - typeof(((struct kiocb *)0)->ki_hint) max_hint = -1; - - if (hint <= max_hint) - return hint; - return 0; -} - static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = iocb_flags(filp), - .ki_hint = ki_hint_validate(file_write_hint(filp)), .ki_ioprio = get_current_ioprio(), }; } @@ -2250,7 +2239,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = kiocb_src->ki_flags, - .ki_hint = kiocb_src->ki_hint, .ki_ioprio = kiocb_src->ki_ioprio, .ki_pos = kiocb_src->ki_pos, }; -- cgit v1.2.3 From 7b12e49669c99f63bc12351c57e581f1f14d4adf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Mar 2022 07:05:29 +0100 Subject: fs: remove fs.f_write_hint The value is now completely unused except for reporting it back through the F_GET_FILE_RW_HINT ioctl, so remove the value and the two ioctls for it. Trying to use the F_SET_FILE_RW_HINT and F_GET_FILE_RW_HINT fcntls will now return EINVAL, just like it would on a kernel that never supported this functionality in the first place. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220308060529.736277-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/fcntl.c | 18 ------------------ fs/open.c | 1 - include/linux/fs.h | 9 --------- 3 files changed, 28 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/fcntl.c b/fs/fcntl.c index 9c6c6a3e2de5..f15d885b9796 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -291,22 +291,6 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd, u64 h; switch (cmd) { - case F_GET_FILE_RW_HINT: - h = file_write_hint(file); - if (copy_to_user(argp, &h, sizeof(*argp))) - return -EFAULT; - return 0; - case F_SET_FILE_RW_HINT: - if (copy_from_user(&h, argp, sizeof(h))) - return -EFAULT; - hint = (enum rw_hint) h; - if (!rw_hint_valid(hint)) - return -EINVAL; - - spin_lock(&file->f_lock); - file->f_write_hint = hint; - spin_unlock(&file->f_lock); - return 0; case F_GET_RW_HINT: h = inode->i_write_hint; if (copy_to_user(argp, &h, sizeof(*argp))) @@ -431,8 +415,6 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_GET_RW_HINT: case F_SET_RW_HINT: - case F_GET_FILE_RW_HINT: - case F_SET_FILE_RW_HINT: err = fcntl_rw_hint(filp, cmd, arg); break; default: diff --git a/fs/open.c b/fs/open.c index 9ff2f621b760..1315253e0247 100644 --- a/fs/open.c +++ b/fs/open.c @@ -835,7 +835,6 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; - f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/include/linux/fs.h b/include/linux/fs.h index d5658ac5d8c6..a1fc3b41cd82 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -966,7 +966,6 @@ struct file { * Must not be taken from IRQ context. */ spinlock_t f_lock; - enum rw_hint f_write_hint; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -2214,14 +2213,6 @@ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns, !gid_valid(i_gid_into_mnt(mnt_userns, inode)); } -static inline enum rw_hint file_write_hint(struct file *file) -{ - if (file->f_write_hint != WRITE_LIFE_NOT_SET) - return file->f_write_hint; - - return file_inode(file)->i_write_hint; -} - static inline int iocb_flags(struct file *file); static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) -- cgit v1.2.3 From 871129332d74c9e94bd110932ac4445833995639 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 4 Sep 2019 12:13:25 -0700 Subject: fs: export rw_verify_area() I'm adding btrfs ioctls to read and write compressed data, and rather than duplicating the checks in rw_verify_area(), let's just export it. Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/internal.h | 5 ----- fs/read_write.c | 1 + include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/internal.h b/fs/internal.h index 8590c973c2f4..711bdc00ec7c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -157,11 +157,6 @@ extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); -/* - * read_write.c - */ -extern int rw_verify_area(int, struct file *, const loff_t *, size_t); - /* * pipe.c */ diff --git a/fs/read_write.c b/fs/read_write.c index 0074afa7ecb3..4d60146243df 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -385,6 +385,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t return security_file_permission(file, read_write == READ ? MAY_READ : MAY_WRITE); } +EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..0ebfc2519212 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3173,6 +3173,7 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); +int rw_verify_area(int, struct file *, const loff_t *, size_t); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); extern int stream_open(struct inode * inode, struct file * filp); -- cgit v1.2.3 From f6f7a25a650818c61defa97d60a79b216618315f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 12 Aug 2021 15:34:57 -0700 Subject: fs: export variant of generic_write_checks without iov_iter Encoded I/O in Btrfs needs to check a write with a given logical size without an iov_iter that matches that size (because the iov_iter we have is for the compressed data). So, factor out the parts of generic_write_check() that don't need an iov_iter into a new generic_write_checks_count() function and export that. Reviewed-by: Nikolay Borisov Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/read_write.c | 33 ++++++++++++++++++++------------- include/linux/fs.h | 1 + 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/read_write.c b/fs/read_write.c index 4d60146243df..dc5000173b80 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1618,24 +1618,16 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) return 0; } -/* - * Performs necessary checks before doing a write - * - * Can adjust writing position or amount of bytes to write. - * Returns appropriate error code that caller should return or - * zero in case that write should be allowed. - */ -ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) +/* Like generic_write_checks(), but takes size of write instead of iter. */ +int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - loff_t count; - int ret; if (IS_SWAPFILE(inode)) return -ETXTBSY; - if (!iov_iter_count(from)) + if (!*count) return 0; /* FIXME: this is for backwards compatibility with 2.4 */ @@ -1645,8 +1637,23 @@ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; - count = iov_iter_count(from); - ret = generic_write_check_limits(file, iocb->ki_pos, &count); + return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); +} +EXPORT_SYMBOL(generic_write_checks_count); + +/* + * Performs necessary checks before doing a write + * + * Can adjust writing position or amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. + */ +ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + loff_t count = iov_iter_count(from); + int ret; + + ret = generic_write_checks_count(iocb, &count); if (ret) return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index 0ebfc2519212..27746a3da8fd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3130,6 +3130,7 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); +int generic_write_checks_count(struct kiocb *iocb, loff_t *count); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); -- cgit v1.2.3 From 2e7e80f7e7e9dbbb3c2a85ee923ca32826052816 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:27 +0000 Subject: fs: Convert is_partially_uptodate to folios Since the uptodate property is maintained on a per-folio basis, the is_partially_uptodate method should also take a folio. Fix the types at the same time so it's clear that it returns true/false and takes the count in bytes, not blocks. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 2 +- Documentation/filesystems/vfs.rst | 10 +++++----- fs/buffer.c | 26 ++++++++++++------------- fs/iomap/buffered-io.c | 36 ++++++++++++++++------------------- include/linux/buffer_head.h | 3 +-- include/linux/fs.h | 4 ++-- include/linux/iomap.h | 3 +-- mm/filemap.c | 4 ++-- 8 files changed, 40 insertions(+), 48 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 3f9b1497ebb8..88b33524687f 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -258,7 +258,7 @@ prototypes:: int (*migratepage)(struct address_space *, struct page *, struct page *); void (*putback_page) (struct page *); int (*launder_page)(struct page *); - int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); + bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); int (*swap_activate)(struct file *); int (*swap_deactivate)(struct file *); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index bf5c48066fac..da3e7b470f0a 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -747,8 +747,8 @@ cache in your filesystem. The following members are defined: void (*putback_page) (struct page *); int (*launder_page) (struct page *); - int (*is_partially_uptodate) (struct page *, unsigned long, - unsigned long); + bool (*is_partially_uptodate) (struct folio *, size_t from, + size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page) (struct mapping *mapping, struct page *page); int (*swap_activate)(struct file *); @@ -937,9 +937,9 @@ cache in your filesystem. The following members are defined: ``is_partially_uptodate`` Called by the VM when reading a file through the pagecache when - the underlying blocksize != pagesize. If the required block is - up to date then the read can complete without needing the IO to - bring the whole page up to date. + the underlying blocksize is smaller than the size of the folio. + If the required block is up to date then the read can complete + without needing I/O to bring the whole page up to date. ``is_dirty_writeback`` Called by the VM when attempting to reclaim a page. The VM uses diff --git a/fs/buffer.c b/fs/buffer.c index 8e112b6bd371..929061995cf8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2206,29 +2206,27 @@ int generic_write_end(struct file *file, struct address_space *mapping, EXPORT_SYMBOL(generic_write_end); /* - * block_is_partially_uptodate checks whether buffers within a page are + * block_is_partially_uptodate checks whether buffers within a folio are * uptodate or not. * - * Returns true if all buffers which correspond to a file portion - * we want to read are uptodate. + * Returns true if all buffers which correspond to the specified part + * of the folio are uptodate. */ -int block_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count) +bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; - int ret = 1; - - if (!page_has_buffers(page)) - return 0; + bool ret = true; - head = page_buffers(page); + head = folio_buffers(folio); + if (!head) + return false; blocksize = head->b_size; - to = min_t(unsigned, PAGE_SIZE - from, count); + to = min_t(unsigned, folio_size(folio) - from, count); to = from + to; - if (from < blocksize && to > PAGE_SIZE - blocksize) - return 0; + if (from < blocksize && to > folio_size(folio) - blocksize) + return false; bh = head; block_start = 0; @@ -2236,7 +2234,7 @@ int block_is_partially_uptodate(struct page *page, unsigned long from, block_end = block_start + blocksize; if (block_end > from && block_start < to) { if (!buffer_uptodate(bh)) { - ret = 0; + ret = false; break; } if (block_end >= to) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d020a2e81a24..da0a7b15a64e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -424,37 +424,33 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_readahead); /* - * iomap_is_partially_uptodate checks whether blocks within a page are + * iomap_is_partially_uptodate checks whether blocks within a folio are * uptodate or not. * - * Returns true if all blocks which correspond to a file portion - * we want to read within the page are uptodate. + * Returns true if all blocks which correspond to the specified part + * of the folio are uptodate. */ -int -iomap_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count) +bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { - struct folio *folio = page_folio(page); struct iomap_page *iop = to_iomap_page(folio); - struct inode *inode = page->mapping->host; - unsigned len, first, last; - unsigned i; + struct inode *inode = folio->mapping->host; + size_t len; + unsigned first, last, i; - /* Limit range to one page */ - len = min_t(unsigned, PAGE_SIZE - from, count); + if (!iop) + return false; + + /* Limit range to this folio */ + len = min(folio_size(folio) - from, count); /* First and last blocks in range within page */ first = from >> inode->i_blkbits; last = (from + len - 1) >> inode->i_blkbits; - if (iop) { - for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) - return 0; - return 1; - } - - return 0; + for (i = first; i <= last; i++) + if (!test_bit(i, iop->uptodate)) + return false; + return true; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3451f1fcda12..79d465057889 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -225,8 +225,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); int block_read_full_page(struct page*, get_block_t*); -int block_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count); +bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, get_block_t *get_block); int __block_write_begin(struct page *page, loff_t pos, unsigned len, diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..5939e6694ada 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -400,8 +400,8 @@ struct address_space_operations { bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); int (*launder_page) (struct page *); - int (*is_partially_uptodate) (struct page *, unsigned long, - unsigned long); + bool (*is_partially_uptodate) (struct folio *, size_t from, + size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page)(struct address_space *, struct page *); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 97a3a2edb585..3bcbb264f83f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -227,8 +227,7 @@ ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); -int iomap_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count); +bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); int iomap_releasepage(struct page *page, gfp_t gfp_mask); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); void iomap_invalidatepage(struct page *page, unsigned int offset, diff --git a/mm/filemap.c b/mm/filemap.c index ad8c39d90bf9..9639b844dd31 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2452,7 +2452,7 @@ static bool filemap_range_uptodate(struct address_space *mapping, pos -= folio_pos(folio); } - return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count); + return mapping->a_ops->is_partially_uptodate(folio, pos, count); } static int filemap_update_page(struct kiocb *iocb, @@ -2844,7 +2844,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas, offset = offset_in_folio(folio, start) & ~(bsz - 1); do { - if (ops->is_partially_uptodate(&folio->page, offset, bsz) == + if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; start = (start + bsz) & ~(bsz - 1); -- cgit v1.2.3 From 128d1f8241d62ab014eef6dd4ef9bb977dbeadb2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:32 +0000 Subject: fs: Add invalidate_folio() aops method This is used in preference to invalidatepage, if defined. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 13 +++++++------ Documentation/filesystems/vfs.rst | 11 ++++++----- include/linux/fs.h | 1 + mm/truncate.c | 8 +++++++- 4 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 88b33524687f..29a045fd3860 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -250,6 +250,7 @@ prototypes:: loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidate_folio) (struct folio *, size_t start, size_t len); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); @@ -278,6 +279,7 @@ readpages: no shared write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: +invalidate_folio: yes exclusive invalidatepage: yes exclusive releasepage: yes freepage: yes @@ -370,13 +372,12 @@ not locked. filesystems and by the swapper. The latter will eventually go away. Please, keep it that way and don't breed new callers. -->invalidatepage() is called when the filesystem must attempt to drop +->invalidate_folio() is called when the filesystem must attempt to drop some or all of the buffers from the page when it is being truncated. It -returns zero on success. If ->invalidatepage is zero, the kernel uses -block_invalidatepage() instead. The filesystem must exclusively acquire -invalidate_lock before invalidating page cache in truncate / hole punch path -(and thus calling into ->invalidatepage) to block races between page cache -invalidation and page cache filling functions (fault, read, ...). +returns zero on success. The filesystem must exclusively acquire +invalidate_lock before invalidating page cache in truncate / hole punch +path (and thus calling into ->invalidate_folio) to block races between page +cache invalidation and page cache filling functions (fault, read, ...). ->releasepage() is called when the kernel is about to try to drop the buffers from the page in preparation for freeing it. It returns zero to diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index da3e7b470f0a..26c090cd8cf5 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -735,6 +735,7 @@ cache in your filesystem. The following members are defined: loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidate_folio) (struct folio *, size_t start, size_t len); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); @@ -868,15 +869,15 @@ cache in your filesystem. The following members are defined: to find out where the blocks in the file are and uses those addresses directly. -``invalidatepage`` - If a page has PagePrivate set, then invalidatepage will be - called when part or all of the page is to be removed from the +``invalidate_folio`` + If a folio has private data, then invalidate_folio will be + called when part or all of the folio is to be removed from the address space. This generally corresponds to either a truncation, punch hole or a complete invalidation of the address space (in the latter case 'offset' will always be 0 and 'length' - will be PAGE_SIZE). Any private data associated with the page + will be folio_size()). Any private data associated with the page should be updated to reflect this truncation. If offset is 0 - and length is PAGE_SIZE, then the private data should be + and length is folio_size(), then the private data should be released, because the page must be able to be completely discarded. This may be done by calling the ->releasepage function, but in this case the release MUST succeed. diff --git a/include/linux/fs.h b/include/linux/fs.h index 5939e6694ada..bcdb613cd652 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -387,6 +387,7 @@ struct address_space_operations { /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidate_folio) (struct folio *, size_t offset, size_t len); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); diff --git a/mm/truncate.c b/mm/truncate.c index aa0ed373789d..b9ad298e6ce7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -154,9 +154,15 @@ static int invalidate_exceptional_entry2(struct address_space *mapping, */ void folio_invalidate(struct folio *folio, size_t offset, size_t length) { + const struct address_space_operations *aops = folio->mapping->a_ops; void (*invalidatepage)(struct page *, unsigned int, unsigned int); - invalidatepage = folio->mapping->a_ops->invalidatepage; + if (aops->invalidate_folio) { + aops->invalidate_folio(folio, offset, length); + return; + } + + invalidatepage = aops->invalidatepage; #ifdef CONFIG_BLOCK if (!invalidatepage) invalidatepage = block_invalidatepage; -- cgit v1.2.3 From 5660a8630dab61a28e07ec00c42bf605b182d725 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:35 +0000 Subject: fs: Remove noop_invalidatepage() We used to have to use noop_invalidatepage() to prevent block_invalidatepage() from being called, but that behaviour is now gone. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- drivers/dax/device.c | 1 - fs/ext2/inode.c | 1 - fs/ext4/inode.c | 1 - fs/fuse/dax.c | 1 - fs/libfs.c | 11 ----------- fs/xfs/xfs_aops.c | 1 - include/linux/fs.h | 2 -- 7 files changed, 18 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index d33a0613ed0c..7a59ca51217e 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -347,7 +347,6 @@ static unsigned long dax_get_unmapped_area(struct file *filp, static const struct address_space_operations dev_dax_aops = { .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, }; static int dax_open(struct inode *inode, struct file *filp) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 1e14777c3ca6..9b579ee56eaf 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1001,7 +1001,6 @@ static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, }; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 07ef3f84db9e..d7086209572a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3632,7 +3632,6 @@ static const struct address_space_operations ext4_dax_aops = { .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, .bmap = ext4_bmap, - .invalidatepage = noop_invalidatepage, .swap_activate = ext4_iomap_swap_activate, }; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 182b24a14804..b11fa10b88d8 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1327,7 +1327,6 @@ static const struct address_space_operations fuse_dax_file_aops = { .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, }; static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) diff --git a/fs/libfs.c b/fs/libfs.c index 974125270a42..4e047841e61d 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1198,17 +1198,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -void noop_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - /* - * There is no page cache to invalidate in the dax case, however - * we need this callback defined to prevent falling back to - * block_invalidatepage() in do_invalidatepage(). - */ -} -EXPORT_SYMBOL_GPL(noop_invalidatepage); - ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 51a040b658cb..7dd314f2288f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -582,6 +582,5 @@ const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, - .invalidatepage = noop_invalidatepage, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index bcdb613cd652..a40ea82248da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3323,8 +3323,6 @@ extern int simple_rename(struct user_namespace *, struct inode *, extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); -extern void noop_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, -- cgit v1.2.3 From f50015a596fa106bf642bd85fbf6e6b52cc913d0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:51 +0000 Subject: fs: Remove aops->invalidatepage With all users migrated to ->invalidate_folio, remove the old operation. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 2 -- Documentation/filesystems/vfs.rst | 1 - include/linux/fs.h | 1 - mm/truncate.c | 14 +++----------- 4 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 29a045fd3860..8e9cbc0fb70f 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -251,7 +251,6 @@ prototypes:: struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); @@ -280,7 +279,6 @@ write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: invalidate_folio: yes exclusive -invalidatepage: yes exclusive releasepage: yes freepage: yes direct_IO: diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 26c090cd8cf5..28704831652c 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -736,7 +736,6 @@ cache in your filesystem. The following members are defined: struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); diff --git a/include/linux/fs.h b/include/linux/fs.h index a40ea82248da..af9ae091bd82 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -388,7 +388,6 @@ struct address_space_operations { /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); diff --git a/mm/truncate.c b/mm/truncate.c index 28650151091a..8010461a59bd 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,8 +19,7 @@ #include #include #include -#include /* grr. try_to_release_page, - do_invalidatepage */ +#include /* grr. try_to_release_page */ #include #include #include "internal.h" @@ -155,16 +154,9 @@ static int invalidate_exceptional_entry2(struct address_space *mapping, void folio_invalidate(struct folio *folio, size_t offset, size_t length) { const struct address_space_operations *aops = folio->mapping->a_ops; - void (*invalidatepage)(struct page *, unsigned int, unsigned int); - if (aops->invalidate_folio) { + if (aops->invalidate_folio) aops->invalidate_folio(folio, offset, length); - return; - } - - invalidatepage = aops->invalidatepage; - if (invalidatepage) - (*invalidatepage)(&folio->page, offset, length); } EXPORT_SYMBOL_GPL(folio_invalidate); @@ -334,7 +326,7 @@ int invalidate_inode_page(struct page *page) * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * - * Note that since ->invalidatepage() accepts range to invalidate + * Note that since ->invalidate_folio() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ -- cgit v1.2.3 From affa80e8c6a1df473694c2087259901872309cc4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:52 +0000 Subject: fs: Add aops->launder_folio Since the only difference between ->launder_page and ->launder_folio is the type of the pointer, these can safely use a union without affecting bisectability. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 10 +++++----- Documentation/filesystems/vfs.rst | 8 ++++---- include/linux/fs.h | 5 ++++- mm/truncate.c | 8 ++++---- 4 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 8e9cbc0fb70f..dee512efb458 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -257,7 +257,7 @@ prototypes:: bool (*isolate_page) (struct page *, isolate_mode_t); int (*migratepage)(struct address_space *, struct page *, struct page *); void (*putback_page) (struct page *); - int (*launder_page)(struct page *); + int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); int (*swap_activate)(struct file *); @@ -285,7 +285,7 @@ direct_IO: isolate_page: yes migratepage: yes (both) putback_page: yes -launder_page: yes +launder_folio: yes is_partially_uptodate: yes error_remove_page: yes swap_activate: no @@ -385,9 +385,9 @@ the kernel assumes that the fs has no private interest in the buffers. ->freepage() is called when the kernel is done dropping the page from the page cache. -->launder_page() may be called prior to releasing a page if -it is still found to be dirty. It returns zero if the page was successfully -cleaned, or an error value if not. Note that in order to prevent the page +->launder_folio() may be called prior to releasing a folio if +it is still found to be dirty. It returns zero if the folio was successfully +cleaned, or an error value if not. Note that in order to prevent the folio getting mapped back in and redirtied, it needs to be kept locked across the entire operation. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 28704831652c..c54ca4d88ed6 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -745,7 +745,7 @@ cache in your filesystem. The following members are defined: int (*migratepage) (struct page *, struct page *); /* put migration-failed page back to right list */ void (*putback_page) (struct page *); - int (*launder_page) (struct page *); + int (*launder_folio) (struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); @@ -930,9 +930,9 @@ cache in your filesystem. The following members are defined: ``putback_page`` Called by the VM when isolated page's migration fails. -``launder_page`` - Called before freeing a page - it writes back the dirty page. - To prevent redirtying the page, it is kept locked during the +``launder_folio`` + Called before freeing a folio - it writes back the dirty folio. + To prevent redirtying the folio, it is kept locked during the whole operation. ``is_partially_uptodate`` diff --git a/include/linux/fs.h b/include/linux/fs.h index af9ae091bd82..0af3075cdff2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -399,7 +399,10 @@ struct address_space_operations { struct page *, struct page *, enum migrate_mode); bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); - int (*launder_page) (struct page *); + union { + int (*launder_page) (struct page *); + int (*launder_folio) (struct folio *); + }; bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); diff --git a/mm/truncate.c b/mm/truncate.c index 8010461a59bd..6ad44b546dff 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -614,13 +614,13 @@ failed: return 0; } -static int do_launder_folio(struct address_space *mapping, struct folio *folio) +static int folio_launder(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return 0; - if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL) + if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) return 0; - return mapping->a_ops->launder_page(&folio->page); + return mapping->a_ops->launder_folio(folio); } /** @@ -686,7 +686,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); - ret2 = do_launder_folio(mapping, folio); + ret2 = folio_launder(mapping, folio); if (ret2 == 0) { if (!invalidate_complete_folio2(mapping, folio)) ret2 = -EBUSY; -- cgit v1.2.3 From 072acba6d08730beba5bad293c7ce6d0c4b0624c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:21:59 +0000 Subject: fs: Remove aops->launder_page With all users converted to ->launder_folio, remove ->launder_page. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- include/linux/fs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0af3075cdff2..055be40084f1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -399,10 +399,7 @@ struct address_space_operations { struct page *, struct page *, enum migrate_mode); bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); - union { - int (*launder_page) (struct page *); - int (*launder_folio) (struct folio *); - }; + int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); -- cgit v1.2.3 From 6f31a5a261dbbe7bf7f585dfe81f8acd4b25ec3b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:22:00 +0000 Subject: fs: Add aops->dirty_folio This replaces ->set_page_dirty(). It returns a bool instead of an int and takes the address_space as a parameter instead of expecting the implementations to retrieve the address_space from the page. This is particularly important for filesystems which use FS_OPS for swap. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- Documentation/filesystems/locking.rst | 15 ++++++++------- Documentation/filesystems/vfs.rst | 16 ++++++++-------- include/linux/fs.h | 1 + mm/page-writeback.c | 17 ++++++++++------- mm/page_io.c | 5 ++++- 5 files changed, 31 insertions(+), 23 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index dee512efb458..72fa12dabd39 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -239,7 +239,7 @@ prototypes:: int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); - int (*set_page_dirty)(struct page *page); + bool (*dirty_folio)(struct address_space *, struct folio *folio); void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); @@ -264,7 +264,7 @@ prototypes:: int (*swap_deactivate)(struct file *); locking rules: - All except set_page_dirty and freepage may block + All except dirty_folio and freepage may block ====================== ======================== ========= =============== ops PageLocked(page) i_rwsem invalidate_lock @@ -272,7 +272,7 @@ ops PageLocked(page) i_rwsem invalidate_lock writepage: yes, unlocks (see below) readpage: yes, unlocks shared writepages: -set_page_dirty no +dirty_folio maybe readahead: yes, unlocks shared readpages: no shared write_begin: locks the page exclusive @@ -361,10 +361,11 @@ If nr_to_write is NULL, all dirty pages must be written. writepages should _only_ write pages which are present on mapping->io_pages. -->set_page_dirty() is called from various places in the kernel -when the target page is marked as needing writeback. It may be called -under spinlock (it cannot block) and is sometimes called with the page -not locked. +->dirty_folio() is called from various places in the kernel when +the target folio is marked as needing writeback. The folio cannot be +truncated because either the caller holds the folio lock, or the caller +has found the folio while holding the page table lock which will block +truncation. ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some filesystems and by the swapper. The latter will eventually go away. Please, diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index c54ca4d88ed6..d16bee420326 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -658,7 +658,7 @@ pages, however the address_space has finer control of write sizes. The read process essentially only requires 'readpage'. The write process is more complicated and uses write_begin/write_end or -set_page_dirty to write data into the address_space, and writepage and +dirty_folio to write data into the address_space, and writepage and writepages to writeback data to storage. Adding and removing pages to/from an address_space is protected by the @@ -724,7 +724,7 @@ cache in your filesystem. The following members are defined: int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); - int (*set_page_dirty)(struct page *page); + bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); @@ -793,13 +793,13 @@ cache in your filesystem. The following members are defined: This will choose pages from the address space that are tagged as DIRTY and will pass them to ->writepage. -``set_page_dirty`` - called by the VM to set a page dirty. This is particularly - needed if an address space attaches private data to a page, and - that data needs to be updated when a page is dirtied. This is +``dirty_folio`` + called by the VM to mark a folio as dirty. This is particularly + needed if an address space attaches private data to a folio, and + that data needs to be updated when a folio is dirtied. This is called, for example, when a memory mapped page gets modified. - If defined, it should set the PageDirty flag, and the - PAGECACHE_TAG_DIRTY tag in the radix tree. + If defined, it should set the folio dirty flag, and the + PAGECACHE_TAG_DIRTY search mark in i_pages. ``readahead`` Called by the VM to read pages associated with the address_space diff --git a/include/linux/fs.h b/include/linux/fs.h index 055be40084f1..c3d5db8851ae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -369,6 +369,7 @@ struct address_space_operations { /* Set a page dirty. Return true if this dirtied it */ int (*set_page_dirty)(struct page *page); + bool (*dirty_folio)(struct address_space *, struct folio *); /* * Reads in the requested pages. Unlike ->readpage(), this is diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d163f8d36b..27a87ae4502c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2616,7 +2616,7 @@ EXPORT_SYMBOL(folio_redirty_for_writepage); * folio_mark_dirty - Mark a folio as being modified. * @folio: The folio. * - * For folios with a mapping this should be done under the page lock + * For folios with a mapping this should be done with the folio lock held * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. @@ -2630,16 +2630,19 @@ bool folio_mark_dirty(struct folio *folio) if (likely(mapping)) { /* * readahead/lru_deactivate_page could remain - * PG_readahead/PG_reclaim due to race with end_page_writeback - * About readahead, if the page is written, the flags would be + * PG_readahead/PG_reclaim due to race with folio_end_writeback + * About readahead, if the folio is written, the flags would be * reset. So no problem. - * About lru_deactivate_page, if the page is redirty, the flag - * will be reset. So no problem. but if the page is used by readahead - * it will confuse readahead and make it restart the size rampup - * process. But it's a trivial problem. + * About lru_deactivate_page, if the folio is redirtied, + * the flag will be reset. So no problem. but if the + * folio is used by readahead it will confuse readahead + * and make it restart the size rampup process. But it's + * a trivial problem. */ if (folio_test_reclaim(folio)) folio_clear_reclaim(folio); + if (mapping->a_ops->dirty_folio) + return mapping->a_ops->dirty_folio(mapping, folio); return mapping->a_ops->set_page_dirty(&folio->page); } if (!folio_test_dirty(folio)) { diff --git a/mm/page_io.c b/mm/page_io.c index 0bf8e40f4e57..24c975fb4e21 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -444,9 +444,12 @@ int swap_set_page_dirty(struct page *page) if (data_race(sis->flags & SWP_FS_OPS)) { struct address_space *mapping = sis->swap_file->f_mapping; + const struct address_space_operations *aops = mapping->a_ops; VM_BUG_ON_PAGE(!PageSwapCache(page), page); - return mapping->a_ops->set_page_dirty(page); + if (aops->dirty_folio) + return aops->dirty_folio(mapping, page_folio(page)); + return aops->set_page_dirty(page); } else { return __set_page_dirty_no_writeback(page); } -- cgit v1.2.3 From 3a3bae50af5d73fab5da20484029de77ca67bb2e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 9 Feb 2022 20:22:15 +0000 Subject: fs: Remove aops ->set_page_dirty With all implementations converted to ->dirty_folio, we can stop calling this fallback method and remove it entirely. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Damien Le Moal Acked-by: Damien Le Moal Tested-by: Mike Marshall # orangefs Tested-by: David Howells # afs --- fs/ecryptfs/mmap.c | 2 +- include/linux/fs.h | 3 +-- mm/page-writeback.c | 11 +++-------- mm/page_io.c | 4 +--- 4 files changed, 6 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 9aabcb2f52e9..9ad61b582f07 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -540,7 +540,7 @@ const struct address_space_operations ecryptfs_aops = { * XXX: This is pretty broken for multiple reasons: ecryptfs does not * actually use buffer_heads, and ecryptfs will crash without * CONFIG_BLOCK. But it matches the behavior before the default for - * address_space_operations without the ->set_page_dirty method was + * address_space_operations without the ->dirty_folio method was * cleaned up, so this is the best we can do without maintainer * feedback. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index c3d5db8851ae..b472d78f00b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -367,8 +367,7 @@ struct address_space_operations { /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); - /* Set a page dirty. Return true if this dirtied it */ - int (*set_page_dirty)(struct page *page); + /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4557a8d3dfea..0997738545dd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2641,15 +2641,10 @@ bool folio_mark_dirty(struct folio *folio) */ if (folio_test_reclaim(folio)) folio_clear_reclaim(folio); - if (mapping->a_ops->dirty_folio) - return mapping->a_ops->dirty_folio(mapping, folio); - return mapping->a_ops->set_page_dirty(&folio->page); + return mapping->a_ops->dirty_folio(mapping, folio); } - if (!folio_test_dirty(folio)) { - if (!folio_test_set_dirty(folio)) - return true; - } - return false; + + return noop_dirty_folio(mapping, folio); } EXPORT_SYMBOL(folio_mark_dirty); diff --git a/mm/page_io.c b/mm/page_io.c index e3333973335e..6bde053b9d9d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -449,9 +449,7 @@ bool swap_dirty_folio(struct address_space *mapping, struct folio *folio) aops = mapping->a_ops; VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - if (aops->dirty_folio) - return aops->dirty_folio(mapping, folio); - return aops->set_page_dirty(&folio->page); + return aops->dirty_folio(mapping, folio); } else { return noop_dirty_folio(mapping, folio); } -- cgit v1.2.3 From c56109dd35c9204cd6c49d2116ef36e5044ef867 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 17:22:10 -0500 Subject: mm/truncate: Combine invalidate_mapping_pagevec() and __invalidate_mapping_pages() We can save a function call by combining these two functions, which are identical except for the return value. Also move the prototype to mm/internal.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- include/linux/fs.h | 4 ---- mm/internal.h | 2 ++ mm/truncate.c | 32 +++++++++++++------------------- 3 files changed, 15 insertions(+), 23 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..85c584c5c623 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2749,10 +2749,6 @@ extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -void invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, - unsigned long *nr_pagevec); - static inline void invalidate_remote_inode(struct inode *inode) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || diff --git a/mm/internal.h b/mm/internal.h index 7c441f43ba31..6047268076e7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -97,6 +97,8 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long invalidate_inode_page(struct page *page); +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); /** * folio_evictable - Test whether a folio is evictable. diff --git a/mm/truncate.c b/mm/truncate.c index 9ed62cb3c503..cace6e3e4e8c 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -494,7 +494,18 @@ void truncate_inode_pages_final(struct address_space *mapping) } EXPORT_SYMBOL(truncate_inode_pages_final); -static unsigned long __invalidate_mapping_pages(struct address_space *mapping, +/** + * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * @nr_pagevec: invalidate failed page number for caller + * + * This helper is similar to invalidate_mapping_pages(), except that it accounts + * for pages that are likely on a pagevec and counts them in @nr_pagevec, which + * will be used by the caller. + */ +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; @@ -559,27 +570,10 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { - return __invalidate_mapping_pages(mapping, start, end, NULL); + return invalidate_mapping_pagevec(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); -/** - * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * @nr_pagevec: invalidate failed page number for caller - * - * This helper is similar to invalidate_mapping_pages(), except that it accounts - * for pages that are likely on a pagevec and counts them in @nr_pagevec, which - * will be used by the caller. - */ -void invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) -{ - __invalidate_mapping_pages(mapping, start, end, nr_pagevec); -} - /* * This is like invalidate_inode_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger -- cgit v1.2.3 From cbcc268bb1ce5b539e7652d398e08e9b83dc4cef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 17:23:58 -0500 Subject: fs: Move many prototypes to pagemap.h These functions are page cache functionality and don't need to be declared in fs.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- drivers/block/xen-blkback/xenbus.c | 1 + drivers/usb/gadget/function/f_mass_storage.c | 1 + fs/coda/file.c | 1 + fs/iomap/fiemap.c | 1 + fs/nfsd/filecache.c | 1 + fs/nfsd/vfs.c | 1 + fs/vboxsf/utils.c | 1 + include/linux/fs.h | 116 --------------------------- include/linux/pagemap.h | 114 ++++++++++++++++++++++++++ 9 files changed, 121 insertions(+), 116 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 62125fd4af4a..f09040435e2e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include "common.h" diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index 46dd11dcb3a8..7371c6e65b10 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -179,6 +179,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/coda/file.c b/fs/coda/file.c index 29dd87be2fb8..3f3c81e6b1ab 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 66cf267c68ae..610ca6f1ec9b 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -7,6 +7,7 @@ #include #include #include +#include static int iomap_to_fiemap(struct fiemap_extent_info *fi, const struct iomap *iomap, u32 flags) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 8bc807c5fea4..47f804e0ec93 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 91600e71be19..fe0d7abbc1b1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index aec2ebf7d25a..e1db0f3f7e5e 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "vfsmod.h" diff --git a/include/linux/fs.h b/include/linux/fs.h index 85c584c5c623..0961c979e949 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2746,50 +2746,6 @@ extern void init_special_inode(struct inode *, umode_t, dev_t); extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end); - -static inline void invalidate_remote_inode(struct inode *inode) -{ - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) - invalidate_mapping_pages(inode->i_mapping, 0, -1); -} -extern int invalidate_inode_pages2(struct address_space *mapping); -extern int invalidate_inode_pages2_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); -extern int write_inode_now(struct inode *, int); -extern int filemap_fdatawrite(struct address_space *); -extern int filemap_flush(struct address_space *); -extern int filemap_fdatawait_keep_errors(struct address_space *mapping); -extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, - loff_t lend); -extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping, - loff_t start_byte, loff_t end_byte); - -static inline int filemap_fdatawait(struct address_space *mapping) -{ - return filemap_fdatawait_range(mapping, 0, LLONG_MAX); -} - -extern bool filemap_range_has_page(struct address_space *, loff_t lstart, - loff_t lend); -extern int filemap_write_and_wait_range(struct address_space *mapping, - loff_t lstart, loff_t lend); -extern int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); -extern int filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end); -extern int filemap_check_errors(struct address_space *mapping); -extern void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); - -static inline int filemap_write_and_wait(struct address_space *mapping) -{ - return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); -} - extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); @@ -2801,67 +2757,6 @@ static inline int file_write_and_wait(struct file *file) return file_write_and_wait_range(file, 0, LLONG_MAX); } -/** - * filemap_set_wb_err - set a writeback error on an address_space - * @mapping: mapping in which to set writeback error - * @err: error to be set in mapping - * - * When writeback fails in some way, we must record that error so that - * userspace can be informed when fsync and the like are called. We endeavor - * to report errors on any file that was open at the time of the error. Some - * internal callers also need to know when writeback errors have occurred. - * - * When a writeback error occurs, most filesystems will want to call - * filemap_set_wb_err to record the error in the mapping so that it will be - * automatically reported whenever fsync is called on the file. - */ -static inline void filemap_set_wb_err(struct address_space *mapping, int err) -{ - /* Fastpath for common case of no error */ - if (unlikely(err)) - __filemap_set_wb_err(mapping, err); -} - -/** - * filemap_check_wb_err - has an error occurred since the mark was sampled? - * @mapping: mapping to check for writeback errors - * @since: previously-sampled errseq_t - * - * Grab the errseq_t value from the mapping, and see if it has changed "since" - * the given value was sampled. - * - * If it has then report the latest error set, otherwise return 0. - */ -static inline int filemap_check_wb_err(struct address_space *mapping, - errseq_t since) -{ - return errseq_check(&mapping->wb_err, since); -} - -/** - * filemap_sample_wb_err - sample the current errseq_t to test for later errors - * @mapping: mapping to be sampled - * - * Writeback errors are always reported relative to a particular sample point - * in the past. This function provides those sample points. - */ -static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) -{ - return errseq_sample(&mapping->wb_err); -} - -/** - * file_sample_sb_err - sample the current errseq_t to test for later errors - * @file: file pointer to be sampled - * - * Grab the most current superblock-level errseq_t value for the given - * struct file. - */ -static inline errseq_t file_sample_sb_err(struct file *file) -{ - return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); -} - extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); @@ -3604,15 +3499,4 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); -/* - * Flush file data before changing attributes. Caller must hold any locks - * required to prevent further writes to this file until we're done setting - * flags. - */ -static inline int inode_drain_writes(struct inode *inode) -{ - inode_dio_wait(inode); - return filemap_write_and_wait(inode->i_mapping); -} - #endif /* _LINUX_FS_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cdb3f118603a..f968b36ad771 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -18,6 +18,120 @@ struct folio_batch; +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end); + +static inline void invalidate_remote_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + invalidate_mapping_pages(inode->i_mapping, 0, -1); +} +int invalidate_inode_pages2(struct address_space *mapping); +int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end); +int write_inode_now(struct inode *, int sync); +int filemap_fdatawrite(struct address_space *); +int filemap_flush(struct address_space *); +int filemap_fdatawait_keep_errors(struct address_space *mapping); +int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); +int filemap_fdatawait_range_keep_errors(struct address_space *mapping, + loff_t start_byte, loff_t end_byte); + +static inline int filemap_fdatawait(struct address_space *mapping) +{ + return filemap_fdatawait_range(mapping, 0, LLONG_MAX); +} + +bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); +int filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend); +int __filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end, int sync_mode); +int filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end); +int filemap_check_errors(struct address_space *mapping); +void __filemap_set_wb_err(struct address_space *mapping, int err); +int filemap_fdatawrite_wbc(struct address_space *mapping, + struct writeback_control *wbc); + +static inline int filemap_write_and_wait(struct address_space *mapping) +{ + return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); +} + +/** + * filemap_set_wb_err - set a writeback error on an address_space + * @mapping: mapping in which to set writeback error + * @err: error to be set in mapping + * + * When writeback fails in some way, we must record that error so that + * userspace can be informed when fsync and the like are called. We endeavor + * to report errors on any file that was open at the time of the error. Some + * internal callers also need to know when writeback errors have occurred. + * + * When a writeback error occurs, most filesystems will want to call + * filemap_set_wb_err to record the error in the mapping so that it will be + * automatically reported whenever fsync is called on the file. + */ +static inline void filemap_set_wb_err(struct address_space *mapping, int err) +{ + /* Fastpath for common case of no error */ + if (unlikely(err)) + __filemap_set_wb_err(mapping, err); +} + +/** + * filemap_check_wb_err - has an error occurred since the mark was sampled? + * @mapping: mapping to check for writeback errors + * @since: previously-sampled errseq_t + * + * Grab the errseq_t value from the mapping, and see if it has changed "since" + * the given value was sampled. + * + * If it has then report the latest error set, otherwise return 0. + */ +static inline int filemap_check_wb_err(struct address_space *mapping, + errseq_t since) +{ + return errseq_check(&mapping->wb_err, since); +} + +/** + * filemap_sample_wb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Writeback errors are always reported relative to a particular sample point + * in the past. This function provides those sample points. + */ +static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) +{ + return errseq_sample(&mapping->wb_err); +} + +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @file: file pointer to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + +/* + * Flush file data before changing attributes. Caller must hold any locks + * required to prevent further writes to this file until we're done setting + * flags. + */ +static inline int inode_drain_writes(struct inode *inode) +{ + inode_dio_wait(inode); + return filemap_write_and_wait(inode->i_mapping); +} + static inline bool mapping_empty(struct address_space *mapping) { return xa_empty(&mapping->i_pages); -- cgit v1.2.3 From 84dacdbd5352bfef82423760fa2e8bffaeef9e05 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:38:51 -0700 Subject: mm: document and polish read-ahead code Add some "big-picture" documentation for read-ahead and polish the code to make it fit this documentation. The meaning of ->async_size is clarified to match its name. i.e. Any request to ->readahead() has a sync part and an async part. The caller will wait for the sync pages to complete, but will not wait for the async pages. The first async page is still marked PG_readahead Note that the current function names page_cache_sync_ra() and page_cache_async_ra() are misleading. All ra request are partly sync and partly async, so either part can be empty. A page_cache_sync_ra() request will usually set ->async_size non-zero, implying it is not all synchronous. When a non-zero req_count is passed to page_cache_async_ra(), the implication is that some prefix of the request is synchronous, though the calculation made there is incorrect - I haven't tried to fix it. Link: https://lkml.kernel.org/r/164549983734.9187.11586890887006601405.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/mm-api.rst | 19 +++++++- Documentation/filesystems/vfs.rst | 16 ++++--- include/linux/fs.h | 9 +++- mm/readahead.c | 99 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 133 insertions(+), 10 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 395835f9289f..f5b2f92822c8 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -58,15 +58,30 @@ Virtually Contiguous Mappings File Mapping and Page Cache =========================== -.. kernel-doc:: mm/readahead.c - :export: +Filemap +------- .. kernel-doc:: mm/filemap.c :export: +Readahead +--------- + +.. kernel-doc:: mm/readahead.c + :doc: Readahead Overview + +.. kernel-doc:: mm/readahead.c + :export: + +Writeback +--------- + .. kernel-doc:: mm/page-writeback.c :export: +Truncate +-------- + .. kernel-doc:: mm/truncate.c :export: diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index bf5c48066fac..b4a0baa46dcc 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -806,12 +806,16 @@ cache in your filesystem. The following members are defined: object. The pages are consecutive in the page cache and are locked. The implementation should decrement the page refcount after starting I/O on each page. Usually the page will be - unlocked by the I/O completion handler. If the filesystem decides - to stop attempting I/O before reaching the end of the readahead - window, it can simply return. The caller will decrement the page - refcount and unlock the remaining pages for you. Set PageUptodate - if the I/O completes successfully. Setting PageError on any page - will be ignored; simply unlock the page if an I/O error occurs. + unlocked by the I/O completion handler. The set of pages are + divided into some sync pages followed by some async pages, + rac->ra->async_size gives the number of async pages. The + filesystem should attempt to read all sync pages but may decide + to stop once it reaches the async pages. If it does decide to + stop attempting I/O, it can simply return. The caller will + remove the remaining pages from the address space, unlock them + and decrement the page refcount. Set PageUptodate if the I/O + completes successfully. Setting PageError on any page will be + ignored; simply unlock the page if an I/O error occurs. ``readpages`` called by the VM to read pages associated with the address_space diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..8b5c486bd4a2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -930,10 +930,15 @@ struct fown_struct { * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. - * @async_size: Start next readahead when this many pages are left. - * @ra_pages: Maximum size of a readahead request. + * @async_size: Numer of pages that were/are not needed immediately + * and so were/are genuinely "ahead". Start next readahead when + * the first of these pages is accessed. + * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. + * + * When this structure is passed to ->readahead(), the "most recent" + * readahead means the current readahead. */ struct file_ra_state { pgoff_t start; diff --git a/mm/readahead.c b/mm/readahead.c index cf0dcf89eb69..73b2bc5302e0 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,6 +8,105 @@ * Initial version. */ +/** + * DOC: Readahead Overview + * + * Readahead is used to read content into the page cache before it is + * explicitly requested by the application. Readahead only ever + * attempts to read pages that are not yet in the page cache. If a + * page is present but not up-to-date, readahead will not try to read + * it. In that case a simple ->readpage() will be requested. + * + * Readahead is triggered when an application read request (whether a + * systemcall or a page fault) finds that the requested page is not in + * the page cache, or that it is in the page cache and has the + * %PG_readahead flag set. This flag indicates that the page was loaded + * as part of a previous read-ahead request and now that it has been + * accessed, it is time for the next read-ahead. + * + * Each readahead request is partly synchronous read, and partly async + * read-ahead. This is reflected in the struct file_ra_state which + * contains ->size being to total number of pages, and ->async_size + * which is the number of pages in the async section. The first page in + * this async section will have %PG_readahead set as a trigger for a + * subsequent read ahead. Once a series of sequential reads has been + * established, there should be no need for a synchronous component and + * all read ahead request will be fully asynchronous. + * + * When either of the triggers causes a readahead, three numbers need to + * be determined: the start of the region, the size of the region, and + * the size of the async tail. + * + * The start of the region is simply the first page address at or after + * the accessed address, which is not currently populated in the page + * cache. This is found with a simple search in the page cache. + * + * The size of the async tail is determined by subtracting the size that + * was explicitly requested from the determined request size, unless + * this would be less than zero - then zero is used. NOTE THIS + * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED + * PAGE. + * + * The size of the region is normally determined from the size of the + * previous readahead which loaded the preceding pages. This may be + * discovered from the struct file_ra_state for simple sequential reads, + * or from examining the state of the page cache when multiple + * sequential reads are interleaved. Specifically: where the readahead + * was triggered by the %PG_readahead flag, the size of the previous + * readahead is assumed to be the number of pages from the triggering + * page to the start of the new readahead. In these cases, the size of + * the previous readahead is scaled, often doubled, for the new + * readahead, though see get_next_ra_size() for details. + * + * If the size of the previous read cannot be determined, the number of + * preceding pages in the page cache is used to estimate the size of + * a previous read. This estimate could easily be misled by random + * reads being coincidentally adjacent, so it is ignored unless it is + * larger than the current request, and it is not scaled up, unless it + * is at the start of file. + * + * In general read ahead is accelerated at the start of the file, as + * reads from there are often sequential. There are other minor + * adjustments to the read ahead size in various special cases and these + * are best discovered by reading the code. + * + * The above calculation determines the readahead, to which any requested + * read size may be added. + * + * Readahead requests are sent to the filesystem using the ->readahead() + * address space operation, for which mpage_readahead() is a canonical + * implementation. ->readahead() should normally initiate reads on all + * pages, but may fail to read any or all pages without causing an IO + * error. The page cache reading code will issue a ->readpage() request + * for any page which ->readahead() does not provided, and only an error + * from this will be final. + * + * ->readahead() will generally call readahead_page() repeatedly to get + * each page from those prepared for read ahead. It may fail to read a + * page by: + * + * * not calling readahead_page() sufficiently many times, effectively + * ignoring some pages, as might be appropriate if the path to + * storage is congested. + * + * * failing to actually submit a read request for a given page, + * possibly due to insufficient resources, or + * + * * getting an error during subsequent processing of a request. + * + * In the last two cases, the page should be unlocked to indicate that + * the read attempt has failed. In the first case the page will be + * unlocked by the caller. + * + * Those pages not in the final ``async_size`` of the request should be + * considered to be important and ->readahead() should not fail them due + * to congestion or temporary resource unavailability, but should wait + * for necessary resources (e.g. memory or indexing information) to + * become available. Pages in the final ``async_size`` may be + * considered less urgent and failure to read them is more acceptable. + * They will eventually be read individually using ->readpage(). + */ + #include #include #include -- cgit v1.2.3 From a128b054ce029554a4a52fc3abb8c1df8bafcaef Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Tue, 22 Mar 2022 14:39:22 -0700 Subject: mount: warn only once about timestamp range expiration Commit f8b92ba67c5d ("mount: Add mount warning for impending timestamp expiry") introduced a mount warning regarding filesystem timestamp limits, that is printed upon each writable mount or remount. This can result in a lot of unnecessary messages in the kernel log in setups where filesystems are being frequently remounted (or mounted multiple times). Avoid this by setting a superblock flag which indicates that the warning has been emitted at least once for any particular mount, as suggested in [1]. Link: https://lore.kernel.org/CAHk-=wim6VGnxQmjfK_tDg6fbHYKL4EFkmnTjVr9QnRqjDBAeA@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20220119202934.26495-1-ailiop@suse.com Signed-off-by: Anthony Iliopoulos Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Reviewed-by: Darrick J. Wong Cc: Alexander Viro Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 ++ include/linux/fs.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux/fs.h') diff --git a/fs/namespace.c b/fs/namespace.c index de6fae84f1a1..0044feef59d0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2597,6 +2597,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); @@ -2611,6 +2612,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; } } diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b5c486bd4a2..ca9445f6cf3d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1440,6 +1440,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ /* Possible states of 'frozen' field */ enum { -- cgit v1.2.3 From 8b9f3ac5b01db85c6cf74c2c3a71280cc3045c9c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:00 -0700 Subject: fs: introduce alloc_inode_sb() to allocate filesystems specific inode The allocated inode cache is supposed to be added to its memcg list_lru which should be allocated as well in advance. That can be done by kmem_cache_alloc_lru() which allocates object and list_lru. The file systems is main user of it. So introduce alloc_inode_sb() to allocate file system specific inodes and set up the inode reclaim context properly. The file system is supposed to use alloc_inode_sb() to allocate inodes. In later patches, we will convert all users to the new API. Link: https://lkml.kernel.org/r/20220228122126.37293-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Roman Gushchin Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/porting.rst | 6 ++++++ fs/inode.c | 2 +- include/linux/fs.h | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index bf19fd6b86e7..7c1583dbeb59 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -45,6 +45,12 @@ typically between calling iget_locked() and unlocking the inode. At some point that will become mandatory. +**mandatory** + +The foo_inode_info should always be allocated through alloc_inode_sb() rather +than kmem_cache_alloc() or kmalloc() related to set up the inode reclaim context +correctly. + --- **mandatory** diff --git a/fs/inode.c b/fs/inode.c index 63324df6fa27..9d9b422504d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -259,7 +259,7 @@ static struct inode *alloc_inode(struct super_block *sb) if (ops->alloc_inode) inode = ops->alloc_inode(sb); else - inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); + inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index ca9445f6cf3d..58a73e59e4c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -3114,6 +3115,16 @@ extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_privs(struct file *); +/* + * This must be used for allocating filesystems specific inodes to set + * up the inode reclaim context correctly. + */ +static inline void * +alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp) +{ + return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp); +} + extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { -- cgit v1.2.3 From 704528d895dd3e7b173e672116b4eb2b0a0fceb0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 23 Mar 2022 21:29:04 -0400 Subject: fs: Remove ->readpages address space operation All filesystems have now been converted to use ->readahead, so remove the ->readpages operation and fix all the comments that used to refer to it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- Documentation/filesystems/fsverity.rst | 6 +++--- Documentation/filesystems/locking.rst | 6 ------ Documentation/filesystems/vfs.rst | 11 ----------- fs/btrfs/reflink.c | 4 ++-- fs/cifs/cifssmb.c | 2 +- fs/cifs/inode.c | 2 +- fs/crypto/crypto.c | 2 +- fs/ext4/readpage.c | 2 +- fs/f2fs/data.c | 4 ++-- fs/fuse/fuse_i.h | 2 +- fs/verity/verify.c | 4 ++-- include/linux/fs.h | 6 ------ include/linux/fsverity.h | 2 +- mm/filemap.c | 2 +- mm/readahead.c | 15 ++------------- 15 files changed, 18 insertions(+), 52 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 1d831e3cbcb3..8cc536d08f51 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -549,7 +549,7 @@ Pagecache ~~~~~~~~~ For filesystems using Linux's pagecache, the ``->readpage()`` and -``->readpages()`` methods must be modified to verify pages before they +``->readahead()`` methods must be modified to verify pages before they are marked Uptodate. Merely hooking ``->read_iter()`` would be insufficient, since ``->read_iter()`` is not used for memory maps. @@ -611,7 +611,7 @@ workqueue, and then the workqueue work does the decryption or verification. Finally, pages where no decryption or verity error occurred are marked Uptodate, and the pages are unlocked. -Files on ext4 and f2fs may contain holes. Normally, ``->readpages()`` +Files on ext4 and f2fs may contain holes. Normally, ``->readahead()`` simply zeroes holes and sets the corresponding pages Uptodate; no bios are issued. To prevent this case from bypassing fs-verity, these filesystems use fsverity_verify_page() to verify hole pages. @@ -778,7 +778,7 @@ weren't already directly answered in other parts of this document. - To prevent bypassing verification, pages must not be marked Uptodate until they've been verified. Currently, each filesystem is responsible for marking pages Uptodate via - ``->readpages()``. Therefore, currently it's not possible for + ``->readahead()``. Therefore, currently it's not possible for the VFS to do the verification on its own. Changing this would require significant changes to the VFS and all filesystems. diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 2998cec9af4b..c26d854275a0 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -241,8 +241,6 @@ prototypes:: int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *folio); void (*readahead)(struct readahead_control *); - int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); @@ -274,7 +272,6 @@ readpage: yes, unlocks shared writepages: dirty_folio maybe readahead: yes, unlocks shared -readpages: no shared write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: @@ -300,9 +297,6 @@ completion. ->readahead() unlocks the pages that I/O is attempted on like ->readpage(). -->readpages() populates the pagecache with the passed pages and starts -I/O against them. They come unlocked upon I/O completion. - ->writepage() is used for two purposes: for "memory cleansing" and for "sync". These are quite different operations and the behaviour may differ depending upon the mode. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 4f14edf93941..794bd1a66bfb 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -726,8 +726,6 @@ cache in your filesystem. The following members are defined: int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); - int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); @@ -817,15 +815,6 @@ cache in your filesystem. The following members are defined: completes successfully. Setting PageError on any page will be ignored; simply unlock the page if an I/O error occurs. -``readpages`` - called by the VM to read pages associated with the address_space - object. This is essentially just a vector version of readpage. - Instead of just one page, several pages are requested. - readpages is only used for read-ahead, so read errors are - ignored. If anything goes wrong, feel free to give up. - This interface is deprecated and will be removed by the end of - 2020; implement readahead instead. - ``write_begin`` Called by the generic buffered write code to ask the filesystem to prepare to write len bytes at the given offset in the file. diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 04a88bfe4fcf..998e3f180d90 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -645,7 +645,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, int ret; /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); @@ -739,7 +739,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, } /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, off, inode, destoff, len); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 071e2f21a7db..bc3ded4f34f6 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -597,7 +597,7 @@ CIFSSMBNegotiate(const unsigned int xid, set_credits(server, server->maxReq); /* probably no need to store and check maxvcs */ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); - /* set up max_read for readpages check */ + /* set up max_read for readahead check */ server->max_read = server->maxBuf; server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 60d853c92f6a..2f9e7d2f81b6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -49,7 +49,7 @@ static void cifs_set_ops(struct inode *inode) inode->i_fop = &cifs_file_ops; } - /* check if server can support readpages */ + /* check if server can support readahead */ if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read < PAGE_SIZE + MAX_CIFS_HDR_SIZE) inode->i_data.a_ops = &cifs_addr_ops_smallbuf; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 4fcca79f39ae..526a4c1bed99 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -248,7 +248,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); * which must still be locked and not uptodate. Normally, blocksize == * PAGE_SIZE and the whole page is decrypted at once. * - * This is for use by the filesystem's ->readpages() method. + * This is for use by the filesystem's ->readahead() method. * * Return: 0 on success; -errno on failure */ diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 1aa26d6634fc..af491e170c4a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -109,7 +109,7 @@ static void verity_work(struct work_struct *work) struct bio *bio = ctx->bio; /* - * fsverity_verify_bio() may call readpages() again, and although verity + * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f8fcbe91059b..c92920c8661d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -164,7 +164,7 @@ static void f2fs_verify_bio(struct work_struct *work) bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* - * fsverity_verify_bio() may call readpages() again, and while verity + * fsverity_verify_bio() may call readahead() again, and while verity * will be disabled for this, decryption and/or decompression may still * be needed, resulting in another bio_post_read_ctx being allocated. * So to prevent deadlocks we need to release the current ctx to the @@ -2392,7 +2392,7 @@ static void f2fs_readahead(struct readahead_control *rac) if (!f2fs_is_compress_backend_ready(inode)) return; - /* If the file has inline data, skip readpages */ + /* If the file has inline data, skip readahead */ if (f2fs_has_inline_data(inode)) return; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index eac4984cc753..488b460e046f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -627,7 +627,7 @@ struct fuse_conn { /** Connection successful. Only set in INIT */ unsigned conn_init:1; - /** Do readpages asynchronously? Only set in INIT */ + /** Do readahead asynchronously? Only set in INIT */ unsigned async_read:1; /** Return an unique read error after abort. Only set in INIT */ diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 0adb970f4e73..14e2fb49cff5 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Data verification functions, i.e. hooks for ->readpages() + * Data verification functions, i.e. hooks for ->readahead() * * Copyright 2019 Google LLC */ @@ -214,7 +214,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); * that fail verification are set to the Error state. Verification is skipped * for pages already in the Error state, e.g. due to fscrypt decryption failure. * - * This is a helper function for use by the ->readpages() method of filesystems + * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that * populate the page cache without issuing bios (e.g. non block-based * filesystems) must instead call fsverity_verify_page() directly on each page. diff --git a/include/linux/fs.h b/include/linux/fs.h index 183160872133..7c81887cc7e8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -370,12 +370,6 @@ struct address_space_operations { /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); - /* - * Reads in the requested pages. Unlike ->readpage(), this is - * PURELY used for read-ahead!. - */ - int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index b568b3c7d095..a7afc800bd8d 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -221,7 +221,7 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work) * * This checks whether ->i_verity_info has been set. * - * Filesystems call this from ->readpages() to check whether the pages need to + * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) diff --git a/mm/filemap.c b/mm/filemap.c index 647d72bf23b6..d904cd7e4181 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2538,7 +2538,7 @@ static int filemap_create_folio(struct file *file, * the page cache as the locked folio would then be enough to * synchronize with hole punching. But there are code paths * such as filemap_update_page() filling in partially uptodate - * pages or ->readpages() that need to hold invalidate_lock + * pages or ->readahead() that need to hold invalidate_lock * while mapping blocks for IO so let's hold the lock here as * well to keep locking rules simple. */ diff --git a/mm/readahead.c b/mm/readahead.c index 9097af639beb..297bd0719cda 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -170,13 +170,6 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, unlock_page(page); put_page(page); } - } else if (aops->readpages) { - aops->readpages(rac->file, rac->mapping, pages, - readahead_count(rac)); - /* Clean up the remaining pages */ - put_pages_list(pages); - rac->_index += rac->_nr_pages; - rac->_nr_pages = 0; } else { while ((page = readahead_page(rac))) { aops->readpage(rac->file, page); @@ -253,10 +246,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; - if (mapping->a_ops->readpages) { - folio->index = index + i; - list_add(&folio->lru, &page_pool); - } else if (filemap_add_folio(mapping, folio, index + i, + if (filemap_add_folio(mapping, folio, index + i, gfp_mask) < 0) { folio_put(folio); read_pages(ractl, &page_pool, true); @@ -318,8 +308,7 @@ void force_page_cache_ra(struct readahead_control *ractl, struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && - !mapping->a_ops->readahead)) + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readahead)) return; /* -- cgit v1.2.3 From a9fcd89d67bb8c4ad613b54ab691fc603c94a03a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Feb 2022 09:13:43 -0500 Subject: fs: Remove read_actor_t This typedef is not used any more. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7c81887cc7e8..7588d3a0ced8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -357,9 +357,6 @@ typedef struct { int error; } read_descriptor_t; -typedef int (*read_actor_t)(read_descriptor_t *, struct page *, - unsigned long, unsigned long); - struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); -- cgit v1.2.3 From b2403a61308533c576c9dd783fcb73a9186e0b37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Feb 2022 09:15:34 -0500 Subject: fs, net: Move read_descriptor_t to net.h fs.h has no more need for this typedef; networking is now the sole user of the read_descriptor_t. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- include/linux/fs.h | 19 ------------------- include/linux/net.h | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7588d3a0ced8..8ff28939de60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -338,25 +338,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) return kiocb->ki_complete == NULL; } -/* - * "descriptor" for what we're up to with a read. - * This allows us to use the same read code yet - * have multiple different users of the data that - * we read from a file. - * - * The simplest case just copies the data to user - * mode. - */ -typedef struct { - size_t written; - size_t count; - union { - char __user *buf; - void *data; - } arg; - int error; -} read_descriptor_t; - struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); diff --git a/include/linux/net.h b/include/linux/net.h index ba736b457a06..12093f4db50c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -125,6 +125,25 @@ struct socket { struct socket_wq wq; }; +/* + * "descriptor" for what we're up to with a read. + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ +typedef struct { + size_t written; + size_t count; + union { + char __user *buf; + void *data; + } arg; + int error; +} read_descriptor_t; + struct vm_area_struct; struct page; struct sockaddr; -- cgit v1.2.3 From 800ba29547e16d5fbe67ca764ba660e049e9f1bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 19 Feb 2022 23:19:49 -0500 Subject: fs: Pass an iocb to generic_perform_write() We can extract both the file pointer and the pos from the iocb. This simplifies each caller as well as allowing generic_perform_write() to see more of the iocb contents in the future. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Al Viro Acked-by: Al Viro --- fs/ceph/file.c | 2 +- fs/ext4/file.c | 2 +- fs/f2fs/file.c | 2 +- fs/nfs/file.c | 2 +- include/linux/fs.h | 2 +- mm/filemap.c | 10 ++++++---- 6 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index feb75eb1cd82..6c9e837aa1d3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1869,7 +1869,7 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_perform_write(file, from, pos); + written = generic_perform_write(iocb, from); if (likely(written >= 0)) iocb->ki_pos = pos + written; ceph_end_io_write(inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8bd66cdc41be..6feb07e3e1eb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -267,7 +267,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, goto out; current->backing_dev_info = inode_to_bdi(inode); - ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + ret = generic_perform_write(iocb, from); current->backing_dev_info = NULL; out: diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d3f39a704b8b..5b89af0f27f0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4448,7 +4448,7 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, return -EOPNOTSUPP; current->backing_dev_info = inode_to_bdi(inode); - ret = generic_perform_write(file, from, iocb->ki_pos); + ret = generic_perform_write(iocb, from); current->backing_dev_info = NULL; if (ret > 0) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index b0ca244c50d0..150b7fa8f0a7 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -646,7 +646,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) result = generic_write_checks(iocb, from); if (result > 0) { current->backing_dev_info = inode_to_bdi(inode); - result = generic_perform_write(file, from, iocb->ki_pos); + result = generic_perform_write(iocb, from); current->backing_dev_info = NULL; } nfs_end_io_write(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 8ff28939de60..468dc7ec821f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2999,7 +2999,7 @@ extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); -extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t); +ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); diff --git a/mm/filemap.c b/mm/filemap.c index d904cd7e4181..3a5ffb5587cd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3752,9 +3752,10 @@ out: } EXPORT_SYMBOL(generic_file_direct_write); -ssize_t generic_perform_write(struct file *file, - struct iov_iter *i, loff_t pos) +ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) { + struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; long status = 0; @@ -3884,7 +3885,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) goto out; - status = generic_perform_write(file, from, pos = iocb->ki_pos); + pos = iocb->ki_pos; + status = generic_perform_write(iocb, from); /* * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were @@ -3916,7 +3918,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) */ } } else { - written = generic_perform_write(file, from, iocb->ki_pos); + written = generic_perform_write(iocb, from); if (likely(written > 0)) iocb->ki_pos += written; } -- cgit v1.2.3 From d7414ba14a3a67f81321069219dc7dbc095022c3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 20 Feb 2022 22:28:03 -0500 Subject: filemap: Remove AOP_FLAG_CONT_EXPAND This flag is no longer used, so remove it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- fs/buffer.c | 3 +-- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/buffer.c b/fs/buffer.c index d67fbe063a3a..2b5561ae5d0b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2352,8 +2352,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) if (err) goto out; - err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_CONT_EXPAND, &page, &fsdata); + err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); if (err) goto out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 468dc7ec821f..bbde95387a23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -275,7 +275,6 @@ enum positive_aop_returns { AOP_TRUNCATED_PAGE = 0x80001, }; -#define AOP_FLAG_CONT_EXPAND 0x0001 /* called from cont_expand */ #define AOP_FLAG_NOFS 0x0002 /* used by filesystem to direct * helper code (eg buffer layer) * to clear GFP_FS from alloc */ -- cgit v1.2.3 From 56f5746c414d92ae8e8314f46760822b4ecf8be3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 09:40:54 -0500 Subject: namei: Merge page_symlink() and __page_symlink() There are no callers of __page_symlink() left, so we can remove that entry point. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner --- Documentation/filesystems/porting.rst | 2 +- fs/namei.c | 13 ++----------- include/linux/fs.h | 2 -- 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 7c1583dbeb59..2e0e4f0e0c6f 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -624,7 +624,7 @@ any symlink that might use page_follow_link_light/page_put_link() must have inode_nohighmem(inode) called before anything might start playing with its pagecache. No highmem pages should end up in the pagecache of such symlinks. That includes any preseeding that might be done during symlink -creation. __page_symlink() will honour the mapping gfp flags, so once +creation. page_symlink() will honour the mapping gfp flags, so once you've done inode_nohighmem() it's safe to use, but if you allocate and insert the page manually, make sure to use the right gfp flags. diff --git a/fs/namei.c b/fs/namei.c index 509657fdf4f5..6153581073b1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5001,12 +5001,10 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(page_readlink); -/* - * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS - */ -int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) +int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; + bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; void *fsdata; int err; @@ -5034,13 +5032,6 @@ retry: fail: return err; } -EXPORT_SYMBOL(__page_symlink); - -int page_symlink(struct inode *inode, const char *symname, int len) -{ - return __page_symlink(inode, symname, len, - !mapping_gfp_constraint(inode->i_mapping, __GFP_FS)); -} EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..e108aff23a28 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3109,8 +3109,6 @@ extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); -extern int __page_symlink(struct inode *inode, const char *symname, int len, - int nofs); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); -- cgit v1.2.3 From 236d93c4bf2d6da83241cc8e4625e89d9604cb43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 10:40:11 -0500 Subject: fs: Remove AOP_FLAG_NOFS With all users of this flag gone, we can stop testing whether it's set. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- fs/netfs/buffered_read.c | 6 +----- include/linux/fs.h | 4 ---- mm/folio-compat.c | 2 -- 3 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 281a88a5b8dc..65c17c5a5567 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -302,7 +302,6 @@ zero_out: * @mapping: The mapping to read from * @pos: File position at which the write will begin * @len: The length of the write (may extend beyond the end of the folio chosen) - * @aop_flags: AOP_* flags * @_folio: Where to put the resultant folio * @_fsdata: Place for the netfs to store a cookie * @@ -335,16 +334,13 @@ int netfs_write_begin(struct file *file, struct address_space *mapping, struct netfs_io_request *rreq; struct netfs_i_context *ctx = netfs_i_context(file_inode(file )); struct folio *folio; - unsigned int fgp_flags; + unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; pgoff_t index = pos >> PAGE_SHIFT; int ret; DEFINE_READAHEAD(ractl, file, NULL, mapping, index); retry: - fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (aop_flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; folio = __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); if (!folio) diff --git a/include/linux/fs.h b/include/linux/fs.h index e108aff23a28..f81bc5cbcbb6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -275,10 +275,6 @@ enum positive_aop_returns { AOP_TRUNCATED_PAGE = 0x80001, }; -#define AOP_FLAG_NOFS 0x0002 /* used by filesystem to direct - * helper code (eg buffer layer) - * to clear GFP_FS from alloc */ - /* * oh the beauties of C type declarations. */ diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 46fa179e32fb..3e42ddb81918 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -135,8 +135,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, { unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; return pagecache_get_page(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); } -- cgit v1.2.3 From 9d6b0cd7579844761ed68926eb3073bab1dca87b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 14:31:43 -0500 Subject: fs: Remove flags parameter from aops->write_begin There are no more aop flags left, so remove the parameter. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- Documentation/filesystems/locking.rst | 2 +- Documentation/filesystems/vfs.rst | 5 +---- block/fops.c | 3 +-- fs/9p/vfs_addr.c | 2 +- fs/adfs/inode.c | 2 +- fs/affs/file.c | 6 +++--- fs/afs/internal.h | 2 +- fs/afs/write.c | 2 +- fs/bfs/file.c | 2 +- fs/ceph/addr.c | 2 +- fs/cifs/file.c | 2 +- fs/ecryptfs/mmap.c | 2 +- fs/exfat/inode.c | 2 +- fs/ext2/inode.c | 6 ++---- fs/ext4/inode.c | 10 +++++----- fs/f2fs/data.c | 5 ++--- fs/f2fs/super.c | 2 +- fs/fat/inode.c | 2 +- fs/fuse/file.c | 3 +-- fs/hfs/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/hpfs/file.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/jffs2/file.c | 4 ++-- fs/jfs/inode.c | 2 +- fs/libfs.c | 2 +- fs/minix/inode.c | 2 +- fs/nfs/file.c | 2 +- fs/nilfs2/inode.c | 2 +- fs/ntfs3/inode.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/omfs/file.c | 2 +- fs/orangefs/inode.c | 5 ++--- fs/reiserfs/inode.c | 2 +- fs/sysv/itree.c | 2 +- fs/ubifs/file.c | 7 +++---- fs/udf/file.c | 2 +- fs/udf/inode.c | 2 +- fs/ufs/inode.c | 2 +- include/linux/fs.h | 4 ++-- include/trace/events/ext4.h | 21 ++++++++------------- include/trace/events/f2fs.h | 12 ++++-------- mm/filemap.c | 6 ++---- mm/shmem.c | 2 +- 45 files changed, 69 insertions(+), 90 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index c26d854275a0..fd9d9caf09ab 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -242,7 +242,7 @@ prototypes:: bool (*dirty_folio)(struct address_space *, struct folio *folio); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 794bd1a66bfb..30f303180a7d 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -727,7 +727,7 @@ cache in your filesystem. The following members are defined: bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -832,9 +832,6 @@ cache in your filesystem. The following members are defined: passed to write_begin is greater than the number of bytes copied into the page). - flags is a field for AOP_FLAG_xxx flags, described in - include/linux/fs.h. - A void * may be returned in fsdata, which then gets passed into write_end. diff --git a/block/fops.c b/block/fops.c index b432756570c6..712affe56e29 100644 --- a/block/fops.c +++ b/block/fops.c @@ -398,8 +398,7 @@ static void blkdev_readahead(struct readahead_control *rac) } static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, struct page **pagep, - void **fsdata) + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { return block_write_begin(mapping, pos, len, pagep, blkdev_get_block); } diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index d311e68e21fd..a2d57112f53e 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -260,7 +260,7 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } static int v9fs_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int flags, + loff_t pos, unsigned int len, struct page **subpagep, void **fsdata) { int retval; diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index b6912496bb19..f7959b1a2d52 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -52,7 +52,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) } static int adfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/affs/file.c b/fs/affs/file.c index 06645d05c717..b952f65c3f06 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -414,7 +414,7 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } static int affs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; @@ -650,7 +650,7 @@ affs_readpage_ofs(struct file *file, struct page *page) } static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; @@ -887,7 +887,7 @@ affs_truncate(struct inode *inode) loff_t isize = inode->i_size; int res; - res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata); + res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata); if (!res) res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata); else diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 7b7ef945dc78..7a72e9c60423 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1535,7 +1535,7 @@ bool afs_dirty_folio(struct address_space *, struct folio *); #define afs_dirty_folio filemap_dirty_folio #endif extern int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); extern int afs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, diff --git a/fs/afs/write.c b/fs/afs/write.c index af496c98d394..5224e346fbad 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -42,7 +42,7 @@ static void afs_folio_start_fscache(bool caching, struct folio *folio) * prepare to perform part of a write to a page */ int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **_page, void **fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 9408f45225cb..dc97c9b8f23b 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -169,7 +169,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) } static int bfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 415f0886bc25..e65541a51b68 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1311,7 +1311,7 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned * clean, or already dirty within the same snap context. */ static int ceph_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned aop_flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 91aeae7fced8..da362b5a0c96 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4681,7 +4681,7 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) } static int cifs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int oncethru = 0; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 84e399a921ad..47904d40ef88 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -264,7 +264,7 @@ out: */ static int ecryptfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 8ed3c4b700cd..b9f63113db2d 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -389,7 +389,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) } static int exfat_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int flags, + loff_t pos, unsigned int len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index bfa69c52ce2c..d8ca8050945a 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -887,8 +887,7 @@ static void ext2_readahead(struct readahead_control *rac) static int ext2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; @@ -912,8 +911,7 @@ static int ext2_write_end(struct file *file, struct address_space *mapping, static int ext2_nobh_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 512d8143c765..d3a7e8581291 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1130,7 +1130,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, #endif static int ext4_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; @@ -1144,7 +1144,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - trace_ext4_write_begin(inode, pos, len, flags); + trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason @@ -2931,7 +2931,7 @@ static int ext4_nonda_switch(struct super_block *sb) } static int ext4_da_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret, retries = 0; @@ -2948,10 +2948,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, - len, flags, pagep, fsdata); + len, pagep, fsdata); } *fsdata = (void *)0; - trace_ext4_da_write_begin(inode, pos, len, flags); + trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9a1a526f2092..b3cf49136b9f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3314,8 +3314,7 @@ unlock_out: } static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3325,7 +3324,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, block_t blkaddr = NULL_ADDR; int err = 0; - trace_f2fs_write_begin(inode, pos, len, flags); + trace_f2fs_write_begin(inode, pos, len); if (!f2fs_is_checkpoint_ready(sbi)) { err = -ENOSPC; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4368f90571bd..ed3e8b7a8260 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2483,7 +2483,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); retry: - err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + err = a_ops->write_begin(NULL, mapping, off, tocopy, &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 9b34ccef2501..1f15b0fd1bb0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -226,7 +226,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) } static int fat_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int err; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e35e394264ad..bca8c2135ec5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2273,8 +2273,7 @@ out: * but how to implement it without killing performance need more thinking. */ static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 396735dd3407..93d9aa832139 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -50,7 +50,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) } static int hfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 435b6202532a..73010aa4623f 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -44,7 +44,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) } static int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2bfd316e1bf1..e658d8edde35 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -463,7 +463,7 @@ static int hostfs_readpage(struct file *file, struct page *page) } static int hostfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 8740b4ea0b52..8b590b3826c3 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -194,7 +194,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) } static int hpfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dd3a088db11d..2de9ca5d260d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -383,7 +383,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { return -EINVAL; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 142d3ba9f0a8..2b35811772de 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -25,7 +25,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *pg, void *fsdata); static int jffs2_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); static int jffs2_readpage (struct file *filp, struct page *pg); @@ -130,7 +130,7 @@ static int jffs2_readpage (struct file *filp, struct page *pg) } static int jffs2_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct page *pg; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index e16f77b4e84c..aa9f112107b2 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -314,7 +314,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) } static int jfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/libfs.c b/fs/libfs.c index d4395e1c6696..a1c10d3163e0 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -549,7 +549,7 @@ static int simple_readpage(struct file *file, struct page *page) } int simple_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct page *page; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 5e8d7ba661cf..3add78bccedc 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -423,7 +423,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) } static int minix_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d66088dd33e7..314d2d7ba84a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -313,7 +313,7 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page, * increment the page use counts until he is done with the page. */ static int nfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index be09a0d10f04..02297ec8dc55 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -248,7 +248,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) } static int nilfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 16466c8648f3..1364174cc6c9 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -862,7 +862,7 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, } static int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, u32 flags, struct page **pagep, + loff_t pos, u32 len, struct page **pagep, void **fsdata) { int err; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4b9af65cb61b..7cffe9dcad17 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1881,7 +1881,7 @@ out: } static int ocfs2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 349b96d89c44..980b0a72c172 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -316,7 +316,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) } static int omfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 809690db8be2..bc7ccd15d7a3 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -326,9 +326,8 @@ static int orangefs_readpage(struct file *file, struct page *page) } static int orangefs_write_begin(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, struct page **pagep, - void **fsdata) + struct address_space *mapping, loff_t pos, unsigned len, + struct page **pagep, void **fsdata) { struct orangefs_write_range *wr; struct folio *folio; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index aa31cf1dbba6..46ba4892030a 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2753,7 +2753,7 @@ static void reiserfs_truncate_failed_write(struct inode *inode) static int reiserfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 96b7fd4facf3..96ad24fe0ffb 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -477,7 +477,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to) } static int sysv_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 0911fc311434..81c085c4decf 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -215,8 +215,7 @@ static void release_existing_page_budget(struct ubifs_info *c) } static int write_begin_slow(struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, - unsigned flags) + loff_t pos, unsigned len, struct page **pagep) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -419,7 +418,7 @@ static int allocate_budget(struct ubifs_info *c, struct page *page, * without forcing write-back. The slow path does not make this assumption. */ static int ubifs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; @@ -493,7 +492,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); put_page(page); - return write_begin_slow(mapping, pos, len, pagep, flags); + return write_begin_slow(mapping, pos, len, pagep); } /* diff --git a/fs/udf/file.c b/fs/udf/file.c index 724bb3141fda..3f4d5c44c784 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -87,7 +87,7 @@ static int udf_adinicb_writepage(struct page *page, static int udf_adinicb_write_begin(struct file *file, struct address_space *mapping, loff_t pos, - unsigned len, unsigned flags, struct page **pagep, + unsigned len, struct page **pagep, void **fsdata) { struct page *page; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 88a95886ce8a..866f9a53248e 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -204,7 +204,7 @@ static void udf_readahead(struct readahead_control *rac) } static int udf_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index bd0e0c66f93d..6c973b71cab2 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -495,7 +495,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) } static int ufs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index f81bc5cbcbb6..a0e73432526f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -346,7 +346,7 @@ struct address_space_operations { void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -3179,7 +3179,7 @@ extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d06ffffad434..229e8fae66a3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -335,17 +335,15 @@ TRACE_EVENT(ext4_begin_ordered_truncate, DECLARE_EVENT_CLASS(ext4__write_begin, - TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, - unsigned int flags), + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), - TP_ARGS(inode, pos, len, flags), + TP_ARGS(inode, pos, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) - __field( unsigned int, flags ) ), TP_fast_assign( @@ -353,29 +351,26 @@ DECLARE_EVENT_CLASS(ext4__write_begin, __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; - __entry->flags = flags; ), - TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u", + TP_printk("dev %d,%d ino %lu pos %lld len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->pos, __entry->len, __entry->flags) + __entry->pos, __entry->len) ); DEFINE_EVENT(ext4__write_begin, ext4_write_begin, - TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, - unsigned int flags), + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), - TP_ARGS(inode, pos, len, flags) + TP_ARGS(inode, pos, len) ); DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, - TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, - unsigned int flags), + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), - TP_ARGS(inode, pos, len, flags) + TP_ARGS(inode, pos, len) ); DECLARE_EVENT_CLASS(ext4__write_end, diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1779e133cea0..bea654a85e6b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1159,17 +1159,15 @@ DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio, TRACE_EVENT(f2fs_write_begin, - TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, - unsigned int flags), + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), - TP_ARGS(inode, pos, len, flags), + TP_ARGS(inode, pos, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, pos) __field(unsigned int, len) - __field(unsigned int, flags) ), TP_fast_assign( @@ -1177,14 +1175,12 @@ TRACE_EVENT(f2fs_write_begin, __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; - __entry->flags = flags; ), - TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u, flags = %u", + TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u", show_dev_ino(__entry), (unsigned long long)__entry->pos, - __entry->len, - __entry->flags) + __entry->len) ); TRACE_EVENT(f2fs_write_end, diff --git a/mm/filemap.c b/mm/filemap.c index 9a1eef6c5d35..0751843b052f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3628,8 +3628,7 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping, { const struct address_space_operations *aops = mapping->a_ops; - return aops->write_begin(file, mapping, pos, len, flags, - pagep, fsdata); + return aops->write_begin(file, mapping, pos, len, pagep, fsdata); } EXPORT_SYMBOL(pagecache_write_begin); @@ -3754,7 +3753,6 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) const struct address_space_operations *a_ops = mapping->a_ops; long status = 0; ssize_t written = 0; - unsigned int flags = 0; do { struct page *page; @@ -3784,7 +3782,7 @@ again: break; } - status = a_ops->write_begin(file, mapping, pos, bytes, flags, + status = a_ops->write_begin(file, mapping, pos, bytes, &page, &fsdata); if (unlikely(status < 0)) break; diff --git a/mm/shmem.c b/mm/shmem.c index 4b2fea33158e..0f557a512171 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2426,7 +2426,7 @@ static int shmem_initxattrs(struct inode *, const struct xattr *, void *); static int shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; -- cgit v1.2.3 From 84a1041c60ff8f648a09d28af7b2e50a8f6345ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 3 Mar 2022 15:00:20 -0500 Subject: fs: Remove pagecache_write_begin() and pagecache_write_end() These wrappers have no more users; remove them. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 12 ------------ mm/filemap.c | 20 -------------------- 2 files changed, 32 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a0e73432526f..b35ce086a7a1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -380,18 +380,6 @@ struct address_space_operations { extern const struct address_space_operations empty_aops; -/* - * pagecache_write_begin/pagecache_write_end must be used by general code - * to write into the pagecache. - */ -int pagecache_write_begin(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); - -int pagecache_write_end(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); - /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. diff --git a/mm/filemap.c b/mm/filemap.c index 0751843b052f..c15cfc28f9ce 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3622,26 +3622,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page_gfp); -int pagecache_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - const struct address_space_operations *aops = mapping->a_ops; - - return aops->write_begin(file, mapping, pos, len, pagep, fsdata); -} -EXPORT_SYMBOL(pagecache_write_begin); - -int pagecache_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - const struct address_space_operations *aops = mapping->a_ops; - - return aops->write_end(file, mapping, pos, len, copied, page, fsdata); -} -EXPORT_SYMBOL(pagecache_write_end); - /* * Warn about a page cache invalidation failure during a direct I/O write. */ -- cgit v1.2.3 From 520f301c54faa3484e820b80d4505d48ee587163 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Jan 2022 14:35:22 -0500 Subject: fs: Convert is_dirty_writeback() to take a folio Pass a folio instead of a page to aops->is_dirty_writeback(). Convert both implementations and the caller. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- Documentation/filesystems/vfs.rst | 10 +++++----- fs/buffer.c | 16 ++++++++-------- fs/nfs/file.c | 21 +++++++++------------ include/linux/buffer_head.h | 2 +- include/linux/fs.h | 2 +- mm/vmscan.c | 2 +- 6 files changed, 25 insertions(+), 28 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 30f303180a7d..469882f72fc1 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -747,7 +747,7 @@ cache in your filesystem. The following members are defined: bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); - void (*is_dirty_writeback) (struct page *, bool *, bool *); + void (*is_dirty_writeback)(struct folio *, bool *, bool *); int (*error_remove_page) (struct mapping *mapping, struct page *page); int (*swap_activate)(struct file *); int (*swap_deactivate)(struct file *); @@ -932,14 +932,14 @@ cache in your filesystem. The following members are defined: without needing I/O to bring the whole page up to date. ``is_dirty_writeback`` - Called by the VM when attempting to reclaim a page. The VM uses + Called by the VM when attempting to reclaim a folio. The VM uses dirty and writeback information to determine if it needs to stall to allow flushers a chance to complete some IO. - Ordinarily it can use PageDirty and PageWriteback but some - filesystems have more complex state (unstable pages in NFS + Ordinarily it can use folio_test_dirty and folio_test_writeback but + some filesystems have more complex state (unstable folios in NFS prevent reclaim) or do not set those flags due to locking problems. This callback allows a filesystem to indicate to the - VM if a page should be treated as dirty or writeback for the + VM if a folio should be treated as dirty or writeback for the purposes of stalling. ``error_remove_page`` diff --git a/fs/buffer.c b/fs/buffer.c index d538495a0553..fb4df259c92d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -79,26 +79,26 @@ void unlock_buffer(struct buffer_head *bh) EXPORT_SYMBOL(unlock_buffer); /* - * Returns if the page has dirty or writeback buffers. If all the buffers - * are unlocked and clean then the PageDirty information is stale. If - * any of the pages are locked, it is assumed they are locked for IO. + * Returns if the folio has dirty or writeback buffers. If all the buffers + * are unlocked and clean then the folio_test_dirty information is stale. If + * any of the buffers are locked, it is assumed they are locked for IO. */ -void buffer_check_dirty_writeback(struct page *page, +void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct buffer_head *head, *bh; *dirty = false; *writeback = false; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (!page_has_buffers(page)) + head = folio_buffers(folio); + if (!head) return; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) *writeback = true; - head = page_buffers(page); bh = head; do { if (buffer_locked(bh)) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 314d2d7ba84a..f05c4b18b681 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -430,19 +430,16 @@ static int nfs_release_page(struct page *page, gfp_t gfp) return nfs_fscache_release_page(page, gfp); } -static void nfs_check_dirty_writeback(struct page *page, +static void nfs_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct nfs_inode *nfsi; - struct address_space *mapping = page_file_mapping(page); - - if (!mapping || PageSwapCache(page)) - return; + struct address_space *mapping = folio->mapping; /* - * Check if an unstable page is currently being committed and - * if so, have the VM treat it as if the page is under writeback - * so it will not block due to pages that will shortly be freeable. + * Check if an unstable folio is currently being committed and + * if so, have the VM treat it as if the folio is under writeback + * so it will not block due to folios that will shortly be freeable. */ nfsi = NFS_I(mapping->host); if (atomic_read(&nfsi->commit_info.rpcs_out)) { @@ -451,11 +448,11 @@ static void nfs_check_dirty_writeback(struct page *page, } /* - * If PagePrivate() is set, then the page is not freeable and as the - * inode is not being committed, it's not going to be cleaned in the - * near future so treat it as dirty + * If the private flag is set, then the folio is not freeable + * and as the inode is not being committed, it's not going to + * be cleaned in the near future so treat it as dirty */ - if (PagePrivate(page)) + if (folio_test_private(folio)) *dirty = true; } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6e5a64005fef..805c4e12700a 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -146,7 +146,7 @@ BUFFER_FNS(Defer_Completion, defer_completion) #define page_has_buffers(page) PagePrivate(page) #define folio_buffers(folio) folio_get_private(folio) -void buffer_check_dirty_writeback(struct page *page, +void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index b35ce086a7a1..2be852661a29 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -369,7 +369,7 @@ struct address_space_operations { int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); - void (*is_dirty_writeback) (struct page *, bool *, bool *); + void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_page)(struct address_space *, struct page *); /* swapfile support */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 1678802e03e7..27851232e00c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1451,7 +1451,7 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping = folio_mapping(folio); if (mapping && mapping->a_ops->is_dirty_writeback) - mapping->a_ops->is_dirty_writeback(&folio->page, dirty, writeback); + mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } static struct page *alloc_demote_page(struct page *page, unsigned long node) -- cgit v1.2.3 From 5efe7448a1426250b5747c10ad438517f44f1e51 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 08:43:23 -0400 Subject: fs: Introduce aops->read_folio Change all the callers of ->readpage to call ->read_folio in preference, if it exists. This is a transitional duplication, and will be removed by the end of the series. Signed-off-by: Matthew Wilcox (Oracle) --- fs/btrfs/file.c | 2 +- fs/buffer.c | 5 ++++- fs/ceph/addr.c | 2 +- include/linux/fs.h | 1 + kernel/events/uprobes.c | 6 ++++-- mm/filemap.c | 9 +++++++-- mm/readahead.c | 14 +++++++++----- mm/swapfile.c | 2 +- 8 files changed, 28 insertions(+), 13 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 380054c94e4b..59510d7b1c65 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2401,7 +2401,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { struct address_space *mapping = filp->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->readpage && !mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); diff --git a/fs/buffer.c b/fs/buffer.c index 9737e0dbe3ec..225d03cd622d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2824,7 +2824,10 @@ int nobh_truncate_page(struct address_space *mapping, /* Ok, it's mapped. Make sure it's up-to-date */ if (!folio_test_uptodate(folio)) { - err = mapping->a_ops->readpage(NULL, &folio->page); + if (mapping->a_ops->read_folio) + err = mapping->a_ops->read_folio(NULL, folio); + else + err = mapping->a_ops->readpage(NULL, &folio->page); if (err) { folio_put(folio); goto out; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e65541a51b68..42bba2b5d98b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1772,7 +1772,7 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->readpage && !mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ceph_vmops; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2be852661a29..5ad942183a2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -336,6 +336,7 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); + int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6418083901d4..2c7815d20038 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -790,7 +790,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), * see uprobe_register(). */ - if (mapping->a_ops->readpage) + if (mapping->a_ops->read_folio || mapping->a_ops->readpage) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); @@ -1143,7 +1143,9 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ - if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) + if (!inode->i_mapping->a_ops->read_folio && + !inode->i_mapping->a_ops->readpage && + !shmem_mapping(inode->i_mapping)) return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) diff --git a/mm/filemap.c b/mm/filemap.c index c15cfc28f9ce..96e3d7ffd98e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2419,7 +2419,10 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping, */ folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(file, &folio->page); + if (mapping->a_ops->read_folio) + error = mapping->a_ops->read_folio(file, folio); + else + error = mapping->a_ops->readpage(file, &folio->page); if (error) return error; @@ -3447,7 +3450,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->read_folio && !mapping->a_ops->readpage) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; @@ -3505,6 +3508,8 @@ repeat: filler: if (filler) err = filler(data, &folio->page); + else if (mapping->a_ops->read_folio) + err = mapping->a_ops->read_folio(data, folio); else err = mapping->a_ops->readpage(data, &folio->page); diff --git a/mm/readahead.c b/mm/readahead.c index 60a28af25c4e..76024c20a5a5 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -15,7 +15,7 @@ * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read - * it. In that case a simple ->readpage() will be requested. + * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in @@ -78,7 +78,7 @@ * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O - * error. The page cache reading code will issue a ->readpage() request + * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * @@ -110,7 +110,7 @@ * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using - * ->readpage() which may be less efficient. + * ->read_folio() which may be less efficient. */ #include @@ -170,8 +170,11 @@ static void read_pages(struct readahead_control *rac) } folio_unlock(folio); } + } else if (aops->read_folio) { + while ((folio = readahead_folio(rac)) != NULL) + aops->read_folio(rac->file, folio); } else { - while ((folio = readahead_folio(rac))) + while ((folio = readahead_folio(rac)) != NULL) aops->readpage(rac->file, &folio->page); } @@ -302,7 +305,8 @@ void force_page_cache_ra(struct readahead_control *ractl, struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readahead)) + if (unlikely(!mapping->a_ops->read_folio && + !mapping->a_ops->readpage && !mapping->a_ops->readahead)) return; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 63c61f8b2611..7c19098b8b45 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3041,7 +3041,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* * Read the swap header. */ - if (!mapping->a_ops->readpage) { + if (!mapping->a_ops->read_folio && !mapping->a_ops->readpage) { error = -EINVAL; goto bad_swap_unlock_inode; } -- cgit v1.2.3 From 7e0a126519b82648b254afcd95a168c15f65ea40 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 11:53:28 -0400 Subject: mm,fs: Remove aops->readpage With all implementations of aops->readpage converted to aops->read_folio, we can stop checking whether it's set and remove the member from aops. Signed-off-by: Matthew Wilcox (Oracle) --- fs/btrfs/file.c | 2 +- fs/buffer.c | 5 +---- fs/ceph/addr.c | 2 +- include/linux/fs.h | 3 +-- kernel/events/uprobes.c | 5 ++--- mm/filemap.c | 15 +++++---------- mm/memory.c | 4 ++-- mm/readahead.c | 12 ++++-------- mm/shmem.c | 2 +- mm/swapfile.c | 2 +- 10 files changed, 19 insertions(+), 33 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 373df5ebaf8d..57fba5abb059 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2402,7 +2402,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { struct address_space *mapping = filp->f_mapping; - if (!mapping->a_ops->readpage && !mapping->a_ops->read_folio) + if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); diff --git a/fs/buffer.c b/fs/buffer.c index ec0c52c8848e..786ef5b98c80 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2827,10 +2827,7 @@ int nobh_truncate_page(struct address_space *mapping, /* Ok, it's mapped. Make sure it's up-to-date */ if (!folio_test_uptodate(folio)) { - if (mapping->a_ops->read_folio) - err = mapping->a_ops->read_folio(NULL, folio); - else - err = mapping->a_ops->readpage(NULL, &folio->page); + err = mapping->a_ops->read_folio(NULL, folio); if (err) { folio_put(folio); goto out; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index be3e47784f08..e040b92bb17c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1772,7 +1772,7 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; - if (!mapping->a_ops->readpage && !mapping->a_ops->read_folio) + if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ceph_vmops; diff --git a/include/linux/fs.h b/include/linux/fs.h index 5ad942183a2c..f812f5aa07dd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -262,7 +262,7 @@ struct iattr { * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned - * by readpage(). + * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a @@ -335,7 +335,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); - int (*readpage)(struct file *, struct page *); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2c7815d20038..a9bc3c98f76a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -787,10 +787,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, struct page *page; /* * Ensure that the page that has the original instruction is populated - * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ - if (mapping->a_ops->read_folio || mapping->a_ops->readpage) + if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); @@ -1144,7 +1144,6 @@ static int __uprobe_register(struct inode *inode, loff_t offset, /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && - !inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) return -EIO; /* Racy, just to catch the obvious mistakes */ diff --git a/mm/filemap.c b/mm/filemap.c index 96e3d7ffd98e..079f8cca7959 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2414,15 +2414,12 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping, /* * A previous I/O error may have been due to temporary failures, - * eg. multipath errors. PG_error will be set again if readpage + * eg. multipath errors. PG_error will be set again if read_folio * fails. */ folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ - if (mapping->a_ops->read_folio) - error = mapping->a_ops->read_folio(file, folio); - else - error = mapping->a_ops->readpage(file, &folio->page); + error = mapping->a_ops->read_folio(file, folio); if (error) return error; @@ -2639,7 +2636,7 @@ err: * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, - * uses the readahead and readpage address_space operations to fetch it. + * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns @@ -3450,7 +3447,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; - if (!mapping->a_ops->read_folio && !mapping->a_ops->readpage) + if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; @@ -3508,10 +3505,8 @@ repeat: filler: if (filler) err = filler(data, &folio->page); - else if (mapping->a_ops->read_folio) - err = mapping->a_ops->read_folio(data, folio); else - err = mapping->a_ops->readpage(data, &folio->page); + err = mapping->a_ops->read_folio(data, folio); if (err < 0) { folio_put(folio); diff --git a/mm/memory.c b/mm/memory.c index 76e3af9639d9..2a12028a3749 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -555,11 +555,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", + pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, - mapping ? mapping->a_ops->readpage : NULL); + mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } diff --git a/mm/readahead.c b/mm/readahead.c index 76024c20a5a5..39983a3a93f0 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -170,12 +170,9 @@ static void read_pages(struct readahead_control *rac) } folio_unlock(folio); } - } else if (aops->read_folio) { - while ((folio = readahead_folio(rac)) != NULL) - aops->read_folio(rac->file, folio); } else { while ((folio = readahead_folio(rac)) != NULL) - aops->readpage(rac->file, &folio->page); + aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); @@ -256,8 +253,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, } /* - * Now start the IO. We ignore I/O errors - if the page is not - * uptodate then the caller will launch readpage again, and + * Now start the IO. We ignore I/O errors - if the folio is not + * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); @@ -305,8 +302,7 @@ void force_page_cache_ra(struct readahead_control *ractl, struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; - if (unlikely(!mapping->a_ops->read_folio && - !mapping->a_ops->readpage && !mapping->a_ops->readahead)) + if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* diff --git a/mm/shmem.c b/mm/shmem.c index 0f557a512171..f3e8de8ff75c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4162,7 +4162,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. - * But read_cache_page_gfp() uses the ->readpage() method: which does not + * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * diff --git a/mm/swapfile.c b/mm/swapfile.c index 7c19098b8b45..ecd45bdbad9b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3041,7 +3041,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* * Read the swap header. */ - if (!mapping->a_ops->read_folio && !mapping->a_ops->readpage) { + if (!mapping->a_ops->read_folio) { error = -EINVAL; goto bad_swap_unlock_inode; } -- cgit v1.2.3 From e1209d3a7a67c281260ba9989621060ba7328b8c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space swap currently uses ->readpage to read swap pages. This can only request one page at a time from the filesystem, which is not most efficient. swap uses ->direct_IO for writes which while this is adequate is an inappropriate over-loading. ->direct_IO may need to had handle allocate space for holes or other details that are not relevant for swap. So this patch introduces a new address_space operation: ->swap_rw. In this patch it is used for reads, and a subsequent patch will switch writes to use it. No filesystem yet supports ->swap_rw, but that is not a problem because no filesystem actually works with filesystem-based swap. Only two filesystems set SWP_FS_OPS: - cifs sets the flag, but ->direct_IO always fails so swap cannot work. - nfs sets the flag, but ->direct_IO calls generic_write_checks() which has failed on swap files for several releases. To ensure that a NULL ->swap_rw isn't called, ->activate_swap() for both NFS and cifs are changed to fail if ->swap_rw is not set. This can be removed if/when the function is added. Future patches will restore swap-over-NFS functionality. To submit an async read with ->swap_rw() we need to allocate a structure to hold the kiocb and other details. swap_readpage() cannot handle transient failure, so we create a mempool to provide the structures. Link: https://lkml.kernel.org/r/164859778125.29473.13430559328221330589.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- fs/cifs/file.c | 4 ++++ fs/nfs/file.c | 4 ++++ include/linux/fs.h | 1 + mm/page_io.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++----- mm/swap.h | 1 + mm/swapfile.c | 5 ++++ 6 files changed, 77 insertions(+), 6 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 14d29404c834..794e2cc23e3f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4905,6 +4905,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis, cifs_dbg(FYI, "swap activate\n"); + if (!swap_file->f_mapping->a_ops->swap_rw) + /* Cannot support swap */ + return -EINVAL; + spin_lock(&inode->i_lock); blocks = inode->i_blocks; isize = inode->i_size; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a3a073604070..55bc8c8ca71a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -488,6 +488,10 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; + if (!file->f_mapping->a_ops->swap_rw) + /* Cannot support swap */ + return -EINVAL; + spin_lock(&inode->i_lock); blocks = inode->i_blocks; isize = inode->i_size; diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..dbc5d10328df 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -380,6 +380,7 @@ struct address_space_operations { int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; diff --git a/mm/page_io.c b/mm/page_io.c index 77fa0fb263f4..2724d6389104 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -235,6 +235,25 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) #define bio_associate_blkg_from_page(bio, page) do { } while (0) #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ +struct swap_iocb { + struct kiocb iocb; + struct bio_vec bvec; +}; +static mempool_t *sio_pool; + +int sio_pool_init(void) +{ + if (!sio_pool) { + mempool_t *pool = mempool_create_kmalloc_pool( + SWAP_CLUSTER_MAX, sizeof(struct swap_iocb)); + if (cmpxchg(&sio_pool, NULL, pool)) + mempool_destroy(pool); + } + if (!sio_pool) + return -ENOMEM; + return 0; +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { @@ -306,6 +325,48 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } +static void sio_read_complete(struct kiocb *iocb, long ret) +{ + struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); + struct page *page = sio->bvec.bv_page; + + if (ret != 0 && ret != PAGE_SIZE) { + SetPageError(page); + ClearPageUptodate(page); + pr_alert_ratelimited("Read-error on swap-device\n"); + } else { + SetPageUptodate(page); + count_vm_event(PSWPIN); + } + unlock_page(page); + mempool_free(sio, sio_pool); +} + +static int swap_readpage_fs(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct iov_iter from; + struct swap_iocb *sio; + loff_t pos = page_file_offset(page); + int ret; + + sio = mempool_alloc(sio_pool, GFP_KERNEL); + init_sync_kiocb(&sio->iocb, swap_file); + sio->iocb.ki_pos = pos; + sio->iocb.ki_complete = sio_read_complete; + sio->bvec.bv_page = page; + sio->bvec.bv_len = PAGE_SIZE; + sio->bvec.bv_offset = 0; + + iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_read_complete(&sio->iocb, ret); + return ret; +} + int swap_readpage(struct page *page, bool synchronous) { struct bio *bio; @@ -334,12 +395,7 @@ int swap_readpage(struct page *page, bool synchronous) } if (data_race(sis->flags & SWP_FS_OPS)) { - struct file *swap_file = sis->swap_file; - struct address_space *mapping = swap_file->f_mapping; - - ret = mapping->a_ops->readpage(swap_file, page); - if (!ret) - count_vm_event(PSWPIN); + ret = swap_readpage_fs(page); goto out; } diff --git a/mm/swap.h b/mm/swap.h index e19f185df5e2..eafac80b18d9 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -6,6 +6,7 @@ #include /* for bio_end_io_t */ /* linux/mm/page_io.c */ +int sio_pool_init(void); int swap_readpage(struct page *page, bool do_poll); int swap_writepage(struct page *page, struct writeback_control *wbc); void end_swap_bio_write(struct bio *bio); diff --git a/mm/swapfile.c b/mm/swapfile.c index d811458c0a88..9398e915b36b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2262,6 +2262,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; + if ((sis->flags & SWP_FS_OPS) && + sio_pool_init() != 0) { + destroy_swap_extents(sis); + return -ENOMEM; + } return ret; } -- cgit v1.2.3 From a2ad63daa88b9d6846976fd2a0b5e4f5cfc58377 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: VFS: add FMODE_CAN_ODIRECT file flag Currently various places test if direct IO is possible on a file by checking for the existence of the direct_IO address space operation. This is a poor choice, as the direct_IO operation may not be used - it is only used if the generic_file_*_iter functions are called for direct IO and some filesystems - particularly NFS - don't do this. Instead, introduce a new f_mode flag: FMODE_CAN_ODIRECT and change the various places to check this (avoiding pointer dereferences). do_dentry_open() will set this flag if ->direct_IO is present, so filesystems do not need to be changed. NFS *is* changed, to set the flag explicitly and discard the direct_IO entry in the address_space_operations for files. Other filesystems which currently use noop_direct_IO could usefully be changed to set this flag instead. Link: https://lkml.kernel.org/r/164859778128.29473.15189737957277399416.stgit@noble.brown Reviewed-by: Christoph Hellwig Signed-off-by: NeilBrown Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- drivers/block/loop.c | 4 ++-- fs/fcntl.c | 9 ++++----- fs/nfs/file.c | 3 ++- fs/open.c | 9 ++++----- fs/overlayfs/file.c | 13 ++++--------- include/linux/fs.h | 3 +++ 6 files changed, 19 insertions(+), 22 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a58595f5ee2c..75ef2e177e1b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -186,8 +186,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) */ if (dio) { if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && - !(lo->lo_offset & dio_align) && - mapping->a_ops->direct_IO) + !(lo->lo_offset & dio_align) && + (file->f_mode & FMODE_CAN_ODIRECT)) use_dio = true; else use_dio = false; diff --git a/fs/fcntl.c b/fs/fcntl.c index f15d885b9796..34a3faa4886d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ - if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) { - if (!filp->f_mapping || !filp->f_mapping->a_ops || - !filp->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if (!S_ISFIFO(inode->i_mode) && + (arg & O_DIRECT) && + !(filp->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0e7686a5d0de..bfb4b707b07e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -69,6 +69,8 @@ nfs_file_open(struct inode *inode, struct file *filp) return res; res = nfs_open(inode, filp); + if (res == 0) + filp->f_mode |= FMODE_CAN_ODIRECT; return res; } @@ -536,7 +538,6 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidate_folio = nfs_invalidate_folio, .releasepage = nfs_release_page, - .direct_IO = nfs_direct_IO, #ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, #endif diff --git a/fs/open.c b/fs/open.c index 1315253e0247..7b50d7a2f51d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -834,16 +834,15 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) + f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fa125feed0ff..9d69b4dbb8c4 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -82,11 +82,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags) if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; - if (flags & O_DIRECT) { - if (!file->f_mapping->a_ops || - !file->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; if (file->f_op->check_flags) { err = file->f_op->check_flags(flags); @@ -306,8 +303,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) ret = -EINVAL; if (iocb->ki_flags & IOCB_DIRECT && - (!real.file->f_mapping->a_ops || - !real.file->f_mapping->a_ops->direct_IO)) + !(real.file->f_mode & FMODE_CAN_ODIRECT)) goto out_fdput; old_cred = ovl_override_creds(file_inode(file)->i_sb); @@ -367,8 +363,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) ret = -EINVAL; if (iocb->ki_flags & IOCB_DIRECT && - (!real.file->f_mapping->a_ops || - !real.file->f_mapping->a_ops->direct_IO)) + !(real.file->f_mode & FMODE_CAN_ODIRECT)) goto out_fdput; if (!ovl_should_sync(OVL_FS(inode->i_sb))) diff --git a/include/linux/fs.h b/include/linux/fs.h index dbc5d10328df..b81cacc51d2f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -162,6 +162,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)0x200000) +/* File supports DIRECT IO */ +#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) -- cgit v1.2.3 From fa29000b6b2603ec2bfdc4c73249fcb00cd54f85 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 17:00:05 -0400 Subject: fs: Add aops->release_folio This replaces aops->releasepage. Update the documentation, and call it if it exists. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jeff Layton --- Documentation/filesystems/caching/netfs-api.rst | 4 +-- Documentation/filesystems/locking.rst | 14 ++++---- Documentation/filesystems/vfs.rst | 45 ++++++++++++------------- include/linux/fs.h | 1 + mm/filemap.c | 2 ++ 5 files changed, 34 insertions(+), 32 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/caching/netfs-api.rst b/Documentation/filesystems/caching/netfs-api.rst index 7308d76a29dc..1d18e9def183 100644 --- a/Documentation/filesystems/caching/netfs-api.rst +++ b/Documentation/filesystems/caching/netfs-api.rst @@ -433,11 +433,11 @@ has done a write and then the page it wrote from has been released by the VM, after which it *has* to look in the cache. To inform fscache that a page might now be in the cache, the following function -should be called from the ``releasepage`` address space op:: +should be called from the ``release_folio`` address space op:: void fscache_note_page_release(struct fscache_cookie *cookie); -if the page has been released (ie. releasepage returned true). +if the page has been released (ie. release_folio returned true). Page release and page invalidation should also wait for any mark left on the page to say that a DIO write is underway from that page:: diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index aeba2475a53c..ddef4a753e73 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -249,7 +249,7 @@ prototypes:: struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); - int (*releasepage) (struct page *, int); + bool (*release_folio)(struct folio *, gfp_t); void (*freepage)(struct page *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); bool (*isolate_page) (struct page *, isolate_mode_t); @@ -270,13 +270,13 @@ ops PageLocked(page) i_rwsem invalidate_lock writepage: yes, unlocks (see below) read_folio: yes, unlocks shared writepages: -dirty_folio maybe +dirty_folio: maybe readahead: yes, unlocks shared write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: invalidate_folio: yes exclusive -releasepage: yes +release_folio: yes freepage: yes direct_IO: isolate_page: yes @@ -372,10 +372,10 @@ invalidate_lock before invalidating page cache in truncate / hole punch path (and thus calling into ->invalidate_folio) to block races between page cache invalidation and page cache filling functions (fault, read, ...). -->releasepage() is called when the kernel is about to try to drop the -buffers from the page in preparation for freeing it. It returns zero to -indicate that the buffers are (or may be) freeable. If ->releasepage is zero, -the kernel assumes that the fs has no private interest in the buffers. +->release_folio() is called when the kernel is about to try to drop the +buffers from the folio in preparation for freeing it. It returns false to +indicate that the buffers are (or may be) freeable. If ->release_folio is +NULL, the kernel assumes that the fs has no private interest in the buffers. ->freepage() is called when the kernel is done dropping the page from the page cache. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 0919a4ad973a..679887b5c8fc 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -620,9 +620,9 @@ Writeback. The first can be used independently to the others. The VM can try to either write dirty pages in order to clean them, or release clean pages in order to reuse them. To do this it can call the ->writepage method -on dirty pages, and ->releasepage on clean pages with PagePrivate set. -Clean pages without PagePrivate and with no external references will be -released without notice being given to the address_space. +on dirty pages, and ->release_folio on clean folios with the private +flag set. Clean pages without PagePrivate and with no external references +will be released without notice being given to the address_space. To achieve this functionality, pages need to be placed on an LRU with lru_cache_add and mark_page_active needs to be called whenever the page @@ -734,7 +734,7 @@ cache in your filesystem. The following members are defined: struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); - int (*releasepage) (struct page *, int); + bool (*release_folio)(struct folio *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* isolate a page for migration */ @@ -864,33 +864,32 @@ cache in your filesystem. The following members are defined: address space. This generally corresponds to either a truncation, punch hole or a complete invalidation of the address space (in the latter case 'offset' will always be 0 and 'length' - will be folio_size()). Any private data associated with the page + will be folio_size()). Any private data associated with the folio should be updated to reflect this truncation. If offset is 0 and length is folio_size(), then the private data should be - released, because the page must be able to be completely - discarded. This may be done by calling the ->releasepage + released, because the folio must be able to be completely + discarded. This may be done by calling the ->release_folio function, but in this case the release MUST succeed. -``releasepage`` - releasepage is called on PagePrivate pages to indicate that the - page should be freed if possible. ->releasepage should remove - any private data from the page and clear the PagePrivate flag. - If releasepage() fails for some reason, it must indicate failure - with a 0 return value. releasepage() is used in two distinct - though related cases. The first is when the VM finds a clean - page with no active users and wants to make it a free page. If - ->releasepage succeeds, the page will be removed from the - address_space and become free. +``release_folio`` + release_folio is called on folios with private data to tell the + filesystem that the folio is about to be freed. ->release_folio + should remove any private data from the folio and clear the + private flag. If release_folio() fails, it should return false. + release_folio() is used in two distinct though related cases. + The first is when the VM wants to free a clean folio with no + active users. If ->release_folio succeeds, the folio will be + removed from the address_space and be freed. The second case is when a request has been made to invalidate - some or all pages in an address_space. This can happen through - the fadvise(POSIX_FADV_DONTNEED) system call or by the - filesystem explicitly requesting it as nfs and 9fs do (when they + some or all folios in an address_space. This can happen + through the fadvise(POSIX_FADV_DONTNEED) system call or by the + filesystem explicitly requesting it as nfs and 9p do (when they believe the cache may be out of date with storage) by calling invalidate_inode_pages2(). If the filesystem makes such a call, - and needs to be certain that all pages are invalidated, then its - releasepage will need to ensure this. Possibly it can clear the - PageUptodate bit if it cannot free private data yet. + and needs to be certain that all folios are invalidated, then + its release_folio will need to ensure this. Possibly it can + clear the uptodate flag if it cannot free private data yet. ``freepage`` freepage is called once the page is no longer visible in the diff --git a/include/linux/fs.h b/include/linux/fs.h index f812f5aa07dd..ad768f13f485 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -355,6 +355,7 @@ struct address_space_operations { /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); + bool (*release_folio)(struct folio *, gfp_t); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); diff --git a/mm/filemap.c b/mm/filemap.c index 9b7fa47feb5e..78e4a7dc3a56 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3955,6 +3955,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_writeback(folio)) return false; + if (mapping && mapping->a_ops->release_folio) + return mapping->a_ops->release_folio(folio, gfp); if (mapping && mapping->a_ops->releasepage) return mapping->a_ops->releasepage(&folio->page, gfp); return try_to_free_buffers(&folio->page); -- cgit v1.2.3 From 704ead2bed202579f025a4754e52e9ab21ff3ada Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 00:27:53 -0400 Subject: fs: Remove last vestiges of releasepage All users are now converted to release_folio Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jeff Layton --- include/linux/fs.h | 1 - include/linux/page-flags.h | 2 +- mm/filemap.c | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ad768f13f485..1cee64d9724b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,7 +356,6 @@ struct address_space_operations { sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); - int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9d8eeaa67d05..af10149a6c31 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -516,7 +516,7 @@ PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. - * - PG_private and PG_private_2 cause releasepage() and co to be invoked + * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) diff --git a/mm/filemap.c b/mm/filemap.c index 78e4a7dc3a56..ee892853a214 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3957,8 +3957,6 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) if (mapping && mapping->a_ops->release_folio) return mapping->a_ops->release_folio(folio, gfp); - if (mapping && mapping->a_ops->releasepage) - return mapping->a_ops->releasepage(&folio->page, gfp); return try_to_free_buffers(&folio->page); } EXPORT_SYMBOL(filemap_release_folio); -- cgit v1.2.3 From d2329aa0c78f4a8dd368bb706f196ab99f692eaa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 07:35:31 -0400 Subject: fs: Add free_folio address space operation Include documentation and convert the callers to use ->free_folio as well as ->freepage. Signed-off-by: Matthew Wilcox (Oracle) --- Documentation/filesystems/locking.rst | 10 +++++----- Documentation/filesystems/vfs.rst | 6 +++--- include/linux/fs.h | 1 + mm/filemap.c | 9 ++++++++- mm/vmscan.c | 6 +++++- 5 files changed, 22 insertions(+), 10 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index ddef4a753e73..515bc48ab58b 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -250,7 +250,7 @@ prototypes:: sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); bool (*release_folio)(struct folio *, gfp_t); - void (*freepage)(struct page *); + void (*free_folio)(struct folio *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); bool (*isolate_page) (struct page *, isolate_mode_t); int (*migratepage)(struct address_space *, struct page *, struct page *); @@ -262,10 +262,10 @@ prototypes:: int (*swap_deactivate)(struct file *); locking rules: - All except dirty_folio and freepage may block + All except dirty_folio and free_folio may block ====================== ======================== ========= =============== -ops PageLocked(page) i_rwsem invalidate_lock +ops folio locked i_rwsem invalidate_lock ====================== ======================== ========= =============== writepage: yes, unlocks (see below) read_folio: yes, unlocks shared @@ -277,7 +277,7 @@ write_end: yes, unlocks exclusive bmap: invalidate_folio: yes exclusive release_folio: yes -freepage: yes +free_folio: yes direct_IO: isolate_page: yes migratepage: yes (both) @@ -377,7 +377,7 @@ buffers from the folio in preparation for freeing it. It returns false to indicate that the buffers are (or may be) freeable. If ->release_folio is NULL, the kernel assumes that the fs has no private interest in the buffers. -->freepage() is called when the kernel is done dropping the page +->free_folio() is called when the kernel has dropped the folio from the page cache. ->launder_folio() may be called prior to releasing a folio if diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 679887b5c8fc..12a011d2cbc6 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -735,7 +735,7 @@ cache in your filesystem. The following members are defined: sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); bool (*release_folio)(struct folio *, gfp_t); - void (*freepage)(struct page *); + void (*free_folio)(struct folio *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* isolate a page for migration */ bool (*isolate_page) (struct page *, isolate_mode_t); @@ -891,8 +891,8 @@ cache in your filesystem. The following members are defined: its release_folio will need to ensure this. Possibly it can clear the uptodate flag if it cannot free private data yet. -``freepage`` - freepage is called once the page is no longer visible in the +``free_folio`` + free_folio is called once the folio is no longer visible in the page cache in order to allow the cleanup of any private data. Since it may be called by the memory reclaimer, it should not assume that the original address_space mapping still exists, and diff --git a/include/linux/fs.h b/include/linux/fs.h index 1cee64d9724b..915844e6293e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,7 @@ struct address_space_operations { sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); + void (*free_folio)(struct folio *folio); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* diff --git a/mm/filemap.c b/mm/filemap.c index d335a154a0d9..adcdef56890f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -226,8 +226,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*freepage)(struct page *); + void (*free_folio)(struct folio *); int refs = 1; + free_folio = mapping->a_ops->free_folio; + if (free_folio) + free_folio(folio); freepage = mapping->a_ops->freepage; if (freepage) freepage(&folio->page); @@ -807,6 +811,7 @@ void replace_page_cache_page(struct page *old, struct page *new) struct folio *fold = page_folio(old); struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; + void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; void (*freepage)(struct page *) = mapping->a_ops->freepage; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); @@ -835,9 +840,11 @@ void replace_page_cache_page(struct page *old, struct page *new) if (PageSwapBacked(new)) __inc_lruvec_page_state(new, NR_SHMEM); xas_unlock_irq(&xas); + if (free_folio) + free_folio(fold); if (freepage) freepage(old); - put_page(old); + folio_put(fold); } EXPORT_SYMBOL_GPL(replace_page_cache_page); diff --git a/mm/vmscan.c b/mm/vmscan.c index f3f7ce2c4068..d8a031128ad0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1282,8 +1282,10 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, xa_unlock_irq(&mapping->i_pages); put_swap_page(&folio->page, swap); } else { + void (*free_folio)(struct folio *); void (*freepage)(struct page *); + free_folio = mapping->a_ops->free_folio; freepage = mapping->a_ops->freepage; /* * Remember a shadow entry for reclaimed file cache in @@ -1310,7 +1312,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - if (freepage != NULL) + if (free_folio) + free_folio(folio); + if (freepage) freepage(&folio->page); } -- cgit v1.2.3 From 8560cb1a7d75048af275dd23fb0cf05382b3c2b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 May 2022 00:43:09 -0400 Subject: fs: Remove aops->freepage All implementations now use free_folio so we can delete the callers and the method. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/fs.h | 1 - mm/filemap.c | 7 ------- mm/vmscan.c | 4 ---- 3 files changed, 12 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 915844e6293e..6f305f1097a5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -357,7 +357,6 @@ struct address_space_operations { void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *folio); - void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a page to the specified target. If diff --git a/mm/filemap.c b/mm/filemap.c index adcdef56890f..fa0ca674450f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -225,16 +225,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { - void (*freepage)(struct page *); void (*free_folio)(struct folio *); int refs = 1; free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); - freepage = mapping->a_ops->freepage; - if (freepage) - freepage(&folio->page); if (folio_test_large(folio) && !folio_test_hugetlb(folio)) refs = folio_nr_pages(folio); @@ -812,7 +808,6 @@ void replace_page_cache_page(struct page *old, struct page *new) struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; - void (*freepage)(struct page *) = mapping->a_ops->freepage; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); @@ -842,8 +837,6 @@ void replace_page_cache_page(struct page *old, struct page *new) xas_unlock_irq(&xas); if (free_folio) free_folio(fold); - if (freepage) - freepage(old); folio_put(fold); } EXPORT_SYMBOL_GPL(replace_page_cache_page); diff --git a/mm/vmscan.c b/mm/vmscan.c index d8a031128ad0..edc89f26b738 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1283,10 +1283,8 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, put_swap_page(&folio->page, swap); } else { void (*free_folio)(struct folio *); - void (*freepage)(struct page *); free_folio = mapping->a_ops->free_folio; - freepage = mapping->a_ops->freepage; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. @@ -1314,8 +1312,6 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (free_folio) free_folio(folio); - if (freepage) - freepage(&folio->page); } return 1; -- cgit v1.2.3 From ee692a21e9bf8354bd3ec816f1cf4bff8619ed77 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 May 2022 11:17:45 +0530 Subject: fs,io_uring: add infrastructure for uring-cmd file_operations->uring_cmd is a file private handler. This is somewhat similar to ioctl but hopefully a lot more sane and useful as it can be used to enable many io_uring capabilities for the underlying operation. IORING_OP_URING_CMD is a file private kind of request. io_uring doesn't know what is in this command type, it's for the provider of ->uring_cmd() to deal with. Co-developed-by: Kanchan Joshi Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220511054750.20432-2-joshi.k@samsung.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 135 ++++++++++++++++++++++++++++++++++++------ include/linux/fs.h | 2 + include/linux/io_uring.h | 33 +++++++++++ include/uapi/linux/io_uring.h | 21 ++++--- 4 files changed, 165 insertions(+), 26 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index ceaf7826ed71..44c57dca358d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -202,13 +202,6 @@ struct io_rings { struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, -}; - struct io_mapped_ubuf { u64 ubuf; u64 ubuf_end; @@ -972,6 +965,7 @@ struct io_kiocb { struct io_xattr xattr; struct io_socket sock; struct io_nop nop; + struct io_uring_cmd uring_cmd; }; u8 opcode; @@ -1050,6 +1044,14 @@ struct io_cancel_data { int seq; }; +/* + * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into + * the following sqe if SQE128 is used. + */ +#define uring_cmd_pdu_size(is_sqe128) \ + ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ + offsetof(struct io_uring_sqe, cmd)) + struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -1289,6 +1291,12 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_SOCKET] = { .audit_skip = 1, }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .needs_async_setup = 1, + .async_size = uring_cmd_pdu_size(1), + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -1428,6 +1436,8 @@ const char *io_uring_get_opcode(u8 opcode) return "GETXATTR"; case IORING_OP_SOCKET: return "SOCKET"; + case IORING_OP_URING_CMD: + return "URING_CMD"; case IORING_OP_LAST: return "INVALID"; } @@ -4507,10 +4517,6 @@ static int __io_getxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio)) - return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4620,10 +4626,6 @@ static int __io_setxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio)) - return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4910,6 +4912,96 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +{ + req->uring_cmd.task_work_cb(&req->uring_cmd); +} + +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + + req->uring_cmd.task_work_cb = task_work_cb; + req->io_task_work.func = io_uring_cmd_work; + io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); + +/* + * Called by consumers of io_uring_cmd, if they originally returned + * -EIOCBQUEUED upon receiving the command. + */ +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + + if (ret < 0) + req_set_fail(req); + if (req->ctx->flags & IORING_SETUP_CQE32) + __io_req_complete32(req, 0, ret, 0, res2, 0); + else + io_req_complete(req, ret); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +static int io_uring_cmd_prep_async(struct io_kiocb *req) +{ + size_t cmd_size; + + cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); + + memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); + return 0; +} + +static int io_uring_cmd_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + + if (sqe->rw_flags) + return -EINVAL; + ioucmd->cmd = sqe->cmd; + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); + return 0; +} + +static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (!req->file->f_op->uring_cmd) + return -EOPNOTSUPP; + + if (ctx->flags & IORING_SETUP_SQE128) + issue_flags |= IO_URING_F_SQE128; + if (ctx->flags & IORING_SETUP_CQE32) + issue_flags |= IO_URING_F_CQE32; + if (ctx->flags & IORING_SETUP_IOPOLL) + issue_flags |= IO_URING_F_IOPOLL; + + if (req_has_async_data(req)) + ioucmd->cmd = req->async_data; + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ret == -EAGAIN) { + if (!req_has_async_data(req)) { + if (io_alloc_async_data(req)) + return -ENOMEM; + io_uring_cmd_prep_async(req); + } + return -EAGAIN; + } + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(ioucmd, ret, 0); + return 0; +} + static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -6305,9 +6397,7 @@ static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_socket *sock = &req->sock; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index) + if (sqe->addr || sqe->rw_flags || sqe->buf_index) return -EINVAL; sock->domain = READ_ONCE(sqe->fd); @@ -7755,6 +7845,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_getxattr_prep(req, sqe); case IORING_OP_SOCKET: return io_socket_prep(req, sqe); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -7787,6 +7879,8 @@ static int io_req_prep_async(struct io_kiocb *req) return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: return io_connect_prep_async(req); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep_async(req); } printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", req->opcode); @@ -8081,6 +8175,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_SOCKET: ret = io_socket(req, issue_flags); break; + case IORING_OP_URING_CMD: + ret = io_uring_cmd(req, issue_flags); + break; default: ret = -EINVAL; break; @@ -12699,6 +12796,8 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..87b5af1d9fbe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1953,6 +1953,7 @@ struct dir_context { #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) struct iov_iter; +struct io_uring_cmd; struct file_operations { struct module *owner; @@ -1995,6 +1996,7 @@ struct file_operations { struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); + int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); } __randomize_layout; struct inode_operations { diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 24651c229ed2..4a2f6cc5a492 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,7 +5,32 @@ #include #include +enum io_uring_cmd_flags { + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, + + /* ctx state flags, for URING_CMD */ + IO_URING_F_SQE128 = 4, + IO_URING_F_CQE32 = 8, + IO_URING_F_IOPOLL = 16, +}; + +struct io_uring_cmd { + struct file *file; + const void *cmd; + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd); + u32 cmd_op; + u32 pad; + u8 pdu[32]; /* available inline for free use */ +}; + #if defined(CONFIG_IO_URING) +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); @@ -30,6 +55,14 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2) +{ +} +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ +} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ac2d90d669c3..23618be55dd2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -22,6 +22,7 @@ struct io_uring_sqe { union { __u64 off; /* offset into file */ __u64 addr2; + __u32 cmd_op; }; union { __u64 addr; /* pointer to buffer or iovecs */ @@ -61,14 +62,17 @@ struct io_uring_sqe { __s32 splice_fd_in; __u32 file_index; }; - __u64 addr3; - __u64 __pad2[1]; - - /* - * If the ring is initialized with IORING_SETUP_SQE128, then this field - * contains 64-bytes of padding, doubling the size of the SQE. - */ - __u64 __big_sqe_pad[0]; + union { + struct { + __u64 addr3; + __u64 __pad2[1]; + }; + /* + * If the ring is initialized with IORING_SETUP_SQE128, then + * this field is used for 80 bytes of arbitrary command data + */ + __u8 cmd[0]; + }; }; enum { @@ -175,6 +179,7 @@ enum io_uring_op { IORING_OP_FGETXATTR, IORING_OP_GETXATTR, IORING_OP_SOCKET, + IORING_OP_URING_CMD, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From 81132a39c152ca09832b9e4cb748129cee5f55ec Mon Sep 17 00:00:00 2001 From: Gou Hao Date: Tue, 2 Nov 2021 10:46:48 +0800 Subject: fs: remove fget_many and fput_many interface These two interface were added in 091141a42 commit, but now there is no place to call them. The only user of fput/fget_many() was removed in commit 62906e89e63b ("io_uring: remove file batch-get optimisation"). A user of get_file_rcu_many() were removed in commit f073531070d2 ("init: add an init_dup helper"). And replace atomic_long_sub/add to atomic_long_dec/inc can improve performance. Here are the test results of unixbench: Cmd: ./Run -c 64 context1 Without patch: System Benchmarks Partial Index BASELINE RESULT INDEX Pipe-based Context Switching 4000.0 2798407.0 6996.0 ======== System Benchmarks Index Score (Partial Only) 6996.0 With patch: System Benchmarks Partial Index BASELINE RESULT INDEX Pipe-based Context Switching 4000.0 3486268.8 8715.7 ======== System Benchmarks Index Score (Partial Only) 8715.7 Signed-off-by: Gou Hao Signed-off-by: Al Viro --- fs/file.c | 33 +++++++++++++-------------------- fs/file_table.c | 9 ++------- include/linux/file.h | 2 -- include/linux/fs.h | 4 +--- 4 files changed, 16 insertions(+), 32 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/file.c b/fs/file.c index ee9317346702..4c59f410a48d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -871,7 +871,7 @@ void do_close_on_exec(struct files_struct *files) } static inline struct file *__fget_files_rcu(struct files_struct *files, - unsigned int fd, fmode_t mask, unsigned int refs) + unsigned int fd, fmode_t mask) { for (;;) { struct file *file; @@ -897,10 +897,9 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * Such a race can take two forms: * * (a) the file ref already went down to zero, - * and get_file_rcu_many() fails. Just try - * again: + * and get_file_rcu() fails. Just try again: */ - if (unlikely(!get_file_rcu_many(file, refs))) + if (unlikely(!get_file_rcu(file))) continue; /* @@ -909,11 +908,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * - * If so, we need to put our refs and try again. + * If so, we need to put our ref and try again. */ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || unlikely(rcu_dereference_raw(*fdentry) != file)) { - fput_many(file, refs); + fput(file); continue; } @@ -926,37 +925,31 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, } static struct file *__fget_files(struct files_struct *files, unsigned int fd, - fmode_t mask, unsigned int refs) + fmode_t mask) { struct file *file; rcu_read_lock(); - file = __fget_files_rcu(files, fd, mask, refs); + file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } -static inline struct file *__fget(unsigned int fd, fmode_t mask, - unsigned int refs) -{ - return __fget_files(current->files, fd, mask, refs); -} - -struct file *fget_many(unsigned int fd, unsigned int refs) +static inline struct file *__fget(unsigned int fd, fmode_t mask) { - return __fget(fd, FMODE_PATH, refs); + return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { - return __fget(fd, FMODE_PATH, 1); + return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { - return __fget(fd, 0, 1); + return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); @@ -966,7 +959,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) task_lock(task); if (task->files) - file = __fget_files(task->files, fd, 0, 1); + file = __fget_files(task->files, fd, 0); task_unlock(task); return file; @@ -1035,7 +1028,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) return 0; return (unsigned long)file; } else { - file = __fget(fd, mask, 1); + file = __fget(fd, mask); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; diff --git a/fs/file_table.c b/fs/file_table.c index 7d2e692b66a9..1ffd74bbbed6 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -368,9 +368,9 @@ EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); -void fput_many(struct file *file, unsigned int refs) +void fput(struct file *file) { - if (atomic_long_sub_and_test(refs, &file->f_count)) { + if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { @@ -389,11 +389,6 @@ void fput_many(struct file *file, unsigned int refs) } } -void fput(struct file *file) -{ - fput_many(file, 1); -} - /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without diff --git a/include/linux/file.h b/include/linux/file.h index 51e830b4fe3a..39704eae83e2 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -14,7 +14,6 @@ struct file; extern void fput(struct file *); -extern void fput_many(struct file *, unsigned int); struct file_operations; struct task_struct; @@ -47,7 +46,6 @@ static inline void fdput(struct fd fd) } extern struct file *fget(unsigned int fd); -extern struct file *fget_many(unsigned int fd, unsigned int refs); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern unsigned long __fdget(unsigned int fd); diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..3660c338bb16 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -981,9 +981,7 @@ static inline struct file *get_file(struct file *f) atomic_long_inc(&f->f_count); return f; } -#define get_file_rcu_many(x, cnt) \ - atomic_long_add_unless(&(x)->f_count, (cnt), 0) -#define get_file_rcu(x) get_file_rcu_many((x), 1) +#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count) #define file_count(x) atomic_long_read(&(x)->f_count) #define MAX_NON_LFS ((1UL<<31) - 1) -- cgit v1.2.3 From 7f8d12ea96352275c2850c24a1367166179392d2 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 29 Mar 2022 15:55:59 +0900 Subject: fs: add a lockdep check function for sb_start_write() Add a function sb_write_started() to allow callers to verify if sb_start_write() is properly called. It will be used for assertion in btrfs. Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..01d61984ce7a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1708,6 +1708,11 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) +static inline bool sb_write_started(const struct super_block *sb) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1); +} + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to -- cgit v1.2.3 From 591502c5cb325b1c6ec59ab161927d606b918aa0 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Mon, 2 May 2022 14:19:24 -0700 Subject: fs/lock: add helper locks_owner_has_blockers to check for blockers Add helper locks_owner_has_blockers to check if there is any blockers for a given lockowner. Reviewed-by: J. Bruce Fields Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- fs/locks.c | 28 ++++++++++++++++++++++++++++ include/linux/fs.h | 7 +++++++ 2 files changed, 35 insertions(+) (limited to 'include/linux/fs.h') diff --git a/fs/locks.c b/fs/locks.c index 8c6df10cd9ed..c369841ef7d1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -300,6 +300,34 @@ void locks_release_private(struct file_lock *fl) } EXPORT_SYMBOL_GPL(locks_release_private); +/** + * locks_owner_has_blockers - Check for blocking lock requests + * @flctx: file lock context + * @owner: lock owner + * + * Return values: + * %true: @owner has at least one blocker + * %false: @owner has no blockers + */ +bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + struct file_lock *fl; + + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_owner != owner) + continue; + if (!list_empty(&fl->fl_blocked_requests)) { + spin_unlock(&flctx->flc_lock); + return true; + } + } + spin_unlock(&flctx->flc_lock); + return false; +} +EXPORT_SYMBOL_GPL(locks_owner_has_blockers); + /* Free a lock which is not in use. */ void locks_free_lock(struct file_lock *fl) { diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..b8ed7f974fb4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1174,6 +1174,8 @@ extern void lease_unregister_notifier(struct notifier_block *); struct files_struct; extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); +extern bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) @@ -1309,6 +1311,11 @@ static inline int lease_modify(struct file_lock *fl, int arg, struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} +static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + return false; +} #endif /* !CONFIG_FILE_LOCKING */ static inline struct inode *file_inode(const struct file *f) -- cgit v1.2.3 From 2443da2259e97688f93d64d17ab69b15f466078a Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Mon, 2 May 2022 14:19:25 -0700 Subject: fs/lock: add 2 callbacks to lock_manager_operations to resolve conflict Add 2 new callbacks, lm_lock_expirable and lm_expire_lock, to lock_manager_operations to allow the lock manager to take appropriate action to resolve the lock conflict if possible. A new field, lm_mod_owner, is also added to lock_manager_operations. The lm_mod_owner is used by the fs/lock code to make sure the lock manager module such as nfsd, is not freed while lock conflict is being resolved. lm_lock_expirable checks and returns true to indicate that the lock conflict can be resolved else return false. This callback must be called with the flc_lock held so it can not block. lm_expire_lock is called to resolve the lock conflict if the returned value from lm_lock_expirable is true. This callback is called without the flc_lock held since it's allowed to block. Upon returning from this callback, the lock conflict should be resolved and the caller is expected to restart the conflict check from the beginnning of the list. Lock manager, such as NFSv4 courteous server, uses this callback to resolve conflict by destroying lock owner, or the NFSv4 courtesy client (client that has expired but allowed to maintains its states) that owns the lock. Reviewed-by: J. Bruce Fields Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- Documentation/filesystems/locking.rst | 4 ++++ fs/locks.c | 33 ++++++++++++++++++++++++++++++--- include/linux/fs.h | 3 +++ 3 files changed, 37 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index c26d854275a0..0997a258361a 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -428,6 +428,8 @@ prototypes:: void (*lm_break)(struct file_lock *); /* break_lease callback */ int (*lm_change)(struct file_lock **, int); bool (*lm_breaker_owns_lease)(struct file_lock *); + bool (*lm_lock_expirable)(struct file_lock *); + void (*lm_expire_lock)(void); locking rules: @@ -439,6 +441,8 @@ lm_grant: no no no lm_break: yes no no lm_change yes no no lm_breaker_owns_lease: yes no no +lm_lock_expirable yes no no +lm_expire_lock no no yes ====================== ============= ================= ========= buffer_head diff --git a/fs/locks.c b/fs/locks.c index c369841ef7d1..ca28e0e50e56 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -902,6 +902,8 @@ posix_test_lock(struct file *filp, struct file_lock *fl) struct file_lock *cfl; struct file_lock_context *ctx; struct inode *inode = locks_inode(filp); + void *owner; + void (*func)(void); ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { @@ -909,12 +911,23 @@ posix_test_lock(struct file *filp, struct file_lock *fl) return; } +retry: spin_lock(&ctx->flc_lock); list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { - if (posix_locks_conflict(fl, cfl)) { - locks_copy_conflock(fl, cfl); - goto out; + if (!posix_locks_conflict(fl, cfl)) + continue; + if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable + && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) { + owner = cfl->fl_lmops->lm_mod_owner; + func = cfl->fl_lmops->lm_expire_lock; + __module_get(owner); + spin_unlock(&ctx->flc_lock); + (*func)(); + module_put(owner); + goto retry; } + locks_copy_conflock(fl, cfl); + goto out; } fl->fl_type = F_UNLCK; out: @@ -1088,6 +1101,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, int error; bool added = false; LIST_HEAD(dispose); + void *owner; + void (*func)(void); ctx = locks_get_lock_context(inode, request->fl_type); if (!ctx) @@ -1106,6 +1121,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } +retry: percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); /* @@ -1117,6 +1133,17 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, list_for_each_entry(fl, &ctx->flc_posix, fl_list) { if (!posix_locks_conflict(request, fl)) continue; + if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable + && (*fl->fl_lmops->lm_lock_expirable)(fl)) { + owner = fl->fl_lmops->lm_mod_owner; + func = fl->fl_lmops->lm_expire_lock; + __module_get(owner); + spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); + (*func)(); + module_put(owner); + goto retry; + } if (conflock) locks_copy_conflock(conflock, fl); error = -EAGAIN; diff --git a/include/linux/fs.h b/include/linux/fs.h index b8ed7f974fb4..aa6c1bbdb8c4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1029,6 +1029,7 @@ struct file_lock_operations { }; struct lock_manager_operations { + void *lm_mod_owner; fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ @@ -1037,6 +1038,8 @@ struct lock_manager_operations { int (*lm_change)(struct file_lock *, int, struct list_head *); void (*lm_setup)(struct file_lock *, void **); bool (*lm_breaker_owns_lease)(struct file_lock *); + bool (*lm_lock_expirable)(struct file_lock *cfl); + void (*lm_expire_lock)(void); }; struct lock_manager { -- cgit v1.2.3 From 6d4675e601357834dadd2ba1d803f6484596015c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 19 May 2022 14:08:54 -0700 Subject: mm: don't be stuck to rmap lock on reclaim path The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended under memory pressure if processes keep working on their vmas(e.g., fork, mmap, munmap). It makes reclaim path stuck. In our real workload traces, we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it makes other processes entering direct reclaim, which were also stuck on the lock. This patch makes lru aging path try_lock mode like shink_page_list so the reclaim context will keep working with next lru pages without being stuck. if it found the rmap lock contended, it rotates the page back to head of lru in both active/inactive lrus to make them consistent behavior, which is basic starting point rather than adding more heristic. Since this patch introduces a new "contended" field as out-param along with try_lock in-param in rmap_walk_control, it's not immutable any longer if the try_lock is set so remove const keywords on rmap related functions. Since rmap walking is already expensive operation, I doubt the const would help sizable benefit( And we didn't have it until 5.17). In a heavy app workload in Android, trace shows following statistics. It almost removes rmap lock contention from reclaim path. Martin Liu reported: Before: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function 1632 0 1631 151.542173 31672 209 page_lock_anon_vma_read 601 0 601 145.544681 28817 198 rmap_walk_file After: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function NaN NaN NaN NaN NaN 0.0 NaN 0 0 0 0.127645 1 12 rmap_walk_file [minchan@kernel.org: add comment, per Matthew] Link: https://lkml.kernel.org/r/YnNqeB5tUf6LZ57b@google.com Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: John Dias Cc: Tim Murray Cc: Matthew Wilcox Cc: Vladimir Davydov Cc: Martin Liu Cc: Minchan Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/fs.h | 5 +++++ include/linux/ksm.h | 4 ++-- include/linux/rmap.h | 28 ++++++++++++++++++++-------- mm/ksm.c | 10 ++++++++-- mm/memory-failure.c | 2 +- mm/page_idle.c | 7 ++++--- mm/rmap.c | 52 +++++++++++++++++++++++++++++++++++++++++----------- mm/vmscan.c | 7 ++++++- 8 files changed, 87 insertions(+), 28 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b81cacc51d2f..044b67f8d861 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -477,6 +477,11 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) up_write(&mapping->i_mmap_rwsem); } +static inline int i_mmap_trylock_read(struct address_space *mapping) +{ + return down_read_trylock(&mapping->i_mmap_rwsem); +} + static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 0630e545f4cb..0b4f17418f64 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -51,7 +51,7 @@ static inline void ksm_exit(struct mm_struct *mm) struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -79,7 +79,7 @@ static inline struct page *ksm_might_need_to_copy(struct page *page, } static inline void rmap_walk_ksm(struct folio *folio, - const struct rmap_walk_control *rwc) + struct rmap_walk_control *rwc) { } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index cbe279a6f0de..9ec23138e410 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -128,6 +128,11 @@ static inline void anon_vma_lock_read(struct anon_vma *anon_vma) down_read(&anon_vma->root->rwsem); } +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) { up_read(&anon_vma->root->rwsem); @@ -366,17 +371,14 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -/* - * Called by memory-failure.c to kill processes. - */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* * rmap_walk_control: To control rmap traversing for specific needs * * arg: passed to rmap_one() and invalid_vma() + * try_lock: bail out if the rmap lock is contended + * contended: indicate the rmap traversal bailed out due to lock contention * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition * anon_lock: for getting anon_lock by optimized way rather than default @@ -384,6 +386,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); */ struct rmap_walk_control { void *arg; + bool try_lock; + bool contended; /* * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. @@ -391,12 +395,20 @@ struct rmap_walk_control { bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct folio *folio); - struct anon_vma *(*anon_lock)(struct folio *folio); + struct anon_vma *(*anon_lock)(struct folio *folio, + struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc); -void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc); +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); + +/* + * Called by memory-failure.c to kill processes. + */ +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, + struct rmap_walk_control *rwc); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); #else /* !CONFIG_MMU */ diff --git a/mm/ksm.c b/mm/ksm.c index 38360285497a..9ee82c9bce94 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2610,7 +2610,7 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; @@ -2634,7 +2634,13 @@ again: struct vm_area_struct *vma; cond_resched(); - anon_vma_lock_read(anon_vma); + if (!anon_vma_trylock_read(anon_vma)) { + if (rwc->try_lock) { + rwc->contended = true; + return; + } + anon_vma_lock_read(anon_vma); + } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 01f8b63d3621..a934ee8124dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -485,7 +485,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = folio_lock_anon_vma_read(folio); + av = folio_lock_anon_vma_read(folio, NULL); if (av == NULL) /* Not actually mapped anymore */ return; diff --git a/mm/page_idle.c b/mm/page_idle.c index fc0435abf909..bc08332a609c 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -86,11 +86,12 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, static void page_idle_clear_pte_refs(struct page *page) { struct folio *folio = page_folio(page); + /* - * Since rwc.arg is unused, rwc is effectively immutable, so we - * can make it static const to save some cycles and stack. + * Since rwc.try_lock is unused, rwc is effectively immutable, so we + * can make it static to save some cycles and stack. */ - static const struct rmap_walk_control rwc = { + static struct rmap_walk_control rwc = { .rmap_one = page_idle_clear_pte_refs_one, .anon_lock = folio_lock_anon_vma_read, }; diff --git a/mm/rmap.c b/mm/rmap.c index 219e287a83d2..5bcb334cd6f2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -527,9 +527,11 @@ out: * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex. + * reference like with page_get_anon_vma() and then block on the mutex + * on !rwc->try_lock case. */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio) +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -557,6 +559,12 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio) goto out; } + if (rwc && rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + /* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; @@ -883,7 +891,8 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) * * Quick test_and_clear_referenced for all mappings of a folio, * - * Return: The number of mappings which referenced the folio. + * Return: The number of mappings which referenced the folio. Return -1 if + * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) @@ -897,6 +906,7 @@ int folio_referenced(struct folio *folio, int is_locked, .rmap_one = folio_referenced_one, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, + .try_lock = true, }; *vm_flags = 0; @@ -927,7 +937,7 @@ int folio_referenced(struct folio *folio, int is_locked, if (we_locked) folio_unlock(folio); - return pra.referenced; + return rwc.contended ? -1 : pra.referenced; } static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) @@ -2336,12 +2346,12 @@ void __put_anon_vma(struct anon_vma *anon_vma) } static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, - const struct rmap_walk_control *rwc) + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; if (rwc->anon_lock) - return rwc->anon_lock(folio); + return rwc->anon_lock(folio, rwc); /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() @@ -2353,7 +2363,17 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, if (!anon_vma) return NULL; + if (anon_vma_trylock_read(anon_vma)) + goto out; + + if (rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + anon_vma_lock_read(anon_vma); +out: return anon_vma; } @@ -2367,7 +2387,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, * contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, - const struct rmap_walk_control *rwc, bool locked) + struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; @@ -2415,7 +2435,7 @@ static void rmap_walk_anon(struct folio *folio, * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, - const struct rmap_walk_control *rwc, bool locked) + struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = folio_mapping(folio); pgoff_t pgoff_start, pgoff_end; @@ -2434,8 +2454,18 @@ static void rmap_walk_file(struct folio *folio, pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; - if (!locked) + if (!locked) { + if (i_mmap_trylock_read(mapping)) + goto lookup; + + if (rwc->try_lock) { + rwc->contended = true; + return; + } + i_mmap_lock_read(mapping); + } +lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(&folio->page, vma); @@ -2457,7 +2487,7 @@ done: i_mmap_unlock_read(mapping); } -void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); @@ -2468,7 +2498,7 @@ void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc) } /* Like rmap_walk, but caller holds relevant rmap lock */ -void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); diff --git a/mm/vmscan.c b/mm/vmscan.c index 24dbe04520cb..887edcd93a40 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1391,6 +1391,10 @@ static enum page_references folio_check_references(struct folio *folio, if (vm_flags & VM_LOCKED) return PAGEREF_ACTIVATE; + /* rmap lock contention: rotate */ + if (referenced_ptes == -1) + return PAGEREF_KEEP; + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -2499,8 +2503,9 @@ static void shrink_active_list(unsigned long nr_to_scan, } } + /* Referenced or rmap lock contention: rotate */ if (folio_referenced(folio, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags) != 0) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So -- cgit v1.2.3 From 70f8d9c5750bbb0ca4ef7e23d6abcb05e6061138 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Mar 2022 17:49:09 -0500 Subject: move mount-related externs from fs.h to mount.h Signed-off-by: Al Viro --- arch/alpha/kernel/osf_sys.c | 1 + include/linux/fs.h | 11 ----------- include/linux/mount.h | 12 ++++++++++++ security/smack/smackfs.c | 1 + 4 files changed, 14 insertions(+), 11 deletions(-) (limited to 'include/linux/fs.h') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 8bbeebb73cf0..d257293401e2 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..14df7c049b43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2461,22 +2461,11 @@ struct super_block *sget(struct file_system_type *type, extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); -extern struct vfsmount *kern_mount(struct file_system_type *); -extern void kern_unmount(struct vfsmount *mnt); -extern int may_umount_tree(struct vfsmount *); -extern int may_umount(struct vfsmount *); -extern long do_mount(const char *, const char __user *, - const char *, unsigned long, void *); -extern struct vfsmount *collect_mounts(const struct path *); -extern void drop_collected_mounts(struct vfsmount *); -extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, - struct vfsmount *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); -extern bool our_mnt(struct vfsmount *mnt); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); diff --git a/include/linux/mount.h b/include/linux/mount.h index b3b149dcbf96..55a4abaf6715 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -108,6 +108,18 @@ extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(const char *name); extern bool path_is_mountpoint(const struct path *path); +extern bool our_mnt(struct vfsmount *mnt); + +extern struct vfsmount *kern_mount(struct file_system_type *); +extern void kern_unmount(struct vfsmount *mnt); +extern int may_umount_tree(struct vfsmount *); +extern int may_umount(struct vfsmount *); +extern long do_mount(const char *, const char __user *, + const char *, unsigned long, void *); +extern struct vfsmount *collect_mounts(const struct path *); +extern void drop_collected_mounts(struct vfsmount *); +extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, + struct vfsmount *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); #endif /* _LINUX_MOUNT_H */ diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 658eab05599e..192f33cc601e 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "smack.h" -- cgit v1.2.3 From fb70bf124b051d4ded4ce57511dfec6d3ebf2b43 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 30 Mar 2022 10:30:54 -0400 Subject: NFSD: Instantiate a struct file when creating a regular NFSv4 file There have been reports of races that cause NFSv4 OPEN(CREATE) to return an error even though the requested file was created. NFSv4 does not provide a status code for this case. To mitigate some of these problems, reorganize the NFSv4 OPEN(CREATE) logic to allocate resources before the file is actually created, and open the new file while the parent directory is still locked. Two new APIs are added: + Add an API that works like nfsd_file_acquire() but does not open the underlying file. The OPEN(CREATE) path can use this API when it already has an open file. + Add an API that is kin to dentry_open(). NFSD needs to create a file and grab an open "struct file *" atomically. The alloc_empty_file() has to be done before the inode create. If it fails (for example, because the NFS server has exceeded its max_files limit), we avoid creating the file and can still return an error to the NFS client. BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=382 Signed-off-by: Chuck Lever Tested-by: JianHong Yin --- fs/nfsd/filecache.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- fs/nfsd/filecache.h | 2 ++ fs/nfsd/nfs4proc.c | 43 +++++++++++++++++++++++++++++++++++++++---- fs/nfsd/nfs4state.c | 16 +++++++++++++--- fs/nfsd/xdr4.h | 1 + fs/open.c | 42 ++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 2 ++ 7 files changed, 143 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 781bef7e42d9..24772f246461 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -897,9 +897,9 @@ nfsd_file_is_cached(struct inode *inode) return ret; } -__be32 -nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **pnf) +static __be32 +nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf, bool open) { __be32 status; struct net *net = SVC_NET(rqstp); @@ -994,10 +994,13 @@ open_file: nfsd_file_gc(); nf->nf_mark = nfsd_file_mark_find_or_create(nf); - if (nf->nf_mark) - status = nfsd_open_verified(rqstp, fhp, may_flags, - &nf->nf_file); - else + if (nf->nf_mark) { + if (open) + status = nfsd_open_verified(rqstp, fhp, may_flags, + &nf->nf_file); + else + status = nfs_ok; + } else status = nfserr_jukebox; /* * If construction failed, or we raced with a call to unlink() @@ -1017,6 +1020,40 @@ open_file: goto out; } +/** + * nfsd_file_acquire - Get a struct nfsd_file with an open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, true); +} + +/** + * nfsd_file_create - Get a struct nfsd_file, do not open + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file just created + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, false); +} + /* * Note that fields may be added, removed or reordered in the future. Programs * scraping this file for info should test the labels to ensure they're diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 435ceab27897..1da0c79a5580 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -59,5 +59,7 @@ void nfsd_file_close_inode_sync(struct inode *inode); bool nfsd_file_is_cached(struct inode *inode); __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); +__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); int nfsd_file_cache_stats_open(struct inode *, struct file *); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 99c0485a29e9..28bae84d7809 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -243,6 +243,37 @@ static inline bool nfsd4_create_is_exclusive(int createmode) createmode == NFS4_CREATE_EXCLUSIVE4_1; } +static __be32 +nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child, + struct nfsd4_open *open) +{ + struct file *filp; + struct path path; + int oflags; + + oflags = O_CREAT | O_LARGEFILE; + switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_WRITE: + oflags |= O_WRONLY; + break; + case NFS4_SHARE_ACCESS_BOTH: + oflags |= O_RDWR; + break; + default: + oflags |= O_RDONLY; + } + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = child; + filp = dentry_create(&path, oflags, open->op_iattr.ia_mode, + current_cred()); + if (IS_ERR(filp)) + return nfserrno(PTR_ERR(filp)); + + open->op_filp = filp; + return nfs_ok; +} + /* * Implement NFSv4's unchecked, guarded, and exclusive create * semantics for regular files. Open state for this new file is @@ -355,11 +386,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true); - if (host_err < 0) { - status = nfserrno(host_err); + status = nfsd4_vfs_create(fhp, child, open); + if (status != nfs_ok) goto out; - } open->op_created = true; /* A newly created file already has a file size of zero. */ @@ -517,6 +546,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, (int)open->op_fnamelen, open->op_fname, open->op_openowner); + open->op_filp = NULL; + /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; @@ -613,6 +644,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (reclaim && !status) nn->somebody_reclaimed = true; out: + if (open->op_filp) { + fput(open->op_filp); + open->op_filp = NULL; + } if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); fh_put(resfh); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 231e5c19cdb7..131102cba06b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5093,9 +5093,19 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); - if (status) - goto out_put_access; + + if (!open->op_filp) { + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); + if (status != nfs_ok) + goto out_put_access; + } else { + status = nfsd_file_create(rqstp, cur_fh, access, &nf); + if (status != nfs_ok) + goto out_put_access; + nf->nf_file = open->op_filp; + open->op_filp = NULL; + } + spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { fp->fi_fds[oflag] = nf; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 846ab6df9d48..7b744011f2d3 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,6 +273,7 @@ struct nfsd4_open { bool op_truncate; /* used during processing */ bool op_created; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ + struct file *op_filp; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_clnt_odstate *op_odstate; /* used during processing */ diff --git a/fs/open.c b/fs/open.c index 1315253e0247..117355ae6bd5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -981,6 +981,48 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * dentry_create - Create and open a file + * @path: path to create + * @flags: O_ flags + * @mode: mode bits for new file + * @cred: credentials to use + * + * Caller must hold the parent directory's lock, and have prepared + * a negative dentry, placed in @path->dentry, for the new file. + * + * Caller sets @path->mnt to the vfsmount of the filesystem where + * the new file is to be created. The parent directory and the + * negative dentry must reside on the same filesystem instance. + * + * On success, returns a "struct file *". Otherwise a ERR_PTR + * is returned. + */ +struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred) +{ + struct file *f; + int error; + + validate_creds(cred); + f = alloc_empty_file(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_create(mnt_user_ns(path->mnt), + d_inode(path->dentry->d_parent), + path->dentry, mode, true); + if (!error) + error = vfs_open(path, f); + + if (unlikely(error)) { + fput(f); + return ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL(dentry_create); + struct file *open_with_fake_path(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { diff --git a/include/linux/fs.h b/include/linux/fs.h index aa6c1bbdb8c4..b848355b5e6c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2640,6 +2640,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, name, flags, mode); } extern struct file * dentry_open(const struct path *, int, const struct cred *); +extern struct file *dentry_create(const struct path *path, int flags, + umode_t mode, const struct cred *cred); extern struct file * open_with_fake_path(const struct path *, int, struct inode*, const struct cred *); static inline struct file *file_clone_open(struct file *file) -- cgit v1.2.3