diff options
Diffstat (limited to 'fs')
101 files changed, 3887 insertions, 2735 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 44b6cdd36dc1..c654a3642897 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -59,7 +59,7 @@ endif # BLOCK config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU - depends on ZONE_DEVICE || FS_DAX_LIMITED + depends on ZONE_DEVICE select FS_IOMAP select DAX help @@ -95,13 +95,6 @@ config FS_DAX_PMD depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE -# Selected by DAX drivers that do not expect filesystem DAX to support -# get_user_pages() of DAX mappings. I.e. "limited" indicates no support -# for fork() of processes with MAP_SHARED mappings or support for -# direct-I/O to a DAX mapping. -config FS_DAX_LIMITED - bool - # Posix ACL utility routines # # Note: Posix ACLs can be implemented without these helpers. Never use @@ -256,7 +249,7 @@ config ARCH_SUPPORTS_HUGETLBFS menuconfig HUGETLBFS bool "HugeTLB file system support" - depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN + depends on ARCH_SUPPORTS_HUGETLBFS depends on (SYSFS || SYSCTL) select MEMFD_CREATE select PADATA if SMP diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9f05d454b9df..2186e87fb61b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -321,8 +321,7 @@ struct walk_control { /* * Ignore any items from the inode currently being processed. Needs - * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in - * the LOG_WALK_REPLAY_INODES stage. + * to be set every time we find a BTRFS_INODE_ITEM_KEY. */ bool ignore_cur_inode; @@ -2465,23 +2464,30 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { - btrfs_item_key_to_cpu(eb, &key, i); + struct btrfs_inode_item *inode_item; - /* inode keys are done during the first stage */ - if (key.type == BTRFS_INODE_ITEM_KEY && - wc->stage == LOG_WALK_REPLAY_INODES) { - struct btrfs_inode_item *inode_item; - u32 mode; + btrfs_item_key_to_cpu(eb, &key, i); - inode_item = btrfs_item_ptr(eb, i, - struct btrfs_inode_item); + if (key.type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); /* - * If we have a tmpfile (O_TMPFILE) that got fsync'ed - * and never got linked before the fsync, skip it, as - * replaying it is pointless since it would be deleted - * later. We skip logging tmpfiles, but it's always - * possible we are replaying a log created with a kernel - * that used to log tmpfiles. + * An inode with no links is either: + * + * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never + * got linked before the fsync, skip it, as replaying + * it is pointless since it would be deleted later. + * We skip logging tmpfiles, but it's always possible + * we are replaying a log created with a kernel that + * used to log tmpfiles; + * + * 2) A non-tmpfile which got its last link deleted + * while holding an open fd on it and later got + * fsynced through that fd. We always log the + * parent inodes when inode->last_unlink_trans is + * set to the current transaction, so ignore all the + * inode items for this inode. We will delete the + * inode when processing the parent directory with + * replay_dir_deletes(). */ if (btrfs_inode_nlink(eb, inode_item) == 0) { wc->ignore_cur_inode = true; @@ -2489,8 +2495,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, } else { wc->ignore_cur_inode = false; } - ret = replay_xattr_deletes(wc->trans, root, log, - path, key.objectid); + } + + /* Inode keys are done during the first stage. */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + u32 mode; + + ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bdde6ac5a60d..c02f100f8552 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2529,19 +2529,19 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); } -static inline void ceph_zero_partial_page( - struct inode *inode, loff_t offset, unsigned size) +static inline void ceph_zero_partial_page(struct inode *inode, + loff_t offset, size_t size) { - struct page *page; - pgoff_t index = offset >> PAGE_SHIFT; + struct folio *folio; - page = find_lock_page(inode->i_mapping, index); - if (page) { - wait_on_page_writeback(page); - zero_user(page, offset & (PAGE_SIZE - 1), size); - unlock_page(page); - put_page(page); - } + folio = filemap_lock_folio(inode->i_mapping, offset >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + folio_wait_writeback(folio); + folio_zero_range(folio, offset_in_folio(folio, offset), size); + folio_unlock(folio); + folio_put(folio); } static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b84d1747a020..b002e9b734f9 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -17,7 +17,6 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> -#include <linux/pfn_t.h> #include <linux/ramfs.h> #include <linux/init.h> #include <linux/string.h> @@ -412,8 +411,8 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; - pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); - vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); + vmf = vmf_insert_mixed(vma, vma->vm_start + off, + address + off); if (vmf & VM_FAULT_ERROR) ret = vm_fault_to_errno(vmf, 0); } @@ -20,7 +20,6 @@ #include <linux/sched/signal.h> #include <linux/uio.h> #include <linux/vmstat.h> -#include <linux/pfn_t.h> #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> @@ -76,9 +75,9 @@ static struct folio *dax_to_folio(void *entry) return page_folio(pfn_to_page(dax_to_pfn(entry))); } -static void *dax_make_entry(pfn_t pfn, unsigned long flags) +static void *dax_make_entry(unsigned long pfn, unsigned long flags) { - return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); + return xa_mk_value(flags | (pfn << DAX_SHIFT)); } static bool dax_is_locked(void *entry) @@ -449,9 +448,6 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) return; - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) - return; - index = linear_page_index(vma, address & ~(size - 1)); if (shared && (folio->mapping || dax_folio_is_shared(folio))) { if (folio->mapping) @@ -474,9 +470,6 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, { struct folio *folio = dax_to_folio(entry); - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) - return; - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) return; @@ -719,7 +712,7 @@ retry: if (order > 0) flags |= DAX_PMD; - entry = dax_make_entry(pfn_to_pfn_t(0), flags); + entry = dax_make_entry(0, flags); dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; @@ -768,12 +761,6 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t end_idx; XA_STATE(xas, &mapping->i_pages, start_idx); - /* - * In the 'limited' case get_user_pages() for dax is disabled. - */ - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) - return NULL; - if (!dax_mapping(mapping)) return NULL; @@ -1053,7 +1040,7 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, * appropriate. */ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, - const struct iomap_iter *iter, void *entry, pfn_t pfn, + const struct iomap_iter *iter, void *entry, unsigned long pfn, unsigned long flags) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1251,7 +1238,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, - size_t size, void **kaddr, pfn_t *pfnp) + size_t size, void **kaddr, unsigned long *pfnp) { pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); int id, rc = 0; @@ -1269,7 +1256,7 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, rc = -EINVAL; if (PFN_PHYS(length) < size) goto out; - if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) + if (*pfnp & (PHYS_PFN(size)-1)) goto out; rc = 0; @@ -1373,12 +1360,12 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, { struct inode *inode = iter->inode; unsigned long vaddr = vmf->address; - pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); + unsigned long pfn = my_zero_pfn(vaddr); vm_fault_t ret; *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); - ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false); + ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false); trace_dax_load_hole(inode, vmf, ret); return ret; } @@ -1395,14 +1382,14 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, struct folio *zero_folio; spinlock_t *ptl; pmd_t pmd_entry; - pfn_t pfn; + unsigned long pfn; zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm); if (unlikely(!zero_folio)) goto fallback; - pfn = page_to_pfn_t(&zero_folio->page); + pfn = page_to_pfn(&zero_folio->page); *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE); @@ -1791,7 +1778,8 @@ static vm_fault_t dax_fault_return(int error) * insertion for now and return the pfn so that caller can insert it after the * fsync is done. */ -static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) +static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp, + unsigned long pfn) { if (WARN_ON_ONCE(!pfnp)) return VM_FAULT_SIGBUS; @@ -1839,7 +1827,7 @@ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, * @pmd: distinguish whether it is a pmd fault */ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, - const struct iomap_iter *iter, pfn_t *pfnp, + const struct iomap_iter *iter, unsigned long *pfnp, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; @@ -1850,7 +1838,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, unsigned long entry_flags = pmd ? DAX_PMD : 0; struct folio *folio; int ret, err = 0; - pfn_t pfn; + unsigned long pfn; void *kaddr; if (!pmd && vmf->cow_page) @@ -1887,16 +1875,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, folio_ref_inc(folio); if (pmd) - ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)), - write); + ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write); else - ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write); + ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write); folio_put(folio); return ret; } -static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp, int *iomap_errp, const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1937,7 +1924,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * the PTE we need to set up. If so just return and the fault will be * retried. */ - if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto unlock_entry; } @@ -2008,7 +1995,7 @@ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, return false; } -static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -2060,8 +2047,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * the PMD we need to set up. If so just return and the fault will be * retried. */ - if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && - !pmd_devmap(*vmf->pmd)) { + if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) { ret = 0; goto unlock_entry; } @@ -2090,7 +2076,7 @@ out: return ret; } #else -static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -2111,7 +2097,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * successfully. */ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, - pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) + unsigned long *pfnp, int *iomap_errp, + const struct iomap_ops *ops) { if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); @@ -2131,8 +2118,8 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); * This function inserts a writeable PTE or PMD entry into the page tables * for an mmaped DAX file. It also marks the page cache entry as dirty. */ -static vm_fault_t -dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) +static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, + unsigned long pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); @@ -2154,7 +2141,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); - folio = pfn_folio(pfn_t_to_pfn(pfn)); + folio = pfn_folio(pfn); folio_ref_inc(folio); if (order == 0) ret = vmf_insert_page_mkwrite(vmf, &folio->page, true); @@ -2181,7 +2168,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) * table entry. */ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, - pfn_t pfn) + unsigned long pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; diff --git a/fs/direct-io.c b/fs/direct-io.c index 1694ee9a9382..2267f5ae7f77 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -996,7 +996,7 @@ do_holes: dio_unpin_page(dio, page); goto out; } - zero_user(page, from, 1 << blkbits); + memzero_page(page, from, 1 << blkbits); sdio->block_in_file++; from += 1 << blkbits; dio->result += 1 << blkbits; diff --git a/fs/exec.c b/fs/exec.c index fe895e47f1dd..2a1e5e4042a1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -604,7 +604,7 @@ int setup_arg_pages(struct linux_binprm *bprm, struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; - unsigned long vm_flags; + vm_flags_t vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 3103b932b674..ee060e26f51d 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -996,6 +996,7 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_hint_femp candi_empty; struct exfat_sb_info *sbi = EXFAT_SB(sb); int num_entries = exfat_calc_num_entries(p_uniname); + unsigned int clu_count = 0; if (num_entries < 0) return num_entries; @@ -1133,6 +1134,10 @@ rewind: } else { if (exfat_get_next_cluster(sb, &clu.dir)) return -EIO; + + /* break if the cluster chain includes a loop */ + if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi))) + goto not_found; } } @@ -1195,6 +1200,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) int i, count = 0; int dentries_per_clu; unsigned int entry_type; + unsigned int clu_count = 0; struct exfat_chain clu; struct exfat_dentry *ep; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -1227,6 +1233,12 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) } else { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; + + if (unlikely(++clu_count > sbi->used_clusters)) { + exfat_fs_error(sb, "FAT or bitmap is corrupted"); + return -EIO; + } + } } diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 23065f948ae7..232cc7f8ab92 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -490,5 +490,15 @@ int exfat_count_num_clusters(struct super_block *sb, } *ret_count = count; + + /* + * since exfat_count_used_clusters() is not called, sbi->used_clusters + * cannot be used here. + */ + if (unlikely(i == sbi->num_clusters && clu != EXFAT_EOF_CLUSTER)) { + exfat_fs_error(sb, "The cluster chain has a loop"); + return -EIO; + } + return 0; } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 6b82497572b4..538d2b6ac2ec 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -622,9 +622,8 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (pos > valid_size) pos = valid_size; - if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) { - ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1, - iocb->ki_flags & IOCB_SYNC); + if (iocb->ki_pos > pos) { + ssize_t err = generic_write_sync(iocb, iocb->ki_pos - pos); if (err < 0) return err; } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index fede0283d6e2..f5f1c4e8a29f 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -890,6 +890,7 @@ static int exfat_check_dir_empty(struct super_block *sb, { int i, dentries_per_clu; unsigned int type; + unsigned int clu_count = 0; struct exfat_chain clu; struct exfat_dentry *ep; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -926,6 +927,10 @@ static int exfat_check_dir_empty(struct super_block *sb, } else { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; + + /* break if the cluster chain includes a loop */ + if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi))) + break; } } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index ea5c1334a214..8926e63f5bb7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -341,13 +341,12 @@ static void exfat_hash_init(struct super_block *sb) INIT_HLIST_HEAD(&sbi->inode_hashtable[i]); } -static int exfat_read_root(struct inode *inode) +static int exfat_read_root(struct inode *inode, struct exfat_chain *root_clu) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); - struct exfat_chain cdir; - int num_subdirs, num_clu = 0; + int num_subdirs; exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); ei->entry = -1; @@ -360,12 +359,9 @@ static int exfat_read_root(struct inode *inode) ei->hint_stat.clu = sbi->root_dir; ei->hint_femp.eidx = EXFAT_HINT_NONE; - exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); - if (exfat_count_num_clusters(sb, &cdir, &num_clu)) - return -EIO; - i_size_write(inode, num_clu << sbi->cluster_size_bits); + i_size_write(inode, EXFAT_CLU_TO_B(root_clu->size, sbi)); - num_subdirs = exfat_count_dir_entries(sb, &cdir); + num_subdirs = exfat_count_dir_entries(sb, root_clu); if (num_subdirs < 0) return -EIO; set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR); @@ -578,7 +574,8 @@ static int exfat_verify_boot_region(struct super_block *sb) } /* mount the file system volume */ -static int __exfat_fill_super(struct super_block *sb) +static int __exfat_fill_super(struct super_block *sb, + struct exfat_chain *root_clu) { int ret; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -595,6 +592,18 @@ static int __exfat_fill_super(struct super_block *sb) goto free_bh; } + /* + * Call exfat_count_num_cluster() before searching for up-case and + * bitmap directory entries to avoid infinite loop if they are missing + * and the cluster chain includes a loop. + */ + exfat_chain_set(root_clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + ret = exfat_count_num_clusters(sb, root_clu, &root_clu->size); + if (ret) { + exfat_err(sb, "failed to count the number of clusters in root"); + goto free_bh; + } + ret = exfat_create_upcase_table(sb); if (ret) { exfat_err(sb, "failed to load upcase table"); @@ -627,6 +636,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) struct exfat_sb_info *sbi = sb->s_fs_info; struct exfat_mount_options *opts = &sbi->options; struct inode *root_inode; + struct exfat_chain root_clu; int err; if (opts->allow_utime == (unsigned short)-1) @@ -645,7 +655,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS; sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS; - err = __exfat_fill_super(sb); + err = __exfat_fill_super(sb, &root_clu); if (err) { exfat_err(sb, "failed to recognize exfat type"); goto check_nls_io; @@ -680,7 +690,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) root_inode->i_ino = EXFAT_ROOT_INO; inode_set_iversion(root_inode, 1); - err = exfat_read_root(root_inode); + err = exfat_read_root(root_inode, &root_clu); if (err) { exfat_err(sb, "failed to initialize root inode"); goto put_inode; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index c48fd36b2d74..c9329ed5c094 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -703,7 +703,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * possible we just missed a transaction commit that did so */ smp_mb(); - if (sbi->s_mb_free_pending == 0) { + if (atomic_read(&sbi->s_mb_free_pending) == 0) { if (test_opt(sb, DISCARD)) { atomic_inc(&sbi->s_retry_alloc_pending); flush_work(&sbi->s_discard_work); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7d962e7f388a..01a6e2de7fc3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -157,7 +157,7 @@ enum criteria { /* * Reads each block group sequentially, performing disk IO if - * necessary, to find find_suitable block group. Tries to + * necessary, to find suitable block group. Tries to * allocate goal length but might trim the request if nothing * is found after enough tries. */ @@ -185,14 +185,8 @@ enum criteria { /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 -/* blocks already reserved */ -#define EXT4_MB_HINT_RESERVED 0x0002 -/* metadata is being allocated */ -#define EXT4_MB_HINT_METADATA 0x0004 /* first blocks in the file */ #define EXT4_MB_HINT_FIRST 0x0008 -/* search for the best chunk */ -#define EXT4_MB_HINT_BEST 0x0010 /* data is being allocated */ #define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ @@ -213,15 +207,6 @@ enum criteria { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 -/* Large fragment size list lookup succeeded at least once for - * CR_POWER2_ALIGNED */ -#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 -/* Avg fragment size rb tree lookup succeeded at least once for - * CR_GOAL_LEN_FAST */ -#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 -/* Avg fragment size rb tree lookup succeeded at least once for - * CR_BEST_AVAIL_LEN */ -#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -1608,16 +1593,14 @@ struct ext4_sb_info { unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; - unsigned int s_mb_free_pending; + atomic_t s_mb_free_pending; struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; - struct list_head *s_mb_avg_fragment_size; - rwlock_t *s_mb_avg_fragment_size_locks; - struct list_head *s_mb_largest_free_orders; - rwlock_t *s_mb_largest_free_orders_locks; + struct xarray *s_mb_avg_fragment_size; + struct xarray *s_mb_largest_free_orders; /* tunables */ unsigned long s_stripe; @@ -1629,15 +1612,16 @@ struct ext4_sb_info { unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; - unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; unsigned int s_sb_update_sec; unsigned int s_sb_update_kb; + /* where last allocation was done - for stream allocation */ + ext4_group_t *s_mb_last_groups; + unsigned int s_mb_nr_global_goals; + /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ @@ -1646,12 +1630,10 @@ struct ext4_sb_info { atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_stream_goals; /* stream allocation global goal hits */ atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - atomic_t s_bal_p2_aligned_bad_suggestions; - atomic_t s_bal_goal_fast_bad_suggestions; - atomic_t s_bal_best_avail_bad_suggestions; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ @@ -3020,7 +3002,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); -bool ext4_should_enable_large_folio(struct inode *inode); +void ext4_set_inode_mapping_order(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -3064,9 +3046,9 @@ extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); -extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, @@ -3489,8 +3471,6 @@ struct ext4_group_info { void *bb_bitmap; #endif struct rw_semaphore alloc_sem; - struct list_head bb_avg_fragment_size_node; - struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means @@ -3541,23 +3521,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); } +static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group) +{ + if (!spin_trylock(ext4_group_lock_ptr(sb, group))) + return false; + /* + * We're able to grab the lock right away, so drop the lock + * contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + return true; +} + static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { - spinlock_t *lock = ext4_group_lock_ptr(sb, group); - if (spin_trylock(lock)) - /* - * We're able to grab the lock right away, so drop the - * lock contention counter. - */ - atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); - else { + if (!ext4_try_lock_group(sb, group)) { /* * The lock is busy, so bump the contention counter, * and then wait on the spin lock. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, EXT4_MAX_CONTENTION); - spin_lock(lock); + spin_lock(ext4_group_lock_ptr(sb, group)); } } @@ -3612,6 +3597,7 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); +extern void ext4_update_final_de(void *de_buf, int old_size, int new_size); int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, @@ -3671,10 +3657,10 @@ static inline int ext4_has_inline_data(struct inode *inode) extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); -extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, - struct ext4_dir_entry_2 *de, - int blocksize, int csum_size, - unsigned int parent_ino, int dotdot_real_len); +extern int ext4_init_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *dir_block, + unsigned int parent_ino, void *inline_buf, + int inline_size); extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 26435f3a3094..c484125d963f 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -31,13 +31,6 @@ #define CHECK_BINSEARCH__ /* - * If EXT_STATS is defined then stats numbers are collected. - * These number will be displayed at umount time. - */ -#define EXT_STATS_ - - -/* * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b43aa82c1b39..ca5499e9412b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5215,7 +5215,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, credits = depth + 2; } - restart_credits = ext4_writepage_trans_blocks(inode); + restart_credits = ext4_chunk_trans_extent(inode, 0); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); if (err) { @@ -5475,7 +5475,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) truncate_pagecache(inode, start); - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -5571,7 +5571,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) truncate_pagecache(inode, start); - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 48908ce0c3ea..93240e35ee36 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -747,7 +747,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; - pfn_t pfn; + unsigned long pfn; if (write) { sb_start_pagefault(sb); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 79aa3df8d019..df4051613b29 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1335,8 +1335,7 @@ got: } } - if (ext4_should_enable_large_folio(inode)) - mapping_set_large_folios(inode->i_mapping); + ext4_set_inode_mapping_order(inode); ext4_update_inode_fsync_trans(handle, inode, 1); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a1bbcdf40824..1b094a4f3866 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -303,7 +303,11 @@ static int ext4_create_inline_data(handle_t *handle, if (error) goto out; - BUG_ON(!is.s.not_found); + if (!is.s.not_found) { + EXT4_ERROR_INODE(inode, "unexpected inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { @@ -354,7 +358,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, if (error) goto out; - BUG_ON(is.s.not_found); + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, "missing inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); @@ -562,7 +570,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, return 0; } - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -612,6 +620,7 @@ retry: } else ret = ext4_block_write_begin(handle, folio, from, to, ext4_get_block); + clear_buffer_new(folio_buffers(folio)); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -891,6 +900,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, return ret; } + clear_buffer_new(folio_buffers(folio)); folio_mark_dirty(folio); folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); @@ -995,7 +1005,7 @@ static void *ext4_get_inline_xattr_pos(struct inode *inode, } /* Set the final de to cover the whole block. */ -static void ext4_update_final_de(void *de_buf, int old_size, int new_size) +void ext4_update_final_de(void *de_buf, int old_size, int new_size) { struct ext4_dir_entry_2 *de, *prev_de; void *limit; @@ -1059,51 +1069,6 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } -static int ext4_finish_convert_inline_dir(handle_t *handle, - struct inode *inode, - struct buffer_head *dir_block, - void *buf, - int inline_size) -{ - int err, csum_size = 0, header_size = 0; - struct ext4_dir_entry_2 *de; - void *target = dir_block->b_data; - - /* - * First create "." and ".." and then copy the dir information - * back to the block. - */ - de = target; - de = ext4_init_dot_dotdot(inode, de, - inode->i_sb->s_blocksize, csum_size, - le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); - header_size = (void *)de - target; - - memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, - inline_size - EXT4_INLINE_DOTDOT_SIZE); - - if (ext4_has_feature_metadata_csum(inode->i_sb)) - csum_size = sizeof(struct ext4_dir_entry_tail); - - inode->i_size = inode->i_sb->s_blocksize; - i_size_write(inode, inode->i_sb->s_blocksize); - EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - ext4_update_final_de(dir_block->b_data, - inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, - inode->i_sb->s_blocksize - csum_size); - - if (csum_size) - ext4_initialize_dirent_tail(dir_block, - inode->i_sb->s_blocksize); - set_buffer_uptodate(dir_block); - unlock_buffer(dir_block); - err = ext4_handle_dirty_dirblock(handle, inode, dir_block); - if (err) - return err; - set_buffer_verified(dir_block); - return ext4_mark_inode_dirty(handle, inode); -} - static int ext4_convert_inline_data_nolock(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) @@ -1175,8 +1140,17 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { - error = ext4_finish_convert_inline_dir(handle, inode, data_bh, - buf, inline_size); + unlock_buffer(data_bh); + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + + error = ext4_init_dirblock(handle, inode, data_bh, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (!error) + error = ext4_mark_inode_dirty(handle, inode); } out_restore: @@ -1315,7 +1289,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; - strcpy(fake.name, "."); + memcpy(fake.name, ".", 2); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); @@ -1325,7 +1299,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; - strcpy(fake.name, ".."); + memcpy(fake.name, "..", 3); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); @@ -1864,7 +1838,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) }; - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1903,7 +1877,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; - BUG_ON(is.s.not_found); + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, + "missing inline data xattr"); + err = -EFSCORRUPTED; + goto out_error; + } value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); @@ -1979,7 +1958,7 @@ int ext4_convert_inline_data(struct inode *inode) return 0; } - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5c7024051f1e..ed54c4d0f2f9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -723,8 +723,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -757,8 +756,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, orig_mlen == map->m_len) goto found; - if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) - map->m_len = orig_mlen; + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -877,6 +875,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } +/* + * Make sure that the current journal transaction has enough credits to map + * one extent. Return -EAGAIN if it cannot extend the current running + * transaction. + */ +static inline int ext4_journal_ensure_extent_credits(handle_t *handle, + struct inode *inode) +{ + int credits; + int ret; + + /* Called from ext4_da_write_begin() which has no handle started? */ + if (!handle) + return 0; + + credits = ext4_chunk_trans_blocks(inode, 1); + ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); + return ret <= 0 ? ret : -EAGAIN; +} + static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { @@ -1171,11 +1189,13 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, } continue; } - if (buffer_new(bh)) + if (WARN_ON_ONCE(buffer_new(bh))) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); + err = ext4_journal_ensure_extent_credits(handle, inode); + if (!err) + err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { @@ -1274,7 +1294,8 @@ static int ext4_write_begin(const struct kiocb *iocb, * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ - needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + needed_blocks = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)) + 1; index = pos >> PAGE_SHIFT; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -1372,8 +1393,9 @@ retry_journal: ext4_orphan_del(NULL, inode); } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + if (ret == -EAGAIN || + (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_journal; folio_put(folio); return ret; @@ -1393,6 +1415,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); + clear_buffer_new(bh); return ret; } @@ -1665,11 +1688,12 @@ struct mpage_da_data { unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ - pgoff_t first_page; /* The first page to write */ - pgoff_t next_page; /* Current page to examine */ - pgoff_t last_page; /* Last page to examine */ + loff_t start_pos; /* The start pos to write */ + loff_t next_pos; /* Current pos to examine */ + loff_t end_pos; /* Last pos to examine */ + /* - * Extent to map - this can be after first_page because that can be + * Extent to map - this can be after start_pos because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ @@ -1689,38 +1713,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - /* This is necessary when next_page == 0. */ - if (mpd->first_page >= mpd->next_page) + /* This is necessary when next_pos == 0. */ + if (mpd->start_pos >= mpd->next_pos) return; mpd->scanned_until_end = 0; - index = mpd->first_page; - end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; - start = index << (PAGE_SHIFT - inode->i_blkbits); - last = end << (PAGE_SHIFT - inode->i_blkbits); + start = EXT4_B_TO_LBLK(inode, mpd->start_pos); + last = mpd->next_pos >> inode->i_blkbits; /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); - ext4_es_remove_extent(inode, start, last - start + 1); + ext4_es_remove_extent(inode, start, last - start); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); - while (index <= end) { - nr = filemap_get_folios(mapping, &index, end, &fbatch); + index = mpd->start_pos >> PAGE_SHIFT; + end = mpd->next_pos >> PAGE_SHIFT; + while (index < end) { + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - if (folio->index < mpd->first_page) + if (folio_pos(folio) < mpd->start_pos) continue; - if (folio_next_index(folio) - 1 > end) + if (folio_next_index(folio) > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -2022,7 +2046,8 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - mpd->first_page += folio_nr_pages(folio); + mpd->start_pos += folio_size(folio); + mpd->wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -2032,7 +2057,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) loff_t size; int err; - BUG_ON(folio->index != mpd->first_page); + WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path @@ -2053,8 +2078,6 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); - if (!err) - mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } @@ -2321,6 +2344,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) int get_blocks_flags; int err, dioread_nolock; + /* Make sure transaction has enough credits for this extent */ + err = ext4_journal_ensure_extent_credits(handle, inode); + if (err < 0) + return err; + trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or @@ -2360,6 +2388,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) } /* + * This is used to submit mapped buffers in a single folio that is not fully + * mapped for various reasons, such as insufficient space or journal credits. + */ +static int mpage_submit_partial_folio(struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct folio *folio; + loff_t pos; + int ret; + + folio = filemap_get_folio(inode->i_mapping, + mpd->start_pos >> PAGE_SHIFT); + if (IS_ERR(folio)) + return PTR_ERR(folio); + /* + * The mapped position should be within the current processing folio + * but must not be the folio start position. + */ + pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; + if (WARN_ON_ONCE((folio_pos(folio) == pos) || + !folio_contains(folio, pos >> PAGE_SHIFT))) + return -EINVAL; + + ret = mpage_submit_folio(mpd, folio); + if (ret) + goto out; + /* + * Update start_pos to prevent this folio from being released in + * mpage_release_unused_pages(), it will be reset to the aligned folio + * pos when this folio is written again in the next round. Additionally, + * do not update wbc->nr_to_write here, as it will be updated once the + * entire folio has finished processing. + */ + mpd->start_pos = pos; +out: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +/* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * @@ -2407,10 +2476,18 @@ static int mpage_map_and_submit_extent(handle_t *handle, * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ - if ((err == -ENOMEM) || + if ((err == -ENOMEM) || (err == -EAGAIN) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { - if (progress) + /* + * We may have already allocated extents for + * some bhs inside the folio, issue the + * corresponding data to prevent stale data. + */ + if (progress) { + if (mpage_submit_partial_folio(mpd)) + goto invalidate_dirty_pages; goto update_disksize; + } return err; } ext4_msg(sb, KERN_CRIT, @@ -2444,7 +2521,7 @@ update_disksize: * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ - disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; + disksize = mpd->start_pos; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; @@ -2468,21 +2545,6 @@ update_disksize: return err; } -/* - * Calculate the total number of credits to reserve for one writepages - * iteration. This is called from ext4_writepages(). We map an extent of - * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping - * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + - * bpp - 1 blocks in bpp different extents. - */ -static int ext4_da_writepages_trans_blocks(struct inode *inode) -{ - int bpp = ext4_journal_blocks_per_folio(inode); - - return ext4_meta_trans_blocks(inode, - MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); -} - static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { @@ -2547,8 +2609,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - pgoff_t index = mpd->first_page; - pgoff_t end = mpd->last_page; + pgoff_t index = mpd->start_pos >> PAGE_SHIFT; + pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; @@ -2563,7 +2625,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; - mpd->next_page = index; + mpd->next_pos = mpd->start_pos; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); @@ -2594,7 +2656,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != folio->index) + if (mpd->map.m_len > 0 && + mpd->next_pos != folio_pos(folio)) goto out; if (handle) { @@ -2640,8 +2703,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) } if (mpd->map.m_len == 0) - mpd->first_page = folio->index; - mpd->next_page = folio_next_index(folio); + mpd->start_pos = folio_pos(folio); + mpd->next_pos = folio_pos(folio) + folio_size(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -2769,12 +2832,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { + int bpf = ext4_journal_blocks_per_folio(inode); /* * We may need to convert up to one extent per block in - * the page and we may dirty the inode. + * the folio and we may dirty the inode. */ - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, - PAGE_SIZE >> inode->i_blkbits); + rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) @@ -2784,18 +2847,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; - mpd->first_page = writeback_index; - mpd->last_page = -1; + mpd->start_pos = writeback_index << PAGE_SHIFT; + mpd->end_pos = LLONG_MAX; } else { - mpd->first_page = wbc->range_start >> PAGE_SHIFT; - mpd->last_page = wbc->range_end >> PAGE_SHIFT; + mpd->start_pos = wbc->range_start; + mpd->end_pos = wbc->range_end; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, mpd->first_page, - mpd->last_page); + tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, + mpd->end_pos >> PAGE_SHIFT); blk_start_plug(&plug); /* @@ -2838,8 +2901,14 @@ retry: * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); - needed_blocks = ext4_da_writepages_trans_blocks(inode); - + /* + * Calculate the number of credits needed to reserve for one + * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will + * attempt to extend the transaction or start a new iteration + * if the reserved credits are insufficient. + */ + needed_blocks = ext4_chunk_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); @@ -2855,7 +2924,8 @@ retry: } mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd->first_page, wbc); + trace_ext4_da_write_folios_start(inode, mpd->start_pos, + mpd->next_pos, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, @@ -2893,6 +2963,8 @@ retry: } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; + trace_ext4_da_write_folios_end(inode, mpd->start_pos, + mpd->next_pos, wbc, ret); if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2904,6 +2976,8 @@ retry: ret = 0; continue; } + if (ret == -EAGAIN) + ret = 0; /* Fatal error - ENOMEM, EIO... */ if (ret) break; @@ -2912,8 +2986,8 @@ unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - mpd->last_page = writeback_index - 1; - mpd->first_page = 0; + mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; + mpd->start_pos = 0; goto retry; } @@ -2923,7 +2997,7 @@ unplug: * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = mpd->first_page; + mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, @@ -4384,7 +4458,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 2); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); @@ -4533,7 +4607,7 @@ int ext4_truncate(struct inode *inode) } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 1); else credits = ext4_blocks_for_truncate(inode); @@ -5101,7 +5175,7 @@ error: return -EFSCORRUPTED; } -bool ext4_should_enable_large_folio(struct inode *inode) +static bool ext4_should_enable_large_folio(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -5118,6 +5192,22 @@ bool ext4_should_enable_large_folio(struct inode *inode) return true; } +/* + * Limit the maximum folio order to 2048 blocks to prevent overestimation + * of reserve handle credits during the folio writeback in environments + * where the PAGE_SIZE exceeds 4KB. + */ +#define EXT4_MAX_PAGECACHE_ORDER(i) \ + umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) +void ext4_set_inode_mapping_order(struct inode *inode) +{ + if (!ext4_should_enable_large_folio(inode)) + return; + + mapping_set_folio_order_range(inode->i_mapping, 0, + EXT4_MAX_PAGECACHE_ORDER(inode)); +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -5435,8 +5525,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if (ext4_should_enable_large_folio(inode)) - mapping_set_large_folios(inode->i_mapping); + + ext4_set_inode_mapping_order(inode); ret = check_igot_inode(inode, flags, function, line); /* @@ -6134,7 +6224,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) int ret; /* - * How many index and lead blocks need to touch to map @lblocks + * How many index and leaf blocks need to touch to map @lblocks * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); @@ -6143,7 +6233,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks; + groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -6160,25 +6250,19 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) } /* - * Calculate the total number of credits to reserve to fit - * the modification of a single pages into a single transaction, - * which may include multiple chunks of block allocations. - * - * This could be called via ext4_write_begin() - * - * We need to consider the worse case, when - * one new block per extent. + * Calculate the journal credits for modifying the number of blocks + * in a single extent within one transaction. 'nrblocks' is used only + * for non-extent inodes. For extent type inodes, 'nrblocks' can be + * zero if the exact number of blocks is unknown. */ -int ext4_writepage_trans_blocks(struct inode *inode) +int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) { - int bpp = ext4_journal_blocks_per_folio(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, bpp); - + ret = ext4_meta_trans_blocks(inode, nrblocks, 1); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret += bpp; + ret += nrblocks; return ret; } @@ -6550,6 +6634,55 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, return !buffer_mapped(bh); } +static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, + get_block_t get_block) +{ + handle_t *handle; + loff_t size; + unsigned long len; + int credits; + int ret; + + credits = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + folio_lock(folio); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { + ret = -EFAULT; + goto out_error; + } + + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); + + ret = ext4_block_write_begin(handle, folio, 0, len, get_block); + if (ret) + goto out_error; + + if (!ext4_should_journal_data(inode)) { + block_commit_write(folio, 0, len); + folio_mark_dirty(folio); + } else { + ret = ext4_journal_folio_buffers(handle, folio, len); + if (ret) + goto out_error; + } + ext4_journal_stop(handle); + folio_wait_stable(folio); + return ret; + +out_error: + folio_unlock(folio); + ext4_journal_stop(handle); + return ret; +} + vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -6561,8 +6694,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - handle_t *handle; - get_block_t *get_block; + get_block_t *get_block = ext4_get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) @@ -6630,47 +6762,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; - else - get_block = ext4_get_block; retry_alloc: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - /* - * Data journalling can't use block_page_mkwrite() because it - * will set_buffer_dirty() before do_journal_get_write_access() - * thus might hit warning messages for dirty metadata buffers. - */ - if (!ext4_should_journal_data(inode)) { - err = block_page_mkwrite(vma, vmf, get_block); - } else { - folio_lock(folio); - size = i_size_read(inode); - /* Page got truncated from under us? */ - if (folio->mapping != mapping || folio_pos(folio) > size) { - ret = VM_FAULT_NOPAGE; - goto out_error; - } - - len = folio_size(folio); - if (folio_pos(folio) + len > size) - len = size - folio_pos(folio); - - err = ext4_block_write_begin(handle, folio, 0, len, - ext4_get_block); - if (!err) { - ret = VM_FAULT_SIGBUS; - if (ext4_journal_folio_buffers(handle, folio, len)) - goto out_error; - } else { - folio_unlock(folio); - } - } - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + /* Start journal and allocate blocks */ + err = ext4_block_page_mkwrite(inode, folio, get_block); + if (err == -EAGAIN || + (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_alloc; out_ret: ret = vmf_fs_error(err); @@ -6678,8 +6774,4 @@ out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; -out_error: - folio_unlock(folio); - ext4_journal_stop(handle); - goto out; } diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index d634c12f1984..a9416b20ff64 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -155,6 +155,7 @@ static struct super_block *mbt_ext4_alloc_super_block(void) bgl_lock_init(sbi->s_blockgroup_lock); sbi->s_es = &fsb->es; + sbi->s_sb = sb; sb->s_fs_info = sbi; up_write(&sb->s_umount); @@ -802,6 +803,8 @@ static void test_mb_mark_used(struct kunit *test) KUNIT_ASSERT_EQ(test, ret, 0); grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); for (i = 0; i < TEST_RANGE_COUNT; i++) test_mb_mark_used_range(test, &e4b, ranges[i].start, @@ -875,6 +878,8 @@ static void test_mb_free_blocks(struct kunit *test) ext4_unlock_group(sb, TEST_GOAL_GROUP); grp->bb_free = 0; + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; memset(bitmap, 0xff, sb->s_blocksize); mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1e98c5be4e0a..5898d92ba19f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -132,25 +132,30 @@ * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * - * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) + * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) * - * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) + * Locking: Writers use xa_lock, readers use rcu_read_lock. * - * This is an array of lists where the index in the array represents the + * This is an array of xarrays where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of - * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total - * number of buddy bitmap orders possible) number of lists. Group-infos are - * placed in appropriate lists. + * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total + * number of buddy bitmap orders possible) number of xarrays. Group-infos are + * placed in appropriate xarrays. * - * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) + * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) * - * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) + * Locking: Writers use xa_lock, readers use rcu_read_lock. * - * This is an array of lists where in the i-th list there are groups with + * This is an array of xarrays where in the i-th xarray there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. - * Note that we don't bother with a special list for completely empty groups - * so we only have MB_NUM_ORDERS(sb) lists. + * Note that we don't bother with a special xarray for completely empty + * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed + * in appropriate xarrays. + * + * In xarray, the index is the block group number, the value is the block group + * information, and a non-empty value indicates the block group is present in + * the current xarray. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for @@ -420,8 +425,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); -static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, enum criteria cr); +static int ext4_mb_scan_group(struct ext4_allocation_context *ac, + ext4_group_t group); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, @@ -841,132 +846,161 @@ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int new_order; + int new, old; - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) + if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) return; - new_order = mb_avg_fragment_size_order(sb, - grp->bb_free / grp->bb_fragments); - if (new_order == grp->bb_avg_fragment_size_order) + old = grp->bb_avg_fragment_size_order; + new = grp->bb_fragments == 0 ? -1 : + mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); + if (new == old) return; - if (grp->bb_avg_fragment_size_order != -1) { - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); - list_del(&grp->bb_avg_fragment_size_node); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + if (old >= 0) + xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group); + + grp->bb_avg_fragment_size_order = new; + if (new >= 0) { + /* + * Cannot use __GFP_NOFAIL because we hold the group lock. + * Although allocation for insertion may fails, it's not fatal + * as we have linear traversal to fall back on. + */ + int err = xa_insert(&sbi->s_mb_avg_fragment_size[new], + grp->bb_group, grp, GFP_ATOMIC); + if (err) + mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d", + grp->bb_group, new, err); + } +} + +static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, + struct xarray *xa, + ext4_group_t start, ext4_group_t end) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + enum criteria cr = ac->ac_criteria; + ext4_group_t ngroups = ext4_get_groups_count(sb); + unsigned long group = start; + struct ext4_group_info *grp; + + if (WARN_ON_ONCE(end > ngroups || start >= end)) + return 0; + + xa_for_each_range(xa, group, grp, start, end - 1) { + int err; + + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); + + err = ext4_mb_scan_group(ac, grp->bb_group); + if (err || ac->ac_status != AC_STATUS_CONTINUE) + return err; + + cond_resched(); } - grp->bb_avg_fragment_size_order = new_order; - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); - list_add_tail(&grp->bb_avg_fragment_size_node, - &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + + return 0; +} + +/* + * Find a suitable group of given order from the largest free orders xarray. + */ +static inline int +ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, + int order, ext4_group_t start, + ext4_group_t end) +{ + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order]; + + if (xa_empty(xa)) + return 0; + + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ -static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group) +static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, + ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *iter; int i; + int ret = 0; + ext4_group_t start, end; - if (ac->ac_status == AC_STATUS_FOUND) - return; - - if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) - atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); - + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { - if (list_empty(&sbi->s_mb_largest_free_orders[i])) - continue; - read_lock(&sbi->s_mb_largest_free_orders_locks[i]); - if (list_empty(&sbi->s_mb_largest_free_orders[i])) { - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - continue; - } - list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], - bb_largest_free_order_node) { - if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { - *group = iter->bb_group; - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - return; - } - } - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + } + if (start) { + end = start; + start = 0; + goto wrap_around; } + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + /* Increment cr and search again if no group is found */ - *new_cr = CR_GOAL_LEN_FAST; + ac->ac_criteria = CR_GOAL_LEN_FAST; + return ret; } /* - * Find a suitable group of given order from the average fragments list. + * Find a suitable group of given order from the average fragments xarray. */ -static struct ext4_group_info * -ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) +static int +ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, + int order, ext4_group_t start, + ext4_group_t end) { - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; - rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; - struct ext4_group_info *grp = NULL, *iter; - enum criteria cr = ac->ac_criteria; + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order]; - if (list_empty(frag_list)) - return NULL; - read_lock(frag_list_lock); - if (list_empty(frag_list)) { - read_unlock(frag_list_lock); - return NULL; - } - list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { - if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { - grp = iter; - break; - } - } - read_unlock(frag_list_lock); - return grp; + if (xa_empty(xa)) + return 0; + + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ -static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group) +static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, + ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp = NULL; - int i; - - if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { - if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); + int i, ret = 0; + ext4_group_t start, end; + + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: + i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); + for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; } - - for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); - i < MB_NUM_ORDERS(ac->ac_sb); i++) { - grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; - return; - } + if (start) { + end = start; + start = 0; + goto wrap_around; } + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* * CR_BEST_AVAIL_LEN works based on the concept that we have * a larger normalized goal len request which can be trimmed to @@ -976,9 +1010,11 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). */ if (ac->ac_flags & EXT4_MB_HINT_DATA) - *new_cr = CR_BEST_AVAIL_LEN; + ac->ac_criteria = CR_BEST_AVAIL_LEN; else - *new_cr = CR_GOAL_LEN_SLOW; + ac->ac_criteria = CR_GOAL_LEN_SLOW; + + return ret; } /* @@ -990,18 +1026,14 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case. */ -static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group) +static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, + ext4_group_t group) { + int ret = 0; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp = NULL; int i, order, min_order; unsigned long num_stripe_clusters = 0; - - if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { - if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); - } + ext4_group_t start, end; /* * mb_avg_fragment_size_order() returns order in a way that makes @@ -1033,6 +1065,9 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context if (1 << min_order < ac->ac_o_ex.fe_len) min_order = fls(ac->ac_o_ex.fe_len); + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: for (i = order; i >= min_order; i--) { int frag_order; /* @@ -1055,17 +1090,24 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context frag_order = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); - grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; - return; - } + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + } + if (start) { + end = start; + start = 0; + goto wrap_around; } /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR_GOAL_LEN_SLOW; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + ac->ac_criteria = CR_GOAL_LEN_SLOW; + + return ret; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1080,59 +1122,82 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) } /* - * Return next linear group for allocation. + * next linear group for allocation. */ -static ext4_group_t -next_linear_group(ext4_group_t group, ext4_group_t ngroups) +static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups) { /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ - return group + 1 >= ngroups ? 0 : group + 1; + *group = *group + 1 >= ngroups ? 0 : *group + 1; } -/* - * ext4_mb_choose_next_group: choose next group for allocation. - * - * @ac Allocation Context - * @new_cr This is an output parameter. If the there is no good group - * available at current CR level, this field is updated to indicate - * the new cr level that should be used. - * @group This is an input / output parameter. As an input it indicates the - * next group that the allocator intends to use for allocation. As - * output, this field indicates the next group that should be used as - * determined by the optimization functions. - * @ngroups Total number of groups - */ -static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac, + ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count) { - *new_cr = ac->ac_criteria; + int ret, i; + enum criteria cr = ac->ac_criteria; + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group = *start; - if (!should_optimize_scan(ac)) { - *group = next_linear_group(*group, ngroups); - return; + for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) { + ret = ext4_mb_scan_group(ac, group); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + cond_resched(); } + *start = group; + if (count == ngroups) + ac->ac_criteria++; + + /* Processed all groups and haven't found blocks */ + if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); + + return 0; +} + +static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) +{ + int ret = 0; + ext4_group_t start; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); + + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + + /* searching for the right group start from the goal value specified */ + start = ac->ac_g_ex.fe_group; + ac->ac_prefetch_grp = start; + ac->ac_prefetch_nr = 0; + + if (!should_optimize_scan(ac)) + return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups); + /* * Optimized scanning can return non adjacent groups which can cause * seek overhead for rotational disks. So try few linear groups before * trying optimized scan. */ - if (ac->ac_groups_linear_remaining) { - *group = next_linear_group(*group, ngroups); - ac->ac_groups_linear_remaining--; - return; - } + if (sbi->s_mb_max_linear_groups) + ret = ext4_mb_scan_groups_linear(ac, ngroups, &start, + sbi->s_mb_max_linear_groups); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; - if (*new_cr == CR_POWER2_ALIGNED) { - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); - } else if (*new_cr == CR_GOAL_LEN_FAST) { - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); - } else if (*new_cr == CR_BEST_AVAIL_LEN) { - ext4_mb_choose_next_group_best_avail(ac, new_cr, group); - } else { + switch (ac->ac_criteria) { + case CR_POWER2_ALIGNED: + return ext4_mb_scan_groups_p2_aligned(ac, start); + case CR_GOAL_LEN_FAST: + return ext4_mb_scan_groups_goal_fast(ac, start); + case CR_BEST_AVAIL_LEN: + return ext4_mb_scan_groups_best_avail(ac, start); + default: /* * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an * rb tree sorted by bb_free. But until that happens, we should @@ -1140,6 +1205,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, */ WARN_ON(1); } + + return 0; } /* @@ -1150,33 +1217,35 @@ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int i; + int new, old = grp->bb_largest_free_order; - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) - if (grp->bb_counters[i] > 0) + for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) + if (grp->bb_counters[new] > 0) break; + /* No need to move between order lists? */ - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || - i == grp->bb_largest_free_order) { - grp->bb_largest_free_order = i; + if (new == old) return; - } - if (grp->bb_largest_free_order >= 0) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); - list_del_init(&grp->bb_largest_free_order_node); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + if (old >= 0) { + struct xarray *xa = &sbi->s_mb_largest_free_orders[old]; + + if (!xa_empty(xa) && xa_load(xa, grp->bb_group)) + xa_erase(xa, grp->bb_group); } - grp->bb_largest_free_order = i; - if (grp->bb_largest_free_order >= 0 && grp->bb_free) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); - list_add_tail(&grp->bb_largest_free_order_node, - &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + + grp->bb_largest_free_order = new; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { + /* + * Cannot use __GFP_NOFAIL because we hold the group lock. + * Although allocation for insertion may fails, it's not fatal + * as we have linear traversal to fall back on. + */ + int err = xa_insert(&sbi->s_mb_largest_free_orders[new], + grp->bb_group, grp, GFP_ATOMIC); + if (err) + mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d", + grp->bb_group, new, err); } } @@ -2167,11 +2236,11 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - spin_lock(&sbi->s_md_lock); - sbi->s_mb_last_group = ac->ac_f_ex.fe_group; - sbi->s_mb_last_start = ac->ac_f_ex.fe_start; - spin_unlock(&sbi->s_md_lock); + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); } + /* * As we've just preallocated more space than * user requested originally, we store allocated @@ -2571,6 +2640,30 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } +static void __ext4_mb_scan_group(struct ext4_allocation_context *ac) +{ + bool is_stripe_aligned; + struct ext4_sb_info *sbi; + enum criteria cr = ac->ac_criteria; + + ac->ac_groups_scanned++; + if (cr == CR_POWER2_ALIGNED) + return ext4_mb_simple_scan_group(ac, ac->ac_e4b); + + sbi = EXT4_SB(ac->ac_sb); + is_stripe_aligned = false; + if ((sbi->s_stripe >= sbi->s_cluster_ratio) && + !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe))) + is_stripe_aligned = true; + + if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && + is_stripe_aligned) + ext4_mb_scan_aligned(ac, ac->ac_e4b); + + if (ac->ac_status == AC_STATUS_CONTINUE) + ext4_mb_complex_scan_group(ac, ac->ac_e4b); +} + /* * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable @@ -2761,6 +2854,37 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, } /* + * Batch reads of the block allocation bitmaps to get + * multiple READs in flight; limit prefetching at inexpensive + * CR, otherwise mballoc can spend a lot of time loading + * imperfect groups + */ +static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + struct ext4_sb_info *sbi; + + if (ac->ac_prefetch_grp != group) + return; + + sbi = EXT4_SB(ac->ac_sb); + if (ext4_mb_cr_expensive(ac->ac_criteria) || + ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) { + unsigned int nr = sbi->s_mb_prefetch; + + if (ext4_has_feature_flex_bg(ac->ac_sb)) { + nr = 1 << sbi->s_log_groups_per_flex; + nr -= group & (nr - 1); + nr = umin(nr, sbi->s_mb_prefetch); + } + + ac->ac_prefetch_nr = nr; + ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr, + &ac->ac_prefetch_ios); + } +} + +/* * Prefetching reads the block bitmap into the buffer cache; but we * need to make sure that the buddy bitmap in the page cache has been * initialized. Note that ext4_mb_init_group() will block if the I/O @@ -2793,24 +2917,58 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, } } +static int ext4_mb_scan_group(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + int ret; + struct super_block *sb = ac->ac_sb; + enum criteria cr = ac->ac_criteria; + + ext4_mb_might_prefetch(ac, group); + + /* prevent unnecessary buddy loading. */ + if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) + return 0; + + /* This now checks without needing the buddy page */ + ret = ext4_mb_good_group_nolock(ac, group, cr); + if (ret <= 0) { + if (!ac->ac_first_err) + ac->ac_first_err = ret; + return 0; + } + + ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b); + if (ret) + return ret; + + /* skip busy group */ + if (cr >= CR_ANY_FREE) + ext4_lock_group(sb, group); + else if (!ext4_try_lock_group(sb, group)) + goto out_unload; + + /* We need to check again after locking the block group. */ + if (unlikely(!ext4_mb_good_group(ac, group, cr))) + goto out_unlock; + + __ext4_mb_scan_group(ac); + +out_unlock: + ext4_unlock_group(sb, group); +out_unload: + ext4_mb_unload_buddy(ac->ac_e4b); + return ret; +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t prefetch_grp = 0, ngroups, group, i; - enum criteria new_cr, cr = CR_GOAL_LEN_FAST; - int err = 0, first_err = 0; - unsigned int nr = 0, prefetch_ios = 0; - struct ext4_sb_info *sbi; - struct super_block *sb; + ext4_group_t i; + int err = 0; + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int lost; - - sb = ac->ac_sb; - sbi = EXT4_SB(sb); - ngroups = ext4_get_groups_count(sb); - /* non-extent files are limited to low blocks/groups */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) - ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); @@ -2844,11 +3002,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - /* TBD: may be hot point */ - spin_lock(&sbi->s_md_lock); - ac->ac_g_ex.fe_group = sbi->s_mb_last_group; - ac->ac_g_ex.fe_start = sbi->s_mb_last_start; - spin_unlock(&sbi->s_md_lock); + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); + ac->ac_g_ex.fe_start = -1; + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; } /* @@ -2856,107 +3014,21 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * start with CR_GOAL_LEN_FAST, unless it is power of 2 * aligned, in which case let's do that faster approach first. */ + ac->ac_criteria = CR_GOAL_LEN_FAST; if (ac->ac_2order) - cr = CR_POWER2_ALIGNED; -repeat: - for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { - ac->ac_criteria = cr; - /* - * searching for the right group start - * from the goal value specified - */ - group = ac->ac_g_ex.fe_group; - ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; - prefetch_grp = group; - nr = 0; - - for (i = 0, new_cr = cr; i < ngroups; i++, - ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { - int ret = 0; - - cond_resched(); - if (new_cr != cr) { - cr = new_cr; - goto repeat; - } - - /* - * Batch reads of the block allocation bitmaps - * to get multiple READs in flight; limit - * prefetching at inexpensive CR, otherwise mballoc - * can spend a lot of time loading imperfect groups - */ - if ((prefetch_grp == group) && - (ext4_mb_cr_expensive(cr) || - prefetch_ios < sbi->s_mb_prefetch_limit)) { - nr = sbi->s_mb_prefetch; - if (ext4_has_feature_flex_bg(sb)) { - nr = 1 << sbi->s_log_groups_per_flex; - nr -= group & (nr - 1); - nr = min(nr, sbi->s_mb_prefetch); - } - prefetch_grp = ext4_mb_prefetch(sb, group, - nr, &prefetch_ios); - } - - /* This now checks without needing the buddy page */ - ret = ext4_mb_good_group_nolock(ac, group, cr); - if (ret <= 0) { - if (!first_err) - first_err = ret; - continue; - } - - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) - goto out; - - ext4_lock_group(sb, group); - - /* - * We need to check again after locking the - * block group - */ - ret = ext4_mb_good_group(ac, group, cr); - if (ret == 0) { - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - continue; - } + ac->ac_criteria = CR_POWER2_ALIGNED; - ac->ac_groups_scanned++; - if (cr == CR_POWER2_ALIGNED) - ext4_mb_simple_scan_group(ac, &e4b); - else { - bool is_stripe_aligned = - (sbi->s_stripe >= - sbi->s_cluster_ratio) && - !(ac->ac_g_ex.fe_len % - EXT4_NUM_B2C(sbi, sbi->s_stripe)); - - if ((cr == CR_GOAL_LEN_FAST || - cr == CR_BEST_AVAIL_LEN) && - is_stripe_aligned) - ext4_mb_scan_aligned(ac, &e4b); - - if (ac->ac_status == AC_STATUS_CONTINUE) - ext4_mb_complex_scan_group(ac, &e4b); - } - - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - - if (ac->ac_status != AC_STATUS_CONTINUE) - break; - } - /* Processed all groups and haven't found blocks */ - if (sbi->s_mb_stats && i == ngroups) - atomic64_inc(&sbi->s_bal_cX_failed[cr]); + ac->ac_e4b = &e4b; + ac->ac_prefetch_ios = 0; + ac->ac_first_err = 0; +repeat: + while (ac->ac_criteria < EXT4_MB_NUM_CRS) { + err = ext4_mb_scan_groups(ac); + if (err) + goto out; - if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) - /* Reset goal length to original goal length before - * falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + if (ac->ac_status != AC_STATUS_CONTINUE) + break; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2967,6 +3039,8 @@ repeat: */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { + int lost; + /* * Someone more lucky has already allocated it. * The only thing we can do is just take first @@ -2982,23 +3056,27 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = CR_ANY_FREE; + ac->ac_criteria = CR_ANY_FREE; goto repeat; } } - if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) { atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC && + ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group) + atomic_inc(&sbi->s_bal_stream_goals); + } out: - if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) - err = first_err; + if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err) + err = ac->ac_first_err; mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, - ac->ac_flags, cr, err); + ac->ac_flags, ac->ac_criteria, err); - if (nr) - ext4_mb_prefetch_fini(sb, prefetch_grp, nr); + if (ac->ac_prefetch_nr) + ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr); return err; } @@ -3121,8 +3199,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); /* CR_GOAL_LEN_FAST stats */ seq_puts(seq, "\tcr_goal_fast_stats:\n"); @@ -3135,8 +3211,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); /* CR_BEST_AVAIL_LEN stats */ seq_puts(seq, "\tcr_best_avail_stats:\n"); @@ -3150,8 +3224,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); /* CR_GOAL_LEN_SLOW stats */ seq_puts(seq, "\tcr_goal_slow_stats:\n"); @@ -3181,6 +3253,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\tstream_goal_hits: %u\n", + atomic_read(&sbi->s_bal_stream_goals)); seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); @@ -3227,6 +3301,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; unsigned int count; + unsigned long idx; position--; if (position >= MB_NUM_ORDERS(sb)) { @@ -3235,11 +3310,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "avg_fragment_size_lists:\n"); count = 0; - read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); - list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], - bb_avg_fragment_size_node) + xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp) count++; - read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; @@ -3251,11 +3323,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "max_free_order_lists:\n"); } count = 0; - read_lock(&sbi->s_mb_largest_free_orders_locks[position]); - list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], - bb_largest_free_order_node) + xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp) count++; - read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); @@ -3375,8 +3444,6 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; - INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); - INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; @@ -3586,6 +3653,20 @@ static void ext4_discard_work(struct work_struct *work) ext4_mb_unload_buddy(&e4b); } +static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) +{ + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) + xa_destroy(&sbi->s_mb_avg_fragment_size[i]); + kfree(sbi->s_mb_avg_fragment_size); +} + +static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) +{ + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) + xa_destroy(&sbi->s_mb_largest_free_orders[i]); + kfree(sbi->s_mb_largest_free_orders); +} + int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3631,44 +3712,27 @@ int ext4_mb_init(struct super_block *sb) } while (i < MB_NUM_ORDERS(sb)); sbi->s_mb_avg_fragment_size = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size) { ret = -ENOMEM; goto out; } - sbi->s_mb_avg_fragment_size_locks = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), - GFP_KERNEL); - if (!sbi->s_mb_avg_fragment_size_locks) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < MB_NUM_ORDERS(sb); i++) { - INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); - rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); - } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + xa_init(&sbi->s_mb_avg_fragment_size[i]); + sbi->s_mb_largest_free_orders = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders) { ret = -ENOMEM; goto out; } - sbi->s_mb_largest_free_orders_locks = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), - GFP_KERNEL); - if (!sbi->s_mb_largest_free_orders_locks) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < MB_NUM_ORDERS(sb); i++) { - INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); - rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); - } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + xa_init(&sbi->s_mb_largest_free_orders[i]); spin_lock_init(&sbi->s_md_lock); - sbi->s_mb_free_pending = 0; + atomic_set(&sbi->s_mb_free_pending, 0); INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); @@ -3709,10 +3773,19 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } + sbi->s_mb_nr_global_goals = umin(num_possible_cpus(), + DIV_ROUND_UP(sbi->s_groups_count, 4)); + sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals, + sizeof(ext4_group_t), GFP_KERNEL); + if (sbi->s_mb_last_groups == NULL) { + ret = -ENOMEM; + goto out; + } + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out; + goto out_free_last_groups; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -3737,11 +3810,12 @@ int ext4_mb_init(struct super_block *sb) out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; +out_free_last_groups: + kfree(sbi->s_mb_last_groups); + sbi->s_mb_last_groups = NULL; out: - kfree(sbi->s_mb_avg_fragment_size); - kfree(sbi->s_mb_avg_fragment_size_locks); - kfree(sbi->s_mb_largest_free_orders); - kfree(sbi->s_mb_largest_free_orders_locks); + ext4_mb_avg_fragment_size_destroy(sbi); + ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); @@ -3808,10 +3882,8 @@ void ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } - kfree(sbi->s_mb_avg_fragment_size); - kfree(sbi->s_mb_avg_fragment_size_locks); - kfree(sbi->s_mb_largest_free_orders); - kfree(sbi->s_mb_largest_free_orders_locks); + ext4_mb_avg_fragment_size_destroy(sbi); + ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); @@ -3841,6 +3913,7 @@ void ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); + kfree(sbi->s_mb_last_groups); } static inline int ext4_issue_discard(struct super_block *sb, @@ -3871,10 +3944,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); - spin_lock(&EXT4_SB(sb)->s_md_lock); - EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; - spin_unlock(&EXT4_SB(sb)->s_md_lock); - + atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; @@ -6278,28 +6348,63 @@ out: * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ -static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, - struct ext4_free_data *entry, - struct ext4_free_data *new_entry, - struct rb_root *entry_rb_root) +static inline bool +ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) { - if ((entry->efd_tid != new_entry->efd_tid) || - (entry->efd_group != new_entry->efd_group)) - return; - if (entry->efd_start_cluster + entry->efd_count == - new_entry->efd_start_cluster) { - new_entry->efd_start_cluster = entry->efd_start_cluster; - new_entry->efd_count += entry->efd_count; - } else if (new_entry->efd_start_cluster + new_entry->efd_count == - entry->efd_start_cluster) { - new_entry->efd_count += entry->efd_count; - } else - return; + if (entry1->efd_tid != entry2->efd_tid) + return false; + if (entry1->efd_start_cluster + entry1->efd_count != + entry2->efd_start_cluster) + return false; + if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group)) + return false; + return true; +} + +static inline void +ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + entry1->efd_count += entry2->efd_count; spin_lock(&sbi->s_md_lock); - list_del(&entry->efd_list); + list_del(&entry2->efd_list); spin_unlock(&sbi->s_md_lock); - rb_erase(&entry->efd_node, entry_rb_root); - kmem_cache_free(ext4_free_data_cachep, entry); + rb_erase(&entry2->efd_node, root); + kmem_cache_free(ext4_free_data_cachep, entry2); +} + +static inline void +ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry) +{ + struct ext4_free_data *prev; + struct rb_node *node; + + node = rb_prev(&entry->efd_node); + if (!node) + return; + + prev = rb_entry(node, struct ext4_free_data, efd_node); + if (ext4_freed_extents_can_be_merged(prev, entry)) + ext4_merge_freed_extents(sbi, root, prev, entry); +} + +static inline void +ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry) +{ + struct ext4_free_data *next; + struct rb_node *node; + + node = rb_next(&entry->efd_node); + if (!node) + return; + + next = rb_entry(node, struct ext4_free_data, efd_node); + if (ext4_freed_extents_can_be_merged(entry, next)) + ext4_merge_freed_extents(sbi, root, entry, next); } static noinline_for_stack void @@ -6309,11 +6414,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; ext4_grpblk_t clusters = new_entry->efd_count; - struct ext4_free_data *entry; + struct ext4_free_data *entry = NULL; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_root *root = &db->bb_free_root; + struct rb_node **n = &root->rb_node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); @@ -6349,27 +6455,30 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } } - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &db->bb_free_root); + atomic_add(clusters, &sbi->s_mb_free_pending); + if (!entry) + goto insert; - /* Now try to see the extent can be merged to left and right */ - node = rb_prev(new_node); - if (node) { - entry = rb_entry(node, struct ext4_free_data, efd_node); - ext4_try_merge_freed_extent(sbi, entry, new_entry, - &(db->bb_free_root)); + /* Now try to see the extent can be merged to prev and next */ + if (ext4_freed_extents_can_be_merged(new_entry, entry)) { + entry->efd_start_cluster = cluster; + entry->efd_count += new_entry->efd_count; + kmem_cache_free(ext4_free_data_cachep, new_entry); + ext4_try_merge_freed_extent_prev(sbi, root, entry); + return; } - - node = rb_next(new_node); - if (node) { - entry = rb_entry(node, struct ext4_free_data, efd_node); - ext4_try_merge_freed_extent(sbi, entry, new_entry, - &(db->bb_free_root)); + if (ext4_freed_extents_can_be_merged(entry, new_entry)) { + entry->efd_count += new_entry->efd_count; + kmem_cache_free(ext4_free_data_cachep, new_entry); + ext4_try_merge_freed_extent_next(sbi, root, entry); + return; } +insert: + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, root); spin_lock(&sbi->s_md_lock); list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); - sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index f8280de3e882..15a049f05d04 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -192,8 +192,13 @@ struct ext4_allocation_context { */ ext4_grpblk_t ac_orig_goal_len; + ext4_group_t ac_prefetch_grp; + unsigned int ac_prefetch_ios; + unsigned int ac_prefetch_nr; + + int ac_first_err; + __u32 ac_flags; /* allocation hints */ - __u32 ac_groups_linear_remaining; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_cX_found[EXT4_MB_NUM_CRS]; @@ -204,6 +209,8 @@ struct ext4_allocation_context { __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ + + struct ext4_buddy *ac_e4b; struct folio *ac_bitmap_folio; struct folio *ac_buddy_folio; struct ext4_prealloc_space *ac_pa; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 1f8493a56e8f..adae3caf175a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -280,7 +280,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, */ again: *err = 0; - jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page, + block_len_in_page) * 2; handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a178ac229489..d83f91b62317 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2915,33 +2915,50 @@ err_unlock_inode: return err; } -struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, - struct ext4_dir_entry_2 *de, - int blocksize, int csum_size, - unsigned int parent_ino, int dotdot_real_len) +int ext4_init_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh, unsigned int parent_ino, + void *inline_buf, int inline_size) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data; + size_t blocksize = bh->b_size; + int csum_size = 0, header_size; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); - strcpy(de->name, "."); + memcpy(de->name, ".", 2); ext4_set_de_type(inode->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(parent_ino); de->name_len = 2; - if (!dotdot_real_len) - de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + ext4_dir_rec_len(1, NULL)), - blocksize); - else + memcpy(de->name, "..", 3); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + if (inline_buf) { de->rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(de->name_len, NULL), blocksize); - strcpy(de->name, ".."); - ext4_set_de_type(inode->i_sb, de, S_IFDIR); + de = ext4_next_entry(de, blocksize); + header_size = (char *)de - bh->b_data; + memcpy((void *)de, inline_buf, inline_size); + ext4_update_final_de(bh->b_data, inline_size + header_size, + blocksize - csum_size); + } else { + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + ext4_dir_rec_len(1, NULL)), + blocksize); + } - return ext4_next_entry(de, blocksize); + if (csum_size) + ext4_initialize_dirent_tail(bh, blocksize); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + return ext4_handle_dirty_dirblock(handle, inode, bh); } int ext4_init_new_dir(handle_t *handle, struct inode *dir, @@ -2950,13 +2967,8 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; ext4_lblk_t block = 0; - unsigned int blocksize = dir->i_sb->s_blocksize; - int csum_size = 0; int err; - if (ext4_has_feature_metadata_csum(dir->i_sb)) - csum_size = sizeof(struct ext4_dir_entry_tail); - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { err = ext4_try_create_inline_dir(handle, dir, inode); if (err < 0 && err != -ENOSPC) @@ -2965,21 +2977,15 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, goto out; } + set_nlink(inode, 2); inode->i_size = 0; dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); de = (struct ext4_dir_entry_2 *)dir_block->b_data; - ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); - set_nlink(inode, 2); - if (csum_size) - ext4_initialize_dirent_tail(dir_block, blocksize); - - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirblock(handle, inode, dir_block); + err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); if (err) goto out; - set_buffer_verified(dir_block); out: brelse(dir_block); return err; @@ -3082,7 +3088,8 @@ bool ext4_empty_dir(struct inode *inode) de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || - le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { + le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 || + de->name[0] != '.') { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); return false; @@ -3091,7 +3098,8 @@ bool ext4_empty_dir(struct inode *inode) de = ext4_next_entry(de, sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || - le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || + de->name[0] != '.' || de->name[1] != '.') { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return false; @@ -3532,7 +3540,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || - strcmp(".", de->name)) { + de->name_len != 1 || de->name[0] != '.') { EXT4_ERROR_INODE(inode, "directory missing '.'"); brelse(bh); *retval = -EFSCORRUPTED; @@ -3543,7 +3551,8 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, de = ext4_next_entry(de, inode->i_sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || - le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || + de->name[0] != '.' || de->name[1] != '.') { EXT4_ERROR_INODE(inode, "directory missing '..'"); brelse(bh); *retval = -EFSCORRUPTED; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 179e54f3a3b6..3d8b0f6d2dea 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -236,10 +236,12 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) + if (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec)) return true; if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && - io_end->flag & EXT4_IO_END_FAILED) + io_end->flag & EXT4_IO_END_FAILED && + !ext4_emergency_state(io_end->inode->i_sb)) return true; return false; } @@ -256,6 +258,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && !io_end->handle && sbi->s_journal); + WARN_ON(!io_end->bio); spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; @@ -318,12 +321,9 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_FAILED || - (io_end->flag & EXT4_IO_END_UNWRITTEN && - !list_empty(&io_end->list_vec))) { - ext4_add_complete_io(io_end); - return; - } + if (ext4_io_end_defer_completion(io_end)) + return ext4_add_complete_io(io_end); + ext4_release_io_end(io_end); } } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8d15acbacc20..5a6fe1513fd2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -338,7 +338,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, cmp = name_len - entry->e_name_len; if (!cmp) cmp = memcmp(name, entry->e_name, name_len); - if (cmp <= 0 && (sorted || cmp == 0)) + if (!cmp || (cmp < 0 && sorted)) break; } *pentry = entry; @@ -962,7 +962,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; + credits += ext4_chunk_trans_extent(inode, 1) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f149ec28aefd..db3831f7f2f5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -82,7 +82,7 @@ repeat: if (folio_test_uptodate(folio)) goto out; - fio.page = &folio->page; + fio.folio = folio; err = f2fs_submit_page_bio(&fio); if (err) { @@ -309,7 +309,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, continue; } - fio.page = &folio->page; + fio.folio = folio; err = f2fs_submit_page_bio(&fio); f2fs_folio_put(folio, err ? true : false); @@ -485,7 +485,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, folio_mark_uptodate(folio); if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); return true; } return false; @@ -1045,7 +1045,7 @@ void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index b3c1df93a163..5c1f47e45dab 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -23,20 +23,18 @@ static struct kmem_cache *cic_entry_slab; static struct kmem_cache *dic_entry_slab; -static void *page_array_alloc(struct inode *inode, int nr) +static void *page_array_alloc(struct f2fs_sb_info *sbi, int nr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (likely(size <= sbi->page_array_slab_size)) return f2fs_kmem_cache_alloc(sbi->page_array_slab, - GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); + GFP_F2FS_ZERO, false, sbi); return f2fs_kzalloc(sbi, size, GFP_NOFS); } -static void page_array_free(struct inode *inode, void *pages, int nr) +static void page_array_free(struct f2fs_sb_info *sbi, void *pages, int nr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (!pages) @@ -73,17 +71,15 @@ static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) return cc->cluster_idx << cc->log_cluster_size; } -bool f2fs_is_compressed_page(struct page *page) +bool f2fs_is_compressed_page(struct folio *folio) { - if (!PagePrivate(page)) - return false; - if (!page_private(page)) + if (!folio->private) return false; - if (page_private_nonpointer(page)) + if (folio_test_f2fs_nonpointer(folio)) return false; - f2fs_bug_on(F2FS_P_SB(page), - *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); + f2fs_bug_on(F2FS_F_SB(folio), + *((u32 *)folio->private) != F2FS_COMPRESSED_PAGE_MAGIC); return true; } @@ -149,13 +145,13 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc) if (cc->rpages) return 0; - cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); + cc->rpages = page_array_alloc(F2FS_I_SB(cc->inode), cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { - page_array_free(cc->inode, cc->rpages, cc->cluster_size); + page_array_free(F2FS_I_SB(cc->inode), cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; @@ -216,13 +212,13 @@ static int lzo_decompress_pages(struct decompress_io_ctx *dic) ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, dic->rbuf, &dic->rlen); if (ret != LZO_E_OK) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lzo decompress failed, ret:%d", ret); return -EIO; } if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lzo invalid rlen:%zu, expected:%lu", dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; @@ -296,13 +292,13 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, dic->clen, dic->rlen); if (ret < 0) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lz4 decompress failed, ret:%d", ret); return -EIO; } if (ret != PAGE_SIZE << dic->log_cluster_size) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lz4 invalid ret:%d, expected:%lu", ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; @@ -424,13 +420,13 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) workspace_size = zstd_dstream_workspace_bound(max_window_size); - workspace = f2fs_vmalloc(F2FS_I_SB(dic->inode), workspace_size); + workspace = f2fs_vmalloc(dic->sbi, workspace_size); if (!workspace) return -ENOMEM; stream = zstd_init_dstream(max_window_size, workspace, workspace_size); if (!stream) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s zstd_init_dstream failed", __func__); vfree(workspace); return -EIO; @@ -466,14 +462,14 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic) ret = zstd_decompress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s zstd_decompress_stream failed, ret: %d", __func__, zstd_get_error_code(ret)); return -EIO; } if (dic->rlen != outbuf.pos) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s ZSTD invalid rlen:%zu, expected:%lu", __func__, dic->rlen, PAGE_SIZE << dic->log_cluster_size); @@ -622,6 +618,7 @@ static void *f2fs_vmap(struct page **pages, unsigned int count) static int f2fs_compress_pages(struct compress_ctx *cc) { + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; @@ -642,7 +639,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); cc->valid_nr_cpages = cc->nr_cpages; - cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); + cc->cpages = page_array_alloc(sbi, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; @@ -716,7 +713,7 @@ out_free_cpages: if (cc->cpages[i]) f2fs_compress_free_page(cc->cpages[i]); } - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: if (cops->destroy_compress_ctx) @@ -734,7 +731,7 @@ static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct f2fs_sb_info *sbi = dic->sbi; struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; @@ -796,25 +793,27 @@ out_end_io: f2fs_decompress_end_io(dic, ret, in_task); } +static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct folio *folio, nid_t ino, block_t blkaddr); + /* * This is called when a page of a compressed cluster has been read from disk * (or failed to be read from disk). It checks whether this page was the last * page being waited on in the cluster, and if so, it decompresses the cluster * (or in the case of a failure, cleans up without actually decompressing). */ -void f2fs_end_read_compressed_page(struct page *page, bool failed, +void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); - struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct decompress_io_ctx *dic = folio->private; + struct f2fs_sb_info *sbi = dic->sbi; dec_page_count(sbi, F2FS_RD_DATA); if (failed) WRITE_ONCE(dic->failed, true); else if (blkaddr && in_task) - f2fs_cache_compressed_page(sbi, page, + f2fs_cache_compressed_page(sbi, folio, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) @@ -1340,7 +1339,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; atomic_set(&cic->pending_pages, cc->valid_nr_cpages); - cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); + cic->rpages = page_array_alloc(sbi, cc->cluster_size); if (!cic->rpages) goto out_put_cic; @@ -1420,7 +1419,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, (*submitted)++; unlock_continue: inode_dec_dirty_pages(cc->inode); - unlock_page(fio.page); + folio_unlock(fio.folio); } if (fio.compr_blocks) @@ -1442,13 +1441,13 @@ unlock_continue: spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: - page_array_free(cc->inode, cic->rpages, cc->cluster_size); + page_array_free(sbi, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) { if (!cc->cpages[i]) @@ -1469,18 +1468,18 @@ out_free: f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; return -EAGAIN; } -void f2fs_compress_write_end_io(struct bio *bio, struct page *page) +void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio) { + struct page *page = &folio->page; struct f2fs_sb_info *sbi = bio->bi_private; - struct compress_io_ctx *cic = - (struct compress_io_ctx *)page_private(page); - enum count_type type = WB_DATA_TYPE(page, - f2fs_is_compressed_page(page)); + struct compress_io_ctx *cic = folio->private; + enum count_type type = WB_DATA_TYPE(folio, + f2fs_is_compressed_page(folio)); int i; if (unlikely(bio->bi_status != BLK_STS_OK)) @@ -1499,7 +1498,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) end_page_writeback(cic->rpages[i]); } - page_array_free(cic->inode, cic->rpages, cic->nr_rpages); + page_array_free(sbi, cic->rpages, cic->nr_rpages); kmem_cache_free(cic_entry_slab, cic); } @@ -1633,14 +1632,13 @@ static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool pre_alloc) { - const struct f2fs_compress_ops *cops = - f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm]; int i; - if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc)) return 0; - dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + dic->tpages = page_array_alloc(dic->sbi, dic->cluster_size); if (!dic->tpages) return -ENOMEM; @@ -1670,10 +1668,9 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, bool bypass_destroy_callback, bool pre_alloc) { - const struct f2fs_compress_ops *cops = - f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm]; - if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc)) return; if (!bypass_destroy_callback && cops->destroy_decompress_ctx) @@ -1700,7 +1697,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) if (!dic) return ERR_PTR(-ENOMEM); - dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); + dic->rpages = page_array_alloc(sbi, cc->cluster_size); if (!dic->rpages) { kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); @@ -1708,6 +1705,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; + dic->sbi = sbi; + dic->compress_algorithm = F2FS_I(cc->inode)->i_compress_algorithm; atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; @@ -1721,7 +1720,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; - dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); + dic->cpages = page_array_alloc(sbi, dic->nr_cpages); if (!dic->cpages) { ret = -ENOMEM; goto out_free; @@ -1751,6 +1750,8 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, bool bypass_destroy_callback) { int i; + /* use sbi in dic to avoid UFA of dic->inode*/ + struct f2fs_sb_info *sbi = dic->sbi; f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); @@ -1762,7 +1763,7 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, continue; f2fs_compress_free_page(dic->tpages[i]); } - page_array_free(dic->inode, dic->tpages, dic->cluster_size); + page_array_free(sbi, dic->tpages, dic->cluster_size); } if (dic->cpages) { @@ -1771,10 +1772,10 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, continue; f2fs_compress_free_page(dic->cpages[i]); } - page_array_free(dic->inode, dic->cpages, dic->nr_cpages); + page_array_free(sbi, dic->cpages, dic->nr_cpages); } - page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + page_array_free(sbi, dic->rpages, dic->nr_rpages); kmem_cache_free(dic_entry_slab, dic); } @@ -1793,8 +1794,7 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) f2fs_free_dic(dic, false); } else { INIT_WORK(&dic->free_work, f2fs_late_free_dic); - queue_work(F2FS_I_SB(dic->inode)->post_read_wq, - &dic->free_work); + queue_work(dic->sbi->post_read_wq, &dic->free_work); } } } @@ -1921,8 +1921,8 @@ void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr + len - 1); } -void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, - nid_t ino, block_t blkaddr) +static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct folio *folio, nid_t ino, block_t blkaddr) { struct folio *cfolio; int ret; @@ -1953,9 +1953,9 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, return; } - set_page_private_data(&cfolio->page, ino); + folio_set_f2fs_data(cfolio, ino); - memcpy(folio_address(cfolio), page_address(page), PAGE_SIZE); + memcpy(folio_address(cfolio), folio_address(folio), PAGE_SIZE); folio_mark_uptodate(cfolio); f2fs_folio_put(cfolio, true); } @@ -2012,7 +2012,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) continue; } - if (ino != get_page_private_data(&folio->page)) { + if (ino != folio_get_f2fs_data(folio)) { folio_unlock(folio); continue; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 711ad80b38d0..7961e0ddfca3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -47,14 +47,14 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -bool f2fs_is_cp_guaranteed(struct page *page) +bool f2fs_is_cp_guaranteed(const struct folio *folio) { - struct address_space *mapping = page_folio(page)->mapping; + struct address_space *mapping = folio->mapping; struct inode *inode; struct f2fs_sb_info *sbi; - if (fscrypt_is_bounce_page(page)) - return page_private_gcing(fscrypt_pagecache_page(page)); + if (fscrypt_is_bounce_folio(folio)) + return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio)); inode = mapping->host; sbi = F2FS_I_SB(inode); @@ -65,7 +65,7 @@ bool f2fs_is_cp_guaranteed(struct page *page) return true; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || - page_private_gcing(page)) + folio_test_f2fs_gcing(folio)) return true; return false; } @@ -142,9 +142,9 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; - if (f2fs_is_compressed_page(&folio->page)) { + if (f2fs_is_compressed_page(folio)) { if (ctx && !ctx->decompression_attempted) - f2fs_end_read_compressed_page(&folio->page, true, 0, + f2fs_end_read_compressed_page(folio, true, 0, in_task); f2fs_put_folio_dic(folio, in_task); continue; @@ -181,14 +181,13 @@ static void f2fs_verify_bio(struct work_struct *work) * as those were handled separately by f2fs_end_read_compressed_page(). */ if (may_have_compressed_pages) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; - if (!f2fs_is_compressed_page(page) && - !fsverity_verify_page(page)) { + if (!f2fs_is_compressed_page(folio) && + !fsverity_verify_page(&folio->page)) { bio->bi_status = BLK_STS_IOERR; break; } @@ -233,16 +232,15 @@ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bool in_task) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; bool all_compressed = true; block_t blkaddr = ctx->fs_blkaddr; - bio_for_each_segment_all(bv, ctx->bio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, ctx->bio) { + struct folio *folio = fi.folio; - if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, false, blkaddr, + if (f2fs_is_compressed_page(folio)) + f2fs_end_read_compressed_page(folio, false, blkaddr, in_task); else all_compressed = false; @@ -280,9 +278,9 @@ static void f2fs_post_read_work(struct work_struct *work) static void f2fs_read_end_io(struct bio *bio) { - struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct f2fs_sb_info *sbi = F2FS_F_SB(bio_first_folio_all(bio)); struct bio_post_read_ctx *ctx; - bool intask = in_task(); + bool intask = in_task() && !irqs_disabled(); iostat_update_and_unbind_ctx(bio); ctx = bio->bi_private; @@ -339,13 +337,13 @@ static void f2fs_write_end_io(struct bio *bio) } #ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_is_compressed_page(&folio->page)) { - f2fs_compress_write_end_io(bio, &folio->page); + if (f2fs_is_compressed_page(folio)) { + f2fs_compress_write_end_io(bio, folio); continue; } #endif - type = WB_DATA_TYPE(&folio->page, false); + type = WB_DATA_TYPE(folio, false); if (unlikely(bio->bi_status != BLK_STS_OK)) { mapping_set_error(folio->mapping, -EIO); @@ -355,12 +353,12 @@ static void f2fs_write_end_io(struct bio *bio) } f2fs_bug_on(sbi, is_node_folio(folio) && - folio->index != nid_of_node(&folio->page)); + folio->index != nid_of_node(folio)); dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, folio)) f2fs_del_fsync_node_entry(sbi, folio); - clear_page_private_gcing(&folio->page); + folio_clear_f2fs_gcing(folio); folio_end_writeback(folio); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && @@ -419,7 +417,6 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); - struct folio *fio_folio = page_folio(fio->page); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -447,7 +444,7 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) op_flags |= REQ_FUA; if (fio->type == DATA && - F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) + F2FS_I(fio->folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) op_flags |= REQ_PRIO; return op_flags; @@ -546,14 +543,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) } static bool __has_merged_page(struct bio *bio, struct inode *inode, - struct page *page, nid_t ino) + struct folio *folio, nid_t ino) { struct folio_iter fi; if (!bio) return false; - if (!inode && !page && !ino) + if (!inode && !folio && !ino) return true; bio_for_each_folio_all(fi, bio) { @@ -564,7 +561,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (IS_ERR(target)) continue; } - if (f2fs_is_compressed_page(&target->page)) { + if (f2fs_is_compressed_page(target)) { target = f2fs_compress_control_folio(target); if (IS_ERR(target)) continue; @@ -572,9 +569,9 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (inode && inode == target->mapping->host) return true; - if (page && page == &target->page) + if (folio && folio == target) return true; - if (ino && ino == ino_of_node(&target->page)) + if (ino && ino == ino_of_node(target)) return true; } @@ -641,7 +638,7 @@ unlock_out: } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type, bool force) { enum temp_type temp; @@ -653,7 +650,7 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_read(&io->io_rwsem); - ret = __has_merged_page(io->bio, inode, page, ino); + ret = __has_merged_page(io->bio, inode, folio, ino); f2fs_up_read(&io->io_rwsem); } if (ret) @@ -671,10 +668,10 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type) { - __submit_merged_write_cond(sbi, inode, page, ino, type, false); + __submit_merged_write_cond(sbi, inode, folio, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) @@ -691,7 +688,7 @@ void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; - struct folio *fio_folio = page_folio(fio->page); + struct folio *fio_folio = fio->folio; struct folio *data_folio = fio->encrypted_page ? page_folio(fio->encrypted_page) : fio_folio; @@ -713,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(data_folio) : WB_DATA_TYPE(fio->page, false)); + __read_io_type(data_folio) : WB_DATA_TYPE(fio->folio, false)); if (is_read_io(bio_op(bio))) f2fs_submit_read_bio(fio->sbi, bio, fio->type); @@ -779,7 +776,7 @@ static void del_bio_entry(struct bio_entry *be) static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { - struct folio *fio_folio = page_folio(fio->page); + struct folio *fio_folio = fio->folio; struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; @@ -848,7 +845,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, - &folio->page, 0); + folio, 0); if (found) break; } @@ -865,7 +862,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, - &folio->page, 0); + folio, 0); if (found) { target = be->bio; del_bio_entry(be); @@ -886,15 +883,15 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; - struct page *page = fio->encrypted_page ? - fio->encrypted_page : fio->page; - struct folio *folio = page_folio(fio->page); + struct folio *data_folio = fio->encrypted_page ? + page_folio(fio->encrypted_page) : fio->folio; + struct folio *folio = fio->folio; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) return -EFSCORRUPTED; - trace_f2fs_submit_folio_bio(page_folio(page), fio); + trace_f2fs_submit_folio_bio(data_folio, fio); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -905,16 +902,16 @@ alloc_new: f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, folio->index, fio, GFP_NOIO); - add_bio_entry(fio->sbi, bio, page, fio->temp); + add_bio_entry(fio->sbi, bio, &data_folio->page, fio->temp); } else { - if (add_ipu_page(fio, &bio, page)) + if (add_ipu_page(fio, &bio, &data_folio->page)) goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); - inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); + inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -949,7 +946,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; - struct page *bio_page; + struct folio *bio_folio; enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); @@ -980,44 +977,44 @@ next: verify_fio_blkaddr(fio); if (fio->encrypted_page) - bio_page = fio->encrypted_page; + bio_folio = page_folio(fio->encrypted_page); else if (fio->compressed_page) - bio_page = fio->compressed_page; + bio_folio = page_folio(fio->compressed_page); else - bio_page = fio->page; + bio_folio = fio->folio; /* set submitted = true as a return value */ fio->submitted = 1; - type = WB_DATA_TYPE(bio_page, fio->compressed_page); + type = WB_DATA_TYPE(bio_folio, fio->compressed_page); inc_page_count(sbi, type); if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, fio->new_blkaddr) || !f2fs_crypt_mergeable_bio(io->bio, fio_inode(fio), - page_folio(bio_page)->index, fio))) + bio_folio->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio_inode(fio), - page_folio(bio_page)->index, fio, GFP_NOIO); + bio_folio->index, fio, GFP_NOIO); io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { + if (!bio_add_folio(io->bio, bio_folio, folio_size(bio_folio), 0)) { __submit_merged_bio(io); goto alloc_new; } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), - PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->folio, + folio_size(fio->folio)); io->last_block_in_bio = fio->new_blkaddr; - trace_f2fs_submit_folio_write(page_folio(fio->page), fio); + trace_f2fs_submit_folio_write(fio->folio, fio); #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { @@ -1553,10 +1550,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) unsigned int start_pgofs; int bidx = 0; bool is_hole; + bool lfs_dio_write; if (!maxblocks) return 0; + lfs_dio_write = (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && + map->m_may_create); + if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) goto out; @@ -1572,8 +1573,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) end = pgofs + maxblocks; next_dnode: - if (map->m_may_create) + if (map->m_may_create) { + if (f2fs_lfs_mode(sbi)) + f2fs_balance_fs(sbi, true); f2fs_map_lock(sbi, flag); + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -1589,7 +1593,7 @@ next_dnode: start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); next_block: blkaddr = f2fs_data_blkaddr(&dn); @@ -1603,7 +1607,7 @@ next_block: /* use out-place-update for direct IO under LFS mode */ if (map->m_may_create && (is_hole || (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && - !f2fs_is_pinned_file(inode)))) { + !f2fs_is_pinned_file(inode) && map->m_last_pblk != blkaddr))) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto sync_out; @@ -1687,10 +1691,15 @@ next_block: if (map->m_multidev_dio) map->m_bdev = FDEV(bidx).bdev; + + if (lfs_dio_write) + map->m_last_pblk = NULL_ADDR; } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) { ofs++; map->m_len++; } else { + if (lfs_dio_write && !f2fs_is_pinned_file(inode)) + map->m_last_pblk = blkaddr; goto sync_out; } @@ -1715,14 +1724,6 @@ skip: dn.ofs_in_node = end_offset; } - if (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && - map->m_may_create) { - /* the next block to be allocated may not be contiguous. */ - if (GET_SEGOFF_FROM_SEG0(sbi, blkaddr) % BLKS_PER_SEC(sbi) == - CAP_BLKS_PER_SEC(sbi) - 1) - goto sync_out; - } - if (pgofs >= end) goto sync_out; else if (dn.ofs_in_node < end_offset) @@ -2303,7 +2304,7 @@ submit_and_realloc: } if (!bio) { - bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i, f2fs_ra_op_flags(rac), folio->index, for_write); if (IS_ERR(bio)) { @@ -2376,6 +2377,14 @@ static int f2fs_mpage_readpages(struct inode *inode, unsigned max_nr_pages = nr_pages; int ret = 0; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + index = rac ? readahead_index(rac) : folio->index; + max_nr_pages = round_up(index + nr_pages, cc.cluster_size) - + round_down(index, cc.cluster_size); + } +#endif + map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; @@ -2642,7 +2651,7 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) int f2fs_do_write_data_page(struct f2fs_io_info *fio) { - struct folio *folio = page_folio(fio->page); + struct folio *folio = fio->folio; struct inode *inode = folio->mapping->host; struct dnode_of_data dn; struct node_info ni; @@ -2652,7 +2661,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* Use COW inode to make dnode_of_data for atomic write */ atomic_commit = f2fs_is_atomic_file(inode) && - page_private_atomic(folio_page(folio, 0)); + folio_test_f2fs_atomic(folio); if (atomic_commit) set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); else @@ -2683,7 +2692,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { folio_clear_uptodate(folio); - clear_page_private_gcing(folio_page(folio, 0)); + folio_clear_f2fs_gcing(folio); goto out_writepage; } got_it: @@ -2753,7 +2762,7 @@ got_it: trace_f2fs_do_write_data_page(folio, OPU); set_inode_flag(inode, FI_APPEND_WRITE); if (atomic_commit) - clear_page_private_atomic(folio_page(folio, 0)); + folio_clear_f2fs_atomic(folio); out_writepage: f2fs_put_dnode(&dn); out: @@ -2771,7 +2780,6 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted, bool allow_balance) { struct inode *inode = folio->mapping->host; - struct page *page = folio_page(folio, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long)i_size) @@ -2788,7 +2796,7 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), .old_blkaddr = NULL_ADDR, - .page = page, + .folio = folio, .encrypted_page = NULL, .submitted = 0, .compr_blocks = compr_blocks, @@ -2890,7 +2898,7 @@ out: inode_dec_dirty_pages(inode); if (err) { folio_clear_uptodate(folio); - clear_page_private_gcing(page); + folio_clear_f2fs_gcing(folio); } folio_unlock(folio); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && @@ -3376,7 +3384,7 @@ restart: f2fs_do_read_inline_data(folio, ifolio); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) - set_page_private_inline(&ifolio->page); + folio_set_f2fs_inline(ifolio); goto out; } err = f2fs_convert_inline_folio(&dn, folio); @@ -3698,7 +3706,7 @@ static int f2fs_write_end(const struct kiocb *iocb, folio_mark_dirty(folio); if (f2fs_is_atomic_file(inode)) - set_page_private_atomic(folio_page(folio, 0)); + folio_set_f2fs_atomic(folio); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { @@ -3733,7 +3741,7 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) f2fs_remove_dirty_inode(inode); } } - clear_page_private_all(&folio->page); + folio_detach_private(folio); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) @@ -3742,7 +3750,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) if (folio_test_dirty(folio)) return false; - clear_page_private_all(&folio->page); + folio_detach_private(folio); return true; } @@ -4160,7 +4168,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { - struct f2fs_map_blocks map = {}; + struct f2fs_map_blocks map = { NULL, }; pgoff_t next_pgofs = 0; int err; @@ -4169,6 +4177,10 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); + if (flags & IOMAP_WRITE && iomap->private) { + map.m_last_pblk = (unsigned long)iomap->private; + iomap->private = NULL; + } /* * If the blocks being overwritten are already allocated, @@ -4207,6 +4219,9 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); + + if (flags & IOMAP_WRITE && map.m_last_pblk) + iomap->private = (void *)map.m_last_pblk; } else { if (flags & IOMAP_WRITE) return -ENOTBLK; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 16c2dfb4f595..43a83bbd3bc5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,7 +21,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static DEFINE_RAW_SPINLOCK(f2fs_stat_lock); +static DEFINE_SPINLOCK(f2fs_stat_lock); #ifdef CONFIG_DEBUG_FS static struct dentry *f2fs_debugfs_root; #endif @@ -91,7 +91,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi) seg_blks = get_seg_entry(sbi, j)->valid_blocks; /* update segment stats */ - if (IS_CURSEG(sbi, j)) + if (is_curseg(sbi, j)) dev_stats[i].devstats[0][DEVSTAT_INUSE]++; else if (seg_blks == BLKS_PER_SEG(sbi)) dev_stats[i].devstats[0][DEVSTAT_FULL]++; @@ -109,7 +109,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi) sec_blks = get_sec_entry(sbi, j)->valid_blocks; /* update section stats */ - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j))) + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, j))) dev_stats[i].devstats[1][DEVSTAT_INUSE]++; else if (sec_blks == BLKS_PER_SEC(sbi)) dev_stats[i].devstats[1][DEVSTAT_FULL]++; @@ -439,9 +439,8 @@ static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; int i = 0, j = 0; - unsigned long flags; - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_for_each_entry(si, &f2fs_stat_list, stat_list) { struct f2fs_sb_info *sbi = si->sbi; @@ -753,7 +752,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); return 0; } @@ -765,7 +764,6 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; struct f2fs_dev_stats *dev_stats; - unsigned long flags; int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); @@ -817,9 +815,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->max_aw_cnt, 0); - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_add_tail(&si->stat_list, &f2fs_stat_list); - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); return 0; } @@ -827,11 +825,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned long flags; - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_del(&si->stat_list); - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); kfree(si->dev_stats); kfree(si); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c36b3b22bfff..fffd7749d6d1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -454,7 +454,7 @@ static void init_dent_inode(struct inode *dir, struct inode *inode, f2fs_folio_wait_writeback(ifolio, NODE, true, true); /* copy name info. to this inode folio */ - ri = F2FS_INODE(&ifolio->page); + ri = F2FS_INODE(ifolio); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); if (IS_ENCRYPTED(dir)) { @@ -897,7 +897,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, f2fs_clear_page_cache_dirty_tag(folio); folio_clear_dirty_for_io(folio); folio_clear_uptodate(folio); - clear_page_private_all(&folio->page); + folio_detach_private(folio); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index cfe925a3d555..199c1e7a83ef 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -19,10 +19,10 @@ #include "node.h" #include <trace/events/f2fs.h> -bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) +bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; + struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext; struct extent_info ei; int devi; @@ -411,10 +411,10 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; - struct f2fs_extent *i_ext = &F2FS_INODE(&ifolio->page)->i_ext; + struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext; struct extent_tree *et; struct extent_node *en; - struct extent_info ei; + struct extent_info ei = {0}; if (!__may_extent_tree(inode, EX_READ)) { /* drop largest read extent */ @@ -934,7 +934,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ if (!__may_extent_tree(dn->inode, type)) return; - ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), dn->inode) + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) + dn->ofs_in_node; ei.len = 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c78464792ceb..46be7560548c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -386,7 +386,7 @@ struct discard_cmd { struct rb_node rb_node; /* rb node located in rb-tree */ struct discard_info di; /* discard info */ struct list_head list; /* command list */ - struct completion wait; /* compleation */ + struct completion wait; /* completion */ struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ @@ -732,6 +732,7 @@ struct f2fs_map_blocks { block_t m_lblk; unsigned int m_len; unsigned int m_flags; + unsigned long m_last_pblk; /* last allocated block, only used for DIO in LFS mode */ pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; @@ -875,6 +876,7 @@ struct f2fs_inode_info { /* linked in global inode list for cache donation */ struct list_head gdonate_list; pgoff_t donate_start, donate_end; /* inclusive */ + atomic_t open_count; /* # of open files */ struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; @@ -1123,8 +1125,8 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ -#define WB_DATA_TYPE(p, f) \ - (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) +#define WB_DATA_TYPE(folio, f) \ + (f || f2fs_is_cp_guaranteed(folio) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -1240,7 +1242,10 @@ struct f2fs_io_info { blk_opf_t op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ - struct page *page; /* page to be written */ + union { + struct page *page; /* page to be written */ + struct folio *folio; + }; struct page *encrypted_page; /* encrypted page */ struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ @@ -1286,7 +1291,7 @@ struct f2fs_bio_info { struct f2fs_dev_info { struct file *bdev_file; struct block_device *bdev; - char path[MAX_PATH_LEN]; + char path[MAX_PATH_LEN + 1]; unsigned int total_segments; block_t start_blk; block_t end_blk; @@ -1427,7 +1432,7 @@ enum { enum { MEMORY_MODE_NORMAL, /* memory mode for normal devices */ - MEMORY_MODE_LOW, /* memory mode for low memry devices */ + MEMORY_MODE_LOW, /* memory mode for low memory devices */ }; enum errors_option { @@ -1491,7 +1496,7 @@ enum compress_flag { #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ - __le32 chksum; /* compressed data chksum */ + __le32 chksum; /* compressed data checksum */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; @@ -1536,6 +1541,7 @@ struct compress_io_ctx { struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ pgoff_t cluster_idx; /* cluster index number */ unsigned int cluster_size; /* page count in cluster */ unsigned int log_cluster_size; /* log of cluster size */ @@ -1576,6 +1582,7 @@ struct decompress_io_ctx { bool failed; /* IO error occurred before decompression? */ bool need_verity; /* need fs-verity verification after decompression? */ + unsigned char compress_algorithm; /* backup algorithm type */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ @@ -1724,6 +1731,9 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ + /* free sections reserved for pinned file */ + unsigned int reserved_pin_section; + /* threshold for gc trials on pinned files */ unsigned short gc_pin_file_threshold; struct f2fs_rwsem pin_sem; @@ -2013,16 +2023,11 @@ static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) return F2FS_I_SB(mapping->host); } -static inline struct f2fs_sb_info *F2FS_F_SB(struct folio *folio) +static inline struct f2fs_sb_info *F2FS_F_SB(const struct folio *folio) { return F2FS_M_SB(folio->mapping); } -static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) -{ - return F2FS_F_SB(page_folio(page)); -} - static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) { return (struct f2fs_super_block *)(sbi->raw_super); @@ -2043,14 +2048,14 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } -static inline struct f2fs_node *F2FS_NODE(const struct page *page) +static inline struct f2fs_node *F2FS_NODE(const struct folio *folio) { - return (struct f2fs_node *)page_address(page); + return (struct f2fs_node *)folio_address(folio); } -static inline struct f2fs_inode *F2FS_INODE(struct page *page) +static inline struct f2fs_inode *F2FS_INODE(const struct folio *folio) { - return &((struct f2fs_node *)page_address(page))->i; + return &((struct f2fs_node *)folio_address(folio))->i; } static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) @@ -2453,6 +2458,13 @@ release_quota: } #define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool folio_test_f2fs_##name(const struct folio *folio) \ +{ \ + unsigned long priv = (unsigned long)folio->private; \ + unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ + (1UL << PAGE_PRIVATE_##flagname); \ + return (priv & v) == v; \ +} \ static inline bool page_private_##name(struct page *page) \ { \ return PagePrivate(page) && \ @@ -2461,6 +2473,17 @@ static inline bool page_private_##name(struct page *page) \ } #define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void folio_set_f2fs_##name(struct folio *folio) \ +{ \ + unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ + (1UL << PAGE_PRIVATE_##flagname); \ + if (!folio->private) \ + folio_attach_private(folio, (void *)v); \ + else { \ + v |= (unsigned long)folio->private; \ + folio->private = (void *)v; \ + } \ +} \ static inline void set_page_private_##name(struct page *page) \ { \ if (!PagePrivate(page)) \ @@ -2470,6 +2493,16 @@ static inline void set_page_private_##name(struct page *page) \ } #define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void folio_clear_f2fs_##name(struct folio *folio) \ +{ \ + unsigned long v = (unsigned long)folio->private; \ + \ + v &= ~(1UL << PAGE_PRIVATE_##flagname); \ + if (v == (1UL << PAGE_PRIVATE_NOT_POINTER)) \ + folio_detach_private(folio); \ + else \ + folio->private = (void *)v; \ +} \ static inline void clear_page_private_##name(struct page *page) \ { \ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ @@ -2492,39 +2525,23 @@ PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); -static inline unsigned long get_page_private_data(struct page *page) +static inline unsigned long folio_get_f2fs_data(struct folio *folio) { - unsigned long data = page_private(page); + unsigned long data = (unsigned long)folio->private; if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) return 0; return data >> PAGE_PRIVATE_MAX; } -static inline void set_page_private_data(struct page *page, unsigned long data) +static inline void folio_set_f2fs_data(struct folio *folio, unsigned long data) { - if (!PagePrivate(page)) - attach_page_private(page, (void *)0); - set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); - page_private(page) |= data << PAGE_PRIVATE_MAX; -} - -static inline void clear_page_private_data(struct page *page) -{ - page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); - if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) - detach_page_private(page); -} + data = (1UL << PAGE_PRIVATE_NOT_POINTER) | (data << PAGE_PRIVATE_MAX); -static inline void clear_page_private_all(struct page *page) -{ - clear_page_private_data(page); - clear_page_private_reference(page); - clear_page_private_gcing(page); - clear_page_private_inline(page); - clear_page_private_atomic(page); - - f2fs_bug_on(F2FS_P_SB(page), page_private(page)); + if (!folio_test_private(folio)) + folio_attach_private(folio, (void *)data); + else + folio->private = (void *)((unsigned long)folio->private | data); } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, @@ -3011,9 +3028,9 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) -static inline bool IS_INODE(struct page *page) +static inline bool IS_INODE(const struct folio *folio) { - struct f2fs_node *p = F2FS_NODE(page); + struct f2fs_node *p = F2FS_NODE(folio); return RAW_IS_INODE(p); } @@ -3031,20 +3048,20 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node) static inline int f2fs_has_extra_attr(struct inode *inode); static inline unsigned int get_dnode_base(struct inode *inode, - struct page *node_page) + struct folio *node_folio) { - if (!IS_INODE(node_page)) + if (!IS_INODE(node_folio)) return 0; return inode ? get_extra_isize(inode) : - offset_in_addr(&F2FS_NODE(node_page)->i); + offset_in_addr(&F2FS_NODE(node_folio)->i); } static inline __le32 *get_dnode_addr(struct inode *inode, struct folio *node_folio) { - return blkaddr_in_node(F2FS_NODE(&node_folio->page)) + - get_dnode_base(inode, &node_folio->page); + return blkaddr_in_node(F2FS_NODE(node_folio)) + + get_dnode_base(inode, node_folio); } static inline block_t data_blkaddr(struct inode *inode, @@ -3366,9 +3383,10 @@ static inline unsigned int addrs_per_page(struct inode *inode, return addrs; } -static inline void *inline_xattr_addr(struct inode *inode, struct folio *folio) +static inline +void *inline_xattr_addr(struct inode *inode, const struct folio *folio) { - struct f2fs_inode *ri = F2FS_INODE(&folio->page); + struct f2fs_inode *ri = F2FS_INODE(folio); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)]); @@ -3628,13 +3646,14 @@ int f2fs_pin_file_control(struct inode *inode, bool inc); */ void f2fs_set_inode_flags(struct inode *inode); bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio); -void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_update_inode(struct inode *inode, struct folio *node_folio); void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_remove_donate_inode(struct inode *inode); void f2fs_evict_inode(struct inode *inode); void f2fs_handle_failed_inode(struct inode *inode); @@ -3784,8 +3803,8 @@ void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio); -int f2fs_recover_xattr_data(struct inode *inode, struct page *page); -int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3852,7 +3871,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, bool recover_newaddr); enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, enum log_type seg_type); -int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); @@ -3886,7 +3905,7 @@ unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, static inline struct inode *fio_inode(struct f2fs_io_info *fio) { - return page_folio(fio->page)->mapping->host; + return fio->folio->mapping->host; } #define DEF_FRAGMENT_SIZE 4 @@ -3953,7 +3972,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -bool f2fs_is_cp_guaranteed(struct page *page); +bool f2fs_is_cp_guaranteed(const struct folio *folio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, @@ -3961,7 +3980,7 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type); void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct folio *folio); @@ -4303,7 +4322,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); -bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage); +bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio); void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, @@ -4345,7 +4364,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -bool sanity_check_extent_cache(struct inode *inode, struct page *ipage); +bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); @@ -4435,20 +4454,20 @@ enum cluster_check_type { CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */ CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */ }; -bool f2fs_is_compressed_page(struct page *page); +bool f2fs_is_compressed_page(struct folio *folio); struct folio *f2fs_compress_control_folio(struct folio *folio); int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata); bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied); int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); -void f2fs_compress_write_end_io(struct bio *bio, struct page *page); +void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio); bool f2fs_is_compress_backend_ready(struct inode *inode); bool f2fs_is_compress_level_valid(int alg, int lvl); int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); -void f2fs_end_read_compressed_page(struct page *page, bool failed, +void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); @@ -4486,8 +4505,6 @@ void f2fs_destroy_compress_cache(void); struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len); -void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, - nid_t ino, block_t blkaddr); bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr); void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); @@ -4504,7 +4521,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); sbi->compr_saved_block += diff; \ } while (0) #else -static inline bool f2fs_is_compressed_page(struct page *page) { return false; } +static inline bool f2fs_is_compressed_page(struct folio *folio) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) { if (!f2fs_compressed_file(inode)) @@ -4522,7 +4539,7 @@ static inline int __init f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { } -static inline void f2fs_end_read_compressed_page(struct page *page, +static inline void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); @@ -4542,8 +4559,6 @@ static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len) { } -static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, - struct page *page, nid_t ino, block_t blkaddr) { } static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr) { return false; } static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c677230699fd..42faaed6a02d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -489,7 +489,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; @@ -629,7 +629,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; - return finish_preallocate_blocks(inode); + err = finish_preallocate_blocks(inode); + if (!err) + atomic_inc(&F2FS_I(inode)->open_count); + return err; } void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -708,7 +711,7 @@ next: * once we invalidate valid blkaddr in range [ofs, ofs + count], * we will invalidate all blkaddr in the whole range. */ - fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); f2fs_update_age_extent_cache_range(dn, fofs, len); @@ -815,12 +818,12 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto out; } - count = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + count = ADDRS_PER_PAGE(dn.node_folio, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); - if (dn.ofs_in_node || IS_INODE(&dn.node_folio->page)) { + if (dn.ofs_in_node || IS_INODE(dn.node_folio)) { f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } @@ -1043,11 +1046,24 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = setattr_prepare(idmap, dentry, attr); + if (err) + return err; + + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + err = fsverity_prepare_setattr(dentry, attr); + if (err) + return err; + if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -1064,20 +1080,19 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, !IS_ALIGNED(attr->ia_size, F2FS_BLK_TO_BYTES(fi->i_cluster_size))) return -EINVAL; + /* + * To prevent scattered pin block generation, we don't allow + * smaller/equal size unaligned truncation for pinned file. + * We only support overwrite IO to pinned file, so don't + * care about larger size truncation. + */ + if (f2fs_is_pinned_file(inode) && + attr->ia_size <= i_size_read(inode) && + !IS_ALIGNED(attr->ia_size, + F2FS_BLK_TO_BYTES(CAP_BLKS_PER_SEC(sbi)))) + return -EINVAL; } - err = setattr_prepare(idmap, dentry, attr); - if (err) - return err; - - err = fscrypt_prepare_setattr(dentry, attr); - if (err) - return err; - - err = fsverity_prepare_setattr(dentry, attr); - if (err) - return err; - if (is_quota_modification(idmap, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) @@ -1085,12 +1100,11 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); err = dquot_transfer(idmap, inode, attr); if (err) { - set_sbi_flag(F2FS_I_SB(inode), - SBI_QUOTA_NEED_REPAIR); - f2fs_unlock_op(F2FS_I_SB(inode)); + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + f2fs_unlock_op(sbi); return err; } /* @@ -1100,7 +1114,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); - f2fs_unlock_op(F2FS_I_SB(inode)); + f2fs_unlock_op(sbi); } if (attr->ia_valid & ATTR_SIZE) { @@ -1163,7 +1177,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); return err; } @@ -1223,7 +1237,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -1322,7 +1336,7 @@ next_dnode: goto next; } - done = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, inode) - + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { *blkaddr = f2fs_data_blkaddr(&dn); @@ -1411,7 +1425,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } ilen = min((pgoff_t) - ADDRS_PER_PAGE(&dn.node_folio->page, dst_inode) - + ADDRS_PER_PAGE(dn.node_folio, dst_inode) - dn.ofs_in_node, len - i); do { dn.data_blkaddr = f2fs_data_blkaddr(&dn); @@ -1453,7 +1467,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, memcpy_folio(fdst, 0, fsrc, 0, PAGE_SIZE); folio_mark_dirty(fdst); - set_page_private_gcing(&fdst->page); + folio_set_f2fs_gcing(fdst); f2fs_folio_put(fdst, true); f2fs_folio_put(fsrc, true); @@ -1707,7 +1721,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, goto out; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); end = min(pg_end, end_offset - dn.ofs_in_node + index); ret = f2fs_do_zero_range(&dn, index, end); @@ -1888,9 +1902,8 @@ next_alloc: } } - if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? - ZONED_PIN_SEC_REQUIRED_COUNT : - GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { + if (has_not_enough_free_secs(sbi, 0, + sbi->reserved_pin_section)) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); @@ -2028,6 +2041,9 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { + if (atomic_dec_and_test(&F2FS_I(inode)->open_count)) + f2fs_remove_donate_inode(inode); + /* * f2fs_release_file is called at every close calls. So we should * not drop any inmemory pages by close called by other process. @@ -2978,7 +2994,7 @@ do_map: f2fs_folio_wait_writeback(folio, DATA, true, true); folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); f2fs_folio_put(folio, true); idx++; @@ -3876,7 +3892,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4054,7 +4070,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4218,7 +4234,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto out; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, pg_end - index); for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { struct block_device *cur_bdev; @@ -4415,7 +4431,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) f2fs_folio_wait_writeback(folio, DATA, true, true); folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); redirty_idx = folio_next_index(folio); folio_unlock(folio); folio_put_refs(folio, 2); @@ -4825,6 +4841,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); const loff_t pos = iocb->ki_pos; ssize_t ret; + bool dio; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -4833,12 +4850,15 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos, iov_iter_count(to), READ); + dio = f2fs_should_use_dio(inode, iocb, to); + /* In LFS mode, if there is inflight dio, wait for its completion */ if (f2fs_lfs_mode(F2FS_I_SB(inode)) && - get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE)) + get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE) && + (!f2fs_is_pinned_file(inode) || !dio)) inode_dio_wait(inode); - if (f2fs_should_use_dio(inode, iocb, to)) { + if (dio) { ret = f2fs_dio_read_iter(iocb, to); } else { ret = filemap_read(iocb, to, 0); @@ -4846,8 +4866,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_READ_IO, ret); } - if (trace_f2fs_dataread_end_enabled()) - trace_f2fs_dataread_end(inode, pos, ret); + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -4870,8 +4889,7 @@ static ssize_t f2fs_file_splice_read(struct file *in, loff_t *ppos, f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_READ_IO, ret); - if (trace_f2fs_dataread_end_enabled()) - trace_f2fs_dataread_end(inode, pos, ret); + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -5216,8 +5234,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) f2fs_dio_write_iter(iocb, from, &may_need_sync) : f2fs_buffered_write_iter(iocb, from); - if (trace_f2fs_datawrite_end_enabled()) - trace_f2fs_datawrite_end(inode, orig_pos, ret); + trace_f2fs_datawrite_end(inode, orig_pos, ret); } /* Don't leave any preallocated blocks around past i_size. */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3cb5242f4ddf..098e9f71421e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -141,10 +141,10 @@ do_gc: FOREGROUND : BACKGROUND); sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) || - gc_control.one_time; + (gc_control.one_time && gc_th->boost_gc_greedy); /* foreground GC was been triggered via f2fs_balance_fs() */ - if (foreground) + if (foreground && !f2fs_sb_has_blkzoned(sbi)) sync_mode = false; gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; @@ -197,6 +197,8 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO; + gc_th->boost_gc_multiple = BOOST_GC_MULTIPLE; + gc_th->boost_gc_greedy = GC_GREEDY; if (f2fs_sb_has_blkzoned(sbi)) { gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED; @@ -278,12 +280,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - if (p->alloc_mode == SSR) { - p->gc_mode = GC_GREEDY; - p->dirty_bitmap = dirty_i->dirty_segmap[type]; - p->max_search = dirty_i->nr_dirty[type]; - p->ofs_unit = 1; - } else if (p->alloc_mode == AT_SSR) { + if (p->alloc_mode == SSR || p->alloc_mode == AT_SSR) { p->gc_mode = GC_GREEDY; p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; @@ -389,14 +386,15 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) } static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, - unsigned int segno, struct victim_sel_policy *p) + unsigned int segno, struct victim_sel_policy *p, + unsigned int valid_thresh_ratio) { if (p->alloc_mode == SSR) return get_seg_entry(sbi, segno)->ckpt_valid_blocks; - if (p->one_time_gc && (get_valid_blocks(sbi, segno, true) >= - CAP_BLKS_PER_SEC(sbi) * sbi->gc_thread->valid_thresh_ratio / - 100)) + if (p->one_time_gc && (valid_thresh_ratio < 100) && + (get_valid_blocks(sbi, segno, true) >= + CAP_BLKS_PER_SEC(sbi) * valid_thresh_ratio / 100)) return UINT_MAX; /* alloc_mode == LFS */ @@ -777,6 +775,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, unsigned int secno, last_victim; unsigned int last_segment; unsigned int nsearched; + unsigned int valid_thresh_ratio = 100; bool is_atgc; int ret = 0; @@ -786,7 +785,11 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, p.alloc_mode = alloc_mode; p.age = age; p.age_threshold = sbi->am.age_threshold; - p.one_time_gc = one_time; + if (one_time) { + p.one_time_gc = one_time; + if (has_enough_free_secs(sbi, 0, NR_PERSISTENT_LOG)) + valid_thresh_ratio = sbi->gc_thread->valid_thresh_ratio; + } retry: select_policy(sbi, gc_type, type, &p); @@ -912,7 +915,7 @@ retry: goto next; } - cost = get_gc_cost(sbi, segno, &p); + cost = get_gc_cost(sbi, segno, &p, valid_thresh_ratio); if (p.min_cost > cost) { p.min_segno = segno; @@ -1162,8 +1165,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - if (IS_INODE(&node_folio->page)) { - base = offset_in_addr(F2FS_INODE(&node_folio->page)); + if (IS_INODE(node_folio)) { + base = offset_in_addr(F2FS_INODE(node_folio)); max_addrs = DEF_ADDRS_PER_INODE; } else { base = 0; @@ -1177,7 +1180,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - *nofs = ofs_of_node(&node_folio->page); + *nofs = ofs_of_node(node_folio); source_blkaddr = data_blkaddr(NULL, node_folio, ofs_in_node); f2fs_folio_put(node_folio, true); @@ -1249,7 +1252,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) } got_it: /* read folio */ - fio.page = &folio->page; + fio.folio = folio; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; /* @@ -1353,7 +1356,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_out; /* read page */ - fio.page = &folio->page; + fio.folio = folio; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) @@ -1473,7 +1476,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1483,7 +1486,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .need_lock = LOCK_REQ, .io_type = FS_GC_DATA_IO, @@ -1499,11 +1502,11 @@ retry: f2fs_remove_dirty_inode(inode); } - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); err = f2fs_do_write_data_page(&fio); if (err) { - clear_page_private_gcing(&folio->page); + folio_clear_f2fs_gcing(folio); if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; @@ -1749,7 +1752,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, !has_enough_free_blocks(sbi, sbi->gc_thread->boost_zoned_gc_percent)) window_granularity *= - BOOST_GC_MULTIPLE; + sbi->gc_thread->boost_gc_multiple; end_segno = start_segno + window_granularity; } @@ -1891,6 +1894,7 @@ gc_more: /* Let's run FG_GC, if we don't have enough space. */ if (has_not_enough_free_secs(sbi, 0, 0)) { gc_type = FG_GC; + gc_control->one_time = false; /* * For example, if there are many prefree_segments below given @@ -2064,7 +2068,7 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, segno))) + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) continue; do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 5c1eaf55e127..24e8b1c27acc 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -68,6 +68,8 @@ struct f2fs_gc_kthread { unsigned int no_zoned_gc_percent; unsigned int boost_zoned_gc_percent; unsigned int valid_thresh_ratio; + unsigned int boost_gc_multiple; + unsigned int boost_gc_greedy; }; struct gc_inode_list { @@ -194,6 +196,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) static inline bool need_to_boost_gc(struct f2fs_sb_info *sbi) { if (f2fs_sb_has_blkzoned(sbi)) - return !has_enough_free_blocks(sbi, LIMIT_BOOST_ZONED_GC); + return !has_enough_free_blocks(sbi, + sbi->gc_thread->boost_zoned_gc_percent); return has_enough_invalid_blocks(sbi); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 901c630685ce..58ac831ef704 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -33,9 +33,9 @@ bool f2fs_may_inline_data(struct inode *inode) return !f2fs_post_read_required(inode); } -static bool inode_has_blocks(struct inode *inode, struct page *ipage) +static bool inode_has_blocks(struct inode *inode, struct folio *ifolio) { - struct f2fs_inode *ri = F2FS_INODE(ipage); + struct f2fs_inode *ri = F2FS_INODE(ifolio); int i; if (F2FS_HAS_BLOCKS(inode)) @@ -48,12 +48,12 @@ static bool inode_has_blocks(struct inode *inode, struct page *ipage) return false; } -bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage) +bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio) { if (!f2fs_has_inline_data(inode)) return false; - if (inode_has_blocks(inode, ipage)) + if (inode_has_blocks(inode, ifolio)) return false; if (!support_inline_data(inode)) @@ -150,7 +150,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .io_type = FS_DATA_IO, }; @@ -206,7 +206,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) /* clear inline data and flag after data writeback */ f2fs_truncate_inline_inode(dn->inode, dn->inode_folio, 0); - clear_page_private_inline(&dn->inode_folio->page); + folio_clear_f2fs_inline(dn->inode_folio); clear_out: stat_dec_inline_inode(dn->inode); clear_inode_flag(dn->inode, FI_INLINE_DATA); @@ -286,7 +286,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio) set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); - clear_page_private_inline(&ifolio->page); + folio_clear_f2fs_inline(ifolio); f2fs_folio_put(ifolio, 1); return 0; } @@ -305,8 +305,8 @@ int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio) * x o -> remove data blocks, and then recover inline_data * x x -> recover data blocks */ - if (IS_INODE(&nfolio->page)) - ri = F2FS_INODE(&nfolio->page); + if (IS_INODE(nfolio)) + ri = F2FS_INODE(nfolio); if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { @@ -825,7 +825,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ifolio) - - (char *)F2FS_INODE(&ifolio->page); + (char *)F2FS_INODE(ifolio); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); out: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 083d52a42bfb..8c4eafe9ffac 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -108,7 +108,7 @@ static void __recover_inline_status(struct inode *inode, struct folio *ifolio) f2fs_folio_wait_writeback(ifolio, NODE, true, true); set_inode_flag(inode, FI_DATA_EXIST); - set_raw_inline(inode, F2FS_INODE(&ifolio->page)); + set_raw_inline(inode, F2FS_INODE(ifolio)); folio_mark_dirty(ifolio); return; } @@ -116,14 +116,15 @@ static void __recover_inline_status(struct inode *inode, struct folio *ifolio) return; } -static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +static +bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_inode *ri = &F2FS_NODE(page)->i; + struct f2fs_inode *ri = &F2FS_NODE(folio)->i; if (!f2fs_sb_has_inode_chksum(sbi)) return false; - if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + if (!IS_INODE(folio) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), @@ -133,9 +134,9 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page return true; } -static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_node *node = F2FS_NODE(folio); struct f2fs_inode *ri = &node->i; __le32 ino = node->footer.ino; __le32 gen = ri->i_generation; @@ -164,34 +165,34 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio) return true; #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_enable_inode_chksum(sbi, &folio->page)) + if (!f2fs_enable_inode_chksum(sbi, folio)) #else - if (!f2fs_enable_inode_chksum(sbi, &folio->page) || + if (!f2fs_enable_inode_chksum(sbi, folio) || folio_test_dirty(folio) || folio_test_writeback(folio)) #endif return true; - ri = &F2FS_NODE(&folio->page)->i; + ri = &F2FS_NODE(folio)->i; provided = le32_to_cpu(ri->i_inode_checksum); - calculated = f2fs_inode_chksum(sbi, &folio->page); + calculated = f2fs_inode_chksum(sbi, folio); if (provided != calculated) f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x", - folio->index, ino_of_node(&folio->page), + folio->index, ino_of_node(folio), provided, calculated); return provided == calculated; } -void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_inode *ri = &F2FS_NODE(page)->i; + struct f2fs_inode *ri = &F2FS_NODE(folio)->i; - if (!f2fs_enable_inode_chksum(sbi, page)) + if (!f2fs_enable_inode_chksum(sbi, folio)) return; - ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, folio)); } static bool sanity_check_compress_inode(struct inode *inode, @@ -266,28 +267,28 @@ err_level: return false; } -static bool sanity_check_inode(struct inode *inode, struct page *node_page) +static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_inode *ri = F2FS_INODE(node_page); + struct f2fs_inode *ri = F2FS_INODE(node_folio); unsigned long long iblocks; - iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); + iblocks = le64_to_cpu(F2FS_INODE(node_folio)->i_blocks); if (!iblocks) { f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", __func__, inode->i_ino, iblocks); return false; } - if (ino_of_node(node_page) != nid_of_node(node_page)) { + if (ino_of_node(node_folio) != nid_of_node(node_folio)) { f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", __func__, inode->i_ino, - ino_of_node(node_page), nid_of_node(node_page)); + ino_of_node(node_folio), nid_of_node(node_folio)); return false; } - if (ino_of_node(node_page) == fi->i_xattr_nid) { + if (ino_of_node(node_folio) == fi->i_xattr_nid) { f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.", __func__, inode->i_ino, fi->i_xattr_nid); return false; @@ -354,7 +355,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_sanity_check_inline_data(inode, node_page)) { + if (f2fs_sanity_check_inline_data(inode, node_folio)) { f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; @@ -419,7 +420,7 @@ static int do_read_inode(struct inode *inode) if (IS_ERR(node_folio)) return PTR_ERR(node_folio); - ri = F2FS_INODE(&node_folio->page); + ri = F2FS_INODE(node_folio); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); @@ -469,7 +470,7 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } - if (!sanity_check_inode(inode, &node_folio->page)) { + if (!sanity_check_inode(inode, node_folio)) { f2fs_folio_put(node_folio, true); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); @@ -481,9 +482,9 @@ static int do_read_inode(struct inode *inode) __recover_inline_status(inode, node_folio); /* try to recover cold bit for non-dir inode */ - if (!S_ISDIR(inode->i_mode) && !is_cold_node(&node_folio->page)) { + if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_folio)) { f2fs_folio_wait_writeback(node_folio, NODE, true, true); - set_cold_node(&node_folio->page, false); + set_cold_node(node_folio, false); folio_mark_dirty(node_folio); } @@ -531,7 +532,7 @@ static int do_read_inode(struct inode *inode) init_idisk_time(inode); - if (!sanity_check_extent_cache(inode, &node_folio->page)) { + if (!sanity_check_extent_cache(inode, node_folio)) { f2fs_folio_put(node_folio, true); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; @@ -669,7 +670,7 @@ void f2fs_update_inode(struct inode *inode, struct folio *node_folio) f2fs_inode_synced(inode); - ri = F2FS_INODE(&node_folio->page); + ri = F2FS_INODE(node_folio); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = fi->i_advise; @@ -748,11 +749,11 @@ void f2fs_update_inode(struct inode *inode, struct folio *node_folio) /* deleted inode */ if (inode->i_nlink == 0) - clear_page_private_inline(&node_folio->page); + folio_clear_f2fs_inline(node_folio); init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS - f2fs_inode_chksum_set(F2FS_I_SB(inode), &node_folio->page); + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_folio); #endif } @@ -820,7 +821,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } -static void f2fs_remove_donate_inode(struct inode *inode) +void f2fs_remove_donate_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -933,6 +934,19 @@ retry: f2fs_update_inode_page(inode); if (dquot_initialize_needed(inode)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + /* + * If both f2fs_truncate() and f2fs_update_inode_page() failed + * due to fuzzed corrupted inode, call f2fs_inode_synced() to + * avoid triggering later f2fs_bug_on(). + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(sbi, + "f2fs_evict_inode: inode is dirty, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } } if (freeze_protected) sb_end_intwrite(inode->i_sb); @@ -949,8 +963,12 @@ no_delete: if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); - else - f2fs_inode_synced(inode); + + /* + * anyway, it needs to remove the inode from sbi->inode_list[DIRTY_META] + * list to avoid UAF in f2fs_sync_inode_meta() during checkpoint. + */ + f2fs_inode_synced(inode); /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */ if (inode->i_ino) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 07e333ee21b7..b882771e4699 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1298,19 +1298,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page; + struct folio *folio; const char *target; if (!dentry) return ERR_PTR(-ECHILD); - page = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); + folio = read_mapping_folio(inode->i_mapping, 0, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); - target = fscrypt_get_symlink(inode, page_address(page), + target = fscrypt_get_symlink(inode, folio_address(folio), inode->i_sb->s_blocksize, done); - put_page(page); + folio_put(folio); return target; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bfe104db284e..27743b93e186 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -135,7 +135,7 @@ static struct folio *get_current_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) return f2fs_get_meta_folio_retry(sbi, current_nat_addr(sbi, nid)); } -static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +static struct folio *get_next_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) { struct folio *src_folio; struct folio *dst_folio; @@ -149,7 +149,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_folio = get_current_nat_folio(sbi, nid); if (IS_ERR(src_folio)) - return &src_folio->page; + return src_folio; dst_folio = f2fs_grab_meta_folio(sbi, dst_off); f2fs_bug_on(sbi, folio_test_dirty(src_folio)); @@ -161,7 +161,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) set_to_next_nat(nm_i, nid); - return &dst_folio->page; + return dst_folio; } static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, @@ -185,7 +185,7 @@ static void __free_nat_entry(struct nat_entry *e) /* must be locked by nat_tree_lock */ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, - struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail, bool init_dirty) { if (no_fail) f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); @@ -195,6 +195,12 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, if (raw_ne) node_info_from_raw_nat(&ne->ni, raw_ne); + if (init_dirty) { + INIT_LIST_HEAD(&ne->list); + nm_i->nat_cnt[TOTAL_NAT]++; + return ne; + } + spin_lock(&nm_i->nat_list_lock); list_add_tail(&ne->list, &nm_i->nat_entries); spin_unlock(&nm_i->nat_list_lock); @@ -204,14 +210,17 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, return ne; } -static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n, bool for_dirty) { struct nat_entry *ne; ne = radix_tree_lookup(&nm_i->nat_root, n); - /* for recent accessed nat entry, move it to tail of lru list */ - if (ne && !get_nat_flag(ne, IS_DIRTY)) { + /* + * for recent accessed nat entry which will not be dirtied soon + * later, move it to tail of lru list. + */ + if (ne && !get_nat_flag(ne, IS_DIRTY) && !for_dirty) { spin_lock(&nm_i->nat_list_lock); if (!list_empty(&ne->list)) list_move_tail(&ne->list, &nm_i->nat_entries); @@ -256,7 +265,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, } static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry *ne, bool init_dirty) { struct nat_entry_set *head; bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; @@ -279,7 +288,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, goto refresh_list; nm_i->nat_cnt[DIRTY_NAT]++; - nm_i->nat_cnt[RECLAIMABLE_NAT]--; + if (!init_dirty) + nm_i->nat_cnt[RECLAIMABLE_NAT]--; set_nat_flag(ne, IS_DIRTY, true); refresh_list: spin_lock(&nm_i->nat_list_lock); @@ -312,8 +322,7 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio) { - return is_node_folio(folio) && IS_DNODE(&folio->page) && - is_cold_node(&folio->page); + return is_node_folio(folio) && IS_DNODE(folio) && is_cold_node(folio); } void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) @@ -384,7 +393,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) bool need = false; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) @@ -401,7 +410,7 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) bool is_cp = true; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; f2fs_up_read(&nm_i->nat_tree_lock); @@ -415,7 +424,7 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) bool need_update = true; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ino); + e = __lookup_nat_cache(nm_i, ino, false); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) @@ -440,9 +449,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, return; f2fs_down_write(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (!e) - e = __init_nat_entry(nm_i, new, ne, false); + e = __init_nat_entry(nm_i, new, ne, false, false); else f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != @@ -459,11 +468,13 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); + bool init_dirty = false; f2fs_down_write(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ni->nid); + e = __lookup_nat_cache(nm_i, ni->nid, true); if (!e) { - e = __init_nat_entry(nm_i, new, NULL, true); + init_dirty = true; + e = __init_nat_entry(nm_i, new, NULL, true, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -499,11 +510,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, nat_set_blkaddr(e, new_blkaddr); if (!__is_valid_data_blkaddr(new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); - __set_nat_cache_dirty(nm_i, e); + __set_nat_cache_dirty(nm_i, e, init_dirty); /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) - e = __lookup_nat_cache(nm_i, ni->ino); + e = __lookup_nat_cache(nm_i, ni->ino, false); if (e) { if (fsync_done && ni->nid == ni->ino) set_nat_flag(e, HAS_FSYNCED_INODE, true); @@ -555,20 +566,24 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry ne; struct nat_entry *e; pgoff_t index; - block_t blkaddr; int i; + bool need_cache = true; ni->flag = 0; ni->nid = nid; retry: /* Check nat cache */ f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); f2fs_up_read(&nm_i->nat_tree_lock); + if (IS_ENABLED(CONFIG_F2FS_CHECK_FS)) { + need_cache = false; + goto sanity_check; + } return 0; } @@ -594,7 +609,7 @@ retry: up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); - goto cache; + goto sanity_check; } /* Fill node_info from nat page */ @@ -609,14 +624,23 @@ retry: ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_folio_put(folio, true); -cache: - blkaddr = le32_to_cpu(ne.block_addr); - if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) - return -EFAULT; +sanity_check: + if (__is_valid_data_blkaddr(ni->blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni->blk_addr, + DATA_GENERIC_ENHANCE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err_ratelimited(sbi, + "f2fs_get_node_info of %pS: inconsistent nat entry, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + __builtin_return_address(0), + ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + return -EFSCORRUPTED; + } /* cache nat entry */ - cache_nat_entry(sbi, nid, &ne); + if (need_cache) + cache_nat_entry(sbi, nid, &ne); return 0; } @@ -636,7 +660,7 @@ static void f2fs_ra_node_pages(struct folio *parent, int start, int n) end = start + n; end = min(end, (int)NIDS_PER_BLOCK); for (i = start; i < end; i++) { - nid = get_nid(&parent->page, i, false); + nid = get_nid(parent, i, false); f2fs_ra_node_page(sbi, nid); } @@ -795,7 +819,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) parent = nfolio[0]; if (level != 0) - nids[1] = get_nid(&parent->page, offset[0], true); + nids[1] = get_nid(parent, offset[0], true); dn->inode_folio = nfolio[0]; dn->inode_folio_locked = true; @@ -803,6 +827,16 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) for (i = 1; i <= level; i++) { bool done = false; + if (nids[i] && nids[i] == dn->inode->i_ino) { + err = -EFSCORRUPTED; + f2fs_err_ratelimited(sbi, + "inode mapping table is corrupted, run fsck to fix it, " + "ino:%lu, nid:%u, level:%d, offset:%d", + dn->inode->i_ino, nids[i], level, offset[level]); + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto release_pages; + } + if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ if (!f2fs_alloc_nid(sbi, &(nids[i]))) { @@ -846,7 +880,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (i < level) { parent = nfolio[i]; - nids[i + 1] = get_nid(&parent->page, offset[i], false); + nids[i + 1] = get_nid(parent, offset[i], false); } } dn->nid = nids[level]; @@ -961,9 +995,9 @@ static int truncate_dnode(struct dnode_of_data *dn) else if (IS_ERR(folio)) return PTR_ERR(folio); - if (IS_INODE(&folio->page) || ino_of_node(&folio->page) != dn->inode->i_ino) { + if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) { f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", - dn->inode->i_ino, dn->nid, ino_of_node(&folio->page)); + dn->inode->i_ino, dn->nid, ino_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); f2fs_folio_put(folio, true); @@ -1007,7 +1041,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, f2fs_ra_node_pages(folio, ofs, NIDS_PER_BLOCK); - rn = F2FS_NODE(&folio->page); + rn = F2FS_NODE(folio); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { child_nid = le32_to_cpu(rn->in.nid[i]); @@ -1070,7 +1104,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, int i; int idx = depth - 2; - nid[0] = get_nid(&dn->inode_folio->page, offset[0], true); + nid[0] = get_nid(dn->inode_folio, offset[0], true); if (!nid[0]) return 0; @@ -1083,14 +1117,14 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, idx = i - 1; goto fail; } - nid[i + 1] = get_nid(&folios[i]->page, offset[i + 1], false); + nid[i + 1] = get_nid(folios[i], offset[i + 1], false); } f2fs_ra_node_pages(folios[idx], offset[idx + 1], NIDS_PER_BLOCK); /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { - child_nid = get_nid(&folios[idx]->page, i, false); + child_nid = get_nid(folios[idx], i, false); if (!child_nid) continue; dn->nid = child_nid; @@ -1159,7 +1193,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) set_new_dnode(&dn, inode, folio, NULL, 0); folio_unlock(folio); - ri = F2FS_INODE(&folio->page); + ri = F2FS_INODE(folio); switch (level) { case 0: case 1: @@ -1188,7 +1222,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = get_nid(&folio->page, offset[0], true); + dn.nid = get_nid(folio, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1220,7 +1254,7 @@ skip_partial: } if (err < 0) goto fail; - if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) { + if (offset[1] == 0 && get_nid(folio, offset[0], true)) { folio_lock(folio); BUG_ON(!is_node_folio(folio)); set_nid(folio, offset[0], 0, true); @@ -1367,8 +1401,8 @@ struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs) set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_folio_wait_writeback(folio, NODE, true, true); - fill_node_footer(&folio->page, dn->nid, dn->inode->i_ino, ofs, true); - set_cold_node(&folio->page, S_ISDIR(dn->inode->i_mode)); + fill_node_footer(folio, dn->nid, dn->inode->i_ino, ofs, true); + set_cold_node(folio, S_ISDIR(dn->inode->i_mode)); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); if (folio_mark_dirty(folio)) @@ -1400,7 +1434,7 @@ static int read_node_folio(struct folio *folio, blk_opf_t op_flags) .type = NODE, .op = REQ_OP_READ, .op_flags = op_flags, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, }; int err; @@ -1462,17 +1496,15 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype) { - struct page *page = &folio->page; - - if (unlikely(nid != nid_of_node(page) || - (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || + if (unlikely(nid != nid_of_node(folio) || + (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) || (ntype == NODE_TYPE_XATTR && - !f2fs_has_xattr_block(ofs_of_node(page))) || + !f2fs_has_xattr_block(ofs_of_node(folio))) || time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(page), ino_of_node(page), - ofs_of_node(page), cpver_of_node(page), + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), next_blkaddr_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); @@ -1553,7 +1585,7 @@ struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid) static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start) { struct f2fs_sb_info *sbi = F2FS_F_SB(parent); - nid_t nid = get_nid(&parent->page, start, false); + nid_t nid = get_nid(parent, start, false); return __get_node_folio(sbi, nid, parent, start, NODE_TYPE_REGULAR); } @@ -1618,9 +1650,9 @@ static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) return ERR_PTR(-EIO); } - if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) + if (!IS_DNODE(folio) || !is_cold_node(folio)) continue; - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) continue; folio_lock(folio); @@ -1630,7 +1662,7 @@ continue_unlock: folio_unlock(folio); continue; } - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) goto continue_unlock; if (!folio_test_dirty(folio)) { @@ -1660,11 +1692,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, - .ino = ino_of_node(&folio->page), + .ino = ino_of_node(folio), .type = NODE, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .submitted = 0, .io_type = io_type, @@ -1689,11 +1721,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && wbc->sync_mode == WB_SYNC_NONE && - IS_DNODE(&folio->page) && is_cold_node(&folio->page)) + IS_DNODE(folio) && is_cold_node(folio)) goto redirty_out; /* get old block addr of this node page */ - nid = nid_of_node(&folio->page); + nid = nid_of_node(folio); f2fs_bug_on(sbi, folio->index != nid); if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) @@ -1731,7 +1763,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(&folio->page)); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio)); dec_page_count(sbi, F2FS_DIRTY_NODES); f2fs_up_read(&sbi->node_write); @@ -1827,9 +1859,9 @@ retry: goto out; } - if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) + if (!IS_DNODE(folio) || !is_cold_node(folio)) continue; - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) continue; folio_lock(folio); @@ -1839,7 +1871,7 @@ continue_unlock: folio_unlock(folio); continue; } - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) goto continue_unlock; if (!folio_test_dirty(folio) && folio != last_folio) { @@ -1849,17 +1881,17 @@ continue_unlock: f2fs_folio_wait_writeback(folio, NODE, true, true); - set_fsync_mark(&folio->page, 0); - set_dentry_mark(&folio->page, 0); + set_fsync_mark(folio, 0); + set_dentry_mark(folio, 0); if (!atomic || folio == last_folio) { - set_fsync_mark(&folio->page, 1); + set_fsync_mark(folio, 1); percpu_counter_inc(&sbi->rf_node_block_count); - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) f2fs_update_inode(inode, folio); - set_dentry_mark(&folio->page, + set_dentry_mark(folio, f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ @@ -1935,7 +1967,7 @@ static bool flush_dirty_inode(struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct inode *inode; - nid_t ino = ino_of_node(&folio->page); + nid_t ino = ino_of_node(folio); inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); if (!inode) @@ -1964,7 +1996,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - if (!IS_INODE(&folio->page)) + if (!IS_INODE(folio)) continue; folio_lock(folio); @@ -1975,10 +2007,10 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) goto unlock; /* flush inline_data, if it's async context. */ - if (page_private_inline(&folio->page)) { - clear_page_private_inline(&folio->page); + if (folio_test_f2fs_inline(folio)) { + folio_clear_f2fs_inline(folio); folio_unlock(folio); - flush_inline_data(sbi, ino_of_node(&folio->page)); + flush_inline_data(sbi, ino_of_node(folio)); continue; } unlock: @@ -2027,13 +2059,13 @@ next_step: * 1. dentry dnodes * 2. file dnodes */ - if (step == 0 && IS_DNODE(&folio->page)) + if (step == 0 && IS_DNODE(folio)) continue; - if (step == 1 && (!IS_DNODE(&folio->page) || - is_cold_node(&folio->page))) + if (step == 1 && (!IS_DNODE(folio) || + is_cold_node(folio))) continue; - if (step == 2 && (!IS_DNODE(&folio->page) || - !is_cold_node(&folio->page))) + if (step == 2 && (!IS_DNODE(folio) || + !is_cold_node(folio))) continue; lock_node: if (wbc->sync_mode == WB_SYNC_ALL) @@ -2057,15 +2089,15 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (page_private_inline(&folio->page)) { - clear_page_private_inline(&folio->page); + if (folio_test_f2fs_inline(folio)) { + folio_clear_f2fs_inline(folio); folio_unlock(folio); - flush_inline_data(sbi, ino_of_node(&folio->page)); + flush_inline_data(sbi, ino_of_node(folio)); goto lock_node; } /* flush dirty inode */ - if (IS_INODE(&folio->page) && flush_dirty_inode(folio)) + if (IS_INODE(folio) && flush_dirty_inode(folio)) goto lock_node; write_node: f2fs_folio_wait_writeback(folio, NODE, true, true); @@ -2073,8 +2105,8 @@ write_node: if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - set_fsync_mark(&folio->page, 0); - set_dentry_mark(&folio->page, 0); + set_fsync_mark(folio, 0); + set_dentry_mark(folio, 0); if (!__write_node_folio(folio, false, &submitted, wbc, do_balance, io_type, NULL)) { @@ -2201,12 +2233,12 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); #ifdef CONFIG_F2FS_CHECK_FS - if (IS_INODE(&folio->page)) - f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); + if (IS_INODE(folio)) + f2fs_inode_chksum_set(F2FS_M_SB(mapping), folio); #endif if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); return true; } return false; @@ -2351,7 +2383,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, * - __remove_nid_from_list(PREALLOC_NID) * - __insert_nid_to_list(FREE_NID) */ - ne = __lookup_nat_cache(nm_i, nid); + ne = __lookup_nat_cache(nm_i, nid, false); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) goto err_out; @@ -2714,7 +2746,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio) if (IS_ERR(ifolio)) return PTR_ERR(ifolio); - ri = F2FS_INODE(&folio->page); + ri = F2FS_INODE(folio); if (ri->i_inline & F2FS_INLINE_XATTR) { if (!f2fs_has_inline_xattr(inode)) { set_inode_flag(inode, FI_INLINE_XATTR); @@ -2740,7 +2772,7 @@ update_inode: return 0; } -int f2fs_recover_xattr_data(struct inode *inode, struct page *page) +int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; @@ -2778,8 +2810,8 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - if (page) { - memcpy(F2FS_NODE(&xfolio->page), F2FS_NODE(page), + if (folio) { + memcpy(F2FS_NODE(xfolio), F2FS_NODE(folio), VALID_XATTR_BLOCK_SIZE); folio_mark_dirty(xfolio); } @@ -2788,10 +2820,10 @@ recover_xnid: return 0; } -int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio) { struct f2fs_inode *src, *dst; - nid_t ino = ino_of_node(page); + nid_t ino = ino_of_node(folio); struct node_info old_ni, new_ni; struct folio *ifolio; int err; @@ -2814,11 +2846,11 @@ retry: if (!folio_test_uptodate(ifolio)) folio_mark_uptodate(ifolio); - fill_node_footer(&ifolio->page, ino, ino, 0, true); - set_cold_node(&ifolio->page, false); + fill_node_footer(ifolio, ino, ino, 0, true); + set_cold_node(ifolio, false); - src = F2FS_INODE(page); - dst = F2FS_INODE(&ifolio->page); + src = F2FS_INODE(folio); + dst = F2FS_INODE(ifolio); memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; @@ -2884,7 +2916,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, if (IS_ERR(folio)) return PTR_ERR(folio); - rn = F2FS_NODE(&folio->page); + rn = F2FS_NODE(folio); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; @@ -2904,6 +2936,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; int i; + bool init_dirty; down_write(&curseg->journal_rwsem); for (i = 0; i < nats_in_cursum(journal); i++) { @@ -2914,12 +2947,15 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) if (f2fs_check_nid_range(sbi, nid)) continue; + init_dirty = false; + raw_ne = nat_in_journal(journal, i); - ne = __lookup_nat_cache(nm_i, nid); + ne = __lookup_nat_cache(nm_i, nid, true); if (!ne) { + init_dirty = true; ne = __alloc_nat_entry(sbi, nid, true); - __init_nat_entry(nm_i, ne, &raw_ne, true); + __init_nat_entry(nm_i, ne, &raw_ne, true, true); } /* @@ -2934,7 +2970,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->nid_list_lock); } - __set_nat_cache_dirty(nm_i, ne); + __set_nat_cache_dirty(nm_i, ne, init_dirty); } update_nats_in_cursum(journal, -i); up_write(&curseg->journal_rwsem); @@ -2959,11 +2995,10 @@ add_out: } static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, - struct page *page) + const struct f2fs_nat_block *nat_blk) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; - struct f2fs_nat_block *nat_blk = page_address(page); int valid = 0; int i = 0; @@ -3000,7 +3035,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, bool to_journal = true; struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; - struct page *page = NULL; + struct folio *folio = NULL; /* * there are two steps to flush nat entries: @@ -3014,11 +3049,11 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { down_write(&curseg->journal_rwsem); } else { - page = get_next_nat_page(sbi, start_nid); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = get_next_nat_folio(sbi, start_nid); + if (IS_ERR(folio)) + return PTR_ERR(folio); - nat_blk = page_address(page); + nat_blk = folio_address(folio); f2fs_bug_on(sbi, !nat_blk); } @@ -3054,8 +3089,8 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - __update_nat_bits(sbi, start_nid, page); - f2fs_put_page(page, 1); + __update_nat_bits(sbi, start_nid, nat_blk); + f2fs_folio_put(folio, true); } /* Allow dirty nats by node block allocation in write_begin */ @@ -3395,10 +3430,10 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) } kvfree(nm_i->free_nid_count); - kvfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bitmap); kvfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS - kvfree(nm_i->nat_bitmap_mir); + kfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; kfree(nm_i); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 1446c433b3ec..030390543b54 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -31,7 +31,7 @@ /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 -/* control total # of node writes used for roll-fowrad recovery */ +/* control total # of node writes used for roll-forward recovery */ #define DEF_RF_NODE_BLOCKS 0 /* vector size for gang look-up from nat cache that consists of radix tree */ @@ -243,41 +243,41 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) #endif } -static inline nid_t ino_of_node(struct page *node_page) +static inline nid_t ino_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.ino); } -static inline nid_t nid_of_node(struct page *node_page) +static inline nid_t nid_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.nid); } -static inline unsigned int ofs_of_node(const struct page *node_page) +static inline unsigned int ofs_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); unsigned flag = le32_to_cpu(rn->footer.flag); return flag >> OFFSET_BIT_SHIFT; } -static inline __u64 cpver_of_node(struct page *node_page) +static inline __u64 cpver_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le64_to_cpu(rn->footer.cp_ver); } -static inline block_t next_blkaddr_of_node(struct folio *node_folio) +static inline block_t next_blkaddr_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(&node_folio->page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.next_blkaddr); } -static inline void fill_node_footer(struct page *page, nid_t nid, +static inline void fill_node_footer(const struct folio *folio, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int old_flag = 0; if (reset) @@ -293,17 +293,18 @@ static inline void fill_node_footer(struct page *page, nid_t nid, (old_flag & OFFSET_BIT_MASK)); } -static inline void copy_node_footer(struct page *dst, struct page *src) +static inline void copy_node_footer(const struct folio *dst, + const struct folio *src) { struct f2fs_node *src_rn = F2FS_NODE(src); struct f2fs_node *dst_rn = F2FS_NODE(dst); memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } -static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +static inline void fill_node_footer_blkaddr(struct folio *folio, block_t blkaddr) { - struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); + struct f2fs_node *rn = F2FS_NODE(folio); __u64 cp_ver = cur_cp_version(ckpt); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) @@ -313,19 +314,19 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline bool is_recoverable_dnode(struct page *page) +static inline bool is_recoverable_dnode(const struct folio *folio) { - struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); __u64 cp_ver = cur_cp_version(ckpt); /* Don't care crc part, if fsck.f2fs sets it. */ if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) - return (cp_ver << 32) == (cpver_of_node(page) << 32); + return (cp_ver << 32) == (cpver_of_node(folio) << 32); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); - return cp_ver == cpver_of_node(page); + return cp_ver == cpver_of_node(folio); } /* @@ -349,9 +350,9 @@ static inline bool is_recoverable_dnode(struct page *page) * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ -static inline bool IS_DNODE(const struct page *node_page) +static inline bool IS_DNODE(const struct folio *node_folio) { - unsigned int ofs = ofs_of_node(node_page); + unsigned int ofs = ofs_of_node(node_folio); if (f2fs_has_xattr_block(ofs)) return true; @@ -369,7 +370,7 @@ static inline bool IS_DNODE(const struct page *node_page) static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) { - struct f2fs_node *rn = F2FS_NODE(&folio->page); + struct f2fs_node *rn = F2FS_NODE(folio); f2fs_folio_wait_writeback(folio, NODE, true, true); @@ -380,9 +381,9 @@ static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) return folio_mark_dirty(folio); } -static inline nid_t get_nid(struct page *p, int off, bool i) +static inline nid_t get_nid(const struct folio *folio, int off, bool i) { - struct f2fs_node *rn = F2FS_NODE(p); + struct f2fs_node *rn = F2FS_NODE(folio); if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); @@ -396,19 +397,19 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold data pages in page cache */ -static inline int is_node(const struct page *page, int type) +static inline int is_node(const struct folio *folio, int type) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); return le32_to_cpu(rn->footer.flag) & BIT(type); } -#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) -#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) -#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) +#define is_cold_node(folio) is_node(folio, COLD_BIT_SHIFT) +#define is_fsync_dnode(folio) is_node(folio, FSYNC_BIT_SHIFT) +#define is_dent_dnode(folio) is_node(folio, DENT_BIT_SHIFT) -static inline void set_cold_node(struct page *page, bool is_dir) +static inline void set_cold_node(const struct folio *folio, bool is_dir) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (is_dir) @@ -418,9 +419,9 @@ static inline void set_cold_node(struct page *page, bool is_dir) rn->footer.flag = cpu_to_le32(flag); } -static inline void set_mark(struct page *page, int mark, int type) +static inline void set_mark(struct folio *folio, int mark, int type) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) flag |= BIT(type); @@ -429,8 +430,8 @@ static inline void set_mark(struct page *page, int mark, int type) rn->footer.flag = cpu_to_le32(flag); #ifdef CONFIG_F2FS_CHECK_FS - f2fs_inode_chksum_set(F2FS_P_SB(page), page); + f2fs_inode_chksum_set(F2FS_F_SB(folio), folio); #endif } -#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) -#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) +#define set_dentry_mark(folio, mark) set_mark(folio, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(folio, mark) set_mark(folio, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 51ebed4e1521..4cb3a91801b4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -157,10 +157,10 @@ static int init_recovered_filename(const struct inode *dir, return 0; } -static int recover_dentry(struct inode *inode, struct page *ipage, +static int recover_dentry(struct inode *inode, struct folio *ifolio, struct list_head *dir_list) { - struct f2fs_inode *raw_inode = F2FS_INODE(ipage); + struct f2fs_inode *raw_inode = F2FS_INODE(ifolio); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; struct f2fs_filename fname; @@ -233,14 +233,14 @@ out: else name = raw_inode->i_name; f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), name, + __func__, ino_of_node(ifolio), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } -static int recover_quota_data(struct inode *inode, struct page *page) +static int recover_quota_data(struct inode *inode, struct folio *folio) { - struct f2fs_inode *raw = F2FS_INODE(page); + struct f2fs_inode *raw = F2FS_INODE(folio); struct iattr attr; uid_t i_uid = le32_to_cpu(raw->i_uid); gid_t i_gid = le32_to_cpu(raw->i_gid); @@ -277,16 +277,16 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) clear_inode_flag(inode, FI_DATA_EXIST); } -static int recover_inode(struct inode *inode, struct page *page) +static int recover_inode(struct inode *inode, struct folio *folio) { - struct f2fs_inode *raw = F2FS_INODE(page); + struct f2fs_inode *raw = F2FS_INODE(folio); struct f2fs_inode_info *fi = F2FS_I(inode); char *name; int err; inode->i_mode = le16_to_cpu(raw->i_mode); - err = recover_quota_data(inode, page); + err = recover_quota_data(inode, folio); if (err) return err; @@ -333,10 +333,10 @@ static int recover_inode(struct inode *inode, struct page *page) if (file_enc_name(inode)) name = "<encrypted>"; else - name = F2FS_INODE(page)->i_name; + name = F2FS_INODE(folio)->i_name; f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x", - ino_of_node(page), name, raw->i_inline); + ino_of_node(folio), name, raw->i_inline); return 0; } @@ -375,7 +375,7 @@ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, if (IS_ERR(folio)) return PTR_ERR(folio); - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); *is_detecting = false; return 0; @@ -424,22 +424,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, break; } - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); break; } - if (!is_fsync_dnode(&folio->page)) + if (!is_fsync_dnode(folio)) goto next; - entry = get_fsync_inode(head, ino_of_node(&folio->page)); + entry = get_fsync_inode(head, ino_of_node(folio)); if (!entry) { bool quota_inode = false; if (!check_only && - IS_INODE(&folio->page) && - is_dent_dnode(&folio->page)) { - err = f2fs_recover_inode_page(sbi, &folio->page); + IS_INODE(folio) && + is_dent_dnode(folio)) { + err = f2fs_recover_inode_page(sbi, folio); if (err) { f2fs_folio_put(folio, true); break; @@ -451,7 +451,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(&folio->page), + entry = add_fsync_inode(sbi, head, ino_of_node(folio), quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); @@ -463,7 +463,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, } entry->blkaddr = blkaddr; - if (IS_INODE(&folio->page) && is_dent_dnode(&folio->page)) + if (IS_INODE(folio) && is_dent_dnode(folio)) entry->last_dentry = blkaddr; next: /* check next segment */ @@ -527,7 +527,7 @@ got_it: nid = le32_to_cpu(sum.nid); ofs_in_node = le16_to_cpu(sum.ofs_in_node); - max_addrs = ADDRS_PER_PAGE(&dn->node_folio->page, dn->inode); + max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode); if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); @@ -552,8 +552,8 @@ got_it: if (IS_ERR(node_folio)) return PTR_ERR(node_folio); - offset = ofs_of_node(&node_folio->page); - ino = ino_of_node(&node_folio->page); + offset = ofs_of_node(node_folio); + ino = ino_of_node(node_folio); f2fs_folio_put(node_folio, true); if (ino != dn->inode->i_ino) { @@ -624,16 +624,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, { struct dnode_of_data dn; struct node_info ni; - unsigned int start, end; + unsigned int start = 0, end = 0, index; int err = 0, recovered = 0; /* step 1: recover xattr */ - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { err = f2fs_recover_inline_xattr(inode, folio); if (err) goto out; - } else if (f2fs_has_xattr_block(ofs_of_node(&folio->page))) { - err = f2fs_recover_xattr_data(inode, &folio->page); + } else if (f2fs_has_xattr_block(ofs_of_node(folio))) { + err = f2fs_recover_xattr_data(inode, folio); if (!err) recovered++; goto out; @@ -648,8 +648,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* step 3: recover data indices */ - start = f2fs_start_bidx_of_node(ofs_of_node(&folio->page), inode); - end = start + ADDRS_PER_PAGE(&folio->page, inode); + start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode); + end = start + ADDRS_PER_PAGE(folio, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: @@ -668,18 +668,18 @@ retry_dn: if (err) goto err; - f2fs_bug_on(sbi, ni.ino != ino_of_node(&folio->page)); + f2fs_bug_on(sbi, ni.ino != ino_of_node(folio)); - if (ofs_of_node(&dn.node_folio->page) != ofs_of_node(&folio->page)) { + if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) { f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", - inode->i_ino, ofs_of_node(&dn.node_folio->page), - ofs_of_node(&folio->page)); + inode->i_ino, ofs_of_node(dn.node_folio), + ofs_of_node(folio)); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } - for (; start < end; start++, dn.ofs_in_node++) { + for (index = start; index < end; index++, dn.ofs_in_node++) { block_t src, dest; src = f2fs_data_blkaddr(&dn); @@ -708,9 +708,9 @@ retry_dn: } if (!file_keep_isize(inode) && - (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + (i_size_read(inode) <= ((loff_t)index << PAGE_SHIFT))) f2fs_i_size_write(inode, - (loff_t)(start + 1) << PAGE_SHIFT); + (loff_t)(index + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block @@ -758,16 +758,18 @@ retry_prev: } } - copy_node_footer(&dn.node_folio->page, &folio->page); - fill_node_footer(&dn.node_folio->page, dn.nid, ni.ino, - ofs_of_node(&folio->page), false); + copy_node_footer(dn.node_folio, folio); + fill_node_footer(dn.node_folio, dn.nid, ni.ino, + ofs_of_node(folio), false); folio_mark_dirty(dn.node_folio); err: f2fs_put_dnode(&dn); out: - f2fs_notice(sbi, "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", - inode->i_ino, file_keep_isize(inode) ? "keep" : "recover", - recovered, err); + f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), " + "range (%u, %u), recovered = %d, err = %d", + inode->i_ino, nid_of_node(folio), + file_keep_isize(inode) ? "keep" : "recover", + start, end, recovered, err); return err; } @@ -778,6 +780,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, int err = 0; block_t blkaddr; unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + unsigned int recoverable_dnode = 0; + unsigned int fsynced_dnode = 0; + unsigned int total_dnode = 0; + unsigned int recovered_inode = 0; + unsigned int recovered_dentry = 0; + unsigned int recovered_dnode = 0; + + f2fs_notice(sbi, "do_recover_data: start to recover dnode"); /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -796,38 +806,43 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, break; } - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); break; } + recoverable_dnode++; - entry = get_fsync_inode(inode_list, ino_of_node(&folio->page)); + entry = get_fsync_inode(inode_list, ino_of_node(folio)); if (!entry) goto next; + fsynced_dnode++; /* * inode(x) | CP | inode(x) | dnode(F) * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (IS_INODE(&folio->page)) { - err = recover_inode(entry->inode, &folio->page); + if (IS_INODE(folio)) { + err = recover_inode(entry->inode, folio); if (err) { f2fs_folio_put(folio, true); break; } + recovered_inode++; } if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, &folio->page, dir_list); + err = recover_dentry(entry->inode, folio, dir_list); if (err) { f2fs_folio_put(folio, true); break; } + recovered_dentry++; } err = do_recover_data(sbi, entry->inode, folio); if (err) { f2fs_folio_put(folio, true); break; } + recovered_dnode++; if (entry->blkaddr == blkaddr) list_move_tail(&entry->list, tmp_inode_list); @@ -840,9 +855,15 @@ next: f2fs_folio_put(folio, true); f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); + total_dnode++; } if (!err) err = f2fs_allocate_new_segments(sbi); + + f2fs_notice(sbi, "do_recover_data: dnode: (recoverable: %u, fsynced: %u, " + "total: %u), recovered: (inode: %u, dentry: %u, dnode: %u), err: %d", + recoverable_dnode, fsynced_dnode, total_dnode, recovered_inode, + recovered_dentry, recovered_dnode, err); return err; } @@ -855,6 +876,9 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, " + "check_only: %d", check_only); + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "recover fsync data on readonly fs"); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ae1223ef648f..cc82d42ef14c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -334,7 +334,7 @@ static int __f2fs_commit_atomic_write(struct inode *inode) goto next; } - blen = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, cow_inode), + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { @@ -455,7 +455,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } else { struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, - .init_gc_type = BG_GC, + .init_gc_type = f2fs_sb_has_blkzoned(sbi) ? + FG_GC : BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, .err_gc_skipped = false, @@ -772,7 +773,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); /* need not be added */ - if (IS_CURSEG(sbi, segno)) + if (is_curseg(sbi, segno)) return; if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) @@ -799,7 +800,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, !valid_blocks) || valid_blocks == CAP_BLKS_PER_SEC(sbi)); - if (!IS_CURSEC(sbi, secno)) + if (!is_cursec(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } @@ -838,7 +839,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, return; } - if (!IS_CURSEC(sbi, secno)) + if (!is_cursec(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } @@ -855,7 +856,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) unsigned short valid_blocks, ckpt_valid_blocks; unsigned int usable_blocks; - if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) + if (segno == NULL_SEGNO || is_curseg(sbi, segno)) return; usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); @@ -888,7 +889,7 @@ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; - if (IS_CURSEG(sbi, segno)) + if (is_curseg(sbi, segno)) continue; __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); @@ -2107,7 +2108,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!force) { if (!f2fs_realtime_discard_enable(sbi) || (!se->valid_blocks && - !IS_CURSEG(sbi, cpc->trim_start)) || + !is_curseg(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -2235,7 +2236,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, next: secno = GET_SEC_FROM_SEG(sbi, start); start_segno = GET_SEG_FROM_SEC(sbi, secno); - if (!IS_CURSEC(sbi, secno) && + if (!is_cursec(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), BLKS_PER_SEC(sbi)); @@ -3619,7 +3620,7 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(fio->page) && is_cold_node(fio->page)) + if (IS_DNODE(fio->folio) && is_cold_node(fio->folio)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; @@ -3665,8 +3666,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; - type = __get_age_segment_type(inode, - page_folio(fio->page)->index); + type = __get_age_segment_type(inode, fio->folio->index); if (type != NO_CHECK_TYPE) return type; @@ -3677,8 +3677,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); } else { - if (IS_DNODE(fio->page)) - return is_cold_node(fio->page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->folio)) + return is_cold_node(fio->folio) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } @@ -3746,7 +3746,7 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, get_random_u32_inclusive(1, sbi->max_fragment_hole); } -int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio) @@ -3850,10 +3850,10 @@ skip_new_segment: up_write(&sit_i->sentry_lock); - if (page && IS_NODESEG(curseg->seg_type)) { - fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + if (folio && IS_NODESEG(curseg->seg_type)) { + fill_node_footer_blkaddr(folio, NEXT_FREE_BLKADDR(sbi, curseg)); - f2fs_inode_chksum_set(sbi, page); + f2fs_inode_chksum_set(sbi, folio); } if (fio) { @@ -3931,7 +3931,7 @@ static int log_type_to_seg_type(enum log_type type) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - struct folio *folio = page_folio(fio->page); + struct folio *folio = fio->folio; enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && @@ -3940,15 +3940,21 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); - if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, folio); + f2fs_bug_on(fio->sbi, !is_set_ckpt_flags(fio->sbi, + CP_ERROR_FLAG)); goto out; } + + f2fs_bug_on(fio->sbi, !f2fs_is_valid_blkaddr_raw(fio->sbi, + fio->new_blkaddr, DATA_GENERIC_ENHANCE)); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1); @@ -3972,7 +3978,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = folio->index, .new_blkaddr = folio->index, - .page = folio_page(folio, 0), + .folio = folio, .encrypted_page = NULL, .in_list = 0, }; @@ -4100,14 +4106,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!recover_curseg) { /* for recovery flow */ - if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (se->valid_blocks == 0 && !is_curseg(sbi, segno)) { if (old_blkaddr == NULL_ADDR) type = CURSEG_COLD_DATA; else type = CURSEG_WARM_DATA; } } else { - if (IS_CURSEG(sbi, segno)) { + if (is_curseg(sbi, segno)) { /* se->type is volatile as SSR allocation */ type = __f2fs_get_curseg(sbi, segno); f2fs_bug_on(sbi, type == NO_CHECK_TYPE); @@ -4191,7 +4197,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); + f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, folio); if (ordered) { @@ -5143,7 +5149,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; - if (IS_CURSEC(sbi, secno)) + if (is_cursec(sbi, secno)) continue; set_bit(secno, dirty_i->dirty_secmap); } @@ -5279,7 +5285,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, * Get # of valid block of the zone. */ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); @@ -5806,9 +5812,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; - kvfree(sit_i->sit_bitmap); + kfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS - kvfree(sit_i->sit_bitmap_mir); + kfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif kfree(sit_i); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index db619fd2f51a..5e2ee5c686b1 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -34,34 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); } -#define IS_CURSEG(sbi, seg) \ - (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) - -#define IS_CURSEC(sbi, secno) \ - (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ - SEGS_PER_SEC(sbi))) - #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) @@ -318,6 +290,28 @@ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } +static inline bool is_curseg(struct f2fs_sb_info *sbi, unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (segno == CURSEG_I(sbi, i)->segno) + return true; + } + return false; +} + +static inline bool is_cursec(struct f2fs_sb_info *sbi, unsigned int secno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (secno == GET_SEC_FROM_SEG(sbi, CURSEG_I(sbi, i)->segno)) + return true; + } + return false; +} + static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, unsigned int segno) { @@ -509,7 +503,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, free_i->free_segments++; - if (!inmem && IS_CURSEC(sbi, secno)) + if (!inmem && is_cursec(sbi, secno)) goto unlock_out; /* check large section */ @@ -674,8 +668,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int data_blocks = 0; - if (f2fs_lfs_mode(sbi) && - unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (f2fs_lfs_mode(sbi)) { total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); @@ -684,7 +677,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, if (lower_p) *lower_p = node_secs + dent_secs + data_secs; if (upper_p) - *upper_p = node_secs + dent_secs + + *upper_p = node_secs + dent_secs + data_secs + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + (data_blocks ? 1 : 0); if (curseg_p) @@ -986,7 +979,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { - if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + if (is_cursec(sbi, secno) || (sbi->cur_victim_sec == secno)) return true; return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bbf1dad6843f..e16c4e2830c2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -27,6 +27,8 @@ #include <linux/part_stat.h> #include <linux/zstd.h> #include <linux/lz4.h> +#include <linux/ctype.h> +#include <linux/fs_parser.h> #include "f2fs.h" #include "node.h" @@ -125,29 +127,20 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, - Opt_nodiscard, Opt_noheap, Opt_heap, Opt_user_xattr, - Opt_nouser_xattr, Opt_acl, - Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, - Opt_noinline_xattr, Opt_inline_xattr_size, Opt_inline_data, Opt_inline_dentry, - Opt_noinline_dentry, Opt_flush_merge, - Opt_noflush_merge, Opt_barrier, - Opt_nobarrier, Opt_fastboot, Opt_extent_cache, - Opt_noextent_cache, - Opt_noinline_data, Opt_data_flush, Opt_reserve_root, Opt_resgid, @@ -156,21 +149,13 @@ enum { Opt_fault_injection, Opt_fault_type, Opt_lazytime, - Opt_nolazytime, Opt_quota, - Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_usrjquota, Opt_grpjquota, Opt_prjjquota, - Opt_offusrjquota, - Opt_offgrpjquota, - Opt_offprjjquota, - Opt_jqfmt_vfsold, - Opt_jqfmt_vfsv0, - Opt_jqfmt_vfsv1, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -180,107 +165,209 @@ enum { Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, Opt_checkpoint_merge, - Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, - Opt_compress_extension, Opt_nocompress_extension, + Opt_compress_extension, Opt_compress_chksum, Opt_compress_mode, Opt_compress_cache, Opt_atgc, Opt_gc_merge, - Opt_nogc_merge, Opt_discard_unit, Opt_memory_mode, Opt_age_extent_cache, Opt_errors, Opt_nat_bits, + Opt_jqfmt, + Opt_checkpoint, Opt_err, }; -static match_table_t f2fs_tokens = { - {Opt_gc_background, "background_gc=%s"}, - {Opt_disable_roll_forward, "disable_roll_forward"}, - {Opt_norecovery, "norecovery"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_noheap, "no_heap"}, - {Opt_heap, "heap"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_active_logs, "active_logs=%u"}, - {Opt_disable_ext_identify, "disable_ext_identify"}, - {Opt_inline_xattr, "inline_xattr"}, - {Opt_noinline_xattr, "noinline_xattr"}, - {Opt_inline_xattr_size, "inline_xattr_size=%u"}, - {Opt_inline_data, "inline_data"}, - {Opt_inline_dentry, "inline_dentry"}, - {Opt_noinline_dentry, "noinline_dentry"}, - {Opt_flush_merge, "flush_merge"}, - {Opt_noflush_merge, "noflush_merge"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_fastboot, "fastboot"}, - {Opt_extent_cache, "extent_cache"}, - {Opt_noextent_cache, "noextent_cache"}, - {Opt_noinline_data, "noinline_data"}, - {Opt_data_flush, "data_flush"}, - {Opt_reserve_root, "reserve_root=%u"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_mode, "mode=%s"}, - {Opt_fault_injection, "fault_injection=%u"}, - {Opt_fault_type, "fault_type=%u"}, - {Opt_lazytime, "lazytime"}, - {Opt_nolazytime, "nolazytime"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_prjquota, "prjquota"}, - {Opt_usrjquota, "usrjquota=%s"}, - {Opt_grpjquota, "grpjquota=%s"}, - {Opt_prjjquota, "prjjquota=%s"}, - {Opt_offusrjquota, "usrjquota="}, - {Opt_offgrpjquota, "grpjquota="}, - {Opt_offprjjquota, "prjjquota="}, - {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, - {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_alloc, "alloc_mode=%s"}, - {Opt_fsync, "fsync_mode=%s"}, - {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, - {Opt_test_dummy_encryption, "test_dummy_encryption"}, - {Opt_inlinecrypt, "inlinecrypt"}, - {Opt_checkpoint_disable, "checkpoint=disable"}, - {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, - {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, - {Opt_checkpoint_enable, "checkpoint=enable"}, - {Opt_checkpoint_merge, "checkpoint_merge"}, - {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, - {Opt_compress_algorithm, "compress_algorithm=%s"}, - {Opt_compress_log_size, "compress_log_size=%u"}, - {Opt_compress_extension, "compress_extension=%s"}, - {Opt_nocompress_extension, "nocompress_extension=%s"}, - {Opt_compress_chksum, "compress_chksum"}, - {Opt_compress_mode, "compress_mode=%s"}, - {Opt_compress_cache, "compress_cache"}, - {Opt_atgc, "atgc"}, - {Opt_gc_merge, "gc_merge"}, - {Opt_nogc_merge, "nogc_merge"}, - {Opt_discard_unit, "discard_unit=%s"}, - {Opt_memory_mode, "memory=%s"}, - {Opt_age_extent_cache, "age_extent_cache"}, - {Opt_errors, "errors=%s"}, - {Opt_nat_bits, "nat_bits"}, +static const struct constant_table f2fs_param_background_gc[] = { + {"on", BGGC_MODE_ON}, + {"off", BGGC_MODE_OFF}, + {"sync", BGGC_MODE_SYNC}, + {} +}; + +static const struct constant_table f2fs_param_mode[] = { + {"adaptive", FS_MODE_ADAPTIVE}, + {"lfs", FS_MODE_LFS}, + {"fragment:segment", FS_MODE_FRAGMENT_SEG}, + {"fragment:block", FS_MODE_FRAGMENT_BLK}, + {} +}; + +static const struct constant_table f2fs_param_jqfmt[] = { + {"vfsold", QFMT_VFS_OLD}, + {"vfsv0", QFMT_VFS_V0}, + {"vfsv1", QFMT_VFS_V1}, + {} +}; + +static const struct constant_table f2fs_param_alloc_mode[] = { + {"default", ALLOC_MODE_DEFAULT}, + {"reuse", ALLOC_MODE_REUSE}, + {} +}; +static const struct constant_table f2fs_param_fsync_mode[] = { + {"posix", FSYNC_MODE_POSIX}, + {"strict", FSYNC_MODE_STRICT}, + {"nobarrier", FSYNC_MODE_NOBARRIER}, + {} +}; + +static const struct constant_table f2fs_param_compress_mode[] = { + {"fs", COMPR_MODE_FS}, + {"user", COMPR_MODE_USER}, + {} +}; + +static const struct constant_table f2fs_param_discard_unit[] = { + {"block", DISCARD_UNIT_BLOCK}, + {"segment", DISCARD_UNIT_SEGMENT}, + {"section", DISCARD_UNIT_SECTION}, + {} +}; + +static const struct constant_table f2fs_param_memory_mode[] = { + {"normal", MEMORY_MODE_NORMAL}, + {"low", MEMORY_MODE_LOW}, + {} +}; + +static const struct constant_table f2fs_param_errors[] = { + {"remount-ro", MOUNT_ERRORS_READONLY}, + {"continue", MOUNT_ERRORS_CONTINUE}, + {"panic", MOUNT_ERRORS_PANIC}, + {} +}; + +static const struct fs_parameter_spec f2fs_param_specs[] = { + fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc), + fsparam_flag("disable_roll_forward", Opt_disable_roll_forward), + fsparam_flag("norecovery", Opt_norecovery), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag("no_heap", Opt_noheap), + fsparam_flag("heap", Opt_heap), + fsparam_flag_no("user_xattr", Opt_user_xattr), + fsparam_flag_no("acl", Opt_acl), + fsparam_s32("active_logs", Opt_active_logs), + fsparam_flag("disable_ext_identify", Opt_disable_ext_identify), + fsparam_flag_no("inline_xattr", Opt_inline_xattr), + fsparam_s32("inline_xattr_size", Opt_inline_xattr_size), + fsparam_flag_no("inline_data", Opt_inline_data), + fsparam_flag_no("inline_dentry", Opt_inline_dentry), + fsparam_flag_no("flush_merge", Opt_flush_merge), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("fastboot", Opt_fastboot), + fsparam_flag_no("extent_cache", Opt_extent_cache), + fsparam_flag("data_flush", Opt_data_flush), + fsparam_u32("reserve_root", Opt_reserve_root), + fsparam_gid("resgid", Opt_resgid), + fsparam_uid("resuid", Opt_resuid), + fsparam_enum("mode", Opt_mode, f2fs_param_mode), + fsparam_s32("fault_injection", Opt_fault_injection), + fsparam_u32("fault_type", Opt_fault_type), + fsparam_flag_no("lazytime", Opt_lazytime), + fsparam_flag_no("quota", Opt_quota), + fsparam_flag("usrquota", Opt_usrquota), + fsparam_flag("grpquota", Opt_grpquota), + fsparam_flag("prjquota", Opt_prjquota), + fsparam_string_empty("usrjquota", Opt_usrjquota), + fsparam_string_empty("grpjquota", Opt_grpjquota), + fsparam_string_empty("prjjquota", Opt_prjjquota), + fsparam_flag("nat_bits", Opt_nat_bits), + fsparam_enum("jqfmt", Opt_jqfmt, f2fs_param_jqfmt), + fsparam_enum("alloc_mode", Opt_alloc, f2fs_param_alloc_mode), + fsparam_enum("fsync_mode", Opt_fsync, f2fs_param_fsync_mode), + fsparam_string("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_flag("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_flag("inlinecrypt", Opt_inlinecrypt), + fsparam_string("checkpoint", Opt_checkpoint), + fsparam_flag_no("checkpoint_merge", Opt_checkpoint_merge), + fsparam_string("compress_algorithm", Opt_compress_algorithm), + fsparam_u32("compress_log_size", Opt_compress_log_size), + fsparam_string("compress_extension", Opt_compress_extension), + fsparam_string("nocompress_extension", Opt_nocompress_extension), + fsparam_flag("compress_chksum", Opt_compress_chksum), + fsparam_enum("compress_mode", Opt_compress_mode, f2fs_param_compress_mode), + fsparam_flag("compress_cache", Opt_compress_cache), + fsparam_flag("atgc", Opt_atgc), + fsparam_flag_no("gc_merge", Opt_gc_merge), + fsparam_enum("discard_unit", Opt_discard_unit, f2fs_param_discard_unit), + fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode), + fsparam_flag("age_extent_cache", Opt_age_extent_cache), + fsparam_enum("errors", Opt_errors, f2fs_param_errors), + {} +}; + +/* Resort to a match_table for this interestingly formatted option */ +static match_table_t f2fs_checkpoint_tokens = { + {Opt_checkpoint_disable, "disable"}, + {Opt_checkpoint_disable_cap, "disable:%u"}, + {Opt_checkpoint_disable_cap_perc, "disable:%u%%"}, + {Opt_checkpoint_enable, "enable"}, {Opt_err, NULL}, }; +#define F2FS_SPEC_background_gc (1 << 0) +#define F2FS_SPEC_inline_xattr_size (1 << 1) +#define F2FS_SPEC_active_logs (1 << 2) +#define F2FS_SPEC_reserve_root (1 << 3) +#define F2FS_SPEC_resgid (1 << 4) +#define F2FS_SPEC_resuid (1 << 5) +#define F2FS_SPEC_mode (1 << 6) +#define F2FS_SPEC_fault_injection (1 << 7) +#define F2FS_SPEC_fault_type (1 << 8) +#define F2FS_SPEC_jqfmt (1 << 9) +#define F2FS_SPEC_alloc_mode (1 << 10) +#define F2FS_SPEC_fsync_mode (1 << 11) +#define F2FS_SPEC_checkpoint_disable_cap (1 << 12) +#define F2FS_SPEC_checkpoint_disable_cap_perc (1 << 13) +#define F2FS_SPEC_compress_level (1 << 14) +#define F2FS_SPEC_compress_algorithm (1 << 15) +#define F2FS_SPEC_compress_log_size (1 << 16) +#define F2FS_SPEC_compress_extension (1 << 17) +#define F2FS_SPEC_nocompress_extension (1 << 18) +#define F2FS_SPEC_compress_chksum (1 << 19) +#define F2FS_SPEC_compress_mode (1 << 20) +#define F2FS_SPEC_discard_unit (1 << 21) +#define F2FS_SPEC_memory_mode (1 << 22) +#define F2FS_SPEC_errors (1 << 23) + +struct f2fs_fs_context { + struct f2fs_mount_info info; + unsigned int opt_mask; /* Bits changed */ + unsigned int spec_mask; + unsigned short qname_mask; +}; + +#define F2FS_CTX_INFO(ctx) ((ctx)->info) + +static inline void ctx_set_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + ctx->info.opt |= flag; + ctx->opt_mask |= flag; +} + +static inline void ctx_clear_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + ctx->info.opt &= ~flag; + ctx->opt_mask |= flag; +} + +static inline bool ctx_test_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + return ctx->info.opt & flag; +} + void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, - const char *fmt, ...) + const char *fmt, ...) { struct va_format vaf; va_list args; @@ -292,11 +379,19 @@ void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, vaf.fmt = printk_skip_level(fmt); vaf.va = &args; if (limit_rate) - printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", - KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + if (sbi) + printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk_ratelimited("%c%cF2FS-fs: %pV\n", + KERN_SOH_ASCII, level, &vaf); else - printk("%c%cF2FS-fs (%s): %pV\n", - KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + if (sbi) + printk("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk("%c%cF2FS-fs: %pV\n", + KERN_SOH_ASCII, level, &vaf); va_end(args); } @@ -390,159 +485,90 @@ static void init_once(void *foo) #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) -static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype, - substring_t *args) +/* + * Note the name of the specified quota file. + */ +static int f2fs_note_qf_name(struct fs_context *fc, int qtype, + struct fs_parameter *param) { - struct super_block *sb = sbi->sb; + struct f2fs_fs_context *ctx = fc->fs_private; char *qname; - int ret = -EINVAL; - if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { - f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + if (param->size < 1) { + f2fs_err(NULL, "Missing quota name"); return -EINVAL; } - if (f2fs_sb_has_quota_ino(sbi)) { - f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); + if (strchr(param->string, '/')) { + f2fs_err(NULL, "quotafile must be on filesystem root"); + return -EINVAL; + } + if (ctx->info.s_qf_names[qtype]) { + if (strcmp(ctx->info.s_qf_names[qtype], param->string) != 0) { + f2fs_err(NULL, "Quota file already specified"); + return -EINVAL; + } return 0; } - qname = match_strdup(args); + qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { - f2fs_err(sbi, "Not enough memory for storing quotafile name"); + f2fs_err(NULL, "Not enough memory for storing quotafile name"); return -ENOMEM; } - if (F2FS_OPTION(sbi).s_qf_names[qtype]) { - if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) - ret = 0; - else - f2fs_err(sbi, "%s quota file already specified", - QTYPE2NAME(qtype)); - goto errout; - } - if (strchr(qname, '/')) { - f2fs_err(sbi, "quotafile must be on filesystem root"); - goto errout; - } - F2FS_OPTION(sbi).s_qf_names[qtype] = qname; - set_opt(sbi, QUOTA); + F2FS_CTX_INFO(ctx).s_qf_names[qtype] = qname; + ctx->qname_mask |= 1 << qtype; return 0; -errout: - kfree(qname); - return ret; } -static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype) +/* + * Clear the name of the specified quota file. + */ +static int f2fs_unnote_qf_name(struct fs_context *fc, int qtype) { - struct super_block *sb = sbi->sb; + struct f2fs_fs_context *ctx = fc->fs_private; - if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { - f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); - return -EINVAL; - } - kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); - F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; + kfree(ctx->info.s_qf_names[qtype]); + ctx->info.s_qf_names[qtype] = NULL; + ctx->qname_mask |= 1 << qtype; return 0; } -static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +static void f2fs_unnote_qf_name_all(struct fs_context *fc) { - /* - * We do the test below only for project quotas. 'usrquota' and - * 'grpquota' mount options are allowed even without quota feature - * to support legacy quotas in quota files. - */ - if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) { - f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); - return -1; - } - if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || - F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || - F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { - if (test_opt(sbi, USRQUOTA) && - F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) - clear_opt(sbi, USRQUOTA); - - if (test_opt(sbi, GRPQUOTA) && - F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) - clear_opt(sbi, GRPQUOTA); - - if (test_opt(sbi, PRJQUOTA) && - F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) - clear_opt(sbi, PRJQUOTA); - - if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || - test_opt(sbi, PRJQUOTA)) { - f2fs_err(sbi, "old and new quota format mixing"); - return -1; - } - - if (!F2FS_OPTION(sbi).s_jquota_fmt) { - f2fs_err(sbi, "journaled quota format not specified"); - return -1; - } - } + int i; - if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) { - f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); - F2FS_OPTION(sbi).s_jquota_fmt = 0; - } - return 0; + for (i = 0; i < MAXQUOTAS; i++) + f2fs_unnote_qf_name(fc, i); } #endif -static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi, - const char *opt, - const substring_t *arg, - bool is_remount) +static int f2fs_parse_test_dummy_encryption(const struct fs_parameter *param, + struct f2fs_fs_context *ctx) { - struct fs_parameter param = { - .type = fs_value_is_string, - .string = arg->from ? arg->from : "", - }; - struct fscrypt_dummy_policy *policy = - &F2FS_OPTION(sbi).dummy_enc_policy; int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { - f2fs_warn(sbi, "test_dummy_encryption option not supported"); + f2fs_warn(NULL, "test_dummy_encryption option not supported"); return -EINVAL; } - - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - /* - * This mount option is just for testing, and it's not worthwhile to - * implement the extra complexity (e.g. RCU protection) that would be - * needed to allow it to be set or changed during remount. We do allow - * it to be specified during remount, but only if there is no change. - */ - if (is_remount && !fscrypt_is_dummy_policy_set(policy)) { - f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); - return -EINVAL; - } - - err = fscrypt_parse_test_dummy_encryption(¶m, policy); + err = fscrypt_parse_test_dummy_encryption(param, + &ctx->info.dummy_enc_policy); if (err) { - if (err == -EEXIST) - f2fs_warn(sbi, - "Can't change test_dummy_encryption on remount"); - else if (err == -EINVAL) - f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", - opt); + if (err == -EINVAL) + f2fs_warn(NULL, "Value of option \"%s\" is unrecognized", + param->key); + else if (err == -EEXIST) + f2fs_warn(NULL, "Conflicting test_dummy_encryption options"); else - f2fs_warn(sbi, "Error processing option \"%s\" [%d]", - opt, err); + f2fs_warn(NULL, "Error processing option \"%s\" [%d]", + param->key, err); return -EINVAL; } - f2fs_warn(sbi, "Test dummy encryption mode enabled"); return 0; } #ifdef CONFIG_F2FS_FS_COMPRESSION -static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, +static bool is_compress_extension_exist(struct f2fs_mount_info *info, const char *new_ext, bool is_ext) { unsigned char (*ext)[F2FS_EXTENSION_LEN]; @@ -550,11 +576,11 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, int i; if (is_ext) { - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + ext = info->extensions; + ext_cnt = info->compress_ext_cnt; } else { - ext = F2FS_OPTION(sbi).noextensions; - ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + ext = info->noextensions; + ext_cnt = info->nocompress_ext_cnt; } for (i = 0; i < ext_cnt; i++) { @@ -572,28 +598,28 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, * extension will be treated as special cases and will not be compressed. * 3. Don't allow the non-compress extension specifies all files. */ -static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) +static int f2fs_test_compress_extension(unsigned char (*noext)[F2FS_EXTENSION_LEN], + int noext_cnt, + unsigned char (*ext)[F2FS_EXTENSION_LEN], + int ext_cnt) { - unsigned char (*ext)[F2FS_EXTENSION_LEN]; - unsigned char (*noext)[F2FS_EXTENSION_LEN]; - int ext_cnt, noext_cnt, index = 0, no_index = 0; - - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; - noext = F2FS_OPTION(sbi).noextensions; - noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + int index = 0, no_index = 0; if (!noext_cnt) return 0; for (no_index = 0; no_index < noext_cnt; no_index++) { + if (strlen(noext[no_index]) == 0) + continue; if (!strcasecmp("*", noext[no_index])) { - f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); + f2fs_info(NULL, "Don't allow the nocompress extension specifies all files"); return -EINVAL; } for (index = 0; index < ext_cnt; index++) { + if (strlen(ext[index]) == 0) + continue; if (!strcasecmp(ext[index], noext[no_index])) { - f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", + f2fs_info(NULL, "Don't allow the same extension %s appear in both compress and nocompress extension", ext[index]); return -EINVAL; } @@ -603,58 +629,62 @@ static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) } #ifdef CONFIG_F2FS_FS_LZ4 -static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +static int f2fs_set_lz4hc_level(struct f2fs_fs_context *ctx, const char *str) { #ifdef CONFIG_F2FS_FS_LZ4HC unsigned int level; if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_level = 0; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } str += 3; if (str[0] != ':') { - f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } if (kstrtouint(str + 1, 10, &level)) return -EINVAL; if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) { - f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + f2fs_info(NULL, "invalid lz4hc compress level: %d", level); return -EINVAL; } - F2FS_OPTION(sbi).compress_level = level; + F2FS_CTX_INFO(ctx).compress_level = level; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; #else if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_level = 0; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } - f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + f2fs_info(NULL, "kernel doesn't support lz4hc compression"); return -EINVAL; #endif } #endif #ifdef CONFIG_F2FS_FS_ZSTD -static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +static int f2fs_set_zstd_level(struct f2fs_fs_context *ctx, const char *str) { int level; int len = 4; if (strlen(str) == len) { - F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + F2FS_CTX_INFO(ctx).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } str += len; if (str[0] != ':') { - f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } if (kstrtoint(str + 1, 10, &level)) @@ -662,685 +692,750 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) /* f2fs does not support negative compress level now */ if (level < 0) { - f2fs_info(sbi, "do not support negative compress level: %d", level); + f2fs_info(NULL, "do not support negative compress level: %d", level); return -ERANGE; } if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) { - f2fs_info(sbi, "invalid zstd compress level: %d", level); + f2fs_info(NULL, "invalid zstd compress level: %d", level); return -EINVAL; } - F2FS_OPTION(sbi).compress_level = level; + F2FS_CTX_INFO(ctx).compress_level = level; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } #endif #endif -static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount) +static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; + struct f2fs_fs_context *ctx = fc->fs_private; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; unsigned char (*noext)[F2FS_EXTENSION_LEN]; int ext_cnt, noext_cnt; + char *name; #endif - char *p, *name; - int arg = 0; - kuid_t uid; - kgid_t gid; - int ret; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - int token; + substring_t args[MAX_OPT_ARGS]; + struct fs_parse_result result; + int token, ret, arg; - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, f2fs_tokens, args); + token = fs_parse(fc, f2fs_param_specs, param, &result); + if (token < 0) + return token; - switch (token) { - case Opt_gc_background: - name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "on")) { - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; - } else if (!strcmp(name, "off")) { - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_warn(sbi, "zoned devices need bggc"); - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; - } else if (!strcmp(name, "sync")) { - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_disable_roll_forward: - set_opt(sbi, DISABLE_ROLL_FORWARD); - break; - case Opt_norecovery: - /* requires ro mount, checked in f2fs_default_check */ - set_opt(sbi, NORECOVERY); - break; - case Opt_discard: - if (!f2fs_hw_support_discard(sbi)) { - f2fs_warn(sbi, "device does not support discard"); - break; - } - set_opt(sbi, DISCARD); - break; - case Opt_nodiscard: - if (f2fs_hw_should_discard(sbi)) { - f2fs_warn(sbi, "discard is required for zoned block devices"); - return -EINVAL; - } - clear_opt(sbi, DISCARD); - break; - case Opt_noheap: - case Opt_heap: - f2fs_warn(sbi, "heap/no_heap options were deprecated"); - break; + switch (token) { + case Opt_gc_background: + F2FS_CTX_INFO(ctx).bggc_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_background_gc; + break; + case Opt_disable_roll_forward: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_ROLL_FORWARD); + break; + case Opt_norecovery: + /* requires ro mount, checked in f2fs_validate_options */ + ctx_set_opt(ctx, F2FS_MOUNT_NORECOVERY); + break; + case Opt_discard: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); + else + ctx_set_opt(ctx, F2FS_MOUNT_DISCARD); + break; + case Opt_noheap: + case Opt_heap: + f2fs_warn(NULL, "heap/no_heap options were deprecated"); + break; #ifdef CONFIG_F2FS_FS_XATTR - case Opt_user_xattr: - set_opt(sbi, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt(sbi, XATTR_USER); - break; - case Opt_inline_xattr: - set_opt(sbi, INLINE_XATTR); - break; - case Opt_noinline_xattr: - clear_opt(sbi, INLINE_XATTR); - break; - case Opt_inline_xattr_size: - if (args->from && match_int(args, &arg)) - return -EINVAL; - set_opt(sbi, INLINE_XATTR_SIZE); - F2FS_OPTION(sbi).inline_xattr_size = arg; - break; + case Opt_user_xattr: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_XATTR_USER); + else + ctx_set_opt(ctx, F2FS_MOUNT_XATTR_USER); + break; + case Opt_inline_xattr: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_XATTR); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR); + break; + case Opt_inline_xattr_size: + if (result.int_32 < MIN_INLINE_XATTR_SIZE || + result.int_32 > MAX_INLINE_XATTR_SIZE) { + f2fs_err(NULL, "inline xattr size is out of range: %u ~ %u", + (u32)MIN_INLINE_XATTR_SIZE, (u32)MAX_INLINE_XATTR_SIZE); + return -EINVAL; + } + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE); + F2FS_CTX_INFO(ctx).inline_xattr_size = result.int_32; + ctx->spec_mask |= F2FS_SPEC_inline_xattr_size; + break; #else - case Opt_user_xattr: - case Opt_nouser_xattr: - case Opt_inline_xattr: - case Opt_noinline_xattr: - case Opt_inline_xattr_size: - f2fs_info(sbi, "xattr options not supported"); - break; + case Opt_user_xattr: + case Opt_inline_xattr: + case Opt_inline_xattr_size: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL - case Opt_acl: - set_opt(sbi, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sbi, POSIX_ACL); - break; + case Opt_acl: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_POSIX_ACL); + else + ctx_set_opt(ctx, F2FS_MOUNT_POSIX_ACL); + break; #else - case Opt_acl: - case Opt_noacl: - f2fs_info(sbi, "acl options not supported"); - break; + case Opt_acl: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif - case Opt_active_logs: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg != 2 && arg != 4 && - arg != NR_CURSEG_PERSIST_TYPE) - return -EINVAL; - F2FS_OPTION(sbi).active_logs = arg; - break; - case Opt_disable_ext_identify: - set_opt(sbi, DISABLE_EXT_IDENTIFY); - break; - case Opt_inline_data: - set_opt(sbi, INLINE_DATA); - break; - case Opt_inline_dentry: - set_opt(sbi, INLINE_DENTRY); - break; - case Opt_noinline_dentry: - clear_opt(sbi, INLINE_DENTRY); - break; - case Opt_flush_merge: - set_opt(sbi, FLUSH_MERGE); - break; - case Opt_noflush_merge: - clear_opt(sbi, FLUSH_MERGE); - break; - case Opt_nobarrier: - set_opt(sbi, NOBARRIER); - break; - case Opt_barrier: - clear_opt(sbi, NOBARRIER); - break; - case Opt_fastboot: - set_opt(sbi, FASTBOOT); - break; - case Opt_extent_cache: - set_opt(sbi, READ_EXTENT_CACHE); - break; - case Opt_noextent_cache: - if (f2fs_sb_has_device_alias(sbi)) { - f2fs_err(sbi, "device aliasing requires extent cache"); - return -EINVAL; - } - clear_opt(sbi, READ_EXTENT_CACHE); - break; - case Opt_noinline_data: - clear_opt(sbi, INLINE_DATA); - break; - case Opt_data_flush: - set_opt(sbi, DATA_FLUSH); - break; - case Opt_reserve_root: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (test_opt(sbi, RESERVE_ROOT)) { - f2fs_info(sbi, "Preserve previous reserve_root=%u", - F2FS_OPTION(sbi).root_reserved_blocks); - } else { - F2FS_OPTION(sbi).root_reserved_blocks = arg; - set_opt(sbi, RESERVE_ROOT); - } - break; - case Opt_resuid: - if (args->from && match_int(args, &arg)) - return -EINVAL; - uid = make_kuid(current_user_ns(), arg); - if (!uid_valid(uid)) { - f2fs_err(sbi, "Invalid uid value %d", arg); - return -EINVAL; - } - F2FS_OPTION(sbi).s_resuid = uid; - break; - case Opt_resgid: - if (args->from && match_int(args, &arg)) - return -EINVAL; - gid = make_kgid(current_user_ns(), arg); - if (!gid_valid(gid)) { - f2fs_err(sbi, "Invalid gid value %d", arg); - return -EINVAL; - } - F2FS_OPTION(sbi).s_resgid = gid; - break; - case Opt_mode: - name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "adaptive")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; - } else if (!strcmp(name, "lfs")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; - } else if (!strcmp(name, "fragment:segment")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; - } else if (!strcmp(name, "fragment:block")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; + case Opt_active_logs: + if (result.int_32 != 2 && result.int_32 != 4 && + result.int_32 != NR_CURSEG_PERSIST_TYPE) + return -EINVAL; + ctx->spec_mask |= F2FS_SPEC_active_logs; + F2FS_CTX_INFO(ctx).active_logs = result.int_32; + break; + case Opt_disable_ext_identify: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_EXT_IDENTIFY); + break; + case Opt_inline_data: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DATA); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DATA); + break; + case Opt_inline_dentry: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DENTRY); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DENTRY); + break; + case Opt_flush_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_FLUSH_MERGE); + else + ctx_set_opt(ctx, F2FS_MOUNT_FLUSH_MERGE); + break; + case Opt_barrier: + if (result.negated) + ctx_set_opt(ctx, F2FS_MOUNT_NOBARRIER); + else + ctx_clear_opt(ctx, F2FS_MOUNT_NOBARRIER); + break; + case Opt_fastboot: + ctx_set_opt(ctx, F2FS_MOUNT_FASTBOOT); + break; + case Opt_extent_cache: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE); + else + ctx_set_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE); + break; + case Opt_data_flush: + ctx_set_opt(ctx, F2FS_MOUNT_DATA_FLUSH); + break; + case Opt_reserve_root: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); + F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_root; + break; + case Opt_resuid: + F2FS_CTX_INFO(ctx).s_resuid = result.uid; + ctx->spec_mask |= F2FS_SPEC_resuid; + break; + case Opt_resgid: + F2FS_CTX_INFO(ctx).s_resgid = result.gid; + ctx->spec_mask |= F2FS_SPEC_resgid; + break; + case Opt_mode: + F2FS_CTX_INFO(ctx).fs_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_mode; + break; #ifdef CONFIG_F2FS_FAULT_INJECTION - case Opt_fault_injection: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (f2fs_build_fault_attr(sbi, arg, 0, FAULT_RATE)) - return -EINVAL; - set_opt(sbi, FAULT_INJECTION); - break; + case Opt_fault_injection: + F2FS_CTX_INFO(ctx).fault_info.inject_rate = result.int_32; + ctx->spec_mask |= F2FS_SPEC_fault_injection; + ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION); + break; - case Opt_fault_type: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (f2fs_build_fault_attr(sbi, 0, arg, FAULT_TYPE)) - return -EINVAL; - set_opt(sbi, FAULT_INJECTION); - break; + case Opt_fault_type: + if (result.uint_32 > BIT(FAULT_MAX)) + return -EINVAL; + F2FS_CTX_INFO(ctx).fault_info.inject_type = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_fault_type; + ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION); + break; #else - case Opt_fault_injection: - case Opt_fault_type: - f2fs_info(sbi, "fault injection options not supported"); - break; + case Opt_fault_injection: + case Opt_fault_type: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif - case Opt_lazytime: - set_opt(sbi, LAZYTIME); - break; - case Opt_nolazytime: - clear_opt(sbi, LAZYTIME); - break; + case Opt_lazytime: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_LAZYTIME); + else + ctx_set_opt(ctx, F2FS_MOUNT_LAZYTIME); + break; #ifdef CONFIG_QUOTA - case Opt_quota: - case Opt_usrquota: - set_opt(sbi, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sbi, GRPQUOTA); - break; - case Opt_prjquota: - set_opt(sbi, PRJQUOTA); - break; - case Opt_usrjquota: - ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_grpjquota: - ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_prjjquota: - ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_offusrjquota: - ret = f2fs_clear_qf_name(sbi, USRQUOTA); - if (ret) - return ret; - break; - case Opt_offgrpjquota: - ret = f2fs_clear_qf_name(sbi, GRPQUOTA); - if (ret) - return ret; - break; - case Opt_offprjjquota: - ret = f2fs_clear_qf_name(sbi, PRJQUOTA); - if (ret) - return ret; - break; - case Opt_jqfmt_vfsold: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; - break; - case Opt_jqfmt_vfsv0: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; - break; - case Opt_jqfmt_vfsv1: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; - break; - case Opt_noquota: - clear_opt(sbi, QUOTA); - clear_opt(sbi, USRQUOTA); - clear_opt(sbi, GRPQUOTA); - clear_opt(sbi, PRJQUOTA); - break; + case Opt_quota: + if (result.negated) { + ctx_clear_opt(ctx, F2FS_MOUNT_QUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA); + } else + ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA); + break; + case Opt_usrquota: + ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA); + break; + case Opt_grpquota: + ctx_set_opt(ctx, F2FS_MOUNT_GRPQUOTA); + break; + case Opt_prjquota: + ctx_set_opt(ctx, F2FS_MOUNT_PRJQUOTA); + break; + case Opt_usrjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, USRQUOTA); + else + ret = f2fs_note_qf_name(fc, USRQUOTA, param); + if (ret) + return ret; + break; + case Opt_grpjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, GRPQUOTA); + else + ret = f2fs_note_qf_name(fc, GRPQUOTA, param); + if (ret) + return ret; + break; + case Opt_prjjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, PRJQUOTA); + else + ret = f2fs_note_qf_name(fc, PRJQUOTA, param); + if (ret) + return ret; + break; + case Opt_jqfmt: + F2FS_CTX_INFO(ctx).s_jquota_fmt = result.int_32; + ctx->spec_mask |= F2FS_SPEC_jqfmt; + break; #else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - case Opt_prjquota: - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_prjjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_offprjjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - case Opt_noquota: - f2fs_info(sbi, "quota operations not supported"); - break; + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + f2fs_info(NULL, "quota operations not supported"); + break; #endif - case Opt_alloc: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - - if (!strcmp(name, "default")) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; - } else if (!strcmp(name, "reuse")) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_fsync: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "posix")) { - F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - } else if (!strcmp(name, "strict")) { - F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; - } else if (!strcmp(name, "nobarrier")) { - F2FS_OPTION(sbi).fsync_mode = - FSYNC_MODE_NOBARRIER; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_test_dummy_encryption: - ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0], - is_remount); - if (ret) - return ret; - break; - case Opt_inlinecrypt: + case Opt_alloc: + F2FS_CTX_INFO(ctx).alloc_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_alloc_mode; + break; + case Opt_fsync: + F2FS_CTX_INFO(ctx).fsync_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_fsync_mode; + break; + case Opt_test_dummy_encryption: + ret = f2fs_parse_test_dummy_encryption(param, ctx); + if (ret) + return ret; + break; + case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - set_opt(sbi, INLINECRYPT); + ctx_set_opt(ctx, F2FS_MOUNT_INLINECRYPT); #else - f2fs_info(sbi, "inline encryption not supported"); + f2fs_info(NULL, "inline encryption not supported"); #endif - break; + break; + case Opt_checkpoint: + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].from = args[0].to = NULL; + arg = 0; + + /* revert to match_table for checkpoint= options */ + token = match_token(param->string, f2fs_checkpoint_tokens, args); + switch (token) { case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) return -EINVAL; if (arg < 0 || arg > 100) return -EINVAL; - F2FS_OPTION(sbi).unusable_cap_perc = arg; - set_opt(sbi, DISABLE_CHECKPOINT); + F2FS_CTX_INFO(ctx).unusable_cap_perc = arg; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable_cap: if (args->from && match_int(args, &arg)) return -EINVAL; - F2FS_OPTION(sbi).unusable_cap = arg; - set_opt(sbi, DISABLE_CHECKPOINT); + F2FS_CTX_INFO(ctx).unusable_cap = arg; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable: - set_opt(sbi, DISABLE_CHECKPOINT); + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: - clear_opt(sbi, DISABLE_CHECKPOINT); - break; - case Opt_checkpoint_merge: - set_opt(sbi, MERGE_CHECKPOINT); - break; - case Opt_nocheckpoint_merge: - clear_opt(sbi, MERGE_CHECKPOINT); + ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; + default: + return -EINVAL; + } + break; + case Opt_checkpoint_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT); + else + ctx_set_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION - case Opt_compress_algorithm: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "lzo")) { + case Opt_compress_algorithm: + name = param->string; + if (!strcmp(name, "lzo")) { #ifdef CONFIG_F2FS_FS_LZO - F2FS_OPTION(sbi).compress_level = 0; - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZO; + F2FS_CTX_INFO(ctx).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZO; + ctx->spec_mask |= F2FS_SPEC_compress_level; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lzo compression"); + f2fs_info(NULL, "kernel doesn't support lzo compression"); #endif - } else if (!strncmp(name, "lz4", 3)) { + } else if (!strncmp(name, "lz4", 3)) { #ifdef CONFIG_F2FS_FS_LZ4 - ret = f2fs_set_lz4hc_level(sbi, name); - if (ret) { - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZ4; + ret = f2fs_set_lz4hc_level(ctx, name); + if (ret) + return -EINVAL; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZ4; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lz4 compression"); + f2fs_info(NULL, "kernel doesn't support lz4 compression"); #endif - } else if (!strncmp(name, "zstd", 4)) { + } else if (!strncmp(name, "zstd", 4)) { #ifdef CONFIG_F2FS_FS_ZSTD - ret = f2fs_set_zstd_level(sbi, name); - if (ret) { - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_ZSTD; + ret = f2fs_set_zstd_level(ctx, name); + if (ret) + return -EINVAL; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_ZSTD; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support zstd compression"); + f2fs_info(NULL, "kernel doesn't support zstd compression"); #endif - } else if (!strcmp(name, "lzo-rle")) { + } else if (!strcmp(name, "lzo-rle")) { #ifdef CONFIG_F2FS_FS_LZORLE - F2FS_OPTION(sbi).compress_level = 0; - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZORLE; + F2FS_CTX_INFO(ctx).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZORLE; + ctx->spec_mask |= F2FS_SPEC_compress_level; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lzorle compression"); + f2fs_info(NULL, "kernel doesn't support lzorle compression"); #endif - } else { - kfree(name); - return -EINVAL; - } - kfree(name); + } else + return -EINVAL; + break; + case Opt_compress_log_size: + if (result.uint_32 < MIN_COMPRESS_LOG_SIZE || + result.uint_32 > MAX_COMPRESS_LOG_SIZE) { + f2fs_err(NULL, + "Compress cluster log size is out of range"); + return -EINVAL; + } + F2FS_CTX_INFO(ctx).compress_log_size = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_compress_log_size; + break; + case Opt_compress_extension: + name = param->string; + ext = F2FS_CTX_INFO(ctx).extensions; + ext_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + ext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(NULL, "invalid extension length/number"); + return -EINVAL; + } + + if (is_compress_extension_exist(&ctx->info, name, true)) break; - case Opt_compress_log_size: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg < MIN_COMPRESS_LOG_SIZE || - arg > MAX_COMPRESS_LOG_SIZE) { - f2fs_err(sbi, - "Compress cluster log size is out of range"); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_log_size = arg; + + ret = strscpy(ext[ext_cnt], name, F2FS_EXTENSION_LEN); + if (ret < 0) + return ret; + F2FS_CTX_INFO(ctx).compress_ext_cnt++; + ctx->spec_mask |= F2FS_SPEC_compress_extension; + break; + case Opt_nocompress_extension: + name = param->string; + noext = F2FS_CTX_INFO(ctx).noextensions; + noext_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(NULL, "invalid extension length/number"); + return -EINVAL; + } + + if (is_compress_extension_exist(&ctx->info, name, false)) break; - case Opt_compress_extension: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + ret = strscpy(noext[noext_cnt], name, F2FS_EXTENSION_LEN); + if (ret < 0) + return ret; + F2FS_CTX_INFO(ctx).nocompress_ext_cnt++; + ctx->spec_mask |= F2FS_SPEC_nocompress_extension; + break; + case Opt_compress_chksum: + F2FS_CTX_INFO(ctx).compress_chksum = true; + ctx->spec_mask |= F2FS_SPEC_compress_chksum; + break; + case Opt_compress_mode: + F2FS_CTX_INFO(ctx).compress_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_compress_mode; + break; + case Opt_compress_cache: + ctx_set_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE); + break; +#else + case Opt_compress_algorithm: + case Opt_compress_log_size: + case Opt_compress_extension: + case Opt_nocompress_extension: + case Opt_compress_chksum: + case Opt_compress_mode: + case Opt_compress_cache: + f2fs_info(NULL, "compression options not supported"); + break; +#endif + case Opt_atgc: + ctx_set_opt(ctx, F2FS_MOUNT_ATGC); + break; + case Opt_gc_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_GC_MERGE); + else + ctx_set_opt(ctx, F2FS_MOUNT_GC_MERGE); + break; + case Opt_discard_unit: + F2FS_CTX_INFO(ctx).discard_unit = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_discard_unit; + break; + case Opt_memory_mode: + F2FS_CTX_INFO(ctx).memory_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_memory_mode; + break; + case Opt_age_extent_cache: + ctx_set_opt(ctx, F2FS_MOUNT_AGE_EXTENT_CACHE); + break; + case Opt_errors: + F2FS_CTX_INFO(ctx).errors = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_errors; + break; + case Opt_nat_bits: + ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS); + break; + } + return 0; +} - if (strlen(name) >= F2FS_EXTENSION_LEN || - ext_cnt >= COMPRESS_EXT_NUM) { - f2fs_err(sbi, - "invalid extension length/number"); - kfree(name); - return -EINVAL; - } +/* + * Check quota settings consistency. + */ +static int f2fs_check_quota_consistency(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + #ifdef CONFIG_QUOTA + struct f2fs_fs_context *ctx = fc->fs_private; + bool quota_feature = f2fs_sb_has_quota_ino(sbi); + bool quota_turnon = sb_any_quota_loaded(sb); + char *old_qname, *new_qname; + bool usr_qf_name, grp_qf_name, prj_qf_name, usrquota, grpquota, prjquota; + int i; - if (is_compress_extension_exist(sbi, name, true)) { - kfree(name); - break; - } + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA) && + !f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); + return -EINVAL; + } - ret = strscpy(ext[ext_cnt], name); - if (ret < 0) { - kfree(name); - return ret; - } - F2FS_OPTION(sbi).compress_ext_cnt++; - kfree(name); - break; - case Opt_nocompress_extension: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; + if (ctx->qname_mask) { + for (i = 0; i < MAXQUOTAS; i++) { + if (!(ctx->qname_mask & (1 << i))) + continue; - noext = F2FS_OPTION(sbi).noextensions; - noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + old_qname = F2FS_OPTION(sbi).s_qf_names[i]; + new_qname = F2FS_CTX_INFO(ctx).s_qf_names[i]; + if (quota_turnon && + !!old_qname != !!new_qname) + goto err_jquota_change; - if (strlen(name) >= F2FS_EXTENSION_LEN || - noext_cnt >= COMPRESS_EXT_NUM) { - f2fs_err(sbi, - "invalid extension length/number"); - kfree(name); - return -EINVAL; + if (old_qname) { + if (strcmp(old_qname, new_qname) == 0) { + ctx->qname_mask &= ~(1 << i); + continue; + } + goto err_jquota_specified; } - if (is_compress_extension_exist(sbi, name, false)) { - kfree(name); - break; + if (quota_feature) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); + ctx->qname_mask &= ~(1 << i); + kfree(F2FS_CTX_INFO(ctx).s_qf_names[i]); + F2FS_CTX_INFO(ctx).s_qf_names[i] = NULL; } + } + } + + /* Make sure we don't mix old and new quota format */ + usr_qf_name = F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[USRQUOTA]; + grp_qf_name = F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[GRPQUOTA]; + prj_qf_name = F2FS_OPTION(sbi).s_qf_names[PRJQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[PRJQUOTA]; + usrquota = test_opt(sbi, USRQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_USRQUOTA); + grpquota = test_opt(sbi, GRPQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_GRPQUOTA); + prjquota = test_opt(sbi, PRJQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA); + + if (usr_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA); + usrquota = false; + } + if (grp_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA); + grpquota = false; + } + if (prj_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA); + prjquota = false; + } + if (usr_qf_name || grp_qf_name || prj_qf_name) { + if (grpquota || usrquota || prjquota) { + f2fs_err(sbi, "old and new quota format mixing"); + return -EINVAL; + } + if (!(ctx->spec_mask & F2FS_SPEC_jqfmt || + F2FS_OPTION(sbi).s_jquota_fmt)) { + f2fs_err(sbi, "journaled quota format not specified"); + return -EINVAL; + } + } + return 0; + +err_jquota_change: + f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + return -EINVAL; +err_jquota_specified: + f2fs_err(sbi, "%s quota file already specified", + QTYPE2NAME(i)); + return -EINVAL; - ret = strscpy(noext[noext_cnt], name); - if (ret < 0) { - kfree(name); - return ret; - } - F2FS_OPTION(sbi).nocompress_ext_cnt++; - kfree(name); - break; - case Opt_compress_chksum: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - F2FS_OPTION(sbi).compress_chksum = true; - break; - case Opt_compress_mode: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "fs")) { - F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; - } else if (!strcmp(name, "user")) { - F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_compress_cache: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - set_opt(sbi, COMPRESS_CACHE); - break; #else - case Opt_compress_algorithm: - case Opt_compress_log_size: - case Opt_compress_extension: - case Opt_nocompress_extension: - case Opt_compress_chksum: - case Opt_compress_mode: - case Opt_compress_cache: - f2fs_info(sbi, "compression options not supported"); - break; + if (f2fs_readonly(sbi->sb)) + return 0; + if (f2fs_sb_has_quota_ino(sbi)) { + f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + if (f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + + return 0; #endif - case Opt_atgc: - set_opt(sbi, ATGC); - break; - case Opt_gc_merge: - set_opt(sbi, GC_MERGE); - break; - case Opt_nogc_merge: - clear_opt(sbi, GC_MERGE); - break; - case Opt_discard_unit: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "block")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_BLOCK; - } else if (!strcmp(name, "segment")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SEGMENT; - } else if (!strcmp(name, "section")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SECTION; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_memory_mode: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "normal")) { - F2FS_OPTION(sbi).memory_mode = - MEMORY_MODE_NORMAL; - } else if (!strcmp(name, "low")) { - F2FS_OPTION(sbi).memory_mode = - MEMORY_MODE_LOW; - } else { - kfree(name); - return -EINVAL; +} + +static int f2fs_check_test_dummy_encryption(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy)) + return 0; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fscrypt_dummy_policies_equal(&F2FS_OPTION(sbi).dummy_enc_policy, + &F2FS_CTX_INFO(ctx).dummy_enc_policy)) + return 0; + f2fs_warn(sbi, "Can't set or change test_dummy_encryption on remount"); + return -EINVAL; + } + return 0; +} + +static inline bool test_compression_spec(unsigned int mask) +{ + return mask & (F2FS_SPEC_compress_algorithm + | F2FS_SPEC_compress_log_size + | F2FS_SPEC_compress_extension + | F2FS_SPEC_nocompress_extension + | F2FS_SPEC_compress_chksum + | F2FS_SPEC_compress_mode); +} + +static inline void clear_compression_spec(struct f2fs_fs_context *ctx) +{ + ctx->spec_mask &= ~(F2FS_SPEC_compress_algorithm + | F2FS_SPEC_compress_log_size + | F2FS_SPEC_compress_extension + | F2FS_SPEC_nocompress_extension + | F2FS_SPEC_compress_chksum + | F2FS_SPEC_compress_mode); +} + +static int f2fs_check_compression(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i, cnt; + + if (!f2fs_sb_has_compression(sbi)) { + if (test_compression_spec(ctx->spec_mask) || + ctx_test_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE)) + f2fs_info(sbi, "Image doesn't support compression"); + clear_compression_spec(ctx); + ctx->opt_mask &= ~F2FS_MOUNT_COMPRESS_CACHE; + return 0; + } + if (ctx->spec_mask & F2FS_SPEC_compress_extension) { + cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + for (i = 0; i < F2FS_CTX_INFO(ctx).compress_ext_cnt; i++) { + if (is_compress_extension_exist(&F2FS_OPTION(sbi), + F2FS_CTX_INFO(ctx).extensions[i], true)) { + F2FS_CTX_INFO(ctx).extensions[i][0] = '\0'; + cnt--; } - kfree(name); - break; - case Opt_age_extent_cache: - set_opt(sbi, AGE_EXTENT_CACHE); - break; - case Opt_errors: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "remount-ro")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_READONLY; - } else if (!strcmp(name, "continue")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_CONTINUE; - } else if (!strcmp(name, "panic")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_PANIC; - } else { - kfree(name); - return -EINVAL; + } + if (F2FS_OPTION(sbi).compress_ext_cnt + cnt > COMPRESS_EXT_NUM) { + f2fs_err(sbi, "invalid extension length/number"); + return -EINVAL; + } + } + if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) { + cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + for (i = 0; i < F2FS_CTX_INFO(ctx).nocompress_ext_cnt; i++) { + if (is_compress_extension_exist(&F2FS_OPTION(sbi), + F2FS_CTX_INFO(ctx).noextensions[i], false)) { + F2FS_CTX_INFO(ctx).noextensions[i][0] = '\0'; + cnt--; } - kfree(name); - break; - case Opt_nat_bits: - set_opt(sbi, NAT_BITS); - break; - default: - f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", - p); + } + if (F2FS_OPTION(sbi).nocompress_ext_cnt + cnt > COMPRESS_EXT_NUM) { + f2fs_err(sbi, "invalid noextension length/number"); return -EINVAL; } } + + if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions, + F2FS_CTX_INFO(ctx).nocompress_ext_cnt, + F2FS_CTX_INFO(ctx).extensions, + F2FS_CTX_INFO(ctx).compress_ext_cnt)) { + f2fs_err(sbi, "new noextensions conflicts with new extensions"); + return -EINVAL; + } + if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions, + F2FS_CTX_INFO(ctx).nocompress_ext_cnt, + F2FS_OPTION(sbi).extensions, + F2FS_OPTION(sbi).compress_ext_cnt)) { + f2fs_err(sbi, "new noextensions conflicts with old extensions"); + return -EINVAL; + } + if (f2fs_test_compress_extension(F2FS_OPTION(sbi).noextensions, + F2FS_OPTION(sbi).nocompress_ext_cnt, + F2FS_CTX_INFO(ctx).extensions, + F2FS_CTX_INFO(ctx).compress_ext_cnt)) { + f2fs_err(sbi, "new extensions conflicts with old noextensions"); + return -EINVAL; + } +#endif return 0; } -static int f2fs_default_check(struct f2fs_sb_info *sbi) +static int f2fs_check_opt_consistency(struct fs_context *fc, + struct super_block *sb) { -#ifdef CONFIG_QUOTA - if (f2fs_check_quota_options(sbi)) + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err; + + if (ctx_test_opt(ctx, F2FS_MOUNT_NORECOVERY) && !f2fs_readonly(sb)) return -EINVAL; -#else - if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) { - f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + + if (f2fs_hw_should_discard(sbi) && + (ctx->opt_mask & F2FS_MOUNT_DISCARD) && + !ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { + f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } - if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) { - f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + + if (!f2fs_hw_support_discard(sbi) && + (ctx->opt_mask & F2FS_MOUNT_DISCARD) && + ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { + f2fs_warn(sbi, "device does not support discard"); + ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); + ctx->opt_mask &= ~F2FS_MOUNT_DISCARD; + } + + if (f2fs_sb_has_device_alias(sbi) && + (ctx->opt_mask & F2FS_MOUNT_READ_EXTENT_CACHE) && + !ctx_test_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE)) { + f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } -#endif + + if (test_opt(sbi, RESERVE_ROOT) && + (ctx->opt_mask & F2FS_MOUNT_RESERVE_ROOT) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_ROOT)) { + f2fs_info(sbi, "Preserve previous reserve_root=%u", + F2FS_OPTION(sbi).root_reserved_blocks); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); + ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT; + } + + err = f2fs_check_test_dummy_encryption(fc, sb); + if (err) + return err; + + err = f2fs_check_compression(fc, sb); + if (err) + return err; + + err = f2fs_check_quota_consistency(fc, sb); + if (err) + return err; if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, @@ -1354,15 +1449,19 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) * devices, but mandatory for host-managed zoned block devices. */ if (f2fs_sb_has_blkzoned(sbi)) { + if (F2FS_CTX_INFO(ctx).bggc_mode == BGGC_MODE_OFF) { + f2fs_warn(sbi, "zoned devices need bggc"); + return -EINVAL; + } #ifdef CONFIG_BLK_DEV_ZONED - if (F2FS_OPTION(sbi).discard_unit != - DISCARD_UNIT_SECTION) { + if ((ctx->spec_mask & F2FS_SPEC_discard_unit) && + F2FS_CTX_INFO(ctx).discard_unit != DISCARD_UNIT_SECTION) { f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SECTION; + F2FS_CTX_INFO(ctx).discard_unit = DISCARD_UNIT_SECTION; } - if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) { + if ((ctx->spec_mask & F2FS_SPEC_mode) && + F2FS_CTX_INFO(ctx).fs_mode != FS_MODE_LFS) { f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature"); return -EINVAL; } @@ -1372,43 +1471,25 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) #endif } -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_test_compress_extension(sbi)) { - f2fs_err(sbi, "invalid compress or nocompress extension"); - return -EINVAL; - } -#endif - - if (test_opt(sbi, INLINE_XATTR_SIZE)) { - int min_size, max_size; - + if (ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE)) { if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off"); return -EINVAL; } - if (!test_opt(sbi, INLINE_XATTR)) { + if (!ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR) && !test_opt(sbi, INLINE_XATTR)) { f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option"); return -EINVAL; } - - min_size = MIN_INLINE_XATTR_SIZE; - max_size = MAX_INLINE_XATTR_SIZE; - - if (F2FS_OPTION(sbi).inline_xattr_size < min_size || - F2FS_OPTION(sbi).inline_xattr_size > max_size) { - f2fs_err(sbi, "inline xattr size is out of range: %d ~ %d", - min_size, max_size); - return -EINVAL; - } } - if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + if (ctx_test_opt(ctx, F2FS_MOUNT_ATGC) && + F2FS_CTX_INFO(ctx).fs_mode == FS_MODE_LFS) { f2fs_err(sbi, "LFS is not compatible with ATGC"); return -EINVAL; } - if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { + if (f2fs_is_readonly(sbi) && ctx_test_opt(ctx, F2FS_MOUNT_FLUSH_MERGE)) { f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); return -EINVAL; } @@ -1417,12 +1498,190 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; } + return 0; +} + +static void f2fs_apply_quota_options(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + bool quota_feature = f2fs_sb_has_quota_ino(sbi); + char *qname; + int i; + + if (quota_feature) + return; + + for (i = 0; i < MAXQUOTAS; i++) { + if (!(ctx->qname_mask & (1 << i))) + continue; + + qname = F2FS_CTX_INFO(ctx).s_qf_names[i]; + if (qname) { + qname = kstrdup(F2FS_CTX_INFO(ctx).s_qf_names[i], + GFP_KERNEL | __GFP_NOFAIL); + set_opt(sbi, QUOTA); + } + F2FS_OPTION(sbi).s_qf_names[i] = qname; + } + + if (ctx->spec_mask & F2FS_SPEC_jqfmt) + F2FS_OPTION(sbi).s_jquota_fmt = F2FS_CTX_INFO(ctx).s_jquota_fmt; + + if (quota_feature && F2FS_OPTION(sbi).s_jquota_fmt) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); + F2FS_OPTION(sbi).s_jquota_fmt = 0; + } +#endif +} + +static void f2fs_apply_test_dummy_encryption(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy) || + /* if already set, it was already verified to be the same */ + fscrypt_is_dummy_policy_set(&F2FS_OPTION(sbi).dummy_enc_policy)) + return; + swap(F2FS_OPTION(sbi).dummy_enc_policy, F2FS_CTX_INFO(ctx).dummy_enc_policy); + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +} - if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) { - f2fs_err(sbi, "norecovery requires readonly mount"); +static void f2fs_apply_compression(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned char (*ctx_ext)[F2FS_EXTENSION_LEN]; + unsigned char (*sbi_ext)[F2FS_EXTENSION_LEN]; + int ctx_cnt, sbi_cnt, i; + + if (ctx->spec_mask & F2FS_SPEC_compress_level) + F2FS_OPTION(sbi).compress_level = + F2FS_CTX_INFO(ctx).compress_level; + if (ctx->spec_mask & F2FS_SPEC_compress_algorithm) + F2FS_OPTION(sbi).compress_algorithm = + F2FS_CTX_INFO(ctx).compress_algorithm; + if (ctx->spec_mask & F2FS_SPEC_compress_log_size) + F2FS_OPTION(sbi).compress_log_size = + F2FS_CTX_INFO(ctx).compress_log_size; + if (ctx->spec_mask & F2FS_SPEC_compress_chksum) + F2FS_OPTION(sbi).compress_chksum = + F2FS_CTX_INFO(ctx).compress_chksum; + if (ctx->spec_mask & F2FS_SPEC_compress_mode) + F2FS_OPTION(sbi).compress_mode = + F2FS_CTX_INFO(ctx).compress_mode; + if (ctx->spec_mask & F2FS_SPEC_compress_extension) { + ctx_ext = F2FS_CTX_INFO(ctx).extensions; + ctx_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + sbi_ext = F2FS_OPTION(sbi).extensions; + sbi_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + for (i = 0; i < ctx_cnt; i++) { + if (strlen(ctx_ext[i]) == 0) + continue; + strscpy(sbi_ext[sbi_cnt], ctx_ext[i]); + sbi_cnt++; + } + F2FS_OPTION(sbi).compress_ext_cnt = sbi_cnt; + } + if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) { + ctx_ext = F2FS_CTX_INFO(ctx).noextensions; + ctx_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + sbi_ext = F2FS_OPTION(sbi).noextensions; + sbi_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + for (i = 0; i < ctx_cnt; i++) { + if (strlen(ctx_ext[i]) == 0) + continue; + strscpy(sbi_ext[sbi_cnt], ctx_ext[i]); + sbi_cnt++; + } + F2FS_OPTION(sbi).nocompress_ext_cnt = sbi_cnt; + } +#endif +} + +static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + F2FS_OPTION(sbi).opt &= ~ctx->opt_mask; + F2FS_OPTION(sbi).opt |= F2FS_CTX_INFO(ctx).opt; + + if (ctx->spec_mask & F2FS_SPEC_background_gc) + F2FS_OPTION(sbi).bggc_mode = F2FS_CTX_INFO(ctx).bggc_mode; + if (ctx->spec_mask & F2FS_SPEC_inline_xattr_size) + F2FS_OPTION(sbi).inline_xattr_size = + F2FS_CTX_INFO(ctx).inline_xattr_size; + if (ctx->spec_mask & F2FS_SPEC_active_logs) + F2FS_OPTION(sbi).active_logs = F2FS_CTX_INFO(ctx).active_logs; + if (ctx->spec_mask & F2FS_SPEC_reserve_root) + F2FS_OPTION(sbi).root_reserved_blocks = + F2FS_CTX_INFO(ctx).root_reserved_blocks; + if (ctx->spec_mask & F2FS_SPEC_resgid) + F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid; + if (ctx->spec_mask & F2FS_SPEC_resuid) + F2FS_OPTION(sbi).s_resuid = F2FS_CTX_INFO(ctx).s_resuid; + if (ctx->spec_mask & F2FS_SPEC_mode) + F2FS_OPTION(sbi).fs_mode = F2FS_CTX_INFO(ctx).fs_mode; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (ctx->spec_mask & F2FS_SPEC_fault_injection) + (void)f2fs_build_fault_attr(sbi, + F2FS_CTX_INFO(ctx).fault_info.inject_rate, 0, FAULT_RATE); + if (ctx->spec_mask & F2FS_SPEC_fault_type) + (void)f2fs_build_fault_attr(sbi, 0, + F2FS_CTX_INFO(ctx).fault_info.inject_type, FAULT_TYPE); +#endif + if (ctx->spec_mask & F2FS_SPEC_alloc_mode) + F2FS_OPTION(sbi).alloc_mode = F2FS_CTX_INFO(ctx).alloc_mode; + if (ctx->spec_mask & F2FS_SPEC_fsync_mode) + F2FS_OPTION(sbi).fsync_mode = F2FS_CTX_INFO(ctx).fsync_mode; + if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap) + F2FS_OPTION(sbi).unusable_cap = F2FS_CTX_INFO(ctx).unusable_cap; + if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap_perc) + F2FS_OPTION(sbi).unusable_cap_perc = + F2FS_CTX_INFO(ctx).unusable_cap_perc; + if (ctx->spec_mask & F2FS_SPEC_discard_unit) + F2FS_OPTION(sbi).discard_unit = F2FS_CTX_INFO(ctx).discard_unit; + if (ctx->spec_mask & F2FS_SPEC_memory_mode) + F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode; + if (ctx->spec_mask & F2FS_SPEC_errors) + F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors; + + f2fs_apply_compression(fc, sb); + f2fs_apply_test_dummy_encryption(fc, sb); + f2fs_apply_quota_options(fc, sb); +} + +static int f2fs_sanity_check_options(struct f2fs_sb_info *sbi, bool remount) +{ + if (f2fs_sb_has_device_alias(sbi) && + !test_opt(sbi, READ_EXTENT_CACHE)) { + f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } + if (!remount) + return 0; + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && + sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { + f2fs_err(sbi, + "zoned: max open zones %u is too small, need at least %u open zones", + sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); + return -EINVAL; + } +#endif + if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { + f2fs_warn(sbi, "LFS is not compatible with IPU"); + return -EINVAL; + } return 0; } @@ -1442,6 +1701,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); atomic_set(&fi->i_compr_blocks, 0); + atomic_set(&fi->open_count, 0); init_f2fs_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); @@ -1718,7 +1978,7 @@ static void f2fs_put_super(struct super_block *sb) destroy_percpu_info(sbi); f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) - kvfree(sbi->write_io[i]); + kfree(sbi->write_io[i]); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif @@ -2329,11 +2589,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_flush_ckpt_thread(sbi); } -static int f2fs_remount(struct super_block *sb, int *flags, char *data) +static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; + unsigned int flags = fc->sb_flags; int err; bool need_restart_gc = false, need_stop_gc = false; bool need_restart_flush = false, need_stop_flush = false; @@ -2379,7 +2640,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #endif /* recover superblocks we couldn't write due to previous RO mount */ - if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + if (!(flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); f2fs_info(sbi, "Try to recover all the superblocks, ret: %d", err); @@ -2389,23 +2650,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi, true); - /* parse mount options */ - err = parse_options(sbi, data, true); + err = f2fs_check_opt_consistency(fc, sb); if (err) goto restore_opts; -#ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi) && - sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { - f2fs_err(sbi, - "zoned: max open zones %u is too small, need at least %u open zones", - sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); - err = -EINVAL; - goto restore_opts; - } -#endif + f2fs_apply_options(fc, sb); - err = f2fs_default_check(sbi); + err = f2fs_sanity_check_options(sbi, true); if (err) goto restore_opts; @@ -2416,20 +2667,20 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) + if (f2fs_readonly(sb) && (flags & SB_RDONLY)) goto skip; - if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) { + if (f2fs_dev_is_readonly(sbi) && !(flags & SB_RDONLY)) { err = -EROFS; goto restore_opts; } #ifdef CONFIG_QUOTA - if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { + if (!f2fs_readonly(sb) && (flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; - } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) { + } else if (f2fs_readonly(sb) && !(flags & SB_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { @@ -2441,12 +2692,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif - if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { - err = -EINVAL; - f2fs_warn(sbi, "LFS is not compatible with IPU"); - goto restore_opts; - } - /* disallow enable atgc dynamically */ if (no_atgc == !!test_opt(sbi, ATGC)) { err = -EINVAL; @@ -2485,7 +2730,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { + if ((flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); goto restore_opts; @@ -2496,7 +2741,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & SB_RDONLY) || + if ((flags & SB_RDONLY) || (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { @@ -2510,7 +2755,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY) { + if (flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2523,7 +2768,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. */ - if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + if ((flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); need_restart_flush = true; @@ -2565,11 +2810,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * triggered while remount and we need to take care of it before * returning from remount. */ - if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + if ((flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || !test_opt(sbi, MERGE_CHECKPOINT)) { f2fs_stop_ckpt_thread(sbi); } else { - /* Flush if the prevous checkpoint, if exists. */ + /* Flush if the previous checkpoint, if exists. */ f2fs_flush_ckpt_thread(sbi); err = f2fs_start_ckpt_thread(sbi); @@ -2592,7 +2837,7 @@ skip: (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + fc->sb_flags = (flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); sbi->umount_lock_holder = NULL; return 0; @@ -3263,7 +3508,6 @@ static const struct super_operations f2fs_sops = { .freeze_fs = f2fs_freeze, .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, - .remount_fs = f2fs_remount, .shutdown = f2fs_shutdown, }; @@ -3451,6 +3695,7 @@ static int __f2fs_commit_super(struct f2fs_sb_info *sbi, struct folio *folio, f2fs_bug_on(sbi, 1); ret = submit_bio_wait(bio); + bio_put(bio); folio_end_writeback(folio); return ret; @@ -4522,14 +4767,14 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) sbi->readdir_ra = true; } -static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +static int f2fs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct f2fs_fs_context *ctx = fc->fs_private; struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; struct inode *root; int err; bool skip_recovery = false, need_fsck = false; - char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; int retry_cnt = 1; @@ -4592,18 +4837,14 @@ try_onemore: sizeof(raw_super->uuid)); default_options(sbi, false); - /* parse mount options */ - options = kstrdup((const char *)data, GFP_KERNEL); - if (data && !options) { - err = -ENOMEM; - goto free_sb_buf; - } - err = parse_options(sbi, options, false); + err = f2fs_check_opt_consistency(fc, sb); if (err) - goto free_options; + goto free_sb_buf; + + f2fs_apply_options(fc, sb); - err = f2fs_default_check(sbi); + err = f2fs_sanity_check_options(sbi, false); if (err) goto free_options; @@ -4770,6 +5011,10 @@ try_onemore: /* get segno of first zoned block device */ sbi->first_seq_zone_segno = get_first_seq_zone_segno(sbi); + sbi->reserved_pin_section = f2fs_sb_has_blkzoned(sbi) ? + ZONED_PIN_SEC_REQUIRED_COUNT : + GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)); + /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); if (__exist_node_summaries(sbi)) @@ -4930,7 +5175,6 @@ reset_checkpoint: if (err) goto sync_free_meta; } - kvfree(options); /* recover broken superblock */ if (recovery) { @@ -5013,7 +5257,7 @@ free_iostat: f2fs_destroy_iostat(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) - kvfree(sbi->write_io[i]); + kfree(sbi->write_io[i]); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); @@ -5024,8 +5268,8 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); - kvfree(options); + /* no need to free dummy_enc_policy, we just keep it in ctx when failed */ + swap(F2FS_CTX_INFO(ctx).dummy_enc_policy, F2FS_OPTION(sbi).dummy_enc_policy); free_sb_buf: kfree(raw_super); free_sbi: @@ -5041,12 +5285,39 @@ free_sbi: return err; } -static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int f2fs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); + return get_tree_bdev(fc, f2fs_fill_super); } +static int f2fs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + + return __f2fs_remount(fc, sb); +} + +static void f2fs_fc_free(struct fs_context *fc) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + + if (!ctx) + return; + +#ifdef CONFIG_QUOTA + f2fs_unnote_qf_name_all(fc); +#endif + fscrypt_free_dummy_policy(&F2FS_CTX_INFO(ctx).dummy_enc_policy); + kfree(ctx); +} + +static const struct fs_context_operations f2fs_context_ops = { + .parse_param = f2fs_parse_param, + .get_tree = f2fs_get_tree, + .reconfigure = f2fs_reconfigure, + .free = f2fs_fc_free, +}; + static void kill_f2fs_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -5088,10 +5359,24 @@ static void kill_f2fs_super(struct super_block *sb) } } +static int f2fs_init_fs_context(struct fs_context *fc) +{ + struct f2fs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct f2fs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &f2fs_context_ops; + + return 0; +} + static struct file_system_type f2fs_fs_type = { .owner = THIS_MODULE, .name = "f2fs", - .mount = f2fs_mount, + .init_fs_context = f2fs_init_fs_context, .kill_sb = kill_f2fs_super, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 75134d69a0bd..f736052dea50 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -628,6 +628,27 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_no_zoned_gc_percent")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_zoned_gc_percent")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_valid_thresh_ratio")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; @@ -824,6 +845,27 @@ out: return count; } + if (!strcmp(a->attr.name, "reserved_pin_section")) { + if (t > GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi))) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_gc_multiple")) { + if (t < 1 || t > SEGS_PER_SEC(sbi)) + return -EINVAL; + sbi->gc_thread->boost_gc_multiple = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_gc_greedy")) { + if (t > GC_GREEDY) + return -EINVAL; + sbi->gc_thread->boost_gc_greedy = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -1050,6 +1092,8 @@ GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); GC_THREAD_RW_ATTR(gc_no_zoned_gc_percent, no_zoned_gc_percent); GC_THREAD_RW_ATTR(gc_boost_zoned_gc_percent, boost_zoned_gc_percent); GC_THREAD_RW_ATTR(gc_valid_thresh_ratio, valid_thresh_ratio); +GC_THREAD_RW_ATTR(gc_boost_gc_multiple, boost_gc_multiple); +GC_THREAD_RW_ATTR(gc_boost_gc_greedy, boost_gc_greedy); /* SM_INFO ATTR */ SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments); @@ -1130,6 +1174,7 @@ F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); +F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1220,6 +1265,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_no_zoned_gc_percent), ATTR_LIST(gc_boost_zoned_gc_percent), ATTR_LIST(gc_valid_thresh_ratio), + ATTR_LIST(gc_boost_gc_multiple), + ATTR_LIST(gc_boost_gc_greedy), ATTR_LIST(gc_idle), ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), @@ -1323,6 +1370,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(last_age_weight), ATTR_LIST(max_read_extent_count), ATTR_LIST(carve_out), + ATTR_LIST(reserved_pin_section), NULL, }; ATTRIBUTE_GROUPS(f2fs); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 1db348f8f887..a7061c2ad8e4 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -356,7 +356,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); - fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); + fat_fs_error_ratelimit(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index c7a2d27120ba..950da09f0961 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -158,9 +158,9 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) mark_inode_dirty(inode); } if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) { - fat_fs_error(sb, "clusters badly computed (%d != %llu)", - new_fclus, - (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); + fat_fs_error_ratelimit( + sb, "clusters badly computed (%d != %llu)", new_fclus, + (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); fat_cache_inval_inode(inode); } inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9); diff --git a/fs/file_table.c b/fs/file_table.c index f09d79a98111..81c72576e548 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -199,7 +199,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) file_ref_init(&f->f_ref, 1); /* * Disable permission and pre-content events for all files by default. - * They may be enabled later by file_set_fsnotify_mode_from_watchers(). + * They may be enabled later by fsnotify_open_perm_and_set_mode(). */ file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 0502bf3cdf6a..ac6d4c1064cc 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -10,7 +10,6 @@ #include <linux/dax.h> #include <linux/uio.h> #include <linux/pagemap.h> -#include <linux/pfn_t.h> #include <linux/iomap.h> #include <linux/interval_tree.h> @@ -757,7 +756,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - pfn_t pfn; + unsigned long pfn; int error = 0; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_conn_dax *fcd = fc->dax; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 95275a1e2f54..5525a4520b0f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1966,17 +1966,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) struct fuse_file *ff; int err; - /* - * Inode is always written before the last reference is dropped and - * hence this should not be reached from reclaim. - * - * Writing back the inode from reclaim can deadlock if the request - * processing itself needs an allocation. Allocations triggering - * reclaim while serving a request can't be prevented, because it can - * involve any number of unrelated userspace processes. - */ - WARN_ON(wbc->for_reclaim); - ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 3fbfb1a2942b..c826e7ca49f5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -9,7 +9,6 @@ #include <linux/pci.h> #include <linux/interrupt.h> #include <linux/group_cpus.h> -#include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/module.h> #include <linux/virtio.h> @@ -1008,7 +1007,7 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, - void **kaddr, pfn_t *pfn) + void **kaddr, unsigned long *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); @@ -1017,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0); + *pfn = fs->window_phys_addr + offset; return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9ddd67da0eeb..09d4baef29cf 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -150,10 +150,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (inode->i_flags & S_PRIVATE) vm_flags |= VM_NORESERVE; - if (!hugetlb_reserve_pages(inode, + if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, - vm_flags)) + vm_flags) < 0) goto out; ret = 0; @@ -179,12 +179,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; - if (flags & MAP_FIXED) { - if (addr & ~huge_page_mask(h)) - return -EINVAL; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - } + if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h))) + return -EINVAL; if (addr) addr0 = ALIGN(addr, huge_page_size(h)); @@ -1563,9 +1559,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode->i_size = size; clear_nlink(inode); - if (!hugetlb_reserve_pages(inode, 0, + if (hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, - acctflag)) + acctflag) < 0) file = ERR_PTR(-ENOMEM); else file = alloc_file_pseudo(inode, mnt, name, O_RDWR, diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 5e47951db630..2a4a288b821c 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -44,6 +44,9 @@ static int jfs_open(struct inode *inode, struct file *file) { int rc; + if (S_ISREG(inode->i_mode) && inode->i_size < 0) + return -EIO; + if ((rc = dquot_file_open(inode, file))) return rc; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 083e7fa54709..fcedeb514e14 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -145,9 +145,9 @@ void jfs_evict_inode(struct inode *inode) if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); + truncate_inode_pages_final(&inode->i_data); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap; - truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 35e063c9f3a4..cdfa699cd7c8 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1389,6 +1389,12 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth; ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); + if (ti < 0 || ti >= le32_to_cpu(dcp->nleafs)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n"); + release_metapage(mp); + return -EIO; + } + /* dmap control page trees fan-out by 4 and a single allocation * group may be described by 1 or 2 subtrees within the ag level * dmap control page, depending upon the ag size. examine the ag's @@ -1809,8 +1815,10 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) return -EIO; dp = (struct dmap *) mp->data; - if (dp->tree.budmin < 0) + if (dp->tree.budmin < 0) { + release_metapage(mp); return -EIO; + } /* try to allocate the blocks. */ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 9029cd216912..b98cf3bb6c1f 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -421,7 +421,7 @@ static void metapage_write_end_io(struct bio *bio) } static int metapage_write_folio(struct folio *folio, - struct writeback_control *wbc, void *unused) + struct writeback_control *wbc) { struct bio *bio = NULL; int block_offset; /* block offset of mp within page */ @@ -550,10 +550,12 @@ static int metapage_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct blk_plug plug; + struct folio *folio = NULL; int err; blk_start_plug(&plug); - err = write_cache_pages(mapping, wbc, metapage_write_folio, NULL); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) + err = metapage_write_folio(folio, wbc); blk_finish_plug(&plug); return err; @@ -813,7 +815,7 @@ static int metapage_write_one(struct folio *folio) if (folio_clear_dirty_for_io(folio)) { folio_get(folio); - ret = metapage_write_folio(folio, &wbc, NULL); + ret = metapage_write_folio(folio, &wbc); if (ret == 0) folio_wait_writeback(folio); folio_put(folio); diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 5ee618d17e77..28c3cf960c6f 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -49,26 +49,6 @@ #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) -/* get page buffer for specified block address */ -/* ToDo: Replace this ugly macro with a function */ -#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ -do { \ - BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \ - if (!(RC)) { \ - if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \ - (le16_to_cpu((P)->header.nextindex) > \ - le16_to_cpu((P)->header.maxentry)) || \ - (le16_to_cpu((P)->header.maxentry) > \ - (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \ - jfs_error((IP)->i_sb, \ - "XT_GETPAGE: xtree page corrupt\n"); \ - BT_PUTPAGE(MP); \ - MP = NULL; \ - RC = -EIO; \ - } \ - } \ -} while (0) - /* for consistency */ #define XT_PUTPAGE(MP) BT_PUTPAGE(MP) @@ -115,6 +95,42 @@ static int xtSplitRoot(tid_t tid, struct inode *ip, struct xtsplit * split, struct metapage ** rmpp); /* + * xt_getpage() + * + * function: get the page buffer for a specified block address. + * + * parameters: + * ip - pointer to the inode + * bn - block number (s64) of the xtree page to be retrieved; + * mp - pointer to a metapage pointer where the page buffer is returned; + * + * returns: + * A pointer to the xtree page (xtpage_t) on success, -EIO on error. + */ + +static inline xtpage_t *xt_getpage(struct inode *ip, s64 bn, struct metapage **mp) +{ + xtpage_t *p; + int rc; + + BT_GETPAGE(ip, bn, *mp, xtpage_t, PSIZE, p, rc, i_xtroot); + + if (rc) + return ERR_PTR(rc); + if ((le16_to_cpu(p->header.nextindex) < XTENTRYSTART) || + (le16_to_cpu(p->header.nextindex) > + le16_to_cpu(p->header.maxentry)) || + (le16_to_cpu(p->header.maxentry) > + ((bn == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { + jfs_error(ip->i_sb, "xt_getpage: xtree page corrupt\n"); + BT_PUTPAGE(*mp); + *mp = NULL; + return ERR_PTR(-EIO); + } + return p; +} + +/* * xtLookup() * * function: map a single page into a physical extent; @@ -216,7 +232,6 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, int *cmpp, struct btstack * btstack, int flag) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); - int rc = 0; int cmp = 1; /* init for empty page */ s64 bn; /* block number */ struct metapage *mp; /* page buffer */ @@ -252,9 +267,9 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, */ for (bn = 0;;) { /* get/pin the page to search */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* try sequential access heuristics with the previous * access entry in target leaf page: @@ -807,10 +822,10 @@ xtSplitUp(tid_t tid, * insert router entry in parent for new right child page <rp> */ /* get/pin the parent page <sp> */ - XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); - if (rc) { + sp = xt_getpage(ip, parent->bn, &smp); + if (IS_ERR(sp)) { XT_PUTPAGE(rcmp); - return rc; + return PTR_ERR(sp); } /* @@ -1062,10 +1077,10 @@ xtSplitPage(tid_t tid, struct inode *ip, * update previous pointer of old next/right page of <sp> */ if (nextbn != 0) { - XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); - if (rc) { + p = xt_getpage(ip, nextbn, &mp); + if (IS_ERR(p)) { XT_PUTPAGE(rmp); - goto clean_up; + return PTR_ERR(p); } BT_MARK_DIRTY(mp, ip); @@ -1417,9 +1432,9 @@ int xtExtend(tid_t tid, /* transaction id */ return rc; /* get back old page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* * if leaf root has been split, original root has been * copied to new child page, i.e., original entry now @@ -1433,9 +1448,9 @@ int xtExtend(tid_t tid, /* transaction id */ XT_PUTPAGE(mp); /* get new child page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { @@ -1711,9 +1726,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) return rc; /* get back old page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* * if leaf root has been split, original root has been * copied to new child page, i.e., original entry now @@ -1727,9 +1742,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) XT_PUTPAGE(mp); /* get new child page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { @@ -1788,9 +1803,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) XT_PUTPAGE(mp); /* get new right page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { @@ -1864,9 +1879,9 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p); return rc; /* get back old page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* * if leaf root has been split, original root has been @@ -1881,9 +1896,9 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p); XT_PUTPAGE(mp); /* get new child page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { @@ -2187,7 +2202,6 @@ void xtInitRoot(tid_t tid, struct inode *ip) */ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) { - int rc = 0; s64 teof; struct metapage *mp; xtpage_t *p; @@ -2268,9 +2282,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) * first access of each page: */ getPage: - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* process entries backward from last index */ index = le16_to_cpu(p->header.nextindex) - 1; @@ -2506,9 +2520,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) /* get back the parent page */ bn = parent->bn; - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); index = parent->index; @@ -2791,9 +2805,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) * first access of each page: */ getPage: - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* process entries backward from last index */ index = le16_to_cpu(p->header.nextindex) - 1; @@ -2836,9 +2850,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) /* get back the parent page */ bn = parent->bn; - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); index = parent->index; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 374fc6b34c79..cf1d720b8251 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -720,7 +720,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || - wbc->for_background || wbc->for_sync || wbc->for_reclaim) { + wbc->for_background || wbc->for_sync) { ioc = nfs_io_completion_alloc(GFP_KERNEL); if (ioc) nfs_io_completion_init(ioc, nfs_io_completion_commit, diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 3083643b864b..bfe884d624e7 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -454,7 +454,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, dwords = fh_len >> 2; type = exportfs_encode_fid(inode, buf, &dwords); err = -EINVAL; - if (type <= 0 || type == FILEID_INVALID || fh_len != dwords << 2) + /* + * Unlike file_handle, type and len of struct fanotify_fh are u8. + * Traditionally, filesystem return handle_type < 0xff, but there + * is no enforecement for that in vfs. + */ + BUILD_BUG_ON(MAX_HANDLE_SZ > 0xff || FILEID_INVALID > 0xff); + if (type <= 0 || type >= FILEID_INVALID || fh_len != dwords << 2) goto out_err; fh->type = type; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index e2b4f17a48bb..079b868552c2 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -199,8 +199,8 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, } /* Are there any inode/mount/sb objects that watch for these events? */ -static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, - __u32 mask) +static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, + __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); @@ -656,20 +656,20 @@ EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* - * At open time we check fsnotify_sb_has_priority_watchers() and set the - * FMODE_NONOTIFY_ mode bits accordignly. + * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm + * hook and set the FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ -void file_set_fsnotify_mode_from_watchers(struct file *file) +int fsnotify_open_perm_and_set_mode(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; - __u32 mnt_mask, p_mask; + __u32 mnt_mask, p_mask = 0; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) - return; + return 0; /* * Permission events is a super set of pre-content events, so if there @@ -679,45 +679,64 @@ void file_set_fsnotify_mode_from_watchers(struct file *file) if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); - return; + return 0; } /* - * If there are permission event watchers but no pre-content event - * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. + * OK, there are some permission event watchers. Check if anybody is + * watching for permission events on *this* file. */ - if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || - likely(!fsnotify_sb_has_priority_watchers(sb, - FSNOTIFY_PRIO_PRE_CONTENT))) { - file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); - return; + mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); + p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask, + ALL_FSNOTIFY_PERM_EVENTS); + if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { + parent = dget_parent(dentry); + p_mask |= fsnotify_inode_watches_children(d_inode(parent)); + dput(parent); } /* - * OK, there are some pre-content watchers. Check if anybody is - * watching for pre-content events on *this* file. + * Legacy FAN_ACCESS_PERM events have very high performance overhead, + * so unlikely to be used in the wild. If they are used there will be + * no optimizations at all. */ - mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); - if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, - FSNOTIFY_PRE_CONTENT_EVENTS))) { - /* Enable pre-content events */ + if (unlikely(p_mask & FS_ACCESS_PERM)) { + /* Enable all permission and pre-content events */ file_set_fsnotify_mode(file, 0); - return; + goto open_perm; } - /* Is parent watching for pre-content events on this file? */ - if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { - parent = dget_parent(dentry); - p_mask = fsnotify_inode_watches_children(d_inode(parent)); - dput(parent); - if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) { - /* Enable pre-content events */ - file_set_fsnotify_mode(file, 0); - return; - } + /* + * Pre-content events are only supported on regular files. + * If there are pre-content event watchers and no permission access + * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. + * That is the common case with HSM service. + */ + if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) { + file_set_fsnotify_mode(file, FMODE_NONOTIFY | + FMODE_NONOTIFY_PERM); + goto open_perm; } - /* Nobody watching for pre-content events from this file */ - file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); + + /* Nobody watching permission and pre-content events on this file */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); + +open_perm: + /* + * Send open perm events depending on object masks and regardless of + * FMODE_NONOTIFY_PERM. + */ + if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) { + int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); + + if (ret) + return ret; + } + + if (p_mask & FS_OPEN_PERM) + return fsnotify_path(&file->f_path, FS_OPEN_PERM); + + return 0; } #endif diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 2203438738f6..76c86f1c2b1c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1071,6 +1071,7 @@ static int ocfs2_grab_folios_for_write(struct address_space *mapping, if (IS_ERR(wc->w_folios[i])) { ret = PTR_ERR(wc->w_folios[i]); mlog_errno(ret); + wc->w_folios[i] = NULL; goto out; } } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 7799f4d16ce9..8c9c4825f984 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -798,6 +798,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } } + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ret = ocfs2_error(inode->i_sb, + "Inode %lu has empty extent list at depth %u\n", + inode->i_ino, + le16_to_cpu(el->l_tree_depth)); + goto out; + } + found = 0; for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { rec = &el->l_recs[i]; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 67fc62a49a76..00f52812dbb0 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2632,7 +2632,7 @@ again: dlm_reco_master_ready(dlm), msecs_to_jiffies(1000)); if (!dlm_reco_master_ready(dlm)) { - mlog(0, "%s: reco master taking awhile\n", + mlog(0, "%s: reco master taking a while\n", dlm->name); goto again; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 12e5d1f73325..14bf440ea4df 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -50,8 +50,6 @@ struct ocfs2_find_inode_args unsigned int fi_sysfile_type; }; -static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; - static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args); static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); @@ -250,14 +248,77 @@ bail: static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = opaque; +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, ocfs2_file_ip_alloc_sem_key; +#endif inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; - if (args->fi_sysfile_type != 0) +#ifdef CONFIG_LOCKDEP + switch (args->fi_sysfile_type) { + case BAD_BLOCK_SYSTEM_INODE: + break; + case GLOBAL_INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]); + break; + case SLOT_MAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]); + break; + case HEARTBEAT_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]); + break; + case GLOBAL_BITMAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]); + break; + case USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]); + break; + case GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]); + break; + case ORPHAN_DIR_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]); + break; + case EXTENT_ALLOC_SYSTEM_INODE: lockdep_set_class(&inode->i_rwsem, - &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); + &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]); + break; + case INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]); + break; + case JOURNAL_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]); + break; + case LOCAL_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]); + break; + case TRUNCATE_LOG_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]); + break; + case LOCAL_USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]); + break; + case LOCAL_GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]); + break; + default: + WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type); + } if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || @@ -267,6 +328,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) else lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_file_ip_alloc_sem_key); +#endif return 0; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 369c7d27befd..cbe2f8ed8897 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -617,6 +617,8 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, */ credits += OCFS2_INODE_UPDATE_CREDITS + 1; + inode_lock(tl_inode); + /* * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() * logic, while we still need to lock the global_bitmap. @@ -626,7 +628,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, if (!gb_inode) { mlog(ML_ERROR, "unable to get global_bitmap inode\n"); ret = -EIO; - goto out; + goto out_unlock_tl_inode; } inode_lock(gb_inode); @@ -634,16 +636,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { mlog_errno(ret); - goto out_unlock_gb_mutex; + goto out_unlock_gb_inode; } - inode_lock(tl_inode); - handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_unlock_tl_inode; + goto out_unlock; } new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); @@ -703,15 +703,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, out_commit: ocfs2_commit_trans(osb, handle); brelse(gd_bh); - -out_unlock_tl_inode: - inode_unlock(tl_inode); - +out_unlock: ocfs2_inode_unlock(gb_inode, 1); -out_unlock_gb_mutex: +out_unlock_gb_inode: inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); +out_unlock_tl_inode: + inode_unlock(tl_inode); out: if (context->meta_ac) { diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 99278c8f0e24..c90b254da75e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -142,6 +142,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, bail_add: ret = d_splice_alias(inode, dentry); + if (IS_ERR(ret)) + goto bail_unlock; if (inode) { /* @@ -154,15 +156,16 @@ bail_add: * NOTE: This dentry already has ->d_op set from * ocfs2_get_parent() and ocfs2_get_dentry() */ - if (!IS_ERR_OR_NULL(ret)) + if (ret) dentry = ret; status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); if (status) { mlog_errno(status); + if (ret) + dput(ret); ret = ERR_PTR(status); - goto bail_unlock; } } else ocfs2_dentry_attach_gen(dentry); @@ -1452,8 +1455,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, newfe = (struct ocfs2_dinode *) newfe_bh->b_data; trace_ocfs2_rename_over_existing( - (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? - (unsigned long long)newfe_bh->b_blocknr : 0ULL); + (unsigned long long)newfe_blkno, newfe_bh, + (unsigned long long)newfe_bh->b_blocknr); if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 77edcd70f72c..0f045e45fa0c 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -360,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file, struct ocfs2_control_message_setn *msg) { long nodenum; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; if (ocfs2_control_get_handshake_state(file) != @@ -375,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file, return -EINVAL; msg->space = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || @@ -391,7 +389,6 @@ static int ocfs2_control_do_setversion_msg(struct file *file, struct ocfs2_control_message_setv *msg) { long major, minor; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; struct ocfs2_protocol_version *max = &ocfs2_user_plugin.sp_max_proto; @@ -409,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - major = simple_strtol(msg->major, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->major, 16, &major)) return -EINVAL; - minor = simple_strtol(msg->minor, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->minor, 16, &minor)) return -EINVAL; /* @@ -441,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file, struct ocfs2_control_message_down *msg) { long nodenum; - char *p = NULL; if (ocfs2_control_get_handshake_state(file) != OCFS2_CONTROL_HANDSHAKE_VALID) @@ -456,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &p, 16); - if (!p || *p) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || diff --git a/fs/open.c b/fs/open.c index feb21e8cf5ff..9655158c3885 100644 --- a/fs/open.c +++ b/fs/open.c @@ -944,12 +944,12 @@ static int do_dentry_open(struct file *f, goto cleanup_all; /* - * Set FMODE_NONOTIFY_* bits according to existing permission watches. + * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits + * according to existing permission watches. * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a * pseudo file, this call will not change the mode. */ - file_set_fsnotify_mode_from_watchers(f); - error = fsnotify_open_perm(f); + error = fsnotify_open_perm_and_set_mode(f); if (error) goto cleanup_all; diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index f7095c91660c..1c375fb65018 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -396,7 +396,7 @@ static ssize_t orangefs_debug_read(struct file *file, goto out; mutex_lock(&orangefs_debug_lock); - sprintf_ret = sprintf(buf, "%s", (char *)file->private_data); + sprintf_ret = scnprintf(buf, ORANGEFS_MAX_DEBUG_STRING_LEN, "%s", (char *)file->private_data); mutex_unlock(&orangefs_debug_lock); read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret); @@ -769,8 +769,8 @@ static void do_k_string(void *k_mask, int index) if (*mask & s_kmod_keyword_mask_map[index].mask_val) { if ((strlen(kernel_debug_string) + - strlen(s_kmod_keyword_mask_map[index].keyword)) - < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) { + strlen(s_kmod_keyword_mask_map[index].keyword) + 1) + < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcat(kernel_debug_string, s_kmod_keyword_mask_map[index].keyword); strcat(kernel_debug_string, ","); @@ -797,7 +797,7 @@ static void do_c_string(void *c_mask, int index) (mask->mask2 & cdm_array[index].mask2)) { if ((strlen(client_debug_string) + strlen(cdm_array[index].keyword) + 1) - < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) { + < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcat(client_debug_string, cdm_array[index].keyword); strcat(client_debug_string, ","); diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 04e15dfa504a..369455b354ef 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -217,36 +217,31 @@ static ssize_t sysfs_int_show(struct kobject *kobj, if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { if (!strcmp(attr->attr.name, "op_timeout_secs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", op_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", slot_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", orangefs_cache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", orangefs_dcache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", orangefs_getattr_timeout_msecs); goto out; @@ -256,14 +251,12 @@ static ssize_t sysfs_int_show(struct kobject *kobj, } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) { if (!strcmp(attr->attr.name, "reads")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%lu\n", orangefs_stats.reads); goto out; } else if (!strcmp(attr->attr.name, "writes")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%lu\n", orangefs_stats.writes); goto out; @@ -497,19 +490,18 @@ out: if (strcmp(kobj->name, PC_KOBJ_ID)) { if (new_op->upcall.req.param.op == ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) { - rc = scnprintf(buf, PAGE_SIZE, "%d %d\n", + rc = sysfs_emit(buf, "%d %d\n", (int)new_op->downcall.resp.param.u. value32[0], (int)new_op->downcall.resp.param.u. value32[1]); } else { - rc = scnprintf(buf, PAGE_SIZE, "%d\n", + rc = sysfs_emit(buf, "%d\n", (int)new_op->downcall.resp.param.u.value64); } } else { - rc = scnprintf( + rc = sysfs_emit( buf, - PAGE_SIZE, "%s", new_op->downcall.resp.perf_count.buffer); } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5635453cd476..76e800e38c8f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -571,6 +571,8 @@ static void pde_set_flags(struct proc_dir_entry *pde) if (pde->proc_ops->proc_compat_ioctl) pde->flags |= PROC_ENTRY_proc_compat_ioctl; #endif + if (pde->proc_ops->proc_lseek) + pde->flags |= PROC_ENTRY_proc_lseek; } struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3604b616311c..129490151be1 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -473,7 +473,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) typeof_member(struct proc_ops, proc_open) open; struct pde_opener *pdeo; - if (!pde->proc_ops->proc_lseek) + if (!pde_has_proc_lseek(pde)) file->f_mode &= ~FMODE_LSEEK; if (pde_is_permanent(pde)) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 520c4742101d..e737401d7383 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -98,6 +98,11 @@ static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde) #endif } +static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_lseek; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -378,6 +383,11 @@ struct proc_maps_private { struct task_struct *task; struct mm_struct *mm; struct vma_iterator iter; + loff_t last_pos; +#ifdef CONFIG_PER_VMA_LOCK + bool mmap_locked; + struct vm_area_struct *locked_vma; +#endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index bc2bc60c36cc..a458f1e112fd 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -121,8 +121,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", 0); - show_val_kb(m, "WritebackTmp: ", - global_node_page_state(NR_WRITEBACK_TEMP)); + show_val_kb(m, "WritebackTmp: ", 0); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", diff --git a/fs/proc/page.c b/fs/proc/page.c index 999af26c7298..ba3568e97fd1 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -43,6 +43,22 @@ static inline unsigned long get_max_dump_pfn(void) #endif } +static u64 get_kpage_count(const struct page *page) +{ + struct page_snapshot ps; + u64 ret; + + snapshot_page(&ps, page); + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + ret = folio_precise_page_mapcount(&ps.folio_snapshot, + &ps.page_snapshot); + else + ret = folio_average_page_mapcount(&ps.folio_snapshot); + + return ret; +} + static ssize_t kpage_read(struct file *file, char __user *buf, size_t count, loff_t *ppos, enum kpage_operation op) @@ -75,10 +91,7 @@ static ssize_t kpage_read(struct file *file, char __user *buf, info = stable_page_flags(page); break; case KPAGE_COUNT: - if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) - info = folio_precise_page_mapcount(page_folio(page), page); - else - info = folio_average_page_mapcount(page_folio(page)); + info = get_kpage_count(page); break; case KPAGE_CGROUP: info = page_cgroup_ino(page); @@ -134,6 +147,7 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) u64 stable_page_flags(const struct page *page) { const struct folio *folio; + struct page_snapshot ps; unsigned long k; unsigned long mapping; bool is_anon; @@ -145,20 +159,22 @@ u64 stable_page_flags(const struct page *page) */ if (!page) return 1 << KPF_NOPAGE; - folio = page_folio(page); + + snapshot_page(&ps, page); + folio = &ps.folio_snapshot; k = folio->flags; mapping = (unsigned long)folio->mapping; - is_anon = mapping & PAGE_MAPPING_ANON; + is_anon = mapping & FOLIO_MAPPING_ANON; /* * pseudo flags for the well known (anonymous) memory mapped pages */ - if (page_mapped(page)) + if (folio_mapped(folio)) u |= 1 << KPF_MMAP; if (is_anon) { u |= 1 << KPF_ANON; - if (mapping & PAGE_MAPPING_KSM) + if (mapping & FOLIO_MAPPING_KSM) u |= 1 << KPF_KSM; } @@ -166,7 +182,7 @@ u64 stable_page_flags(const struct page *page) * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ - if (page == &folio->page) + if (ps.idx == 0) u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head); else u |= 1 << KPF_COMPOUND_TAIL; @@ -176,25 +192,19 @@ u64 stable_page_flags(const struct page *page) folio_test_large_rmappable(folio)) { /* Note: we indicate any THPs here, not just PMD-sized ones */ u |= 1 << KPF_THP; - } else if (is_huge_zero_folio(folio)) { + } else if (is_huge_zero_pfn(ps.pfn)) { u |= 1 << KPF_ZERO_PAGE; u |= 1 << KPF_THP; - } else if (is_zero_folio(folio)) { + } else if (is_zero_pfn(ps.pfn)) { u |= 1 << KPF_ZERO_PAGE; } - /* - * Caveats on high order pages: PG_buddy and PG_slab will only be set - * on the head page. - */ - if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; - else if (page_count(page) == 0 && is_free_buddy_page(page)) + if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY) u |= 1 << KPF_BUDDY; - if (PageOffline(page)) + if (folio_test_offline(folio)) u |= 1 << KPF_OFFLINE; - if (PageTable(page)) + if (folio_test_pgtable(folio)) u |= 1 << KPF_PGTABLE; if (folio_test_slab(folio)) u |= 1 << KPF_SLAB; @@ -202,7 +212,7 @@ u64 stable_page_flags(const struct page *page) #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) u |= kpf_copy_bit(k, KPF_IDLE, PG_idle); #else - if (folio_test_idle(folio)) + if (ps.flags & PAGE_SNAPSHOT_PG_IDLE) u |= 1 << KPF_IDLE; #endif @@ -228,7 +238,7 @@ u64 stable_page_flags(const struct page *page) if (u & (1 << KPF_HUGE)) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); else - u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison); + u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison); #endif u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 751479eb128f..3d6d8a9f13fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -29,6 +29,9 @@ #include <asm/tlbflush.h> #include "internal.h" +#define SENTINEL_VMA_END -1 +#define SENTINEL_VMA_GATE -2 + #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) @@ -127,15 +130,134 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, - loff_t *ppos) +#ifdef CONFIG_PER_VMA_LOCK + +static void unlock_vma(struct proc_maps_private *priv) +{ + if (priv->locked_vma) { + vma_end_read(priv->locked_vma); + priv->locked_vma = NULL; + } +} + +static const struct seq_operations proc_pid_maps_op; + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_private *priv) +{ + /* + * smaps and numa_maps perform page table walk, therefore require + * mmap_lock but maps can be read with locking just the vma and + * walking the vma tree under rcu read protection. + */ + if (m->op != &proc_pid_maps_op) { + if (mmap_read_lock_killable(priv->mm)) + return false; + + priv->mmap_locked = true; + } else { + rcu_read_lock(); + priv->locked_vma = NULL; + priv->mmap_locked = false; + } + + return true; +} + +static inline void unlock_vma_range(struct proc_maps_private *priv) +{ + if (priv->mmap_locked) { + mmap_read_unlock(priv->mm); + } else { + unlock_vma(priv); + rcu_read_unlock(); + } +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + struct vm_area_struct *vma; + + if (priv->mmap_locked) + return vma_next(&priv->iter); + + unlock_vma(priv); + vma = lock_next_vma(priv->mm, &priv->iter, last_pos); + if (!IS_ERR_OR_NULL(vma)) + priv->locked_vma = vma; + + return vma; +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + if (priv->mmap_locked) + return false; + + rcu_read_unlock(); + mmap_read_lock(priv->mm); + /* Reinitialize the iterator after taking mmap_lock */ + vma_iter_set(&priv->iter, pos); + priv->mmap_locked = true; + + return true; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_private *priv) +{ + return mmap_read_lock_killable(priv->mm) == 0; +} + +static inline void unlock_vma_range(struct proc_maps_private *priv) +{ + mmap_read_unlock(priv->mm); +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + return vma_next(&priv->iter); +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) { - struct vm_area_struct *vma = vma_next(&priv->iter); + return false; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma; + +retry: + vma = get_next_vma(priv, *ppos); + /* EINTR of EAGAIN is possible */ + if (IS_ERR(vma)) { + if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) + goto retry; + return vma; + } + + /* Store previous position to be able to restart if needed */ + priv->last_pos = *ppos; if (vma) { - *ppos = vma->vm_start; + /* + * Track the end of the reported vma to ensure position changes + * even if previous vma was merged with the next vma and we + * found the extended vma with the same vm_start. + */ + *ppos = vma->vm_end; } else { - *ppos = -2UL; + *ppos = SENTINEL_VMA_GATE; vma = get_gate_vma(priv->mm); } @@ -145,11 +267,11 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = *ppos; + loff_t last_addr = *ppos; struct mm_struct *mm; /* See m_next(). Zero at the start or after lseek. */ - if (last_addr == -1UL) + if (last_addr == SENTINEL_VMA_END) return NULL; priv->task = get_proc_task(priv->inode); @@ -163,28 +285,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return NULL; } - if (mmap_read_lock_killable(mm)) { + if (!lock_vma_range(m, priv)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; return ERR_PTR(-EINTR); } - vma_iter_init(&priv->iter, mm, last_addr); + /* + * Reset current position if last_addr was set before + * and it's not a sentinel. + */ + if (last_addr > 0) + *ppos = last_addr = priv->last_pos; + vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); hold_task_mempolicy(priv); - if (last_addr == -2UL) + if (last_addr == SENTINEL_VMA_GATE) return get_gate_vma(mm); - return proc_get_vma(priv, ppos); + return proc_get_vma(m, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - if (*ppos == -2UL) { - *ppos = -1UL; + if (*ppos == SENTINEL_VMA_GATE) { + *ppos = SENTINEL_VMA_END; return NULL; } - return proc_get_vma(m->private, ppos); + return proc_get_vma(m, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -196,7 +324,7 @@ static void m_stop(struct seq_file *m, void *v) return; release_task_mempolicy(priv); - mmap_read_unlock(mm); + unlock_vma_range(priv); mmput(mm); put_task_struct(priv->task); priv->task = NULL; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 10d01eb09c43..f188bd900eb2 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1490,10 +1490,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return -EINVAL; dump = vzalloc(sizeof(*dump)); - if (!dump) { - ret = -ENOMEM; - goto out_err; - } + if (!dump) + return -ENOMEM; /* Keep size of the buffer page aligned so that it can be mmaped */ data_size = roundup(sizeof(struct vmcoredd_header) + data->size, @@ -1519,22 +1517,19 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) dump->size = data_size; /* Add the dump to driver sysfs list and update the elfcore hdr */ - mutex_lock(&vmcore_mutex); - if (vmcore_opened) - pr_warn_once("Unexpected adding of device dump\n"); - if (vmcore_open) { - ret = -EBUSY; - goto unlock; - } + scoped_guard(mutex, &vmcore_mutex) { + if (vmcore_opened) + pr_warn_once("Unexpected adding of device dump\n"); + if (vmcore_open) { + ret = -EBUSY; + goto out_err; + } - list_add_tail(&dump->list, &vmcoredd_list); - vmcoredd_update_size(data_size); - mutex_unlock(&vmcore_mutex); + list_add_tail(&dump->list, &vmcoredd_list); + vmcoredd_update_size(data_size); + } return 0; -unlock: - mutex_unlock(&vmcore_mutex); - out_err: vfree(buf); vfree(dump); diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 368e870624da..b69daeb1301b 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -195,6 +195,7 @@ replay_again: * from @cfids->entries. Caller will put last reference if the latter. */ if (cfid->has_lease && cfid->time) { + cfid->last_access_time = jiffies; spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; kfree(utf16_path); @@ -363,6 +364,7 @@ replay_again: cfid->file_all_info_is_valid = true; cfid->time = jiffies; + cfid->last_access_time = jiffies; spin_unlock(&cfids->cfid_list_lock); /* At this point the directory handle is fully cached */ rc = 0; @@ -617,7 +619,7 @@ static void cached_dir_put_work(struct work_struct *work) queue_work(serverclose_wq, &cfid->close_work); } -int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) +bool cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) { struct cached_fids *cfids = tcon->cfids; struct cached_fid *cfid; @@ -730,8 +732,8 @@ static void cfids_laundromat_worker(struct work_struct *work) spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { - if (cfid->time && - time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) { + if (cfid->last_access_time && + time_after(jiffies, cfid->last_access_time + HZ * dir_cache_timeout)) { cfid->on_list = false; list_move(&cfid->entry, &entry); cfids->num_entries--; diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index a28f7cae3caa..46b5a2fdf15b 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -14,7 +14,6 @@ struct cached_dirent { char *name; int namelen; loff_t pos; - struct cifs_fattr fattr; }; @@ -39,6 +38,7 @@ struct cached_fid { bool on_list:1; bool file_all_info_is_valid:1; unsigned long time; /* jiffies of when lease was taken */ + unsigned long last_access_time; /* jiffies of when last accessed */ struct kref refcount; struct cifs_fid fid; spinlock_t fid_lock; @@ -80,6 +80,6 @@ extern void drop_cached_dir_by_name(const unsigned int xid, struct cifs_sb_info *cifs_sb); extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb); extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon); -extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); +extern bool cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); #endif /* _CACHED_DIR_H */ diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 3fdf75737d43..f1cea365b6f1 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -26,6 +26,7 @@ #include "smbdirect.h" #endif #include "cifs_swn.h" +#include "cached_dir.h" void cifs_dump_mem(char *label, void *data, int length) @@ -280,6 +281,54 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) return 0; } +static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) +{ + struct list_head *stmp, *tmp, *tmp1; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct cached_fids *cfids; + struct cached_fid *cfid; + LIST_HEAD(entry); + + seq_puts(m, "# Version:1\n"); + seq_puts(m, "# Format:\n"); + seq_puts(m, "# <tree id> <sess id> <persistent fid> <path>\n"); + + spin_lock(&cifs_tcp_ses_lock); + list_for_each(stmp, &cifs_tcp_ses_list) { + server = list_entry(stmp, struct TCP_Server_Info, + tcp_ses_list); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + cfids = tcon->cfids; + spin_lock(&cfids->cfid_list_lock); /* check lock ordering */ + seq_printf(m, "Num entries: %d\n", cfids->num_entries); + list_for_each_entry(cfid, &cfids->entries, entry) { + seq_printf(m, "0x%x 0x%llx 0x%llx %s", + tcon->tid, + ses->Suid, + cfid->fid.persistent_fid, + cfid->path); + if (cfid->file_all_info_is_valid) + seq_printf(m, "\tvalid file info"); + if (cfid->dirents.is_valid) + seq_printf(m, ", valid dirents"); + seq_printf(m, "\n"); + } + spin_unlock(&cfids->cfid_list_lock); + + + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + seq_putc(m, '\n'); + return 0; +} + static __always_inline const char *compression_alg_str(__le16 alg) { switch (alg) { @@ -863,6 +912,9 @@ cifs_proc_init(void) proc_create_single("open_files", 0400, proc_fs_cifs, cifs_debug_files_proc_show); + proc_create_single("open_dirs", 0400, proc_fs_cifs, + cifs_debug_dirs_proc_show); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops); proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops); proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops); @@ -907,6 +959,7 @@ cifs_proc_clean(void) remove_proc_entry("DebugData", proc_fs_cifs); remove_proc_entry("open_files", proc_fs_cifs); + remove_proc_entry("open_dirs", proc_fs_cifs); remove_proc_entry("cifsFYI", proc_fs_cifs); remove_proc_entry("traceSMB", proc_fs_cifs); remove_proc_entry("Stats", proc_fs_cifs); diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 35892df7335c..3cc686246908 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -343,7 +343,7 @@ static struct ntlmssp2_name *find_next_av(struct cifs_ses *ses, len = AV_LEN(av); if (AV_TYPE(av) == NTLMSSP_AV_EOL) return NULL; - if (!len || (u8 *)av + sizeof(*av) + len > end) + if ((u8 *)av + sizeof(*av) + len > end) return NULL; return av; } @@ -363,7 +363,7 @@ static int find_av_name(struct cifs_ses *ses, u16 type, char **name, u16 maxlen) av_for_each_entry(ses, av) { len = AV_LEN(av); - if (AV_TYPE(av) != type) + if (AV_TYPE(av) != type || !len) continue; if (!IS_ALIGNED(len, sizeof(__le16))) { cifs_dbg(VFS | ONCE, "%s: bad length(%u) for type %u\n", @@ -532,17 +532,67 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_ return rc; } +/* + * Set up NTLMv2 response blob with SPN (cifs/<hostname>) appended to the + * existing list of AV pairs. + */ +static int set_auth_key_response(struct cifs_ses *ses) +{ + size_t baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); + size_t len, spnlen, tilen = 0, num_avs = 2 /* SPN + EOL */; + struct TCP_Server_Info *server = ses->server; + char *spn __free(kfree) = NULL; + struct ntlmssp2_name *av; + char *rsp = NULL; + int rc; + + spnlen = strlen(server->hostname); + len = sizeof("cifs/") + spnlen; + spn = kmalloc(len, GFP_KERNEL); + if (!spn) { + rc = -ENOMEM; + goto out; + } + + spnlen = scnprintf(spn, len, "cifs/%.*s", + (int)spnlen, server->hostname); + + av_for_each_entry(ses, av) + tilen += sizeof(*av) + AV_LEN(av); + + len = baselen + tilen + spnlen * sizeof(__le16) + num_avs * sizeof(*av); + rsp = kmalloc(len, GFP_KERNEL); + if (!rsp) { + rc = -ENOMEM; + goto out; + } + + memcpy(rsp + baselen, ses->auth_key.response, tilen); + av = (void *)(rsp + baselen + tilen); + av->type = cpu_to_le16(NTLMSSP_AV_TARGET_NAME); + av->length = cpu_to_le16(spnlen * sizeof(__le16)); + cifs_strtoUTF16((__le16 *)av->data, spn, spnlen, ses->local_nls); + av = (void *)((__u8 *)av + sizeof(*av) + AV_LEN(av)); + av->type = cpu_to_le16(NTLMSSP_AV_EOL); + av->length = 0; + + rc = 0; + ses->auth_key.len = len; +out: + ses->auth_key.response = rsp; + return rc; +} + int setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) { struct shash_desc *hmacmd5 = NULL; - int rc; - int baselen; - unsigned int tilen; + unsigned char *tiblob = NULL; /* target info blob */ struct ntlmv2_resp *ntlmv2; char ntlmv2_hash[16]; - unsigned char *tiblob = NULL; /* target info blob */ __le64 rsp_timestamp; + __u64 cc; + int rc; if (nls_cp == NULL) { cifs_dbg(VFS, "%s called with nls_cp==NULL\n", __func__); @@ -588,32 +638,25 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) * (as Windows 7 does) */ rsp_timestamp = find_timestamp(ses); + get_random_bytes(&cc, sizeof(cc)); - baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); - tilen = ses->auth_key.len; - tiblob = ses->auth_key.response; + cifs_server_lock(ses->server); - ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL); - if (!ses->auth_key.response) { - rc = -ENOMEM; + tiblob = ses->auth_key.response; + rc = set_auth_key_response(ses); + if (rc) { ses->auth_key.len = 0; - goto setup_ntlmv2_rsp_ret; + goto unlock; } - ses->auth_key.len += baselen; ntlmv2 = (struct ntlmv2_resp *) (ses->auth_key.response + CIFS_SESS_KEY_SIZE); ntlmv2->blob_signature = cpu_to_le32(0x00000101); ntlmv2->reserved = 0; ntlmv2->time = rsp_timestamp; - - get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal)); + ntlmv2->client_chal = cc; ntlmv2->reserved2 = 0; - memcpy(ses->auth_key.response + baselen, tiblob, tilen); - - cifs_server_lock(ses->server); - rc = cifs_alloc_hash("hmac(md5)", &hmacmd5); if (rc) { cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 89160bc34d35..19dd901fe8ab 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -627,12 +627,14 @@ struct smb_version_operations { bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov, u32 *plen); - int (*create_reparse_symlink)(const unsigned int xid, - struct inode *inode, - struct dentry *dentry, - struct cifs_tcon *tcon, - const char *full_path, - const char *symname); + struct inode * (*create_reparse_inode)(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + bool directory, + struct kvec *reparse_iov, + struct kvec *xattr_iov); }; struct smb_version_values { diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 045227ed4efc..40ec0634377f 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -483,6 +483,14 @@ extern int cifs_query_reparse_point(const unsigned int xid, const char *full_path, u32 *tag, struct kvec *rsp, int *rsp_buftype); +extern struct inode *cifs_create_reparse_inode(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + bool directory, + struct kvec *reparse_iov, + struct kvec *xattr_iov); extern int CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid); extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 75142f49d65d..6c890db06593 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -2851,6 +2851,134 @@ error: return rc; } +struct inode *cifs_create_reparse_inode(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + bool directory, + struct kvec *reparse_iov, + struct kvec *xattr_iov) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_open_parms oparms; + TRANSACT_IOCTL_REQ *io_req; + struct inode *new = NULL; + struct kvec in_iov[2]; + struct kvec out_iov; + struct cifs_fid fid; + int io_req_len; + int oplock = 0; + int buf_type = 0; + int rc; + + cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path); + + /* + * If server filesystem does not support reparse points then do not + * attempt to create reparse point. This will prevent creating unusable + * empty object on the server. + */ + if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) + return ERR_PTR(-EOPNOTSUPP); + +#ifndef CONFIG_CIFS_XATTR + if (xattr_iov) + return ERR_PTR(-EOPNOTSUPP); +#endif + + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + FILE_READ_ATTRIBUTES | FILE_WRITE_DATA | FILE_WRITE_EA, + FILE_CREATE, + (directory ? CREATE_NOT_FILE : CREATE_NOT_DIR) | OPEN_REPARSE_POINT, + ACL_NO_MODE); + oparms.fid = &fid; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) + return ERR_PTR(rc); + +#ifdef CONFIG_CIFS_XATTR + if (xattr_iov) { + struct smb2_file_full_ea_info *ea; + + ea = &((struct smb2_create_ea_ctx *)xattr_iov->iov_base)->ea; + while (1) { + rc = CIFSSMBSetEA(xid, + tcon, + full_path, + &ea->ea_data[0], + &ea->ea_data[ea->ea_name_length+1], + le16_to_cpu(ea->ea_value_length), + cifs_sb->local_nls, + cifs_sb); + if (rc) + goto out_close; + if (le32_to_cpu(ea->next_entry_offset) == 0) + break; + ea = (struct smb2_file_full_ea_info *)((u8 *)ea + + le32_to_cpu(ea->next_entry_offset)); + } + } +#endif + + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **)&io_req, NULL); + if (rc) + goto out_close; + + inc_rfc1001_len(io_req, sizeof(io_req->Pad)); + + io_req_len = be32_to_cpu(io_req->hdr.smb_buf_length) + sizeof(io_req->hdr.smb_buf_length); + + /* NT IOCTL response contains one-word long output setup buffer with size of output data. */ + io_req->MaxSetupCount = 1; + /* NT IOCTL response does not contain output parameters. */ + io_req->MaxParameterCount = cpu_to_le32(0); + /* FSCTL_SET_REPARSE_POINT response contains empty output data. */ + io_req->MaxDataCount = cpu_to_le32(0); + + io_req->TotalParameterCount = cpu_to_le32(0); + io_req->TotalDataCount = cpu_to_le32(reparse_iov->iov_len); + io_req->ParameterCount = io_req->TotalParameterCount; + io_req->ParameterOffset = cpu_to_le32(0); + io_req->DataCount = io_req->TotalDataCount; + io_req->DataOffset = cpu_to_le32(offsetof(typeof(*io_req), Data) - + sizeof(io_req->hdr.smb_buf_length)); + io_req->SetupCount = 4; + io_req->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); + io_req->FunctionCode = cpu_to_le32(FSCTL_SET_REPARSE_POINT); + io_req->Fid = fid.netfid; + io_req->IsFsctl = 1; + io_req->IsRootFlag = 0; + io_req->ByteCount = cpu_to_le16(le32_to_cpu(io_req->DataCount) + sizeof(io_req->Pad)); + + inc_rfc1001_len(io_req, reparse_iov->iov_len); + + in_iov[0].iov_base = (char *)io_req; + in_iov[0].iov_len = io_req_len; + in_iov[1] = *reparse_iov; + rc = SendReceive2(xid, tcon->ses, in_iov, ARRAY_SIZE(in_iov), &buf_type, + CIFS_NO_RSP_BUF, &out_iov); + + cifs_buf_release(io_req); + + if (!rc) + rc = cifs_get_inode_info(&new, full_path, data, sb, xid, NULL); + +out_close: + CIFSSMBClose(xid, tcon, fid.netfid); + + /* + * If CREATE was successful but FSCTL_SET_REPARSE_POINT failed then + * remove the intermediate object created by CREATE. Otherwise + * empty object stay on the server when reparse call failed. + */ + if (rc) + CIFSSMBDelFile(xid, tcon, full_path, cifs_sb, NULL); + + return rc ? ERR_PTR(rc) : new; +} + int CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid) @@ -4020,6 +4148,12 @@ findFirstRetry: pSMB->FileName[name_len] = 0; pSMB->FileName[name_len+1] = 0; name_len += 2; + } else if (!searchName[0]) { + pSMB->FileName[0] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[1] = 0; + pSMB->FileName[2] = 0; + pSMB->FileName[3] = 0; + name_len = 4; } } else { name_len = copy_path_name(pSMB->FileName, searchName); @@ -4031,6 +4165,10 @@ findFirstRetry: pSMB->FileName[name_len] = '*'; pSMB->FileName[name_len+1] = 0; name_len += 2; + } else if (!searchName[0]) { + pSMB->FileName[0] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[1] = 0; + name_len = 2; } } @@ -4057,7 +4195,7 @@ findFirstRetry: pSMB->SearchAttributes = cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY); - pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO)); + pSMB->SearchCount = cpu_to_le16(msearch ? CIFSMaxBufSize/sizeof(FILE_UNIX_INFO) : 1); pSMB->SearchFlags = cpu_to_le16(search_flags); pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 205f547ca49e..5eec8957f2a9 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3362,18 +3362,15 @@ generic_ip_connect(struct TCP_Server_Info *server) struct net *net = cifs_net_ns(server); struct sock *sk; - rc = __sock_create(net, sfamily, SOCK_STREAM, - IPPROTO_TCP, &server->ssocket, 1); + rc = sock_create_kern(net, sfamily, SOCK_STREAM, + IPPROTO_TCP, &server->ssocket); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } sk = server->ssocket->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 59ccc2229ab3..3f34bb07997b 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1475,35 +1475,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, pr_warn("username too long\n"); goto cifs_parse_mount_err; } - ctx->username = kstrdup(param->string, GFP_KERNEL); - if (ctx->username == NULL) { - cifs_errorf(fc, "OOM when copying username string\n"); - goto cifs_parse_mount_err; - } + ctx->username = no_free_ptr(param->string); break; case Opt_pass: kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; - - ctx->password = kstrdup(param->string, GFP_KERNEL); - if (ctx->password == NULL) { - cifs_errorf(fc, "OOM when copying password string\n"); - goto cifs_parse_mount_err; - } + ctx->password = no_free_ptr(param->string); break; case Opt_pass2: kfree_sensitive(ctx->password2); ctx->password2 = NULL; if (strlen(param->string) == 0) break; - - ctx->password2 = kstrdup(param->string, GFP_KERNEL); - if (ctx->password2 == NULL) { - cifs_errorf(fc, "OOM when copying password2 string\n"); - goto cifs_parse_mount_err; - } + ctx->password2 = no_free_ptr(param->string); break; case Opt_ip: if (strlen(param->string) == 0) { @@ -1526,11 +1512,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } kfree(ctx->domainname); - ctx->domainname = kstrdup(param->string, GFP_KERNEL); - if (ctx->domainname == NULL) { - cifs_errorf(fc, "OOM when copying domainname string\n"); - goto cifs_parse_mount_err; - } + ctx->domainname = no_free_ptr(param->string); cifs_dbg(FYI, "Domain name set\n"); break; case Opt_srcaddr: @@ -1550,11 +1532,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (strncasecmp(param->string, "default", 7) != 0) { kfree(ctx->iocharset); - ctx->iocharset = kstrdup(param->string, GFP_KERNEL); - if (ctx->iocharset == NULL) { - cifs_errorf(fc, "OOM when copying iocharset string\n"); - goto cifs_parse_mount_err; - } + ctx->iocharset = no_free_ptr(param->string); } /* if iocharset not set then load_nls_default * is used by caller diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index 769752ad2c5c..2ecd705e9e8c 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -19,6 +19,7 @@ #include "smb2proto.h" #include "cifs_ioctl.h" #include "fs_context.h" +#include "reparse.h" /* * M-F Symlink Functions - Begin @@ -570,7 +571,6 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, int rc = -EOPNOTSUPP; unsigned int xid; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; @@ -593,7 +593,6 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, goto symlink_exit; } pTcon = tlink_tcon(tlink); - server = cifs_pick_channel(pTcon->ses); full_path = build_path_from_dentry(direntry, page); if (IS_ERR(full_path)) { @@ -643,13 +642,9 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, case CIFS_SYMLINK_TYPE_NATIVE: case CIFS_SYMLINK_TYPE_NFS: case CIFS_SYMLINK_TYPE_WSL: - if (server->ops->create_reparse_symlink && - (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) { - rc = server->ops->create_reparse_symlink(xid, inode, - direntry, - pTcon, - full_path, - symname); + if (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) { + rc = create_reparse_symlink(xid, inode, direntry, pTcon, + full_path, symname); goto symlink_exit; } break; diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 5fa29a97ac15..33c1d970747c 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -34,7 +34,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb, const char *symname, bool *directory); -int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, +int create_reparse_symlink(const unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, const char *symname) { @@ -227,7 +227,8 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, iov.iov_base = buf; iov.iov_len = len; - new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + new = tcon->ses->server->ops->create_reparse_inode( + &data, inode->i_sb, xid, tcon, full_path, directory, &iov, NULL); if (!IS_ERR(new)) @@ -399,7 +400,8 @@ static int create_native_socket(const unsigned int xid, struct inode *inode, struct inode *new; int rc = 0; - new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + new = tcon->ses->server->ops->create_reparse_inode( + &data, inode->i_sb, xid, tcon, full_path, false, &iov, NULL); if (!IS_ERR(new)) d_instantiate(dentry, new); @@ -492,7 +494,8 @@ static int mknod_nfs(unsigned int xid, struct inode *inode, .symlink_target = kstrdup(symname, GFP_KERNEL), }; - new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + new = tcon->ses->server->ops->create_reparse_inode( + &data, inode->i_sb, xid, tcon, full_path, false, &iov, NULL); if (!IS_ERR(new)) d_instantiate(dentry, new); @@ -685,7 +688,8 @@ static int mknod_wsl(unsigned int xid, struct inode *inode, memcpy(data.wsl.eas, &cc->ea, len); data.wsl.eas_len = len; - new = smb2_get_reparse_inode(&data, inode->i_sb, + new = tcon->ses->server->ops->create_reparse_inode( + &data, inode->i_sb, xid, tcon, full_path, false, &reparse_iov, &xattr_iov); if (!IS_ERR(new)) @@ -698,7 +702,7 @@ static int mknod_wsl(unsigned int xid, struct inode *inode, return rc; } -int smb2_mknod_reparse(unsigned int xid, struct inode *inode, +int mknod_reparse(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev) { diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 08de853b36a8..66269c10beba 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -129,10 +129,10 @@ static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct cifs_open_info_data *data); -int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, +int create_reparse_symlink(const unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, const char *symname); -int smb2_mknod_reparse(unsigned int xid, struct inode *inode, +int mknod_reparse(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 330bc3d25bad..0a8c2fcc9ded 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -332,6 +332,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) struct cifs_server_iface *old_iface = NULL; struct cifs_server_iface *last_iface = NULL; struct sockaddr_storage ss; + int retry = 0; spin_lock(&ses->chan_lock); chan_index = cifs_ses_get_chan_index(ses, server); @@ -360,6 +361,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) return; } +try_again: last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface, iface_head); iface_min_speed = last_iface->speed; @@ -397,6 +399,13 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) } if (list_entry_is_head(iface, &ses->iface_list, iface_head)) { + list_for_each_entry(iface, &ses->iface_list, iface_head) + iface->weight_fulfilled = 0; + + /* see if it can be satisfied in second attempt */ + if (!retry++) + goto try_again; + iface = NULL; cifs_dbg(FYI, "unable to find a suitable iface\n"); } diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index b27a182629ec..e364b6515af3 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -16,6 +16,7 @@ #include "fs_context.h" #include "nterr.h" #include "smberr.h" +#include "reparse.h" /* * An NT cancel request header looks just like the original request except: @@ -1263,17 +1264,26 @@ cifs_make_node(unsigned int xid, struct inode *inode, if (rc == 0) d_instantiate(dentry, newinode); return rc; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + /* + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers + * and was used by default in earlier versions of Windows. + */ + return cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + } else if (le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) { + /* + * mknod via reparse points requires server support for + * storing reparse points, which is available since + * Windows 2000, but was not widely used until release + * of Windows Server 2012 by the Windows NFS server. + */ + return mknod_reparse(xid, inode, dentry, tcon, + full_path, mode, dev); + } else { + return -EOPNOTSUPP; } - /* - * Check if mounted with mount parm 'sfu' mount parm. - * SFU emulation should work with all servers, but only - * supports block and char device, socket & fifo, - * and was used by default in earlier versions of Windows - */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - return -EPERM; - return cifs_sfu_make_node(xid, inode, dentry, tcon, - full_path, mode, dev); } static bool @@ -1370,6 +1380,7 @@ struct smb_version_operations smb1_operations = { .create_hardlink = CIFSCreateHardLink, .query_symlink = cifs_query_symlink, .get_reparse_point_buffer = cifs_get_reparse_point_buffer, + .create_reparse_inode = cifs_create_reparse_inode, .open = cifs_open_file, .set_fid = cifs_set_fid, .close = cifs_close_file, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index a11a2a693c51..69d251726c02 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1058,10 +1058,11 @@ int smb2_query_path_info(const unsigned int xid, * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create * response. */ - if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK) + if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK) { cmds[num_cmds++] = SMB2_OP_GET_REPARSE; - if (!tcon->posix_extensions) - cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; + if (!tcon->posix_extensions) + cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; + } oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES | @@ -1320,7 +1321,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path, return rc; } -struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, +struct inode *smb2_create_reparse_inode(struct cifs_open_info_data *data, struct super_block *sb, const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 938a8a7c5d21..1b4a31894f43 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5262,7 +5262,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, full_path, mode, dev); } else if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) || (tcon->posix_extensions)) { - rc = smb2_mknod_reparse(xid, inode, dentry, tcon, + rc = mknod_reparse(xid, inode, dentry, tcon, full_path, mode, dev); } return rc; @@ -5321,7 +5321,7 @@ struct smb_version_operations smb20_operations = { .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, - .create_reparse_symlink = smb2_create_reparse_symlink, + .create_reparse_inode = smb2_create_reparse_inode, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5424,7 +5424,7 @@ struct smb_version_operations smb21_operations = { .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, - .create_reparse_symlink = smb2_create_reparse_symlink, + .create_reparse_inode = smb2_create_reparse_inode, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5531,7 +5531,7 @@ struct smb_version_operations smb30_operations = { .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, - .create_reparse_symlink = smb2_create_reparse_symlink, + .create_reparse_inode = smb2_create_reparse_inode, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5647,7 +5647,7 @@ struct smb_version_operations smb311_operations = { .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, - .create_reparse_symlink = smb2_create_reparse_symlink, + .create_reparse_inode = smb2_create_reparse_inode, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 035aa1624053..6e805ece6a7b 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -54,7 +54,7 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); -struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, +struct inode *smb2_create_reparse_inode(struct cifs_open_info_data *data, struct super_block *sb, const unsigned int xid, struct cifs_tcon *tcon, @@ -314,9 +314,6 @@ int smb311_posix_query_path_info(const unsigned int xid, int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); -int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, const char *symname); int smb2_make_nfs_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 3061043e915c..b69c294e3ef0 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -80,23 +80,22 @@ static int squashfs_bio_read_cached(struct bio *fullbio, struct address_space *cache_mapping, u64 index, int length, u64 read_start, u64 read_end, int page_count) { - struct page *head_to_cache = NULL, *tail_to_cache = NULL; + struct folio *head_to_cache = NULL, *tail_to_cache = NULL; struct block_device *bdev = fullbio->bi_bdev; int start_idx = 0, end_idx = 0; - struct bvec_iter_all iter_all; + struct folio_iter fi; struct bio *bio = NULL; - struct bio_vec *bv; int idx = 0; int err = 0; #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL - struct page **cache_pages = kmalloc_array(page_count, - sizeof(void *), GFP_KERNEL | __GFP_ZERO); + struct folio **cache_folios = kmalloc_array(page_count, + sizeof(*cache_folios), GFP_KERNEL | __GFP_ZERO); #endif - bio_for_each_segment_all(bv, fullbio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, fullbio) { + struct folio *folio = fi.folio; - if (page->mapping == cache_mapping) { + if (folio->mapping == cache_mapping) { idx++; continue; } @@ -111,13 +110,13 @@ static int squashfs_bio_read_cached(struct bio *fullbio, * adjacent blocks. */ if (idx == 0 && index != read_start) - head_to_cache = page; + head_to_cache = folio; else if (idx == page_count - 1 && index + length != read_end) - tail_to_cache = page; + tail_to_cache = folio; #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL /* Cache all pages in the BIO for repeated reads */ - else if (cache_pages) - cache_pages[idx] = page; + else if (cache_folios) + cache_folios[idx] = folio; #endif if (!bio || idx != end_idx) { @@ -150,45 +149,45 @@ static int squashfs_bio_read_cached(struct bio *fullbio, return err; if (head_to_cache) { - int ret = add_to_page_cache_lru(head_to_cache, cache_mapping, + int ret = filemap_add_folio(cache_mapping, head_to_cache, read_start >> PAGE_SHIFT, GFP_NOIO); if (!ret) { - SetPageUptodate(head_to_cache); - unlock_page(head_to_cache); + folio_mark_uptodate(head_to_cache); + folio_unlock(head_to_cache); } } if (tail_to_cache) { - int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping, + int ret = filemap_add_folio(cache_mapping, tail_to_cache, (read_end >> PAGE_SHIFT) - 1, GFP_NOIO); if (!ret) { - SetPageUptodate(tail_to_cache); - unlock_page(tail_to_cache); + folio_mark_uptodate(tail_to_cache); + folio_unlock(tail_to_cache); } } #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL - if (!cache_pages) + if (!cache_folios) goto out; for (idx = 0; idx < page_count; idx++) { - if (!cache_pages[idx]) + if (!cache_folios[idx]) continue; - int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping, + int ret = filemap_add_folio(cache_mapping, cache_folios[idx], (read_start >> PAGE_SHIFT) + idx, GFP_NOIO); if (!ret) { - SetPageUptodate(cache_pages[idx]); - unlock_page(cache_pages[idx]); + folio_mark_uptodate(cache_folios[idx]); + folio_unlock(cache_folios[idx]); } } - kfree(cache_pages); + kfree(cache_folios); out: #endif return 0; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 5ca2baa16dc2..ce7d661d5ad8 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -493,10 +493,9 @@ out: return res; } -static int squashfs_readahead_fragment(struct page **page, +static int squashfs_readahead_fragment(struct inode *inode, struct page **page, unsigned int pages, unsigned int expected, loff_t start) { - struct inode *inode = page[0]->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); @@ -605,8 +604,8 @@ static void squashfs_readahead(struct readahead_control *ractl) if (start >> msblk->block_log == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { - res = squashfs_readahead_fragment(pages, nr_pages, - expected, start); + res = squashfs_readahead_fragment(inode, pages, + nr_pages, expected, start); if (res) goto skip_pages; continue; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 04dae67c0266..e75a6cec67be 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -979,8 +979,7 @@ static int do_writepage(struct folio *folio, size_t len) * on the page lock and it would not write the truncated inode node to the * journal before we have finished. */ -static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, - void *data) +static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc) { struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1052,7 +1051,12 @@ out_unlock: static int ubifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return write_cache_pages(mapping, wbc, ubifs_writepage, NULL); + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = ubifs_writepage(folio, wbc); + return error; } /** diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22f4bf956ba1..54c6cc7fe9c6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -165,14 +165,14 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { - VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh)); mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } @@ -304,7 +304,7 @@ again: goto out; ret = false; - if (!pmd_present(_pmd) || pmd_devmap(_pmd)) + if (!pmd_present(_pmd)) goto out; if (pmd_trans_huge(_pmd)) { @@ -383,12 +383,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (!ctx) goto out; - BUG_ON(ctx->mm != mm); + VM_WARN_ON_ONCE(ctx->mm != mm); /* Any unrecognized flag is a bug. */ - VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS); /* 0 or > 1 flags set is a bug; we expect exactly 1. */ - VM_BUG_ON(!reason || (reason & (reason - 1))); + VM_WARN_ON_ONCE(!reason || (reason & (reason - 1))); if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; @@ -411,12 +411,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); + VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { - printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", - vmf->flags); + pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); dump_stack(); } #endif @@ -602,7 +601,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, */ out: atomic_dec(&ctx->mmap_changing); - VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); + VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } @@ -710,7 +709,7 @@ void dup_userfaultfd_fail(struct list_head *fcs) struct userfaultfd_ctx *ctx = fctx->new; atomic_dec(&octx->mmap_changing); - VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0); + VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0); userfaultfd_ctx_put(octx); userfaultfd_ctx_put(ctx); @@ -751,11 +750,6 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, if (!ctx) return; - if (to & ~PAGE_MASK) { - userfaultfd_ctx_put(ctx); - return; - } - msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_REMAP; @@ -766,6 +760,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, userfaultfd_event_wait_completion(ctx, &ewq); } +void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + + if (!ctx) + return; + + userfaultfd_ctx_put(ctx); +} + bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { @@ -1243,7 +1247,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, int ret; struct uffdio_register uffdio_register; struct uffdio_register __user *user_uffdio_register; - unsigned long vm_flags; + vm_flags_t vm_flags; bool found; bool basic_ioctls; unsigned long start, end; @@ -1317,8 +1321,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & __VM_UFFD_FLAGS)); + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* check not compatible vmas */ ret = -EINVAL; @@ -1372,7 +1376,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, found = true; } for_each_vma_range(vmi, cur, end); - BUG_ON(!found); + VM_WARN_ON_ONCE(!found); ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, wp_async); @@ -1464,8 +1468,16 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & __VM_UFFD_FLAGS)); + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); + + /* + * Prevent unregistering through a different userfaultfd than + * the one used for registration. + */ + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; /* * Check not compatible vmas, not strictly required @@ -1479,7 +1491,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, found = true; } for_each_vma_range(vmi, cur, end); - BUG_ON(!found); + VM_WARN_ON_ONCE(!found); vma_iter_set(&vmi, start); prev = vma_prev(&vmi); @@ -1490,16 +1502,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async)); - - /* - * Nothing to do: this vma is already registered into this - * userfaultfd and with the right tracking mode too. - */ + /* VMA not registered with userfaultfd. */ if (!vma->vm_userfaultfd_ctx.ctx) goto skip; - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async)); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); if (vma->vm_start > start) start = vma->vm_start; @@ -1564,7 +1573,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, * len == 0 means wake all and we don't want to wake all here, * so check it again to be sure. */ - VM_BUG_ON(!range.len); + VM_WARN_ON_ONCE(!range.len); wake_userfault(ctx, &range); ret = 0; @@ -1621,7 +1630,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, return -EFAULT; if (ret < 0) goto out; - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); /* len == 0 would wake all */ range.len = ret; if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { @@ -1676,7 +1685,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (ret < 0) goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { range.start = uffdio_zeropage.range.start; @@ -1788,7 +1797,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { range.start = uffdio_continue.range.start; @@ -1845,7 +1854,7 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { range.start = uffdio_poison.range.start; @@ -2106,12 +2115,10 @@ static int new_userfaultfd(int flags) struct file *file; int fd; - BUG_ON(!current->mm); + VM_WARN_ON_ONCE(!current->mm); /* Check the UFFD_* constants for consistency. */ BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); - BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); - BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) return -EINVAL; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b04c59d87378..55a304cb3aef 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1732,7 +1732,7 @@ xfs_dax_fault_locked( bool write_fault) { vm_fault_t ret; - pfn_t pfn; + unsigned long pfn; if (!IS_ENABLED(CONFIG_FS_DAX)) { ASSERT(0); |