diff options
Diffstat (limited to 'fs/userfaultfd.c')
| -rw-r--r-- | fs/userfaultfd.c | 147 |
1 files changed, 78 insertions, 69 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 54c6cc7fe9c6..c5ba1f4487bd 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,7 +29,7 @@ #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/miscdevice.h> #include <linux/uio.h> @@ -233,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, { struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; - bool ret = true; assert_fault_locked(vmf); ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) - goto out; + return true; - ret = false; pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. + */ + + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (huge_pte_none(pte)) + return true; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(pte)) + return true; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. */ - if (huge_pte_none_mostly(pte)) - ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) - ret = true; -out: - return ret; + return true; + + return false; } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, struct vm_fault *vmf, unsigned long reason) { - return false; /* should never get here */ + /* Should never get here. */ + VM_WARN_ON_ONCE(1); + return false; } #endif /* CONFIG_HUGETLB_PAGE */ /* - * Verify the pagetables are still not ok after having reigstered into + * Verify the pagetables are still not ok after having registered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any * userfault that has already been resolved, if userfaultfd_read_iter and * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different @@ -284,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pmd_t *pmd, _pmd; pte_t *pte; pte_t ptent; - bool ret = true; + bool ret; assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out; + return true; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) - goto out; + return true; pud = pud_offset(p4d, address); if (!pud_present(*pud)) - goto out; + return true; pmd = pmd_offset(pud, address); again: _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) - goto out; + return true; - ret = false; + /* + * A race could arise which would result in a softleaf entry such as + * migration entry unexpectedly being present in the PMD, so explicitly + * check for this and bail out if so. + */ if (!pmd_present(_pmd)) - goto out; + return false; - if (pmd_trans_huge(_pmd)) { - if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) - ret = true; - goto out; - } + if (pmd_trans_huge(_pmd)) + return !pmd_write(_pmd) && (reason & VM_UFFD_WP); pte = pte_offset_map(pmd, address); - if (!pte) { - ret = true; + if (!pte) goto again; - } + /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. */ ptent = ptep_get(pte); - if (pte_none_mostly(ptent)) - ret = true; + + ret = true; + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (pte_none(ptent)) + goto out; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(ptent)) + goto out; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ if (!pte_write(ptent) && (reason & VM_UFFD_WP)) - ret = true; - pte_unmap(pte); + goto out; + ret = false; out: + pte_unmap(pte); return ret; } @@ -490,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); - if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf, reason); - else + if (is_vm_hugetlb_page(vma)) { must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); - if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); + } else { + must_wait = userfaultfd_must_wait(ctx, vmf, reason); + } + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { @@ -1270,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - goto out; -#endif + if (!pgtable_supports_uffd_wp()) + goto out; + vm_flags |= VM_UFFD_WP; } if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { @@ -1980,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features &= ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; -#endif -#ifndef CONFIG_PTE_MARKER_UFFD_WP - uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; - uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; - uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; -#endif + if (!pgtable_supports_uffd_wp()) + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; + + if (!uffd_supports_wp_marker()) { + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; + } ret = -EINVAL; if (features & ~uffdio_api.features) @@ -2111,9 +2130,7 @@ static void init_once_userfaultfd_ctx(void *mem) static int new_userfaultfd(int flags) { - struct userfaultfd_ctx *ctx; - struct file *file; - int fd; + struct userfaultfd_ctx *ctx __free(kfree) = NULL; VM_WARN_ON_ONCE(!current->mm); @@ -2135,26 +2152,18 @@ static int new_userfaultfd(int flags) atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; - fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); - if (fd < 0) - goto err_out; + FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS, + anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), + NULL)); + if (fdf.err) + return fdf.err; - /* Create a new inode so that the LSM can block the creation. */ - file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); - if (IS_ERR(file)) { - put_unused_fd(fd); - fd = PTR_ERR(file); - goto err_out; - } /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - file->f_mode |= FMODE_NOWAIT; - fd_install(fd, file); - return fd; -err_out: - kmem_cache_free(userfaultfd_ctx_cachep, ctx); - return fd; + fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT; + retain_and_null_ptr(ctx); + return fd_publish(fdf); } static inline bool userfaultfd_syscall_allowed(int flags) |
