summaryrefslogtreecommitdiff
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c147
1 files changed, 78 insertions, 69 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 54c6cc7fe9c6..c5ba1f4487bd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -29,7 +29,7 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/miscdevice.h>
#include <linux/uio.h>
@@ -233,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
{
struct vm_area_struct *vma = vmf->vma;
pte_t *ptep, pte;
- bool ret = true;
assert_fault_locked(vmf);
ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
if (!ptep)
- goto out;
+ return true;
- ret = false;
pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us. PTE markers should be handled the same as none
- * ptes here.
+ * changes under us.
+ */
+
+ /* Entry is still missing, wait for userspace to resolve the fault. */
+ if (huge_pte_none(pte))
+ return true;
+ /* UFFD PTE markers require userspace to resolve the fault. */
+ if (pte_is_uffd_marker(pte))
+ return true;
+ /*
+ * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+ * resolve the fault.
*/
- if (huge_pte_none_mostly(pte))
- ret = true;
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
- ret = true;
-out:
- return ret;
+ return true;
+
+ return false;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
struct vm_fault *vmf,
unsigned long reason)
{
- return false; /* should never get here */
+ /* Should never get here. */
+ VM_WARN_ON_ONCE(1);
+ return false;
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
- * Verify the pagetables are still not ok after having reigstered into
+ * Verify the pagetables are still not ok after having registered into
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
* userfault that has already been resolved, if userfaultfd_read_iter and
* UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
@@ -284,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pmd_t *pmd, _pmd;
pte_t *pte;
pte_t ptent;
- bool ret = true;
+ bool ret;
assert_fault_locked(vmf);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
- goto out;
+ return true;
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
- goto out;
+ return true;
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
- goto out;
+ return true;
pmd = pmd_offset(pud, address);
again:
_pmd = pmdp_get_lockless(pmd);
if (pmd_none(_pmd))
- goto out;
+ return true;
- ret = false;
+ /*
+ * A race could arise which would result in a softleaf entry such as
+ * migration entry unexpectedly being present in the PMD, so explicitly
+ * check for this and bail out if so.
+ */
if (!pmd_present(_pmd))
- goto out;
+ return false;
- if (pmd_trans_huge(_pmd)) {
- if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
- ret = true;
- goto out;
- }
+ if (pmd_trans_huge(_pmd))
+ return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
pte = pte_offset_map(pmd, address);
- if (!pte) {
- ret = true;
+ if (!pte)
goto again;
- }
+
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us. PTE markers should be handled the same as none
- * ptes here.
+ * changes under us.
*/
ptent = ptep_get(pte);
- if (pte_none_mostly(ptent))
- ret = true;
+
+ ret = true;
+ /* Entry is still missing, wait for userspace to resolve the fault. */
+ if (pte_none(ptent))
+ goto out;
+ /* UFFD PTE markers require userspace to resolve the fault. */
+ if (pte_is_uffd_marker(ptent))
+ goto out;
+ /*
+ * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+ * resolve the fault.
+ */
if (!pte_write(ptent) && (reason & VM_UFFD_WP))
- ret = true;
- pte_unmap(pte);
+ goto out;
+ ret = false;
out:
+ pte_unmap(pte);
return ret;
}
@@ -490,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
set_current_state(blocking_state);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
- if (!is_vm_hugetlb_page(vma))
- must_wait = userfaultfd_must_wait(ctx, vmf, reason);
- else
+ if (is_vm_hugetlb_page(vma)) {
must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
- if (is_vm_hugetlb_page(vma))
hugetlb_vma_unlock_read(vma);
+ } else {
+ must_wait = userfaultfd_must_wait(ctx, vmf, reason);
+ }
+
release_fault_lock(vmf);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
@@ -1270,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
- goto out;
-#endif
+ if (!pgtable_supports_uffd_wp())
+ goto out;
+
vm_flags |= VM_UFFD_WP;
}
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
@@ -1980,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &=
~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
- uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
-#endif
-#ifndef CONFIG_PTE_MARKER_UFFD_WP
- uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
- uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
- uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
-#endif
+ if (!pgtable_supports_uffd_wp())
+ uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+
+ if (!uffd_supports_wp_marker()) {
+ uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
+ }
ret = -EINVAL;
if (features & ~uffdio_api.features)
@@ -2111,9 +2130,7 @@ static void init_once_userfaultfd_ctx(void *mem)
static int new_userfaultfd(int flags)
{
- struct userfaultfd_ctx *ctx;
- struct file *file;
- int fd;
+ struct userfaultfd_ctx *ctx __free(kfree) = NULL;
VM_WARN_ON_ONCE(!current->mm);
@@ -2135,26 +2152,18 @@ static int new_userfaultfd(int flags)
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
- fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
- if (fd < 0)
- goto err_out;
+ FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS,
+ anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS),
+ NULL));
+ if (fdf.err)
+ return fdf.err;
- /* Create a new inode so that the LSM can block the creation. */
- file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- fd = PTR_ERR(file);
- goto err_out;
- }
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- file->f_mode |= FMODE_NOWAIT;
- fd_install(fd, file);
- return fd;
-err_out:
- kmem_cache_free(userfaultfd_ctx_cachep, ctx);
- return fd;
+ fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT;
+ retain_and_null_ptr(ctx);
+ return fd_publish(fdf);
}
static inline bool userfaultfd_syscall_allowed(int flags)