From 67c6bb56b649590a3f59c2a92331aa4e83d4534c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 6 Jan 2021 10:34:49 +0000 Subject: firmware: smccc: Add SMCCC TRNG function call IDs The ARM architected TRNG firmware interface, described in ARM spec DEN0098, define an ARM SMCCC based interface to a true random number generator, provided by firmware. Add the definitions of the SMCCC functions as defined by the spec. Signed-off-by: Ard Biesheuvel Signed-off-by: Andre Przywara Reviewed-by: Linus Walleij Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20210106103453.152275-2-andre.przywara@arm.com Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index f860645f6512..62c54234576c 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -102,6 +102,37 @@ ARM_SMCCC_OWNER_STANDARD_HYP, \ 0x21) +/* TRNG entropy source calls (defined by ARM DEN0098) */ +#define ARM_SMCCC_TRNG_VERSION \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x50) + +#define ARM_SMCCC_TRNG_FEATURES \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x51) + +#define ARM_SMCCC_TRNG_GET_UUID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x52) + +#define ARM_SMCCC_TRNG_RND32 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x53) + +#define ARM_SMCCC_TRNG_RND64 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x53) + /* * Return codes defined in ARM DEN 0070A * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C -- cgit v1.2.3 From f9ce0be71d1fbb038ada15ced83474b0e63f264d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 19 Dec 2020 15:19:23 +0300 Subject: mm: Cleanup faultaround and finish_fault() codepaths alloc_set_pte() has two users with different requirements: in the faultaround code, it called from an atomic context and PTE page table has to be preallocated. finish_fault() can sleep and allocate page table as needed. PTL locking rules are also strange, hard to follow and overkill for finish_fault(). Let's untangle the mess. alloc_set_pte() has gone now. All locking is explicit. The price is some code duplication to handle huge pages in faultaround path, but it should be fine, having overall improvement in readability. Link: https://lore.kernel.org/r/20201229132819.najtavneutnf7ajp@box Signed-off-by: Kirill A. Shutemov [will: s/from from/from/ in comment; spotted by willy] Signed-off-by: Will Deacon --- fs/xfs/xfs_file.c | 6 +- include/linux/mm.h | 12 +-- include/linux/pgtable.h | 11 +++ mm/filemap.c | 177 +++++++++++++++++++++++++++++++----------- mm/memory.c | 199 ++++++++++++++---------------------------------- 5 files changed, 213 insertions(+), 192 deletions(-) (limited to 'include/linux') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5b0f93f73837..111fe73bb8a7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1319,17 +1319,19 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } -static void +static vm_fault_t xfs_filemap_map_pages( struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - filemap_map_pages(vmf, start_pgoff, end_pgoff); + ret = filemap_map_pages(vmf, start_pgoff, end_pgoff); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + return ret; } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/include/linux/mm.h b/include/linux/mm.h index ecdf8a8cd6ae..4572a9bc5862 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -542,8 +542,8 @@ struct vm_fault { * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. - * vm_ops->map_pages() calls - * alloc_set_pte() from atomic context. + * vm_ops->map_pages() sets up a page + * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. @@ -578,7 +578,7 @@ struct vm_operations_struct { vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); - void (*map_pages)(struct vm_fault *vmf, + vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); @@ -988,7 +988,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); +void do_set_pte(struct vm_fault *vmf, struct page *page); + vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif @@ -2622,7 +2624,7 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); -extern void filemap_map_pages(struct vm_fault *vmf, +extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8fcdfa52eb4b..36eb748f3c97 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1314,6 +1314,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } +/* + * the ordering of these checks is important for pmds with _page_devmap set. + * if we check pmd_trans_unstable() first we will trip the bad_pmd() check + * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly + * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. + */ +static inline int pmd_devmap_trans_unstable(pmd_t *pmd) +{ + return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); +} + #ifndef CONFIG_NUMA_BALANCING /* * Technically a PTE can be PROTNONE even when not doing NUMA balancing but diff --git a/mm/filemap.c b/mm/filemap.c index 5c9d564317a5..c1f2dc89b8a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -2911,74 +2912,164 @@ out_retry: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_fault *vmf, - pgoff_t start_pgoff, pgoff_t end_pgoff) +static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) { - struct file *file = vmf->vma->vm_file; + struct mm_struct *mm = vmf->vma->vm_mm; + + /* Huge page is mapped? No need to proceed. */ + if (pmd_trans_huge(*vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + vm_fault_t ret = do_set_pmd(vmf, page); + if (!ret) { + /* The page is mapped successfully, reference consumed. */ + unlock_page(page); + return true; + } + } + + if (pmd_none(*vmf->pmd)) { + vmf->ptl = pmd_lock(mm, vmf->pmd); + if (likely(pmd_none(*vmf->pmd))) { + mm_inc_nr_ptes(mm); + pmd_populate(mm, vmf->pmd, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + spin_unlock(vmf->ptl); + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + return false; +} + +static struct page *next_uptodate_page(struct page *page, + struct address_space *mapping, + struct xa_state *xas, pgoff_t end_pgoff) +{ + unsigned long max_idx; + + do { + if (!page) + return NULL; + if (xas_retry(xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageLocked(page)) + continue; + if (!page_cache_get_speculative(page)) + continue; + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(xas))) + goto skip; + if (!PageUptodate(page) || PageReadahead(page)) + goto skip; + if (PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + if (page->mapping != mapping) + goto unlock; + if (!PageUptodate(page)) + goto unlock; + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (xas->xa_index >= max_idx) + goto unlock; + return page; +unlock: + unlock_page(page); +skip: + put_page(page); + } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); + + return NULL; +} + +static inline struct page *first_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_find(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +static inline struct page *next_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_next_entry(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +vm_fault_t filemap_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct vm_area_struct *vma = vmf->vma; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - unsigned long max_idx; + unsigned long address = vmf->address; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + vm_fault_t ret = 0; rcu_read_lock(); - xas_for_each(&xas, head, end_pgoff) { - if (xas_retry(&xas, head)) - continue; - if (xa_is_value(head)) - goto next; + head = first_map_page(mapping, &xas, end_pgoff); + if (!head) + goto out; - /* - * Check for a locked page first, as a speculative - * reference may adversely influence page migration. - */ - if (PageLocked(head)) - goto next; - if (!page_cache_get_speculative(head)) - goto next; + if (filemap_map_pmd(vmf, head)) { + ret = VM_FAULT_NOPAGE; + goto out; + } - /* Has the page moved or been split? */ - if (unlikely(head != xas_reload(&xas))) - goto skip; + vmf->address = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + do { page = find_subpage(head, xas.xa_index); - - if (!PageUptodate(head) || - PageReadahead(page) || - PageHWPoison(page)) - goto skip; - if (!trylock_page(head)) - goto skip; - - if (head->mapping != mapping || !PageUptodate(head)) - goto unlock; - - max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); - if (xas.xa_index >= max_idx) + if (PageHWPoison(page)) goto unlock; if (mmap_miss > 0) mmap_miss--; vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; - if (vmf->pte) - vmf->pte += xas.xa_index - last_pgoff; + vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, page)) + + if (!pte_none(*vmf->pte)) goto unlock; + + do_set_pte(vmf, page); + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, vmf->address, vmf->pte); unlock_page(head); - goto next; + + /* The fault is handled */ + if (vmf->address == address) + ret = VM_FAULT_NOPAGE; + continue; unlock: unlock_page(head); -skip: put_page(head); -next: - /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*vmf->pmd)) - break; - } + } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); + pte_unmap_unlock(vmf->pte, vmf->ptl); +out: rcu_read_unlock(); + vmf->address = address; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + return ret; } EXPORT_SYMBOL(filemap_map_pages); diff --git a/mm/memory.c b/mm/memory.c index feff48e1465a..3e2fc2950ad7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3503,7 +3503,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; - /* See the comment in pte_alloc_one_map() */ + /* See comment in handle_pte_fault() */ if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; @@ -3643,66 +3643,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; } -/* - * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. - * If we check pmd_trans_unstable() first we will trip the bad_pmd() check - * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. - */ -static int pmd_devmap_trans_unstable(pmd_t *pmd) -{ - return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); -} - -static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - - if (!pmd_none(*vmf->pmd)) - goto map_pte; - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - goto map_pte; - } - - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - spin_unlock(vmf->ptl); - vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { - return VM_FAULT_OOM; - } -map_pte: - /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of - * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge - * under us and then back to pmd_none, as a result of MADV_DONTNEED - * running immediately after a huge pmd fault in a different thread of - * this mm, in turn leading to a misleading pmd_trans_huge() retval. - * All we have to ensure is that it is a regular pmd that we can walk - * with pte_offset_map() and we can do that through an atomic read in - * C, which is what pmd_trans_unstable() provides. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return VM_FAULT_NOPAGE; - - /* - * At this point we know that our vmf->pmd points to a page of ptes - * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() - * for the duration of the fault. If a racing MADV_DONTNEED runs and - * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still - * be valid and we will re-check to make sure the vmf->pte isn't - * pte_none() under vmf->ptl protection when we return to - * alloc_set_pte(). - */ - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); - return 0; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { @@ -3717,7 +3657,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3775,52 +3715,17 @@ out: return ret; } #else -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { - BUILD_BUG(); - return 0; + return VM_FAULT_FALLBACK; } #endif -/** - * alloc_set_pte - setup new PTE entry for given page and add reverse page - * mapping. If needed, the function allocates page table or use pre-allocated. - * - * @vmf: fault environment - * @page: page to map - * - * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on - * return. - * - * Target users are page handler itself and implementations of - * vm_ops->map_pages. - * - * Return: %0 on success, %VM_FAULT_ code in case of error. - */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) +void do_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; - vm_fault_t ret; - - if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); - if (ret != VM_FAULT_FALLBACK) - return ret; - } - - if (!vmf->pte) { - ret = pte_alloc_one_map(vmf); - if (ret) - return ret; - } - - /* Re-check under ptl */ - if (unlikely(!pte_none(*vmf->pte))) { - update_mmu_tlb(vma, vmf->address, vmf->pte); - return VM_FAULT_NOPAGE; - } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3837,14 +3742,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) page_add_file_rmap(page, false); } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); - - return 0; } - /** * finish_fault - finish page fault once we have prepared the page to fault * @@ -3862,12 +3761,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) */ vm_fault_t finish_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct page *page; - vm_fault_t ret = 0; + vm_fault_t ret; /* Did we COW the page? */ - if ((vmf->flags & FAULT_FLAG_WRITE) && - !(vmf->vma->vm_flags & VM_SHARED)) + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) page = vmf->cow_page; else page = vmf->page; @@ -3876,12 +3775,38 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * check even for read faults because we might have lost our CoWed * page */ - if (!(vmf->vma->vm_flags & VM_SHARED)) - ret = check_stable_address_space(vmf->vma->vm_mm); - if (!ret) - ret = alloc_set_pte(vmf, page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (!(vma->vm_flags & VM_SHARED)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + return ret; + } + + if (pmd_none(*vmf->pmd)) { + if (PageTransCompound(page)) { + ret = do_set_pmd(vmf, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) + return VM_FAULT_OOM; + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return 0; + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + ret = 0; + /* Re-check under ptl */ + if (likely(pte_none(*vmf->pte))) + do_set_pte(vmf, page); + else + ret = VM_FAULT_NOPAGE; + + update_mmu_tlb(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } @@ -3951,13 +3876,12 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off; - vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - vmf->address = max(address & mask, vmf->vma->vm_start); - off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + address = max(address & mask, vmf->vma->vm_start); + off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3965,7 +3889,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); @@ -3973,31 +3897,11 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) - goto out; + return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); - - /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*vmf->pmd)) { - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!vmf->pte) - goto out; - - /* check if the page fault is solved */ - vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*vmf->pte)) - ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(vmf->pte, vmf->ptl); -out: - vmf->address = address; - vmf->pte = NULL; - return ret; + return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); } static vm_fault_t do_read_fault(struct vm_fault *vmf) @@ -4353,7 +4257,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) */ vmf->pte = NULL; } else { - /* See comment in pte_alloc_one_map() */ + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead + * of pmd_trans_huge() to ensure the pmd didn't become + * pmd_trans_huge under us and then back to pmd_none, as a + * result of MADV_DONTNEED running immediately after a huge pmd + * fault in a different thread of this mm, in turn leading to a + * misleading pmd_trans_huge() retval. All we have to ensure is + * that it is a regular pmd that we can walk with + * pte_offset_map() and we can do that through an atomic read + * in C, which is what pmd_trans_unstable() provides. + */ if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* -- cgit v1.2.3 From 46bdb4277f98e70d0c91f4289897ade533fe9e80 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Nov 2020 18:48:26 +0000 Subject: mm: Allow architectures to request 'old' entries when prefaulting Commit 5c0a85fad949 ("mm: make faultaround produce old ptes") changed the "faultaround" behaviour to initialise prefaulted PTEs as 'old', since this avoids vmscan wrongly assuming that they are hot, despite having never been explicitly accessed by userspace. The change has been shown to benefit numerous arm64 micro-architectures (with hardware access flag) running Android, where both application launch latency and direct reclaim time are significantly reduced (by 10%+ and ~80% respectively). Unfortunately, commit 315d09bf30c2 ("Revert "mm: make faultaround produce old ptes"") reverted the change due to it being identified as the cause of a ~6% regression in unixbench on x86. Experiments on a variety of recent arm64 micro-architectures indicate that unixbench is not affected by the original commit, which appears to yield a 0-1% performance improvement. Since one size does not fit all for the initial state of prefaulted PTEs, introduce arch_wants_old_prefaulted_pte(), which allows an architecture to opt-in to 'old' prefaulted PTEs at runtime based on whatever criteria it may have. Cc: Jan Kara Cc: Minchan Kim Cc: Andrew Morton Cc: Kirill A. Shutemov Cc: Linus Torvalds Reported-by: Vinayak Menon Signed-off-by: Will Deacon --- include/linux/mm.h | 5 ++++- mm/filemap.c | 14 ++++++++++---- mm/memory.c | 20 +++++++++++++++++++- 3 files changed, 33 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4572a9bc5862..251a2339befb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -434,6 +434,7 @@ extern pgprot_t protection_map[16]; * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * @FAULT_FLAG_PREFAULT: Fault was a prefault. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -464,6 +465,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 #define FAULT_FLAG_INSTRUCTION 0x100 #define FAULT_FLAG_INTERRUPTIBLE 0x200 +#define FAULT_FLAG_PREFAULT 0x400 /* * The default fault flags that should be used by most of the @@ -501,7 +503,8 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ - { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ + { FAULT_FLAG_PREFAULT, "PREFAULT" } /* * vm_fault is filled by the pagefault handler and passed to the vma's diff --git a/mm/filemap.c b/mm/filemap.c index c1f2dc89b8a7..a6dc97906c8e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3019,6 +3019,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long address = vmf->address; + unsigned long flags = vmf->flags; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); @@ -3051,14 +3052,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!pte_none(*vmf->pte)) goto unlock; + /* We're about to handle the fault */ + if (vmf->address == address) { + vmf->flags &= ~FAULT_FLAG_PREFAULT; + ret = VM_FAULT_NOPAGE; + } else { + vmf->flags |= FAULT_FLAG_PREFAULT; + } + do_set_pte(vmf, page); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock_page(head); - - /* The fault is handled */ - if (vmf->address == address) - ret = VM_FAULT_NOPAGE; continue; unlock: unlock_page(head); @@ -3067,6 +3072,7 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); + vmf->flags = flags; vmf->address = address; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; diff --git a/mm/memory.c b/mm/memory.c index 3e2fc2950ad7..f0e7c589ca9d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -134,6 +134,18 @@ static inline bool arch_faults_on_old_pte(void) } #endif +#ifndef arch_wants_old_prefaulted_pte +static inline bool arch_wants_old_prefaulted_pte(void) +{ + /* + * Transitioning a PTE from 'old' to 'young' can be expensive on + * some architectures, even if it's performed in hardware. By + * default, "false" means prefaulted entries will be 'young'. + */ + return false; +} +#endif + static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -3725,11 +3737,17 @@ void do_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool prefault = vmf->flags & FAULT_FLAG_PREFAULT; pte_t entry; flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); + + if (prefault && arch_wants_old_prefaulted_pte()) + entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); + if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ -- cgit v1.2.3 From 742d33729a0df11c9d8d4625dbf21dd20cdefd44 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 20 Jan 2021 14:34:23 +0000 Subject: mm: Move immutable fields of 'struct vm_fault' into anonymous struct 'struct vm_fault' contains both information about the fault being serviced alongside mutable fields contributing to the state of the fault-handling logic. Unfortunately, the distinction between the two is not clear-cut, and a number of callers end up manipulating the structure temporarily before restoring it when returning. Try to clean this up by moving the immutable fault information into an anonymous struct, which will later be marked as 'const'. Ideally, the 'flags' field would be part of the new structure too, but it seems as though the ->page_mkwrite() path is not ready for this yet. Cc: Kirill A. Shutemov Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=whYs9XsO88iqJzN6NC=D-dp2m0oYXuOoZ=eWnvv=5OA+w@mail.gmail.com Signed-off-by: Will Deacon --- include/linux/mm.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 251a2339befb..b4a5cb9bff7d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -517,11 +517,14 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { - struct vm_area_struct *vma; /* Target VMA */ - unsigned int flags; /* FAULT_FLAG_xxx flags */ - gfp_t gfp_mask; /* gfp mask to be used for allocations */ - pgoff_t pgoff; /* Logical page offset based on vma */ - unsigned long address; /* Faulting virtual address */ + struct { + struct vm_area_struct *vma; /* Target VMA */ + gfp_t gfp_mask; /* gfp mask to be used for allocations */ + pgoff_t pgoff; /* Logical page offset based on vma */ + unsigned long address; /* Faulting virtual address */ + }; + unsigned int flags; /* FAULT_FLAG_xxx flags + * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching -- cgit v1.2.3 From 9d3af4b448a119ac81378d3bc775f1c4a2a7ff36 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 14 Jan 2021 15:24:19 +0000 Subject: mm: Pass 'address' to map to do_set_pte() and drop FAULT_FLAG_PREFAULT Rather than modifying the 'address' field of the 'struct vm_fault' passed to do_set_pte(), leave that to identify the real faulting address and pass in the virtual address to be mapped by the new pte as a separate argument. This makes FAULT_FLAG_PREFAULT redundant, as a prefault entry can be identified simply by comparing the new address parameter with the faulting address, so remove the redundant flag at the same time. Cc: Kirill A. Shutemov Cc: Linus Torvalds Signed-off-by: Will Deacon --- include/linux/mm.h | 7 ++----- mm/filemap.c | 21 +++++++-------------- mm/memory.c | 10 +++++----- 3 files changed, 14 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b4a5cb9bff7d..e0f056753bef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -434,7 +434,6 @@ extern pgprot_t protection_map[16]; * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * @FAULT_FLAG_PREFAULT: Fault was a prefault. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -465,7 +464,6 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 #define FAULT_FLAG_INSTRUCTION 0x100 #define FAULT_FLAG_INTERRUPTIBLE 0x200 -#define FAULT_FLAG_PREFAULT 0x400 /* * The default fault flags that should be used by most of the @@ -503,8 +501,7 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ - { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ - { FAULT_FLAG_PREFAULT, "PREFAULT" } + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } /* * vm_fault is filled by the pagefault handler and passed to the vma's @@ -995,7 +992,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); -void do_set_pte(struct vm_fault *vmf, struct page *page); +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); diff --git a/mm/filemap.c b/mm/filemap.c index a6dc97906c8e..fb7a8d9b5603 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3018,8 +3018,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - unsigned long address = vmf->address; - unsigned long flags = vmf->flags; + unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); @@ -3035,8 +3034,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } - vmf->address = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); do { page = find_subpage(head, xas.xa_index); if (PageHWPoison(page)) @@ -3045,7 +3044,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (mmap_miss > 0) mmap_miss--; - vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; + addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; @@ -3053,16 +3052,12 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto unlock; /* We're about to handle the fault */ - if (vmf->address == address) { - vmf->flags &= ~FAULT_FLAG_PREFAULT; + if (vmf->address == addr) ret = VM_FAULT_NOPAGE; - } else { - vmf->flags |= FAULT_FLAG_PREFAULT; - } - do_set_pte(vmf, page); + do_set_pte(vmf, page, addr); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache(vma, addr, vmf->pte); unlock_page(head); continue; unlock: @@ -3072,8 +3067,6 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); - vmf->flags = flags; - vmf->address = address; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; } diff --git a/mm/memory.c b/mm/memory.c index f0e7c589ca9d..7b1307873325 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3733,11 +3733,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) } #endif -void do_set_pte(struct vm_fault *vmf, struct page *page) +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = vmf->flags & FAULT_FLAG_PREFAULT; + bool prefault = vmf->address != addr; pte_t entry; flush_icache_page(vma, page); @@ -3753,13 +3753,13 @@ void do_set_pte(struct vm_fault *vmf, struct page *page) /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } /** @@ -3819,7 +3819,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ret = 0; /* Re-check under ptl */ if (likely(pte_none(*vmf->pte))) - do_set_pte(vmf, page); + do_set_pte(vmf, page, vmf->address); else ret = VM_FAULT_NOPAGE; -- cgit v1.2.3 From 5857c9209ce58f8e262889539ccdf63e73ad7a93 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 14 Jan 2021 15:44:09 +0000 Subject: mm: Mark anonymous struct field of 'struct vm_fault' as 'const' The fields of this struct are only ever read after being initialised, so mark it 'const' before somebody tries to modify it again. GCC will then complain (with an error) about modification of these fields after they have been initialised, although LLVM currently allows them without even a warning: https://bugs.llvm.org/show_bug.cgi?id=48755 Hopefully, future versions of LLVM will emit a warning. Cc: Kirill A. Shutemov Cc: Linus Torvalds Signed-off-by: Will Deacon --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e0f056753bef..7ff3d9817d38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -514,7 +514,7 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { - struct { + const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ -- cgit v1.2.3