From 6cdfa1d5d5d8285108495c33588c48cdda81b647 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:42 +0100 Subject: mm/pgtable: make pte_next_pfn() independent of set_ptes() Let's provide pte_next_pfn(), independently of set_ptes(). This allows for using the generic pte_next_pfn() version in some arch-specific set_ptes() implementations, and prepares for reusing pte_next_pfn() in other context. Link: https://lkml.kernel.org/r/20240129124649.189745-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Christophe Leroy Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f6d0e3513948..351cd9dc7194 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif -#ifndef set_ptes #ifndef pte_next_pfn static inline pte_t pte_next_pfn(pte_t pte) @@ -221,6 +220,7 @@ static inline pte_t pte_next_pfn(pte_t pte) } #endif +#ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. -- cgit v1.2.3 From f8d937761d65c87e9987b88ea7beb7bddc333a0e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:47 +0100 Subject: mm/memory: optimize fork() with PTE-mapped THP Let's implement PTE batching when consecutive (present) PTEs map consecutive pages of the same large folio, and all other PTE bits besides the PFNs are equal. We will optimize folio_pte_batch() separately, to ignore selected PTE bits. This patch is based on work by Ryan Roberts. Use __always_inline for __copy_present_ptes() and keep the handling for single PTEs completely separate from the multi-PTE case: we really want the compiler to optimize for the single-PTE case with small folios, to not degrade performance. Note that PTE batching will never exceed a single page table and will always stay within VMA boundaries. Further, processing PTE-mapped THP that maybe pinned and have PageAnonExclusive set on at least one subpage should work as expected, but there is room for improvement: We will repeatedly (1) detect a PTE batch (2) detect that we have to copy a page (3) fall back and allocate a single page to copy a single page. For now we won't care as pinned pages are a corner case, and we should rather look into maintaining only a single PageAnonExclusive bit for large folios. Link: https://lkml.kernel.org/r/20240129124649.189745-14-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 31 ++++++++++++++ mm/memory.c | 112 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 124 insertions(+), 19 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 351cd9dc7194..aab227e12493 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -650,6 +650,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef wrprotect_ptes +/** + * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to write-protect. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_set_wrprotect(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + for (;;) { + ptep_set_wrprotect(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings diff --git a/mm/memory.c b/mm/memory.c index ca888431680c..a7eb2301a1d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -930,15 +930,15 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma return 0; } -static inline void __copy_present_pte(struct vm_area_struct *dst_vma, +static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, - pte_t pte, unsigned long addr) + pte_t pte, unsigned long addr, int nr) { struct mm_struct *src_mm = src_vma->vm_mm; /* If it's a COW mapping, write protect it both processes. */ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { - ptep_set_wrprotect(src_mm, addr, src_pte); + wrprotect_ptes(src_mm, addr, src_pte, nr); pte = pte_wrprotect(pte); } @@ -950,26 +950,93 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma, if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); + set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); +} + +/* + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same folio. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN. + */ +static inline int folio_pte_batch(struct folio *folio, unsigned long addr, + pte_t *start_ptep, pte_t pte, int max_nr) +{ + unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t expected_pte = pte_next_pfn(pte); + pte_t *ptep = start_ptep + 1; + + VM_WARN_ON_FOLIO(!pte_present(pte), folio); + + while (ptep != end_ptep) { + pte = ptep_get(ptep); + + if (!pte_same(pte, expected_pte)) + break; + + /* + * Stop immediately once we reached the end of the folio. In + * corner cases the next PFN might fall into a different + * folio. + */ + if (pte_pfn(pte) == folio_end_pfn) + break; + + expected_pte = pte_next_pfn(expected_pte); + ptep++; + } + + return ptep - start_ptep; } /* - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page - * is required to copy this pte. + * Copy one present PTE, trying to batch-process subsequent PTEs that map + * consecutive pages of the same folio by copying them as well. + * + * Returns -EAGAIN if one preallocated page is required to copy the next PTE. + * Otherwise, returns the number of copied PTEs (at least 1). */ static inline int -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, - int *rss, struct folio **prealloc) + int max_nr, int *rss, struct folio **prealloc) { struct page *page; struct folio *folio; + int err, nr; page = vm_normal_page(src_vma, addr, pte); if (unlikely(!page)) goto copy_pte; folio = page_folio(page); + + /* + * If we likely have to copy, just don't bother with batching. Make + * sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { + nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr); + folio_ref_add(folio, nr); + if (folio_test_anon(folio)) { + if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, + nr, src_vma))) { + folio_ref_sub(folio, nr); + return -EAGAIN; + } + rss[MM_ANONPAGES] += nr; + VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); + } else { + folio_dup_file_rmap_ptes(folio, page, nr); + rss[mm_counter_file(folio)] += nr; + } + __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, + addr, nr); + return nr; + } + folio_get(folio); if (folio_test_anon(folio)) { /* @@ -981,8 +1048,9 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, prealloc, page); + err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + return err ? err : 1; } rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); @@ -992,8 +1060,8 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, } copy_pte: - __copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr); - return 0; + __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); + return 1; } static inline struct folio *folio_prealloc(struct mm_struct *src_mm, @@ -1030,10 +1098,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *src_pte, *dst_pte; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; - int progress, ret = 0; + int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct folio *prealloc = NULL; + int nr; again: progress = 0; @@ -1064,6 +1133,8 @@ again: arch_enter_lazy_mmu_mode(); do { + nr = 1; + /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. @@ -1102,9 +1173,10 @@ again: */ WARN_ON_ONCE(ret != -ENOENT); } - /* copy_present_pte() will clear `*prealloc' if consumed */ - ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, - ptent, addr, rss, &prealloc); + /* copy_present_ptes() will clear `*prealloc' if consumed */ + max_nr = (end - addr) / PAGE_SIZE; + ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, + ptent, addr, max_nr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. @@ -1121,8 +1193,10 @@ again: folio_put(prealloc); prealloc = NULL; } - progress += 8; - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + nr = ret; + progress += 8 * nr; + } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, + addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_src_pte, src_ptl); @@ -1143,7 +1217,7 @@ again: prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; - } else if (ret) { + } else if (ret < 0) { VM_WARN_ON_ONCE(1); } -- cgit v1.2.3 From 10ebac4f95e7a9951c453d6c66d9beb5a35db338 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:35 +0100 Subject: mm/memory: optimize unmap/zap with PTE-mapped THP Similar to how we optimized fork(), let's implement PTE batching when consecutive (present) PTEs map consecutive pages of the same large folio. Most infrastructure we need for batching (mmu gather, rmap) is already there. We only have to add get_and_clear_full_ptes() and clear_full_ptes(). Similarly, extend zap_install_uffd_wp_if_needed() to process a PTE range. We won't bother sanity-checking the mapcount of all subpages, but only check the mapcount of the first subpage we process. If there is a real problem hiding somewhere, we can trigger it simply by using small folios, or when we zap single pages of a large folio. Ideally, we had that check in rmap code (including for delayed rmap), but then we cannot print the PTE. Let's keep it simple for now. If we ever have a cheap folio_mapcount(), we might just want to check for underflows there. To keep small folios as fast as possible force inlining of a specialized variant using __always_inline with nr=1. Link: https://lkml.kernel.org/r/20240214204435.167852-11-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 70 +++++++++++++++++++++++++++++++++++++ mm/memory.c | 92 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 136 insertions(+), 26 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index aab227e12493..49ab1f73b5c2 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -580,6 +580,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } #endif +#ifndef get_and_clear_full_ptes +/** + * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the + * returned PTE. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = ptep_get_and_clear_full(mm, addr, ptep, full); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} +#endif + +#ifndef clear_full_ptes +/** + * clear_full_ptes - Clear present PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + ptep_get_and_clear_full(mm, addr, ptep, full); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif /* * If two threads concurrently fault at the same page, the thread that diff --git a/mm/memory.c b/mm/memory.c index 168096f9360e..465ada39c2b7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1517,7 +1517,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details) */ static inline void zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte, + unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { /* Zap on anonymous always means dropping everything */ @@ -1527,20 +1527,27 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, if (zap_drop_file_uffd_wp(details)) return; - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + for (;;) { + /* the PFN in the PTE is irrelevant. */ + pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (--nr == 0) + break; + pte++; + addr += PAGE_SIZE; + } } -static inline void zap_present_folio_pte(struct mmu_gather *tlb, +static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, - struct page *page, pte_t *pte, pte_t ptent, unsigned long addr, - struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, + unsigned long addr, struct zap_details *details, int *rss, + bool *force_flush, bool *force_break) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; if (!folio_test_anon(folio)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { @@ -1550,36 +1557,49 @@ static inline void zap_present_folio_pte(struct mmu_gather *tlb, } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); - rss[mm_counter(folio)]--; + rss[mm_counter(folio)] -= nr; } else { /* We don't need up-to-date accessed/dirty bits. */ - ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); - rss[MM_ANONPAGES]--; + clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + rss[MM_ANONPAGES] -= nr; } + /* Checking a single PTE in a batch is sufficient. */ arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, + ptent); if (!delay_rmap) { - folio_remove_rmap_pte(folio, page, vma); + folio_remove_rmap_ptes(folio, page, nr, vma); + + /* Only sanity-check the first page in a batch. */ if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); } - if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { + if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; } } -static inline void zap_present_pte(struct mmu_gather *tlb, +/* + * Zap or skip at least one present PTE, trying to batch-process subsequent + * PTEs that map consecutive pages of the same folio. + * + * Returns the number of processed (skipped or zapped) PTEs (at least 1). + */ +static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, - unsigned long addr, struct zap_details *details, - int *rss, bool *force_flush, bool *force_break) + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *force_flush, + bool *force_break) { + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; + int nr; page = vm_normal_page(vma, addr, ptent); if (!page) { @@ -1589,14 +1609,29 @@ static inline void zap_present_pte(struct mmu_gather *tlb, tlb_remove_tlb_entry(tlb, pte, addr); VM_WARN_ON_ONCE(userfaultfd_wp(vma)); ksm_might_unmap_zero_page(mm, ptent); - return; + return 1; } folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) - return; - zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details, - rss, force_flush, force_break); + return 1; + + /* + * Make sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(folio_test_large(folio) && max_nr != 1)) { + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, + NULL); + + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, + addr, details, rss, force_flush, + force_break); + return nr; + } + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, + details, rss, force_flush, force_break); + return 1; } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1611,6 +1646,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; + int nr; tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); @@ -1624,7 +1660,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t ptent = ptep_get(pte); struct folio *folio; struct page *page; + int max_nr; + nr = 1; if (pte_none(ptent)) continue; @@ -1632,10 +1670,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, break; if (pte_present(ptent)) { - zap_present_pte(tlb, vma, pte, ptent, addr, details, - rss, &force_flush, &force_break); + max_nr = (end - addr) / PAGE_SIZE; + nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, + addr, details, rss, &force_flush, + &force_break); if (unlikely(force_break)) { - addr += PAGE_SIZE; + addr += nr * PAGE_SIZE; break; } continue; @@ -1689,8 +1729,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, WARN_ON_ONCE(1); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - } while (pte++, addr += PAGE_SIZE, addr != end); + zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); + } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); -- cgit v1.2.3 From 6280d7317ccae19c776a3b6cf9848c964f958091 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:48 +0000 Subject: mm: clarify the spec for set_ptes() Patch series "Transparent Contiguous PTEs for User Mappings", v6. This is a series to opportunistically and transparently use contpte mappings (set the contiguous bit in ptes) for user memory when those mappings meet the requirements. The change benefits arm64, but there is some (very) minor refactoring for x86 to enable its integration with core-mm. It is part of a wider effort to improve performance by allocating and mapping variable-sized blocks of memory (folios). One aim is for the 4K kernel to approach the performance of the 16K kernel, but without breaking compatibility and without the associated increase in memory. Another aim is to benefit the 16K and 64K kernels by enabling 2M THP, since this is the contpte size for those kernels. We have good performance data that demonstrates both aims are being met (see below). Of course this is only one half of the change. We require the mapped physical memory to be the correct size and alignment for this to actually be useful (i.e. 64K for 4K pages, or 2M for 16K/64K pages). Fortunately folios are solving this problem for us. Filesystems that support it (XFS, AFS, EROFS, tmpfs, ...) will allocate large folios up to the PMD size today, and more filesystems are coming. And for anonymous memory, "multi-size THP" is now upstream. Patch Layout ============ In this version, I've split the patches to better show each optimization: - 1-2: mm prep: misc code and docs cleanups - 3-6: mm,arm64,x86 prep: Add pte_advance_pfn() and make pte_next_pfn() a generic wrapper around it - 7-11: arm64 prep: Refactor ptep helpers into new layer - 12: functional contpte implementation - 23-18: various optimizations on top of the contpte implementation Testing ======= I've tested this series on both Ampere Altra (bare metal) and Apple M2 (VM): - mm selftests (inc new tests written for multi-size THP); no regressions - Speedometer Java script benchmark in Chromium web browser; no issues - Kernel compilation; no issues - Various tests under high memory pressure with swap enabled; no issues Performance =========== High Level Use Cases ~~~~~~~~~~~~~~~~~~~~ First some high level use cases (kernel compilation and speedometer JavaScript benchmarks). These are running on Ampere Altra (I've seen similar improvements on Android/Pixel 6). baseline: mm-unstable (mTHP switched off) mTHP: + enable 16K, 32K, 64K mTHP sizes "always" mTHP + contpte: + this series mTHP + contpte + exefolio: + patch at [6], which series supports Kernel Compilation with -j8 (negative is faster): | kernel | real-time | kern-time | user-time | |---------------------------|-----------|-----------|-----------| | baseline | 0.0% | 0.0% | 0.0% | | mTHP | -5.0% | -39.1% | -0.7% | | mTHP + contpte | -6.0% | -41.4% | -1.5% | | mTHP + contpte + exefolio | -7.8% | -43.1% | -3.4% | Kernel Compilation with -j80 (negative is faster): | kernel | real-time | kern-time | user-time | |---------------------------|-----------|-----------|-----------| | baseline | 0.0% | 0.0% | 0.0% | | mTHP | -5.0% | -36.6% | -0.6% | | mTHP + contpte | -6.1% | -38.2% | -1.6% | | mTHP + contpte + exefolio | -7.4% | -39.2% | -3.2% | Speedometer (positive is faster): | kernel | runs_per_min | |:--------------------------|--------------| | baseline | 0.0% | | mTHP | 1.5% | | mTHP + contpte | 3.2% | | mTHP + contpte + exefolio | 4.5% | Micro Benchmarks ~~~~~~~~~~~~~~~~ The following microbenchmarks are intended to demonstrate the performance of fork() and munmap() do not regress. I'm showing results for order-0 (4K) mappings, and for order-9 (2M) PTE-mapped THP. Thanks to David for sharing his benchmarks. baseline: mm-unstable + batch zap [7] series contpte-basic: + patches 0-19; functional contpte implementation contpte-batch: + patches 20-23; implement new batched APIs contpte-inline: + patch 24; __always_inline to help compiler contpte-fold: + patch 25; fold contpte mapping when sensible Primary platform is Ampere Altra bare metal. I'm also showing results for M2 VM (on top of MacOS) for reference, although experience suggests this might not be the most reliable for performance numbers of this sort: | FORK | order-0 | order-9 | | Ampere Altra |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 2.7% | 0.0% | 0.2% | | contpte-basic | 6.3% | 1.4% | 1948.7% | 0.2% | | contpte-batch | 7.6% | 2.0% | -1.9% | 0.4% | | contpte-inline | 3.6% | 1.5% | -1.0% | 0.2% | | contpte-fold | 4.6% | 2.1% | -1.8% | 0.2% | | MUNMAP | order-0 | order-9 | | Ampere Altra |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 0.5% | 0.0% | 0.3% | | contpte-basic | 1.8% | 0.3% | 1104.8% | 0.1% | | contpte-batch | -0.3% | 0.4% | 2.7% | 0.1% | | contpte-inline | -0.1% | 0.6% | 0.9% | 0.1% | | contpte-fold | 0.1% | 0.6% | 0.8% | 0.1% | | FORK | order-0 | order-9 | | Apple M2 VM |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 1.4% | 0.0% | 0.8% | | contpte-basic | 6.8% | 1.2% | 469.4% | 1.4% | | contpte-batch | -7.7% | 2.0% | -8.9% | 0.7% | | contpte-inline | -6.0% | 2.1% | -6.0% | 2.0% | | contpte-fold | 5.9% | 1.4% | -6.4% | 1.4% | | MUNMAP | order-0 | order-9 | | Apple M2 VM |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 0.6% | 0.0% | 0.4% | | contpte-basic | 1.6% | 0.6% | 233.6% | 0.7% | | contpte-batch | 1.9% | 0.3% | -3.9% | 0.4% | | contpte-inline | 2.2% | 0.8% | -1.6% | 0.9% | | contpte-fold | 1.5% | 0.7% | -1.7% | 0.7% | Misc ~~~~ John Hubbard at Nvidia has indicated dramatic 10x performance improvements for some workloads at [8], when using 64K base page kernel. [1] https://lore.kernel.org/linux-arm-kernel/20230622144210.2623299-1-ryan.roberts@arm.com/ [2] https://lore.kernel.org/linux-arm-kernel/20231115163018.1303287-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-arm-kernel/20231204105440.61448-1-ryan.roberts@arm.com/ [4] https://lore.kernel.org/lkml/20231218105100.172635-1-ryan.roberts@arm.com/ [5] https://lore.kernel.org/linux-mm/633af0a7-0823-424f-b6ef-374d99483f05@arm.com/ [6] https://lore.kernel.org/lkml/08c16f7d-f3b3-4f22-9acc-da943f647dc3@arm.com/ [7] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com/ [8] https://lore.kernel.org/linux-mm/c507308d-bdd4-5f9e-d4ff-e96e4520be85@nvidia.com/ [9] https://gitlab.arm.com/linux-arm/linux-rr/-/tree/features/granule_perf/contpte-lkml_v6 This patch (of 18): set_ptes() spec implies that it can only be used to set a present pte because it interprets the PFN field to increment it. However, set_pte_at() has been implemented on top of set_ptes() since set_ptes() was introduced, and set_pte_at() allows setting a pte to a not-present state. So clarify the spec to state that when nr==1, new state of pte may be present or not present. When nr>1, new state of all ptes must be present. While we are at it, tighten the spec to set requirements around the initial state of ptes; when nr==1 it may be either present or not-present. But when nr>1 all ptes must initially be not-present. All set_ptes() callsites already conform to this requirement. Stating it explicitly is useful because it allows for a simplification to the upcoming arm64 contpte implementation. Link: https://lkml.kernel.org/r/20240215103205.2607016-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240215103205.2607016-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 49ab1f73b5c2..231370e1b80f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -229,6 +229,10 @@ static inline pte_t pte_next_pfn(pte_t pte) * @pte: Page table entry for the first page. * @nr: Number of pages to map. * + * When nr==1, initial state of pte may be present or not present, and new state + * may be present or not present. When nr>1, initial state of all ptes must be + * not present, and new state must be present. + * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * -- cgit v1.2.3 From 583ceaaa339960e673ac0029f323bb1c6ffc96d7 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:50 +0000 Subject: mm: introduce pte_advance_pfn() and use for pte_next_pfn() The goal is to be able to advance a PTE by an arbitrary number of PFNs. So introduce a new API that takes a nr param. Define the default implementation here and allow for architectures to override. pte_next_pfn() becomes a wrapper around pte_advance_pfn(). Follow up commits will convert each overriding architecture's pte_next_pfn() to pte_advance_pfn(). Link: https://lkml.kernel.org/r/20240215103205.2607016-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 231370e1b80f..b7ac8358f2aa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,14 +212,17 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif - #ifndef pte_next_pfn -static inline pte_t pte_next_pfn(pte_t pte) +#ifndef pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif +#define pte_next_pfn(pte) pte_advance_pfn(pte, 1) +#endif + #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. -- cgit v1.2.3 From fb23bf6bd288db3187c27b971e558a3e9f70ae96 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:53 +0000 Subject: mm: tidy up pte_next_pfn() definition Now that the all architecture overrides of pte_next_pfn() have been replaced with pte_advance_pfn(), we can simplify the definition of the generic pte_next_pfn() macro so that it is unconditionally defined. Link: https://lkml.kernel.org/r/20240215103205.2607016-7-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b7ac8358f2aa..bc005d84f764 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif -#ifndef pte_next_pfn #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { @@ -221,7 +220,6 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) -#endif #ifndef set_ptes /** -- cgit v1.2.3 From c6ec76a2ebc5829e5826b218d2e1475ec11b333e Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:02 +0000 Subject: mm: add pte_batch_hint() to reduce scanning in folio_pte_batch() Some architectures (e.g. arm64) can tell from looking at a pte, if some follow-on ptes also map contiguous physical memory with the same pgprot. (for arm64, these are contpte mappings). Take advantage of this knowledge to optimize folio_pte_batch() so that it can skip these ptes when scanning to create a batch. By default, if an arch does not opt-in, folio_pte_batch() returns a compile-time 1, so the changes are optimized out and the behaviour is as before. arm64 will opt-in to providing this hint in the next patch, which will greatly reduce the cost of ptep_get() when scanning a range of contptes. Link: https://lkml.kernel.org/r/20240215103205.2607016-16-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Tested-by: John Hubbard Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 21 +++++++++++++++++++++ mm/memory.c | 19 ++++++++++++------- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bc005d84f764..a36cf4e124b0 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,6 +212,27 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif +#ifndef pte_batch_hint +/** + * pte_batch_hint - Number of pages that can be added to batch without scanning. + * @ptep: Page table pointer for the entry. + * @pte: Page table entry. + * + * Some architectures know that a set of contiguous ptes all map the same + * contiguous memory with the same permissions. In this case, it can provide a + * hint to aid pte batching without the core code needing to scan every pte. + * + * An architecture implementation may ignore the PTE accessed state. Further, + * the dirty state must apply atomically to all the PTEs described by the hint. + * + * May be overridden by the architecture, else pte_batch_hint is always 1. + */ +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + return 1; +} +#endif + #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { diff --git a/mm/memory.c b/mm/memory.c index 465ada39c2b7..642b4f2be523 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -988,16 +988,20 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; - pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags); - pte_t *ptep = start_ptep + 1; + pte_t expected_pte, *ptep; bool writable; + int nr; if (any_writable) *any_writable = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); - while (ptep != end_ptep) { + nr = pte_batch_hint(start_ptep, pte); + expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); + ptep = start_ptep + nr; + + while (ptep < end_ptep) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); @@ -1011,17 +1015,18 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, * corner cases the next PFN might fall into a different * folio. */ - if (pte_pfn(pte) == folio_end_pfn) + if (pte_pfn(pte) >= folio_end_pfn) break; if (any_writable) *any_writable |= writable; - expected_pte = pte_next_pfn(expected_pte); - ptep++; + nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, nr); + ptep += nr; } - return ptep - start_ptep; + return min(ptep - start_ptep, max_nr); } /* -- cgit v1.2.3 From c05995b7ec2a73bf813a8944978e175f8e4ec3ac Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:50 +0800 Subject: mm/treewide: align up pXd_leaf() retval across archs Even if pXd_leaf() API is defined globally, it's not clear on the retval, and there are three types used (bool, int, unsigned log). Always return a boolean for pXd_leaf() APIs. Link: https://lkml.kernel.org/r/20240305043750.93762-11-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgtable-64.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/s390/include/asm/pgtable.h | 4 ++-- arch/sparc/include/asm/pgtable_64.h | 4 ++-- arch/x86/include/asm/pgtable.h | 8 ++++---- include/linux/pgtable.h | 8 ++++---- 6 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index b42017d76924..2c7e1661db01 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -190,7 +190,7 @@ static inline int pud_bad(pud_t pud) } #define pud_leaf pud_leaf -static inline int pud_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { return pud_present(pud) && (pud_val(pud) & _PAGE_LEAF); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index add5cd30ab34..6839520dbcb1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -241,7 +241,7 @@ static inline int pmd_bad(pmd_t pmd) } #define pmd_leaf pmd_leaf -static inline int pmd_leaf(pmd_t pmd) +static inline bool pmd_leaf(pmd_t pmd) { return pmd_present(pmd) && (pmd_val(pmd) & _PAGE_LEAF); } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 9e08af5b9247..60950e7a25f5 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -706,7 +706,7 @@ static inline int pud_none(pud_t pud) } #define pud_leaf pud_leaf -static inline int pud_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) return 0; @@ -714,7 +714,7 @@ static inline int pud_leaf(pud_t pud) } #define pmd_leaf pmd_leaf -static inline int pmd_leaf(pmd_t pmd) +static inline bool pmd_leaf(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; } diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 6ff0a28d5fd1..4d1bafaba942 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -681,7 +681,7 @@ static inline unsigned long pte_special(pte_t pte) } #define pmd_leaf pmd_leaf -static inline unsigned long pmd_leaf(pmd_t pmd) +static inline bool pmd_leaf(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); @@ -868,7 +868,7 @@ static inline pmd_t *pud_pgtable(pud_t pud) #define p4d_page(p4d) NULL #define pud_leaf pud_leaf -static inline unsigned long pud_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { pte_t pte = __pte(pud_val(pud)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index cfc84c55d0e6..7621a5acb13e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -252,7 +252,7 @@ static inline unsigned long pgd_pfn(pgd_t pgd) } #define p4d_leaf p4d_leaf -static inline int p4d_leaf(p4d_t p4d) +static inline bool p4d_leaf(p4d_t p4d) { /* No 512 GiB pages yet */ return 0; @@ -261,7 +261,7 @@ static inline int p4d_leaf(p4d_t p4d) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define pmd_leaf pmd_leaf -static inline int pmd_leaf(pmd_t pte) +static inline bool pmd_leaf(pmd_t pte) { return pmd_flags(pte) & _PAGE_PSE; } @@ -1086,7 +1086,7 @@ static inline pmd_t *pud_pgtable(pud_t pud) #define pud_page(pud) pfn_to_page(pud_pfn(pud)) #define pud_leaf pud_leaf -static inline int pud_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == (_PAGE_PSE | _PAGE_PRESENT); @@ -1413,7 +1413,7 @@ static inline bool pgdp_maps_userspace(void *__ptr) } #define pgd_leaf pgd_leaf -static inline int pgd_leaf(pgd_t pgd) { return 0; } +static inline bool pgd_leaf(pgd_t pgd) { return false; } #ifdef CONFIG_PAGE_TABLE_ISOLATION /* diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a36cf4e124b0..85fc7554cd52 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1777,16 +1777,16 @@ typedef unsigned int pgtbl_mod_mask; * Only meaningful when called on a valid entry. */ #ifndef pgd_leaf -#define pgd_leaf(x) 0 +#define pgd_leaf(x) false #endif #ifndef p4d_leaf -#define p4d_leaf(x) 0 +#define p4d_leaf(x) false #endif #ifndef pud_leaf -#define pud_leaf(x) 0 +#define pud_leaf(x) false #endif #ifndef pmd_leaf -#define pmd_leaf(x) 0 +#define pmd_leaf(x) false #endif #ifndef pgd_leaf_size -- cgit v1.2.3