diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/backing-dev.c | 2 | ||||
| -rw-r--r-- | mm/fadvise.c | 3 | ||||
| -rw-r--r-- | mm/filemap.c | 171 | ||||
| -rw-r--r-- | mm/huge_memory.c | 33 | ||||
| -rw-r--r-- | mm/internal.h | 2 | ||||
| -rw-r--r-- | mm/kasan/common.c | 12 | ||||
| -rw-r--r-- | mm/kfence/core.c | 14 | ||||
| -rw-r--r-- | mm/memcontrol.c | 40 | ||||
| -rw-r--r-- | mm/memfd.c | 43 | ||||
| -rw-r--r-- | mm/memory.c | 62 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 17 | ||||
| -rw-r--r-- | mm/mempool.c | 409 | ||||
| -rw-r--r-- | mm/page-writeback.c | 6 | ||||
| -rw-r--r-- | mm/page_alloc.c | 15 | ||||
| -rw-r--r-- | mm/secretmem.c | 20 | ||||
| -rw-r--r-- | mm/shmem.c | 8 | ||||
| -rw-r--r-- | mm/slab.h | 112 | ||||
| -rw-r--r-- | mm/slab_common.c | 29 | ||||
| -rw-r--r-- | mm/slub.c | 694 | ||||
| -rw-r--r-- | mm/sparse.c | 3 | ||||
| -rw-r--r-- | mm/truncate.c | 10 | ||||
| -rw-r--r-- | mm/usercopy.c | 24 | ||||
| -rw-r--r-- | mm/vmscan.c | 2 | ||||
| -rw-r--r-- | mm/workingset.c | 2 |
24 files changed, 934 insertions, 799 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 41b6c9386b69..c5740c6d37a2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -72,7 +72,7 @@ static void collect_wb_stats(struct wb_stats *stats, list_for_each_entry(inode, &wb->b_more_io, i_io_list) stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) - if (inode->i_state & I_DIRTY_TIME) + if (inode_state_read_once(inode) & I_DIRTY_TIME) stats->nr_dirty_time++; spin_unlock(&wb->list_lock); diff --git a/mm/fadvise.c b/mm/fadvise.c index 588fe76c5a14..67028e30aa91 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -111,8 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + filemap_flush_range(mapping, offset, endbyte); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index 024b71da5224..dfc8a31f1222 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -256,7 +256,7 @@ void filemap_remove_folio(struct folio *folio) __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); @@ -335,7 +335,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) @@ -366,83 +366,60 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) return 0; } -/** - * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @wbc: the writeback_control controlling the writeout - * - * Call writepages on the mapping using the provided wbc to control the - * writeout. - * - * Return: %0 on success, negative error code otherwise. - */ -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc) +static int filemap_writeback(struct address_space *mapping, loff_t start, + loff_t end, enum writeback_sync_modes sync_mode, + long *nr_to_write) { + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, + .range_start = start, + .range_end = end, + }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - wbc_attach_fdatawrite_inode(wbc, mapping->host); - ret = do_writepages(mapping, wbc); - wbc_detach_inode(wbc); + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); + + if (!ret && nr_to_write) + *nr_to_write = wbc.nr_to_write; return ret; } -EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. + * This is a data integrity operation that waits upon dirty or in writeback + * pages. * * Return: %0 on success, negative error code otherwise. */ -int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); -} - -static inline int __filemap_fdatawrite(struct address_space *mapping, - int sync_mode) +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { - return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); + return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); int filemap_fdatawrite(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_ALL); + return filemap_fdatawrite_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_fdatawrite); -int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end) -{ - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); -} -EXPORT_SYMBOL(filemap_fdatawrite_range); - /** - * filemap_fdatawrite_range_kick - start writeback on a range + * filemap_flush_range - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback @@ -452,12 +429,12 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * * Return: %0 on success, negative error code otherwise. */ -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); + return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } -EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); +EXPORT_SYMBOL_GPL(filemap_flush_range); /** * filemap_flush - mostly a non-blocking flush @@ -470,10 +447,22 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); */ int filemap_flush(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_NONE); + return filemap_flush_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); +/* + * Start writeback on @nr_to_write pages from @mapping. No one but the existing + * btrfs caller should be using this. Talk to linux-mm if you think adding a + * new caller is a good idea. + */ +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) +{ + return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, + nr_to_write); +} +EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); + /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check @@ -691,8 +680,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. @@ -794,8 +782,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); @@ -2366,6 +2353,64 @@ out: } EXPORT_SYMBOL(filemap_get_folios_tag); +/** + * filemap_get_folios_dirty - Get a batch of dirty folios + * @mapping: The address_space to search + * @start: The starting folio index + * @end: The final folio index (inclusive) + * @fbatch: The batch to fill + * + * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except + * the returned folios are presumed to be dirty or undergoing writeback. Dirty + * state is presumed because we don't block on folio lock nor want to miss + * folios. Callers that need to can recheck state upon locking the folio. + * + * This may not return all dirty folios if the batch gets filled up. + * + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. + */ +unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, + pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + if (xa_is_value(folio)) + continue; + if (folio_trylock(folio)) { + bool clean = !folio_test_dirty(folio) && + !folio_test_writeback(folio); + folio_unlock(folio); + if (clean) { + folio_put(folio); + continue; + } + } + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no folio beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a folio at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -4470,16 +4515,8 @@ int filemap_invalidate_inode(struct inode *inode, bool flush, unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ - if (flush) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - filemap_fdatawrite_wbc(mapping, &wbc); - } + if (flush) + filemap_fdatawrite_range(mapping, start, end); /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6cba1cb14b23..1192e62531cd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1642,17 +1642,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void touch_pmd(struct vm_area_struct *vma, unsigned long addr, +/** + * touch_pmd - Mark page table pmd entry as accessed and dirty (for write) + * @vma: The VMA covering @addr + * @addr: The virtual address + * @pmd: pmd pointer into the page table mapping @addr + * @write: Whether it's a write access + * + * Return: whether the pmd entry is changed + */ +bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write) { - pmd_t _pmd; + pmd_t entry; - _pmd = pmd_mkyoung(*pmd); + entry = pmd_mkyoung(*pmd); if (write) - _pmd = pmd_mkdirty(_pmd); + entry = pmd_mkdirty(entry); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, write)) + pmd, entry, write)) { update_mmu_cache_pmd(vma, addr, pmd); + return true; + } + + return false; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -1842,18 +1855,14 @@ unlock: } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void huge_pmd_set_accessed(struct vm_fault *vmf) +bool huge_pmd_set_accessed(struct vm_fault *vmf) { bool write = vmf->flags & FAULT_FLAG_WRITE; - vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) - goto unlock; - - touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); + return false; -unlock: - spin_unlock(vmf->ptl); + return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); } static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..27ad37a41868 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, */ void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write); -void touch_pmd(struct vm_area_struct *vma, unsigned long addr, +bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); /* diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d4c14359feaf..38e8bb0bf326 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -520,24 +520,20 @@ void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order, bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) { - struct folio *folio = virt_to_folio(ptr); + struct page *page = virt_to_page(ptr); struct slab *slab; - /* - * This function can be called for large kmalloc allocation that get - * their memory from page_alloc. Thus, the folio might not be a slab. - */ - if (unlikely(!folio_test_slab(folio))) { + if (unlikely(PageLargeKmalloc(page))) { if (check_page_allocation(ptr, ip)) return false; - kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); + kasan_poison(ptr, page_size(page), KASAN_PAGE_FREE, false); return true; } if (is_kfence_address(ptr)) return true; - slab = folio_slab(folio); + slab = page_slab(page); if (check_slab_allocation(slab->slab_cache, ptr, ip)) return false; diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 727c20c94ac5..e62b5516bf48 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -612,14 +612,15 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab; + struct page *page; if (!i || (i % 2)) continue; - slab = page_slab(pfn_to_page(start_pfn + i)); - __folio_set_slab(slab_folio(slab)); + page = pfn_to_page(start_pfn + i); + __SetPageSlab(page); #ifdef CONFIG_MEMCG + struct slab *slab = page_slab(page); slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | MEMCG_DATA_OBJEXTS; #endif @@ -665,16 +666,17 @@ static unsigned long kfence_init_pool(void) reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab; + struct page *page; if (!i || (i % 2)) continue; - slab = page_slab(pfn_to_page(start_pfn + i)); + page = pfn_to_page(start_pfn + i); #ifdef CONFIG_MEMCG + struct slab *slab = page_slab(page); slab->obj_exts = 0; #endif - __folio_clear_slab(slab_folio(slab)); + __ClearPageSlab(page); } return addr; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4deda33625f4..b46356da6c0e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2557,38 +2557,25 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, } static __always_inline -struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) +struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) { /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in * slab->obj_exts. */ - if (folio_test_slab(folio)) { - struct slabobj_ext *obj_exts; - struct slab *slab; - unsigned int off; - - slab = folio_slab(folio); - obj_exts = slab_obj_exts(slab); - if (!obj_exts) - return NULL; - - off = obj_to_index(slab->slab_cache, slab, p); - if (obj_exts[off].objcg) - return obj_cgroup_memcg(obj_exts[off].objcg); + struct slabobj_ext *obj_exts; + unsigned int off; + obj_exts = slab_obj_exts(slab); + if (!obj_exts) return NULL; - } - /* - * folio_memcg_check() is used here, because in theory we can encounter - * a folio where the slab flag has been cleared already, but - * slab->obj_exts has not been freed yet - * folio_memcg_check() will guarantee that a proper memory - * cgroup pointer or NULL will be returned. - */ - return folio_memcg_check(folio); + off = obj_to_index(slab->slab_cache, slab, p); + if (obj_exts[off].objcg) + return obj_cgroup_memcg(obj_exts[off].objcg); + + return NULL; } /* @@ -2602,10 +2589,15 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) */ struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { + struct slab *slab; + if (mem_cgroup_disabled()) return NULL; - return mem_cgroup_from_obj_folio(virt_to_folio(p), p); + slab = virt_to_slab(p); + if (slab) + return mem_cgroup_from_obj_slab(slab, p); + return folio_memcg_check(virt_to_folio(p)); } static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) diff --git a/mm/memfd.c b/mm/memfd.c index a405eaa451ee..ab5312aff14b 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -460,6 +460,8 @@ static struct file *alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; + struct inode *inode; + int err = 0; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, @@ -471,12 +473,20 @@ static struct file *alloc_file(const char *name, unsigned int flags) } if (IS_ERR(file)) return file; + + inode = file_inode(file); + err = security_inode_init_security_anon(inode, + &QSTR(MEMFD_ANON_NAME), NULL); + if (err) { + fput(file); + file = ERR_PTR(err); + return file; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; if (flags & MFD_NOEXEC_SEAL) { - struct inode *inode = file_inode(file); - inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); if (file_seals) { @@ -497,9 +507,9 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - struct file *file; - int fd, error; - char *name; + char *name __free(kfree) = NULL; + unsigned int fd_flags; + int error; error = sanitize_flags(&flags); if (error < 0) @@ -509,25 +519,6 @@ SYSCALL_DEFINE2(memfd_create, if (IS_ERR(name)) return PTR_ERR(name); - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_free_name; - } - - file = alloc_file(name, flags); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_free_fd; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_free_fd: - put_unused_fd(fd); -err_free_name: - kfree(name); - return error; + fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; + return FD_ADD(fd_flags, alloc_file(name, flags)); } diff --git a/mm/memory.c b/mm/memory.c index b59ae7ce42eb..aad432e71251 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6134,6 +6134,45 @@ split: } /* + * The page faults may be spurious because of the racy access to the + * page table. For example, a non-populated virtual page is accessed + * on 2 CPUs simultaneously, thus the page faults are triggered on + * both CPUs. However, it's possible that one CPU (say CPU A) cannot + * find the reason for the page fault if the other CPU (say CPU B) has + * changed the page table before the PTE is checked on CPU A. Most of + * the time, the spurious page faults can be ignored safely. However, + * if the page fault is for the write access, it's possible that a + * stale read-only TLB entry exists in the local CPU and needs to be + * flushed on some architectures. This is called the spurious page + * fault fixing. + * + * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page() + * by default and used as such on most architectures, while + * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and + * used as such on most architectures. + */ +static void fix_spurious_fault(struct vm_fault *vmf, + enum pgtable_level ptlevel) +{ + /* Skip spurious TLB flush for retried page fault */ + if (vmf->flags & FAULT_FLAG_TRIED) + return; + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (vmf->flags & FAULT_FLAG_WRITE) { + if (ptlevel == PGTABLE_LEVEL_PTE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, + vmf->pte); + else + flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address, + vmf->pmd); + } +} +/* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. @@ -6214,23 +6253,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, - vmf->flags & FAULT_FLAG_WRITE)) { + vmf->flags & FAULT_FLAG_WRITE)) update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); - } else { - /* Skip spurious TLB flush for retried page fault */ - if (vmf->flags & FAULT_FLAG_TRIED) - goto unlock; - /* - * This is needed only for protection faults but the arch code - * is not yet telling us if this is a protection fault or not. - * This still avoids useless tlb flushes for .text page faults - * with threads. - */ - if (vmf->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, - vmf->pte); - } + else + fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -6327,7 +6354,10 @@ retry_pud: if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&vmf); + vmf.ptl = pmd_lock(mm, vmf.pmd); + if (!huge_pmd_set_accessed(&vmf)) + fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD); + spin_unlock(vmf.ptl); return 0; } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0be83039c3b5..238a6712738e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1088,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone, bool mhp_off_inaccessible) + struct zone *zone) { unsigned long end_pfn = pfn + nr_pages; int ret, i; @@ -1097,15 +1097,6 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; - /* - * Memory block is accessible at this stage and hence poison the struct - * pages now. If the memory block is accessible during memory hotplug - * addition phase, then page poisining is already performed in - * sparse_add_section(). - */ - if (mhp_off_inaccessible) - page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE, false); @@ -1444,7 +1435,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, - u64 start, u64 size, mhp_t mhp_flags) + u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1460,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); - if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) - mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { @@ -1555,7 +1544,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && mhp_supports_memmap_on_memory()) { - ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); + ret = create_altmaps_and_memory_blocks(nid, group, start, size); if (ret) goto error; } else { diff --git a/mm/mempool.c b/mm/mempool.c index d7bbf1189db9..c290e5261b47 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/mm/mempool.c - * * memory buffer pool support. Such pools are mostly used * for guaranteed, deadlock-free memory allocations during * extreme VM load. @@ -9,7 +7,7 @@ * started by Ingo Molnar, Copyright (C) 2001 * debugging by David Rientjes, Copyright (C) 2015 */ - +#include <linux/fault-inject.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/highmem.h> @@ -20,8 +18,27 @@ #include <linux/writeback.h> #include "slab.h" +static DECLARE_FAULT_ATTR(fail_mempool_alloc); +static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk); + +static int __init mempool_faul_inject_init(void) +{ + int error; + + error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", + NULL, &fail_mempool_alloc)); + if (error) + return error; + + /* booting will fail on error return here, don't bother to cleanup */ + return PTR_ERR_OR_ZERO( + fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL, + &fail_mempool_alloc_bulk)); +} +late_initcall(mempool_faul_inject_init); + #ifdef CONFIG_SLUB_DEBUG_ON -static void poison_error(mempool_t *pool, void *element, size_t size, +static void poison_error(struct mempool *pool, void *element, size_t size, size_t byte) { const int nr = pool->curr_nr; @@ -38,7 +55,7 @@ static void poison_error(mempool_t *pool, void *element, size_t size, dump_stack(); } -static void __check_element(mempool_t *pool, void *element, size_t size) +static void __check_element(struct mempool *pool, void *element, size_t size) { u8 *obj = element; size_t i; @@ -54,7 +71,7 @@ static void __check_element(mempool_t *pool, void *element, size_t size) memset(obj, POISON_INUSE, size); } -static void check_element(mempool_t *pool, void *element) +static void check_element(struct mempool *pool, void *element) { /* Skip checking: KASAN might save its metadata in the element. */ if (kasan_enabled()) @@ -93,7 +110,7 @@ static void __poison_element(void *element, size_t size) obj[size - 1] = POISON_END; } -static void poison_element(mempool_t *pool, void *element) +static void poison_element(struct mempool *pool, void *element) { /* Skip poisoning: KASAN might save its metadata in the element. */ if (kasan_enabled()) @@ -124,15 +141,16 @@ static void poison_element(mempool_t *pool, void *element) } } #else /* CONFIG_SLUB_DEBUG_ON */ -static inline void check_element(mempool_t *pool, void *element) +static inline void check_element(struct mempool *pool, void *element) { } -static inline void poison_element(mempool_t *pool, void *element) +static inline void poison_element(struct mempool *pool, void *element) { } #endif /* CONFIG_SLUB_DEBUG_ON */ -static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) +static __always_inline bool kasan_poison_element(struct mempool *pool, + void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) return kasan_mempool_poison_object(element); @@ -142,7 +160,7 @@ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) return true; } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(struct mempool *pool, void *element) { if (pool->alloc == mempool_kmalloc) kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); @@ -154,7 +172,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) (unsigned long)pool->pool_data); } -static __always_inline void add_element(mempool_t *pool, void *element) +static __always_inline void add_element(struct mempool *pool, void *element) { BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); poison_element(pool, element); @@ -162,7 +180,7 @@ static __always_inline void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(struct mempool *pool) { void *element = pool->elements[--pool->curr_nr]; @@ -183,7 +201,7 @@ static void *remove_element(mempool_t *pool) * May be called on a zeroed but uninitialized mempool (i.e. allocated with * kzalloc()). */ -void mempool_exit(mempool_t *pool) +void mempool_exit(struct mempool *pool) { while (pool->curr_nr) { void *element = remove_element(pool); @@ -202,7 +220,7 @@ EXPORT_SYMBOL(mempool_exit); * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. */ -void mempool_destroy(mempool_t *pool) +void mempool_destroy(struct mempool *pool) { if (unlikely(!pool)) return; @@ -212,9 +230,9 @@ void mempool_destroy(mempool_t *pool) } EXPORT_SYMBOL(mempool_destroy); -int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +int mempool_init_node(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id) { spin_lock_init(&pool->lock); pool->min_nr = min_nr; @@ -264,8 +282,9 @@ EXPORT_SYMBOL(mempool_init_node); * * Return: %0 on success, negative error code otherwise. */ -int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) +int mempool_init_noprof(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); @@ -291,11 +310,11 @@ EXPORT_SYMBOL(mempool_init_noprof); * * Return: pointer to the created memory pool object or %NULL on error. */ -mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +struct mempool *mempool_create_node_noprof(int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id) { - mempool_t *pool; + struct mempool *pool; pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) @@ -329,7 +348,7 @@ EXPORT_SYMBOL(mempool_create_node_noprof); * * Return: %0 on success, negative error code otherwise. */ -int mempool_resize(mempool_t *pool, int new_min_nr) +int mempool_resize(struct mempool *pool, int new_min_nr) { void *element; void **new_elements; @@ -391,140 +410,227 @@ out: } EXPORT_SYMBOL(mempool_resize); -/** - * mempool_alloc - allocate an element from a specific memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). - * @gfp_mask: the usual allocation bitmask. - * - * this function only sleeps if the alloc_fn() function sleeps or - * returns NULL. Note that due to preallocation, this function - * *never* fails when called from process contexts. (it might - * fail if called from an IRQ context.) - * Note: using __GFP_ZERO is not supported. - * - * Return: pointer to the allocated element or %NULL on error. - */ -void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) +static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems, + unsigned int count, unsigned int allocated, + gfp_t gfp_mask) { - void *element; unsigned long flags; - wait_queue_entry_t wait; - gfp_t gfp_temp; + unsigned int i; - VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_alloc(gfp_mask); - - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ - gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ - gfp_mask |= __GFP_NOWARN; /* failures are OK */ + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(pool->curr_nr < count - allocated)) + goto fail; + for (i = 0; i < count; i++) { + if (!elems[i]) { + elems[i] = remove_element(pool); + allocated++; + } + } + spin_unlock_irqrestore(&pool->lock, flags); - gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); + /* Paired with rmb in mempool_free(), read comment there. */ + smp_wmb(); -repeat_alloc: + /* + * Update the allocation stack trace as this is more useful for + * debugging. + */ + for (i = 0; i < count; i++) + kmemleak_update_trace(elems[i]); + return allocated; - element = pool->alloc(gfp_temp, pool->pool_data); - if (likely(element != NULL)) - return element; +fail: + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + DEFINE_WAIT(wait); - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr)) { - element = remove_element(pool); + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irqrestore(&pool->lock, flags); - /* paired with rmb in mempool_free(), read comment there */ - smp_wmb(); + /* - * Update the allocation stack trace as this is more useful - * for debugging. + * Wait for someone else to return an element to @pool, but wake + * up occasionally as memory pressure might have reduced even + * and the normal allocation in alloc_fn could succeed even if + * no element was returned. */ - kmemleak_update_trace(element); - return element; - } - - /* - * We use gfp mask w/o direct reclaim or IO for the first round. If - * alloc failed with that and @pool was empty, retry immediately. - */ - if (gfp_temp != gfp_mask) { + io_schedule_timeout(5 * HZ); + finish_wait(&pool->wait, &wait); + } else { + /* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */ spin_unlock_irqrestore(&pool->lock, flags); - gfp_temp = gfp_mask; - goto repeat_alloc; } - /* We must not sleep if !__GFP_DIRECT_RECLAIM */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { - spin_unlock_irqrestore(&pool->lock, flags); - return NULL; - } + return allocated; +} - /* Let's wait for someone else to return an element to @pool */ - init_wait(&wait); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); +/* + * Adjust the gfp flags for mempool allocations, as we never want to dip into + * the global emergency reserves or retry in the page allocator. + * + * The first pass also doesn't want to go reclaim, but the next passes do, so + * return a separate subset for that first iteration. + */ +static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask) +{ + *gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); +} - spin_unlock_irqrestore(&pool->lock, flags); +/** + * mempool_alloc_bulk - allocate multiple elements from a memory pool + * @pool: pointer to the memory pool + * @elems: partially or fully populated elements array + * @count: number of entries in @elem that need to be allocated + * @allocated: number of entries in @elem already allocated + * + * Allocate elements for each slot in @elem that is non-%NULL. This is done by + * first calling into the alloc_fn supplied at pool initialization time, and + * dipping into the reserved pool when alloc_fn fails to allocate an element. + * + * On return all @count elements in @elems will be populated. + * + * Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void. + */ +int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems, + unsigned int count, unsigned int allocated) +{ + gfp_t gfp_mask = GFP_KERNEL; + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); + unsigned int i = 0; + + VM_WARN_ON_ONCE(count > pool->min_nr); + might_alloc(gfp_mask); /* - * FIXME: this should be io_schedule(). The timeout is there as a - * workaround for some DM problems in 2.6.18. + * If an error is injected, fail all elements in a bulk allocation so + * that we stress the multiple elements missing path. */ - io_schedule_timeout(5*HZ); + if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) { + pr_info("forcing mempool usage for %pS\n", + (void *)_RET_IP_); + goto use_pool; + } + +repeat_alloc: + /* + * Try to allocate the elements using the allocation callback first as + * that might succeed even when the caller's bulk allocation did not. + */ + for (i = 0; i < count; i++) { + if (elems[i]) + continue; + elems[i] = pool->alloc(gfp_temp, pool->pool_data); + if (unlikely(!elems[i])) + goto use_pool; + allocated++; + } + + return 0; - finish_wait(&pool->wait, &wait); +use_pool: + allocated = mempool_alloc_from_pool(pool, elems, count, allocated, + gfp_temp); + gfp_temp = gfp_mask; goto repeat_alloc; } -EXPORT_SYMBOL(mempool_alloc_noprof); +EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof); /** - * mempool_alloc_preallocated - allocate an element from preallocated elements - * belonging to a specific memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). + * mempool_alloc - allocate an element from a memory pool + * @pool: pointer to the memory pool + * @gfp_mask: GFP_* flags. %__GFP_ZERO is not supported. * - * This function is similar to mempool_alloc, but it only attempts allocating - * an element from the preallocated elements. It does not sleep and immediately - * returns if no preallocated elements are available. + * Allocate an element from @pool. This is done by first calling into the + * alloc_fn supplied at pool initialization time, and dipping into the reserved + * pool when alloc_fn fails to allocate an element. * - * Return: pointer to the allocated element or %NULL if no elements are - * available. + * This function only sleeps if the alloc_fn callback sleeps, or when waiting + * for elements to become available in the pool. + * + * Return: pointer to the allocated element or %NULL when failing to allocate + * an element. Allocation failure can only happen when @gfp_mask does not + * include %__GFP_DIRECT_RECLAIM. */ -void *mempool_alloc_preallocated(mempool_t *pool) +void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) { + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); void *element; - unsigned long flags; - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr)) { - element = remove_element(pool); - spin_unlock_irqrestore(&pool->lock, flags); - /* paired with rmb in mempool_free(), read comment there */ - smp_wmb(); + VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + might_alloc(gfp_mask); + +repeat_alloc: + if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) { + pr_info("forcing mempool usage for %pS\n", + (void *)_RET_IP_); + element = NULL; + } else { + element = pool->alloc(gfp_temp, pool->pool_data); + } + + if (unlikely(!element)) { /* - * Update the allocation stack trace as this is more useful - * for debugging. + * Try to allocate an element from the pool. + * + * The first pass won't have __GFP_DIRECT_RECLAIM and won't + * sleep in mempool_alloc_from_pool. Retry the allocation + * with all flags set in that case. */ - kmemleak_update_trace(element); - return element; + if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) { + if (gfp_temp != gfp_mask) { + gfp_temp = gfp_mask; + goto repeat_alloc; + } + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + goto repeat_alloc; + } + } } - spin_unlock_irqrestore(&pool->lock, flags); - return NULL; + return element; +} +EXPORT_SYMBOL(mempool_alloc_noprof); + +/** + * mempool_alloc_preallocated - allocate an element from preallocated elements + * belonging to a memory pool + * @pool: pointer to the memory pool + * + * This function is similar to mempool_alloc(), but it only attempts allocating + * an element from the preallocated elements. It only takes a single spinlock_t + * and immediately returns if no preallocated elements are available. + * + * Return: pointer to the allocated element or %NULL if no elements are + * available. + */ +void *mempool_alloc_preallocated(struct mempool *pool) +{ + void *element = NULL; + + mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT); + return element; } EXPORT_SYMBOL(mempool_alloc_preallocated); /** - * mempool_free - return an element to the pool. - * @element: pool element pointer. - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). + * mempool_free_bulk - return elements to a mempool + * @pool: pointer to the memory pool + * @elems: elements to return + * @count: number of elements to return * - * this function only sleeps if the free_fn() function sleeps. + * Returns a number of elements from the start of @elem to @pool if @pool needs + * replenishing and sets their slots in @elem to NULL. Other elements are left + * in @elem. + * + * Return: number of elements transferred to @pool. Elements are always + * transferred from the beginning of @elem, so the return value can be used as + * an offset into @elem for the freeing the remaining elements in the caller. */ -void mempool_free(void *element, mempool_t *pool) +unsigned int mempool_free_bulk(struct mempool *pool, void **elems, + unsigned int count) { unsigned long flags; - - if (unlikely(element == NULL)) - return; + unsigned int freed = 0; + bool added = false; /* * Paired with the wmb in mempool_alloc(). The preceding read is @@ -558,21 +664,6 @@ void mempool_free(void *element, mempool_t *pool) * Waiters happen iff curr_nr is 0 and the above guarantee also * ensures that there will be frees which return elements to the * pool waking up the waiters. - */ - if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr < pool->min_nr)) { - add_element(pool, element); - spin_unlock_irqrestore(&pool->lock, flags); - if (wq_has_sleeper(&pool->wait)) - wake_up(&pool->wait); - return; - } - spin_unlock_irqrestore(&pool->lock, flags); - } - - /* - * Handle the min_nr = 0 edge case: * * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, * so waiters sleeping on pool->wait would never be woken by the @@ -580,20 +671,45 @@ void mempool_free(void *element, mempool_t *pool) * allocation of element when both min_nr and curr_nr are 0, and * any active waiters are properly awakened. */ - if (unlikely(pool->min_nr == 0 && + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { + spin_lock_irqsave(&pool->lock, flags); + while (pool->curr_nr < pool->min_nr && freed < count) { + add_element(pool, elems[freed++]); + added = true; + } + spin_unlock_irqrestore(&pool->lock, flags); + } else if (unlikely(pool->min_nr == 0 && READ_ONCE(pool->curr_nr) == 0)) { + /* Handle the min_nr = 0 edge case: */ spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr == 0)) { - add_element(pool, element); - spin_unlock_irqrestore(&pool->lock, flags); - if (wq_has_sleeper(&pool->wait)) - wake_up(&pool->wait); - return; + add_element(pool, elems[freed++]); + added = true; } spin_unlock_irqrestore(&pool->lock, flags); } - pool->free(element, pool->pool_data); + if (unlikely(added) && wq_has_sleeper(&pool->wait)) + wake_up(&pool->wait); + + return freed; +} +EXPORT_SYMBOL_GPL(mempool_free_bulk); + +/** + * mempool_free - return an element to the pool. + * @element: element to return + * @pool: pointer to the memory pool + * + * Returns @element to @pool if it needs replenishing, else frees it using + * the free_fn callback in @pool. + * + * This function only sleeps if the free_fn callback sleeps. + */ +void mempool_free(void *element, struct mempool *pool) +{ + if (likely(element) && !mempool_free_bulk(pool, &element, 1)) + pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); @@ -632,19 +748,6 @@ void mempool_kfree(void *element, void *pool_data) } EXPORT_SYMBOL(mempool_kfree); -void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t)pool_data; - return kvmalloc(size, gfp_mask); -} -EXPORT_SYMBOL(mempool_kvmalloc); - -void mempool_kvfree(void *element, void *pool_data) -{ - kvfree(element); -} -EXPORT_SYMBOL(mempool_kvfree); - /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 757bc4d3b5b5..a124ab6a205d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping, return true; } -static xa_mark_t wbc_to_tag(struct writeback_control *wbc) -{ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - return PAGECACHE_TAG_TOWRITE; - return PAGECACHE_TAG_DIRTY; -} static pgoff_t wbc_end(struct writeback_control *wbc) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ed82ee55e66a..4074c07d02ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4977,13 +4977,18 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * @nr_pages: The number of pages desired in the array * @page_array: Array to store the pages * - * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to the page_array. + * This is a batched version of the page allocator that attempts to allocate + * @nr_pages quickly. Pages are added to @page_array. * - * Note that only NULL elements are populated with pages and nr_pages - * is the maximum number of pages that will be stored in the array. + * Note that only the elements in @page_array that were cleared to %NULL on + * entry are populated with newly allocated pages. @nr_pages is the maximum + * number of pages that will be stored in the array. * - * Returns the number of pages in the array. + * Returns the number of pages in @page_array, including ones already + * allocated on entry. This can be less than the number requested in @nr_pages, + * but all empty slots are filled from the beginning. I.e., if all slots in + * @page_array were set to %NULL on entry, the slots from 0 to the return value + * - 1 will be filled. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, diff --git a/mm/secretmem.c b/mm/secretmem.c index b59350daffe3..f0ef4e198884 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -224,9 +224,6 @@ err_free_inode: SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { - struct file *file; - int fd, err; - /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); @@ -238,22 +235,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) if (atomic_read(&secretmem_users) < 0) return -ENFILE; - fd = get_unused_fd_flags(flags & O_CLOEXEC); - if (fd < 0) - return fd; - - file = secretmem_file_create(flags); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto err_put_fd; - } - - fd_install(fd, file); - return fd; - -err_put_fd: - put_unused_fd(fd); - return err; + return FD_ADD(flags & O_CLOEXEC, secretmem_file_create(flags)); } static int secretmem_init_fs_context(struct fs_context *fc) diff --git a/mm/shmem.c b/mm/shmem.c index 5a3f0f754dc0..899303d8c9aa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1075,7 +1075,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, +static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; @@ -1132,7 +1132,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { - same_folio = lend < folio_pos(folio) + folio_size(folio); + same_folio = lend < folio_next_pos(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); @@ -1226,7 +1226,7 @@ whole_folios: shmem_recalc_inode(inode, 0, -nr_swaps_freed); } -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); @@ -5778,7 +5778,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, } #endif -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } diff --git a/mm/slab.h b/mm/slab.h index 078daecc7cf5..f730e012553c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -40,13 +40,29 @@ typedef u64 freelist_full_t; * Freelist pointer and counter to cmpxchg together, avoids the typical ABA * problems with cmpxchg of just a pointer. */ -typedef union { - struct { - void *freelist; - unsigned long counter; +struct freelist_counters { + union { + struct { + void *freelist; + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + /* + * If slab debugging is enabled then the + * frozen bit can be reused to indicate + * that the slab was corrupted + */ + unsigned frozen:1; + }; + }; + }; +#ifdef system_has_freelist_aba + freelist_full_t freelist_counters; +#endif }; - freelist_full_t full; -} freelist_aba_t; +}; /* Reuses the bits in struct page */ struct slab { @@ -69,27 +85,7 @@ struct slab { #endif }; /* Double-word boundary */ - union { - struct { - void *freelist; /* first free object */ - union { - unsigned long counters; - struct { - unsigned inuse:16; - unsigned objects:15; - /* - * If slab debugging is enabled then the - * frozen bit can be reused to indicate - * that the slab was corrupted - */ - unsigned frozen:1; - }; - }; - }; -#ifdef system_has_freelist_aba - freelist_aba_t freelist_counter; -#endif - }; + struct freelist_counters; }; struct rcu_head rcu_head; }; @@ -114,23 +110,10 @@ SLAB_MATCH(_unused_slab_obj_exts, obj_exts); #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); #if defined(system_has_freelist_aba) -static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist_counters))); #endif /** - * folio_slab - Converts from folio to slab. - * @folio: The folio. - * - * Currently struct slab is a different representation of a folio where - * folio_test_slab() is true. - * - * Return: The slab which contains this folio. - */ -#define folio_slab(folio) (_Generic((folio), \ - const struct folio *: (const struct slab *)(folio), \ - struct folio *: (struct slab *)(folio))) - -/** * slab_folio - The folio allocated for a slab * @s: The slab. * @@ -146,20 +129,24 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) struct slab *: (struct folio *)s)) /** - * page_slab - Converts from first struct page to slab. - * @p: The first (either head of compound or single) page of slab. - * - * A temporary wrapper to convert struct page to struct slab in situations where - * we know the page is the compound head, or single order-0 page. - * - * Long-term ideally everything would work with struct slab directly or go - * through folio to struct slab. + * page_slab - Converts from struct page to its slab. + * @page: A page which may or may not belong to a slab. * - * Return: The slab which contains this page + * Return: The slab which contains this page or NULL if the page does + * not belong to a slab. This includes pages returned from large kmalloc. */ -#define page_slab(p) (_Generic((p), \ - const struct page *: (const struct slab *)(p), \ - struct page *: (struct slab *)(p))) +static inline struct slab *page_slab(const struct page *page) +{ + unsigned long head; + + head = READ_ONCE(page->compound_head); + if (head & 1) + page = (struct page *)(head - 1); + if (data_race(page->page_type >> 24) != PGTY_slab) + page = NULL; + + return (struct slab *)page; +} /** * slab_page - The first struct page allocated for a slab @@ -188,12 +175,7 @@ static inline pg_data_t *slab_pgdat(const struct slab *slab) static inline struct slab *virt_to_slab(const void *addr) { - struct folio *folio = virt_to_folio(addr); - - if (!folio_test_slab(folio)) - return NULL; - - return folio_slab(folio); + return page_slab(virt_to_page(addr)); } static inline int slab_order(const struct slab *slab) @@ -236,10 +218,8 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { -#ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; struct lock_class_key lock_key; -#endif struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; @@ -601,6 +581,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } +static inline unsigned int large_kmalloc_order(const struct page *page) +{ + return page[1].flags.f & 0xff; +} + +static inline size_t large_kmalloc_size(const struct page *page) +{ + return PAGE_SIZE << large_kmalloc_order(page); +} + #ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else diff --git a/mm/slab_common.c b/mm/slab_common.c index 932d13ada36c..84dfff4f7b1f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -997,26 +997,27 @@ void __init create_kmalloc_caches(void) */ size_t __ksize(const void *object) { - struct folio *folio; + const struct page *page; + const struct slab *slab; if (unlikely(object == ZERO_SIZE_PTR)) return 0; - folio = virt_to_folio(object); + page = virt_to_page(object); - if (unlikely(!folio_test_slab(folio))) { - if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) - return 0; - if (WARN_ON(object != folio_address(folio))) - return 0; - return folio_size(folio); - } + if (unlikely(PageLargeKmalloc(page))) + return large_kmalloc_size(page); + + slab = page_slab(page); + /* Delete this after we're sure there are no users */ + if (WARN_ON(!slab)) + return page_size(page); #ifdef CONFIG_SLUB_DEBUG - skip_orig_size_check(folio_slab(folio)->slab_cache, object); + skip_orig_size_check(slab->slab_cache, object); #endif - return slab_ksize(folio_slab(folio)->slab_cache); + return slab_ksize(slab->slab_cache); } gfp_t kmalloc_fix_flags(gfp_t flags) @@ -1614,17 +1615,15 @@ static void kfree_rcu_work(struct work_struct *work) static bool kfree_rcu_sheaf(void *obj) { struct kmem_cache *s; - struct folio *folio; struct slab *slab; if (is_vmalloc_addr(obj)) return false; - folio = virt_to_folio(obj); - if (unlikely(!folio_test_slab(folio))) + slab = virt_to_slab(obj); + if (unlikely(!slab)) return false; - slab = folio_slab(folio); s = slab->slab_cache; if (s->cpu_sheaves) { if (likely(!IS_ENABLED(CONFIG_NUMA) || diff --git a/mm/slub.c b/mm/slub.c index a0b905c2a557..2acce22590f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -410,19 +410,22 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; -#ifndef CONFIG_SLUB_TINY -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { +struct freelist_tid { union { struct { - void **freelist; /* Pointer to next available object */ + void *freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ }; - freelist_aba_t freelist_tid; + freelist_full_t freelist_tid; }; +}; + +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ +struct kmem_cache_cpu { + struct freelist_tid; struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ @@ -432,7 +435,6 @@ struct kmem_cache_cpu { unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif }; -#endif /* CONFIG_SLUB_TINY */ static inline void stat(const struct kmem_cache *s, enum stat_item si) { @@ -469,7 +471,10 @@ struct slab_sheaf { struct rcu_head rcu_head; struct list_head barn_list; /* only used for prefilled sheafs */ - unsigned int capacity; + struct { + unsigned int capacity; + bool pfmemalloc; + }; }; struct kmem_cache *cache; unsigned int size; @@ -594,12 +599,10 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_ptr_decode(s, p, ptr_addr); } -#ifndef CONFIG_SLUB_TINY static void prefetch_freepointer(const struct kmem_cache *s, void *object) { prefetchw(object + s->offset); } -#endif /* * When running under KMSAN, get_freepointer_safe() may return an uninitialized @@ -711,10 +714,12 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) return s->cpu_partial_slabs; } #else +#ifdef SLAB_SUPPORTS_SYSFS static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } +#endif static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) { @@ -755,32 +760,29 @@ static __always_inline void slab_unlock(struct slab *slab) } static inline bool -__update_freelist_fast(struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new) +__update_freelist_fast(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) { #ifdef system_has_freelist_aba - freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old }; - freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new }; - - return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full); + return try_cmpxchg_freelist(&slab->freelist_counters, + &old->freelist_counters, + new->freelist_counters); #else return false; #endif } static inline bool -__update_freelist_slow(struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new) +__update_freelist_slow(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) { bool ret = false; slab_lock(slab); - if (slab->freelist == freelist_old && - slab->counters == counters_old) { - slab->freelist = freelist_new; - slab->counters = counters_new; + if (slab->freelist == old->freelist && + slab->counters == old->counters) { + slab->freelist = new->freelist; + slab->counters = new->counters; ret = true; } slab_unlock(slab); @@ -796,22 +798,18 @@ __update_freelist_slow(struct slab *slab, * interrupt the operation. */ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) + struct freelist_counters *old, struct freelist_counters *new, const char *n) { bool ret; if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); - if (s->flags & __CMPXCHG_DOUBLE) { - ret = __update_freelist_fast(slab, freelist_old, counters_old, - freelist_new, counters_new); - } else { - ret = __update_freelist_slow(slab, freelist_old, counters_old, - freelist_new, counters_new); - } + if (s->flags & __CMPXCHG_DOUBLE) + ret = __update_freelist_fast(slab, old, new); + else + ret = __update_freelist_slow(slab, old, new); + if (likely(ret)) return true; @@ -826,21 +824,17 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla } static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) + struct freelist_counters *old, struct freelist_counters *new, const char *n) { bool ret; if (s->flags & __CMPXCHG_DOUBLE) { - ret = __update_freelist_fast(slab, freelist_old, counters_old, - freelist_new, counters_new); + ret = __update_freelist_fast(slab, old, new); } else { unsigned long flags; local_irq_save(flags); - ret = __update_freelist_slow(slab, freelist_old, counters_old, - freelist_new, counters_new); + ret = __update_freelist_slow(slab, old, new); local_irq_restore(flags); } if (likely(ret)) @@ -978,7 +972,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; static slab_flags_t slub_debug; #endif -static char *slub_debug_string; +static const char *slub_debug_string __ro_after_init; static int disable_higher_order_debug; /* @@ -1785,8 +1779,8 @@ static inline int free_consistency_checks(struct kmem_cache *s, * * returns the start of next block if there's any, or NULL */ -static char * -parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) +static const char * +parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init) { bool higher_order_disable = false; @@ -1863,17 +1857,17 @@ check_slabs: return NULL; } -static int __init setup_slub_debug(char *str) +static int __init setup_slub_debug(const char *str, const struct kernel_param *kp) { slab_flags_t flags; slab_flags_t global_flags; - char *saved_str; - char *slab_list; + const char *saved_str; + const char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; global_flags = DEBUG_DEFAULT_FLAGS; - if (*str++ != '=' || !*str) + if (!str || !*str) /* * No options specified. Switch on full debugging. */ @@ -1917,11 +1911,15 @@ out: static_branch_unlikely(&init_on_free)) && (slub_debug & SLAB_POISON)) pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); - return 1; + return 0; } -__setup("slab_debug", setup_slub_debug); -__setup_param("slub_debug", slub_debug, setup_slub_debug, 0); +static const struct kernel_param_ops param_ops_slab_debug __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slub_debug, +}; +__core_param_cb(slab_debug, ¶m_ops_slab_debug, NULL, 0); +__core_param_cb(slub_debug, ¶m_ops_slab_debug, NULL, 0); /* * kmem_cache_flags - apply debugging options to the cache @@ -1935,9 +1933,9 @@ __setup_param("slub_debug", slub_debug, setup_slub_debug, 0); */ slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { - char *iter; + const char *iter; size_t len; - char *next_block; + const char *next_block; slab_flags_t block_flags; slab_flags_t slub_debug_local = slub_debug; @@ -1961,7 +1959,7 @@ slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) continue; /* Found a block that has a slab list, search it */ while (*iter) { - char *end, *glob; + const char *end, *glob; size_t cmplen; end = strchrnul(iter, ','); @@ -2023,15 +2021,21 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -#ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { return false; } -#endif #endif /* CONFIG_SLUB_DEBUG */ +/* + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + #ifdef CONFIG_SLAB_OBJ_EXT #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG @@ -2086,14 +2090,6 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -/* - * The allocated objcg pointers array is not accounted directly. - * Moreover, it should not come from DMA buffer and is not readily - * reclaimable. So those GFP bits should be masked off. - */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ - __GFP_ACCOUNT | __GFP_NOFAIL) - static inline void init_slab_obj_exts(struct slab *slab) { slab->obj_exts = 0; @@ -2373,33 +2369,34 @@ bool memcg_slab_post_charge(void *p, gfp_t flags) { struct slabobj_ext *slab_exts; struct kmem_cache *s; - struct folio *folio; + struct page *page; struct slab *slab; unsigned long off; - folio = virt_to_folio(p); - if (!folio_test_slab(folio)) { + page = virt_to_page(p); + if (PageLargeKmalloc(page)) { + unsigned int order; int size; - if (folio_memcg_kmem(folio)) + if (PageMemcgKmem(page)) return true; - if (__memcg_kmem_charge_page(folio_page(folio, 0), flags, - folio_order(folio))) + order = large_kmalloc_order(page); + if (__memcg_kmem_charge_page(page, flags, order)) return false; /* - * This folio has already been accounted in the global stats but + * This page has already been accounted in the global stats but * not in the memcg stats. So, subtract from the global and use * the interface which adds to both global and memcg stats. */ - size = folio_size(folio); - node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size); + size = PAGE_SIZE << order; + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size); return true; } - slab = folio_slab(folio); + slab = page_slab(page); s = slab->slab_cache; /* @@ -2601,8 +2598,24 @@ static void *setup_object(struct kmem_cache *s, void *object) static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) { - struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, - s->sheaf_capacity), gfp); + struct slab_sheaf *sheaf; + size_t sheaf_size; + + if (gfp & __GFP_NO_OBJ_EXT) + return NULL; + + gfp &= ~OBJCGS_CLEAR_MASK; + + /* + * Prevent recursion to the same cache, or a deep stack of kmallocs of + * varying sizes (sheaf capacity might differ for each kmalloc size + * bucket) + */ + if (s->flags & SLAB_KMALLOC) + gfp |= __GFP_NO_OBJ_EXT; + + sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf = kzalloc(sheaf_size, gfp); if (unlikely(!sheaf)) return NULL; @@ -2655,7 +2668,7 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) if (!sheaf) return NULL; - if (refill_sheaf(s, sheaf, gfp)) { + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { free_empty_sheaf(s, sheaf); return NULL; } @@ -2733,12 +2746,13 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) sheaf->size = 0; } -static void __rcu_free_sheaf_prepare(struct kmem_cache *s, +static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, struct slab_sheaf *sheaf) { bool init = slab_want_init_on_free(s); void **p = &sheaf->objects[0]; unsigned int i = 0; + bool pfmemalloc = false; while (i < sheaf->size) { struct slab *slab = virt_to_slab(p[i]); @@ -2751,8 +2765,13 @@ static void __rcu_free_sheaf_prepare(struct kmem_cache *s, continue; } + if (slab_test_pfmemalloc(slab)) + pfmemalloc = true; + i++; } + + return pfmemalloc; } static void rcu_free_sheaf_nobarn(struct rcu_head *head) @@ -3015,14 +3034,11 @@ static void barn_init(struct node_barn *barn) static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) { - struct list_head empty_list; - struct list_head full_list; + LIST_HEAD(empty_list); + LIST_HEAD(full_list); struct slab_sheaf *sheaf, *sheaf2; unsigned long flags; - INIT_LIST_HEAD(&empty_list); - INIT_LIST_HEAD(&full_list); - spin_lock_irqsave(&barn->lock, flags); list_splice_init(&barn->sheaves_full, &full_list); @@ -3048,24 +3064,24 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, struct kmem_cache_order_objects oo, bool allow_spin) { - struct folio *folio; + struct page *page; struct slab *slab; unsigned int order = oo_order(oo); if (unlikely(!allow_spin)) - folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, node, order); else if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_frozen_pages(flags, order); + page = alloc_frozen_pages(flags, order); else - folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); + page = __alloc_frozen_pages(flags, order, node, NULL); - if (!folio) + if (!page) return NULL; - slab = folio_slab(folio); - __folio_set_slab(folio); - if (folio_is_pfmemalloc(folio)) + __SetPageSlab(page); + slab = page_slab(page); + if (page_is_pfmemalloc(page)) slab_set_pfmemalloc(slab); return slab; @@ -3289,16 +3305,16 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) static void __free_slab(struct kmem_cache *s, struct slab *slab) { - struct folio *folio = slab_folio(slab); - int order = folio_order(folio); + struct page *page = slab_page(slab); + int order = compound_order(page); int pages = 1 << order; __slab_clear_pfmemalloc(slab); - folio->mapping = NULL; - __folio_clear_slab(folio); + page->mapping = NULL; + __ClearPageSlab(page); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - free_frozen_pages(&folio->page, order); + free_frozen_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -3618,8 +3634,6 @@ static struct slab *get_partial(struct kmem_cache *s, int node, return get_any_partial(s, pc); } -#ifndef CONFIG_SLUB_TINY - #ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguation @@ -3723,8 +3737,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; - struct slab new; - struct slab old; + struct freelist_counters old, new; if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); @@ -3773,10 +3786,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } else { new.freelist = old.freelist; } - } while (!slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); + } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); /* * Stage three: Manipulate the slab list based on the updated state. @@ -4019,12 +4029,6 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s) return c->slab || slub_percpu_partial(c); } -#else /* CONFIG_SLUB_TINY */ -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } -static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } -static inline void flush_this_cpu_slab(struct kmem_cache *s) { } -#endif /* CONFIG_SLUB_TINY */ - static bool has_pcs_used(int cpu, struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; @@ -4365,17 +4369,16 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } -#ifndef CONFIG_SLUB_TINY static inline bool __update_cpu_freelist_fast(struct kmem_cache *s, void *freelist_old, void *freelist_new, unsigned long tid) { - freelist_aba_t old = { .freelist = freelist_old, .counter = tid }; - freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) }; + struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; + struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full, - &old.full, new.full); + return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, + &old.freelist_tid, new.freelist_tid); } /* @@ -4388,27 +4391,24 @@ __update_cpu_freelist_fast(struct kmem_cache *s, */ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) { - struct slab new; - unsigned long counters; - void *freelist; + struct freelist_counters old, new; lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); do { - freelist = slab->freelist; - counters = slab->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; + + new.freelist = NULL; + new.counters = old.counters; - new.counters = counters; + new.inuse = old.objects; + new.frozen = old.freelist != NULL; - new.inuse = slab->objects; - new.frozen = freelist != NULL; - } while (!__slab_update_freelist(s, slab, - freelist, counters, - NULL, new.counters, - "get_freelist")); + } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); - return freelist; + return old.freelist; } /* @@ -4416,26 +4416,22 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) */ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) { - struct slab new; - unsigned long counters; - void *freelist; + struct freelist_counters old, new; do { - freelist = slab->freelist; - counters = slab->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; - new.counters = counters; + new.freelist = NULL; + new.counters = old.counters; VM_BUG_ON(new.frozen); - new.inuse = slab->objects; + new.inuse = old.objects; new.frozen = 1; - } while (!slab_update_freelist(s, slab, - freelist, counters, - NULL, new.counters, - "freeze_slab")); + } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); - return freelist; + return old.freelist; } /* @@ -4629,7 +4625,7 @@ new_objects: pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { freelist = pc.object; /* * For debug caches here we had to go through @@ -4667,7 +4663,7 @@ new_objects: stat(s, ALLOC_SLAB); - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); if (unlikely(!freelist)) { @@ -4879,32 +4875,6 @@ redo: return object; } -#else /* CONFIG_SLUB_TINY */ -static void *__slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) -{ - struct partial_context pc; - struct slab *slab; - void *object; - - pc.flags = gfpflags; - pc.orig_size = orig_size; - slab = get_partial(s, node, &pc); - - if (slab) - return pc.object; - - slab = new_slab(s, gfpflags, node); - if (unlikely(!slab)) { - slab_out_of_memory(s, gfpflags, node); - return NULL; - } - - object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - - return object; -} -#endif /* CONFIG_SLUB_TINY */ /* * If the object has been wiped upon free, make sure it's fully initialized by @@ -5045,7 +5015,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, return NULL; if (empty) { - if (!refill_sheaf(s, empty, gfp)) { + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { full = empty; } else { /* @@ -5156,7 +5126,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) * be false because of cpu migration during an unlocked part of * the current allocation or previous freeing process. */ - if (folio_nid(virt_to_folio(object)) != node) { + if (page_to_nid(virt_to_page(object)) != node) { local_unlock(&s->cpu_sheaves->lock); return NULL; } @@ -5345,6 +5315,26 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod } EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); +static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, + struct slab_sheaf *sheaf, gfp_t gfp) +{ + int ret = 0; + + ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); + + if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) + return ret; + + /* + * if we are allowed to, refill sheaf with pfmemalloc but then remember + * it for when it's returned + */ + ret = refill_sheaf(s, sheaf, gfp); + sheaf->pfmemalloc = true; + + return ret; +} + /* * returns a sheaf that has at least the requested size * when prefilling is needed, do so with given gfp flags @@ -5379,6 +5369,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) sheaf->cache = s; sheaf->capacity = size; + /* + * we do not need to care about pfmemalloc here because oversize + * sheaves area always flushed and freed when returned + */ if (!__kmem_cache_alloc_bulk(s, gfp, size, &sheaf->objects[0])) { kfree(sheaf); @@ -5415,17 +5409,18 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) if (!sheaf) sheaf = alloc_empty_sheaf(s, gfp); - if (sheaf && sheaf->size < size) { - if (refill_sheaf(s, sheaf, gfp)) { + if (sheaf) { + sheaf->capacity = s->sheaf_capacity; + sheaf->pfmemalloc = false; + + if (sheaf->size < size && + __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { sheaf_flush_unused(s, sheaf); free_empty_sheaf(s, sheaf); sheaf = NULL; } } - if (sheaf) - sheaf->capacity = s->sheaf_capacity; - return sheaf; } @@ -5445,7 +5440,8 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, struct slub_percpu_sheaves *pcs; struct node_barn *barn; - if (unlikely(sheaf->capacity != s->sheaf_capacity)) { + if (unlikely((sheaf->capacity != s->sheaf_capacity) + || sheaf->pfmemalloc)) { sheaf_flush_unused(s, sheaf); kfree(sheaf); return; @@ -5511,7 +5507,7 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, if (likely(sheaf->capacity >= size)) { if (likely(sheaf->capacity == s->sheaf_capacity)) - return refill_sheaf(s, sheaf, gfp); + return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, &sheaf->objects[sheaf->size])) { @@ -5544,6 +5540,9 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, * * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT * memcg charging is forced over limit if necessary, to avoid failure. + * + * It is possible that the allocation comes from kfence and then the sheaf + * size is not decreased. */ void * kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, @@ -5555,7 +5554,10 @@ kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, if (sheaf->size == 0) goto out; - ret = sheaf->objects[--sheaf->size]; + ret = kfence_alloc(s, s->object_size, gfp); + + if (likely(!ret)) + ret = sheaf->objects[--sheaf->size]; init = slab_want_init_on_alloc(gfp, s); @@ -5578,7 +5580,7 @@ unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) */ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct folio *folio; + struct page *page; void *ptr = NULL; unsigned int order = get_order(size); @@ -5588,15 +5590,15 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) flags |= __GFP_COMP; if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_frozen_pages_noprof(flags, order); + page = alloc_frozen_pages_noprof(flags, order); else - folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL); + page = __alloc_frozen_pages_noprof(flags, order, node, NULL); - if (folio) { - ptr = folio_address(folio); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); - __folio_set_large_kmalloc(folio); + __SetPageLargeKmalloc(page); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -5723,9 +5725,7 @@ retry: * it did local_lock_irqsave(&s->cpu_slab->lock, flags). * In this case fast path with __update_cpu_freelist_fast() is not safe. */ -#ifndef CONFIG_SLUB_TINY if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) -#endif ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); if (PTR_ERR(ret) == -EBUSY) { @@ -5863,10 +5863,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - void *prior; - int was_frozen; - struct slab new; - unsigned long counters; + bool was_frozen, was_full; + struct freelist_counters old, new; struct kmem_cache_node *n = NULL; unsigned long flags; bool on_node_partial; @@ -5878,20 +5876,43 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; } + /* + * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below + * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) + * is the only other reason it can be false, and it is already handled + * above. + */ + do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } - prior = slab->freelist; - counters = slab->counters; - set_freepointer(s, tail, prior); - new.counters = counters; - was_frozen = new.frozen; + + old.freelist = slab->freelist; + old.counters = slab->counters; + + was_full = (old.freelist == NULL); + was_frozen = old.frozen; + + set_freepointer(s, tail, old.freelist); + + new.freelist = head; + new.counters = old.counters; new.inuse -= cnt; - if ((!new.inuse || !prior) && !was_frozen) { - /* Needs to be taken off a list */ - if (!kmem_cache_has_cpu_partial(s) || prior) { + + /* + * Might need to be taken off (due to becoming empty) or added + * to (due to not being full anymore) the partial list. + * Unless it's frozen. + */ + if ((!new.inuse || was_full) && !was_frozen) { + /* + * If slab becomes non-full and we have cpu partial + * lists, we put it there unconditionally to avoid + * taking the list_lock. Otherwise we need it. + */ + if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { n = get_node(s, slab_nid(slab)); /* @@ -5908,10 +5929,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, } } - } while (!slab_update_freelist(s, slab, - prior, counters, - head, new.counters, - "__slab_free")); + } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { @@ -5921,7 +5939,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (kmem_cache_has_cpu_partial(s) && !prior) { + } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { /* * If we started with a full slab then put it onto the * per cpu partial list. @@ -5930,6 +5948,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, stat(s, CPU_PARTIAL_FREE); } + /* + * In other cases we didn't take the list_lock because the slab + * was already on the partial list and will remain there. + */ + return; } @@ -5937,19 +5960,24 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * This slab was partially empty but not on the per-node partial list, * in which case we shouldn't manipulate its list, just return. */ - if (prior && !on_node_partial) { + if (!was_full && !on_node_partial) { spin_unlock_irqrestore(&n->list_lock, flags); return; } + /* + * If slab became empty, should we add/keep it on the partial list or we + * have enough? + */ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* * Objects left in the slab. If it was not on the partial list before - * then add it. + * then add it. This can only happen when cache has no per cpu partial + * list otherwise we would have put it there. */ - if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -5957,10 +5985,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; slab_empty: - if (prior) { - /* - * Slab on the partial list. - */ + /* + * The slab could have a single object and thus go from full to empty in + * a single free, but more likely it was on the partial list. Remove it. + */ + if (likely(!was_full)) { remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); } @@ -6185,8 +6214,12 @@ static void rcu_free_sheaf(struct rcu_head *head) * handles it fine. The only downside is that sheaf will serve fewer * allocations when reused. It only happens due to debugging, which is a * performance hit anyway. + * + * If it returns true, there was at least one object from pfmemalloc + * slab so simply flush everything. */ - __rcu_free_sheaf_prepare(s, sheaf); + if (__rcu_free_sheaf_prepare(s, sheaf)) + goto flush; n = get_node(s, sheaf->node); if (!n) @@ -6339,7 +6372,8 @@ next_remote_batch: continue; } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) + || slab_test_pfmemalloc(slab))) { remote_objects[remote_nr] = p[i]; p[i] = p[--size]; if (++remote_nr >= PCS_BATCH_MAX) @@ -6487,14 +6521,10 @@ static void free_deferred_objects(struct irq_work *work) llist_for_each_safe(pos, t, llnode) { struct slab *slab = container_of(pos, struct slab, llnode); -#ifdef CONFIG_SLUB_TINY - free_slab(slab->slab_cache, slab); -#else if (slab->frozen) deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); else free_slab(slab->slab_cache, slab); -#endif } } @@ -6530,7 +6560,6 @@ void defer_free_barrier(void) irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); } -#ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that * can perform fastpath freeing without additional function calls. @@ -6623,14 +6652,6 @@ redo: } stat_add(s, FREE_FASTPATH, cnt); } -#else /* CONFIG_SLUB_TINY */ -static void do_slab_free(struct kmem_cache *s, - struct slab *slab, void *head, void *tail, - int cnt, unsigned long addr) -{ - __slab_free(s, slab, head, tail, cnt, addr); -} -#endif /* CONFIG_SLUB_TINY */ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, @@ -6643,7 +6664,8 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, return; if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id())) { + slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { if (likely(free_to_pcs(s, object))) return; } @@ -6753,12 +6775,12 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); -static void free_large_kmalloc(struct folio *folio, void *object) +static void free_large_kmalloc(struct page *page, void *object) { - unsigned int order = folio_order(folio); + unsigned int order = compound_order(page); - if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { - dump_page(&folio->page, "Not a kmalloc allocation"); + if (WARN_ON_ONCE(!PageLargeKmalloc(page))) { + dump_page(page, "Not a kmalloc allocation"); return; } @@ -6769,10 +6791,10 @@ static void free_large_kmalloc(struct folio *folio, void *object) kasan_kfree_large(object); kmsan_kfree_large(object); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); - __folio_clear_large_kmalloc(folio); - free_frozen_pages(&folio->page, order); + __ClearPageLargeKmalloc(page); + free_frozen_pages(page, order); } /* @@ -6782,7 +6804,7 @@ static void free_large_kmalloc(struct folio *folio, void *object) void kvfree_rcu_cb(struct rcu_head *head) { void *obj = head; - struct folio *folio; + struct page *page; struct slab *slab; struct kmem_cache *s; void *slab_addr; @@ -6793,20 +6815,20 @@ void kvfree_rcu_cb(struct rcu_head *head) return; } - folio = virt_to_folio(obj); - if (!folio_test_slab(folio)) { + page = virt_to_page(obj); + slab = page_slab(page); + if (!slab) { /* * rcu_head offset can be only less than page size so no need to - * consider folio order + * consider allocation order */ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); - free_large_kmalloc(folio, obj); + free_large_kmalloc(page, obj); return; } - slab = folio_slab(folio); s = slab->slab_cache; - slab_addr = folio_address(folio); + slab_addr = slab_address(slab); if (is_kfence_address(obj)) { obj = kfence_object_start(obj); @@ -6828,7 +6850,7 @@ void kvfree_rcu_cb(struct rcu_head *head) */ void kfree(const void *object) { - struct folio *folio; + struct page *page; struct slab *slab; struct kmem_cache *s; void *x = (void *)object; @@ -6838,13 +6860,13 @@ void kfree(const void *object) if (unlikely(ZERO_OR_NULL_PTR(object))) return; - folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, (void *)object); + page = virt_to_page(object); + slab = page_slab(page); + if (!slab) { + free_large_kmalloc(page, (void *)object); return; } - slab = folio_slab(folio); s = slab->slab_cache; slab_free(s, slab, x, _RET_IP_); } @@ -6861,7 +6883,6 @@ EXPORT_SYMBOL(kfree); */ void kfree_nolock(const void *object) { - struct folio *folio; struct slab *slab; struct kmem_cache *s; void *x = (void *)object; @@ -6869,13 +6890,12 @@ void kfree_nolock(const void *object) if (unlikely(ZERO_OR_NULL_PTR(object))) return; - folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) { + slab = virt_to_slab(object); + if (unlikely(!slab)) { WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); return; } - slab = folio_slab(folio); s = slab->slab_cache; memcg_slab_free_hook(s, slab, &x, 1); @@ -6907,11 +6927,7 @@ void kfree_nolock(const void *object) * since kasan quarantine takes locks and not supported from NMI. */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); -#ifndef CONFIG_SLUB_TINY do_slab_free(s, slab, x, x, 0, _RET_IP_); -#else - defer_free(s, x); -#endif } EXPORT_SYMBOL_GPL(kfree_nolock); @@ -6943,16 +6959,16 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { - struct folio *folio; + struct page *page = virt_to_page(p); + struct slab *slab = page_slab(page); - folio = virt_to_folio(p); - if (unlikely(!folio_test_slab(folio))) { + if (!slab) { /* Big kmalloc object */ - WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE); - WARN_ON(p != folio_address(folio)); - ks = folio_size(folio); + ks = page_size(page); + WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE); + WARN_ON(p != page_address(page)); } else { - s = folio_slab(folio)->slab_cache; + s = slab->slab_cache; orig_size = get_orig_size(s, (void *)p); ks = s->object_size; } @@ -7256,23 +7272,25 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, { int lookahead = 3; void *object; - struct folio *folio; + struct page *page; + struct slab *slab; size_t same; object = p[--size]; - folio = virt_to_folio(object); + page = virt_to_page(object); + slab = page_slab(page); if (!s) { /* Handle kalloc'ed objects */ - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); + if (!slab) { + free_large_kmalloc(page, object); df->slab = NULL; return size; } /* Derive kmem_cache from object */ - df->slab = folio_slab(folio); - df->s = df->slab->slab_cache; + df->slab = slab; + df->s = slab->slab_cache; } else { - df->slab = folio_slab(folio); + df->slab = slab; df->s = cache_from_obj(s, object); /* Support for memcg */ } @@ -7361,7 +7379,6 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -#ifndef CONFIG_SLUB_TINY static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) @@ -7379,14 +7396,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_lock_irqsave(&s->cpu_slab->lock, irqflags); for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); - - if (unlikely(object)) { - p[i] = object; - continue; - } + void *object = c->freelist; - object = c->freelist; if (unlikely(!object)) { /* * We may have removed an object from c->freelist using @@ -7432,41 +7443,13 @@ error: return 0; } -#else /* CONFIG_SLUB_TINY */ -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p) -{ - int i; - - for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); - - if (unlikely(object)) { - p[i] = object; - continue; - } - - p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, - _RET_IP_, s->object_size); - if (unlikely(!p[i])) - goto error; - - maybe_wipe_obj_freeptr(s, p[i]); - } - - return i; - -error: - __kmem_cache_free_bulk(s, i, p); - return 0; -} -#endif /* CONFIG_SLUB_TINY */ /* Note that interrupts must be enabled when calling this function. */ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { unsigned int i = 0; + void *kfence_obj; if (!size) return 0; @@ -7475,6 +7458,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; + /* + * to make things simpler, only assume at most once kfence allocated + * object per bulk allocation and choose its index randomly + */ + kfence_obj = kfence_alloc(s, s->object_size, flags); + + if (unlikely(kfence_obj)) { + if (unlikely(size == 1)) { + p[0] = kfence_obj; + goto out; + } + size--; + } + if (s->cpu_sheaves) i = alloc_from_pcs_bulk(s, size, p); @@ -7486,10 +7483,23 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { if (i > 0) __kmem_cache_free_bulk(s, i, p); + if (kfence_obj) + __kfence_free(kfence_obj); return 0; } } + if (unlikely(kfence_obj)) { + int idx = get_random_u32_below(size + 1); + + if (idx != size) + p[size] = p[idx]; + p[idx] = kfence_obj; + + size++; + } + +out: /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. @@ -7651,7 +7661,6 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) barn_init(barn); } -#ifndef CONFIG_SLUB_TINY static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < @@ -7672,12 +7681,6 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) return 1; } -#else -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) -{ - return 1; -} -#endif /* CONFIG_SLUB_TINY */ static int init_percpu_sheaves(struct kmem_cache *s) { @@ -7767,13 +7770,11 @@ void __kmem_cache_release(struct kmem_cache *s) cache_random_seq_destroy(s); if (s->cpu_sheaves) pcs_destroy(s); -#ifndef CONFIG_SLUB_TINY #ifdef CONFIG_PREEMPT_RT if (s->cpu_slab) lockdep_unregister_key(&s->lock_key); #endif free_percpu(s->cpu_slab); -#endif free_kmem_cache_nodes(s); } @@ -8139,46 +8140,53 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) * Kmalloc subsystem *******************************************************************/ -static int __init setup_slub_min_order(char *str) +static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp) { - get_option(&str, (int *)&slub_min_order); + int ret; + + ret = kstrtouint(str, 0, &slub_min_order); + if (ret) + return ret; if (slub_min_order > slub_max_order) slub_max_order = slub_min_order; - return 1; + return 0; } -__setup("slab_min_order=", setup_slub_min_order); -__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0); - +static const struct kernel_param_ops param_ops_slab_min_order __initconst = { + .set = setup_slub_min_order, +}; +__core_param_cb(slab_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); +__core_param_cb(slub_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); -static int __init setup_slub_max_order(char *str) +static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp) { - get_option(&str, (int *)&slub_max_order); + int ret; + + ret = kstrtouint(str, 0, &slub_max_order); + if (ret) + return ret; + slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); if (slub_min_order > slub_max_order) slub_min_order = slub_max_order; - return 1; + return 0; } -__setup("slab_max_order=", setup_slub_max_order); -__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0); - -static int __init setup_slub_min_objects(char *str) -{ - get_option(&str, (int *)&slub_min_objects); - - return 1; -} +static const struct kernel_param_ops param_ops_slab_max_order __initconst = { + .set = setup_slub_max_order, +}; +__core_param_cb(slab_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); +__core_param_cb(slub_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); -__setup("slab_min_objects=", setup_slub_min_objects); -__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); +core_param(slab_min_objects, slub_min_objects, uint, 0); +core_param(slub_min_objects, slub_min_objects, uint, 0); #ifdef CONFIG_NUMA -static int __init setup_slab_strict_numa(char *str) +static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp) { if (nr_node_ids > 1) { static_branch_enable(&strict_numa); @@ -8187,10 +8195,14 @@ static int __init setup_slab_strict_numa(char *str) pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); } - return 1; + return 0; } -__setup("slab_strict_numa", setup_slab_strict_numa); +static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slab_strict_numa, +}; +__core_param_cb(slab_strict_numa, ¶m_ops_slab_strict_numa, NULL, 0); #endif @@ -8516,10 +8528,8 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { -#ifndef CONFIG_SLUB_TINY flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); WARN_ON(!flushwq); -#endif } struct kmem_cache * diff --git a/mm/sparse.c b/mm/sparse.c index 17c50a6415c2..b5b2b6f7041b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -951,8 +951,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - if (!altmap || !altmap->inaccessible) - page_init_poison(memmap, sizeof(struct page) * nr_pages); + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); diff --git a/mm/truncate.c b/mm/truncate.c index 3c5a50ae3274..12467c1bd711 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -46,7 +46,7 @@ static void clear_shadow_entries(struct address_space *mapping, xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } @@ -111,7 +111,7 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping, xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); out: folio_batch_remove_exceptionals(fbatch); @@ -364,7 +364,7 @@ long mapping_evict_folio(struct address_space *mapping, struct folio *folio) * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, - loff_t lstart, loff_t lend) + loff_t lstart, uoff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ @@ -412,7 +412,7 @@ void truncate_inode_pages_range(struct address_space *mapping, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { - same_folio = lend < folio_pos(folio) + folio_size(folio); + same_folio = lend < folio_next_pos(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) @@ -647,7 +647,7 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); diff --git a/mm/usercopy.c b/mm/usercopy.c index dbdcc43964fb..5de7a518b1b1 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -164,7 +164,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n, { unsigned long addr = (unsigned long)ptr; unsigned long offset; - struct folio *folio; + struct page *page; + struct slab *slab; if (is_kmap_addr(ptr)) { offset = offset_in_page(ptr); @@ -189,16 +190,23 @@ static inline void check_heap_object(const void *ptr, unsigned long n, if (!virt_addr_valid(ptr)) return; - folio = virt_to_folio(ptr); - - if (folio_test_slab(folio)) { + page = virt_to_page(ptr); + slab = page_slab(page); + if (slab) { /* Check slab allocator for flags and size. */ - __check_heap_object(ptr, n, folio_slab(folio), to_user); - } else if (folio_test_large(folio)) { - offset = ptr - folio_address(folio); - if (n > folio_size(folio) - offset) + __check_heap_object(ptr, n, slab, to_user); + } else if (PageCompound(page)) { + page = compound_head(page); + offset = ptr - page_address(page); + if (n > page_size(page) - offset) usercopy_abort("page alloc", NULL, to_user, offset, n); } + + /* + * We cannot check non-compound pages. They might be part of + * a large allocation, in which case crossing a page boundary + * is fine. + */ } DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, diff --git a/mm/vmscan.c b/mm/vmscan.c index b2fc8b626d3d..bb4a96c7b682 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -811,7 +811,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, __filemap_remove_folio(folio, shadow); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); if (free_folio) diff --git a/mm/workingset.c b/mm/workingset.c index 68a76a91111f..d32dc2e02a61 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -755,7 +755,7 @@ out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; |
