From e36488c83b6d871b35dd555afb2d434bd61687cf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:36 -0700 Subject: bitfield: avoid gcc-8 -Wint-in-bool-context warning Passing an enum into FIELD_GET() produces a long but harmless warning on newer compilers: from include/linux/linkage.h:7, from include/linux/kernel.h:7, from include/linux/skbuff.h:17, from include/linux/if_ether.h:23, from include/linux/etherdevice.h:25, from drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:63: drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c: In function 'iwl_mvm_rx_mpdu_mq': include/linux/bitfield.h:56:20: error: enum constant in boolean context [-Werror=int-in-bool-context] BUILD_BUG_ON_MSG(!(_mask), _pfx "mask is zero"); \ ^ ... include/linux/bitfield.h:103:3: note: in expansion of macro '__BF_FIELD_CHECK' __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ ^~~~~~~~~~~~~~~~ drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:1025:21: note: in expansion of macro 'FIELD_GET' le16_encode_bits(FIELD_GET(IWL_RX_HE_PHY_SIBG_SYM_OR_USER_NUM_MASK, The problem here is that the caller has no idea how the macro gets expanding, leading to a false-positive. It can be trivially avoided by doing a comparison against zero. This only recently started appearing as the iwlwifi driver was patched to use FIELD_GET. Link: http://lkml.kernel.org/r/20180813220950.194841-1-arnd@arndb.de Fixes: 514c30696fbc ("iwlwifi: add support for IEEE802.11ax") Signed-off-by: Arnd Bergmann Cc: Masahiro Yamada Cc: Johannes Berg Cc: Jakub Kicinski Cc: Andy Shevchenko Cc: Al Viro Cc: David Laight Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitfield.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 65a6981eef7b..3f1ef4450a7c 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -53,7 +53,7 @@ ({ \ BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ _pfx "mask is not constant"); \ - BUILD_BUG_ON_MSG(!(_mask), _pfx "mask is zero"); \ + BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \ BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ _pfx "value too large for the field"); \ -- cgit v1.2.3 From 4cdfffc8722e99be8d400d8fa1fcd615d078ad43 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Aug 2018 15:44:37 -0700 Subject: vfs: discard ATTR_ATTR_FLAG This flag was introduce in 2.1.37pre1 and the only place it was tested was removed in 2.1.43pre1. The flag was never set. Let's discard it properly. Link: http://lkml.kernel.org/r/877en0hewz.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs.h | 2 +- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index cb8374af08a6..33b8423ef0c9 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -19,7 +19,7 @@ #define HOSTFS_ATTR_ATIME_SET 128 #define HOSTFS_ATTR_MTIME_SET 256 -/* These two are unused by hostfs. */ +/* This one is unused by hostfs. */ #define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ #define HOSTFS_ATTR_ATTR_FLAG 1024 diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ec33fd0423f..9d319f1f66f6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -179,7 +179,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ -#define ATTR_ATTR_FLAG (1 << 10) #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) -- cgit v1.2.3 From b3a2369692fedcc439dfaa4f215cf5323a886c88 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Aug 2018 15:45:12 -0700 Subject: include/linux/page_ext.h: drop definition of unused PAGE_EXT_DEBUG_POISON After commit bd33ef368135 ("mm: enable page poisoning early at boot") PAGE_EXT_DEBUG_POISON is not longer used. Remove it. Link: http://lkml.kernel.org/r/20180531135457.20167-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index ca5461efae2f..bbec618a614b 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -16,18 +16,7 @@ struct page_ext_operations { #ifdef CONFIG_PAGE_EXTENSION -/* - * page_ext->flags bits: - * - * PAGE_EXT_DEBUG_POISON is set for poisoned pages. This is used to - * implement generic debug pagealloc feature. The pages are filled with - * poison patterns and set this flag after free_pages(). The poisoned - * pages are verified whether the patterns are not corrupted and clear - * the flag before alloc_pages(). - */ - enum page_ext_flags { - PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ PAGE_EXT_DEBUG_GUARD, PAGE_EXT_OWNER, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) -- cgit v1.2.3 From 10ed63415223b16d7e80ba528556500231814232 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Aug 2018 15:45:15 -0700 Subject: mm/page_ext.c: constify lookup_page_ext() argument lookup_page_ext() finds 'struct page_ext' for a given page. It requires only read access to the given struct page. Current implemnentation takes 'struct page *' as an argument. It makes compiler complain when 'const struct page *' passed. Change the argument to 'const struct page *'. Link: http://lkml.kernel.org/r/20180531135457.20167-3-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 4 ++-- mm/page_ext.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index bbec618a614b..f84f167ec04c 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -50,7 +50,7 @@ static inline void page_ext_init(void) } #endif -struct page_ext *lookup_page_ext(struct page *page); +struct page_ext *lookup_page_ext(const struct page *page); #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; @@ -59,7 +59,7 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } -static inline struct page_ext *lookup_page_ext(struct page *page) +static inline struct page_ext *lookup_page_ext(const struct page *page) { return NULL; } diff --git a/mm/page_ext.c b/mm/page_ext.c index 5295ef331165..a9826da84ccb 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -120,7 +120,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(struct page *page) +struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; @@ -195,7 +195,7 @@ fail: #else /* CONFIG_FLAT_NODE_MEM_MAP */ -struct page_ext *lookup_page_ext(struct page *page) +struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -- cgit v1.2.3 From 74c8164e1cdb1eb22f1d49d54e515e81821a8ad0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:36 -0700 Subject: mpage: mpage_readpages() should submit IO as read-ahead a_ops->readpages() is only ever used for read-ahead, yet we don't flag the IO being submitted as such. Fix that up. Any file system that uses mpage_readpages() as its ->readpages() implementation will now get this right. Since we're passing in whether the IO is read-ahead or not, we don't need to pass in the 'gfp' separately, as it is dependent on the IO being read-ahead. Kill off that member. Add some documentation notes on ->readpages() being purely for read-ahead. Link: http://lkml.kernel.org/r/20180621010725.17813-3-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Chris Mason Cc: Christoph Hellwig Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/data.c | 5 +++++ fs/mpage.c | 29 +++++++++++++++++++---------- include/linux/fs.h | 4 ++++ 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8f931d699287..b7c9b58acf3e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1421,6 +1421,11 @@ out: /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. + * + * Note that the aops->readpages() function is ONLY used for read-ahead. If + * this function ever deviates from doing just read-ahead, it should either + * use ->readpage() or do the necessary surgery to decouple ->readpages() + * readom read-ahead. */ static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, diff --git a/fs/mpage.c b/fs/mpage.c index 6dc90e456abf..c820dc9bebab 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -137,11 +137,11 @@ struct mpage_readpage_args { struct bio *bio; struct page *page; unsigned int nr_pages; + bool is_readahead; sector_t last_block_in_bio; struct buffer_head map_bh; unsigned long first_logical_block; get_block_t *get_block; - gfp_t gfp; }; /* @@ -170,8 +170,18 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct block_device *bdev = NULL; int length; int fully_mapped = 1; + int op_flags; unsigned nblocks; unsigned relative_block; + gfp_t gfp; + + if (args->is_readahead) { + op_flags = REQ_RAHEAD; + gfp = readahead_gfp_mask(page->mapping); + } else { + op_flags = 0; + gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + } if (page_has_buffers(page)) goto confused; @@ -284,7 +294,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * This page will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); alloc_new: if (args->bio == NULL) { @@ -296,14 +306,14 @@ alloc_new: args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, args->nr_pages, BIO_MAX_PAGES), - args->gfp); + gfp); if (args->bio == NULL) goto confused; } length = first_hole << blkbits; if (bio_add_page(args->bio, page, length, 0) < length) { - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); goto alloc_new; } @@ -311,7 +321,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); else args->last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -319,7 +329,7 @@ out: confused: if (args->bio) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); if (!PageUptodate(page)) block_read_full_page(page, args->get_block); else @@ -377,7 +387,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, { struct mpage_readpage_args args = { .get_block = get_block, - .gfp = readahead_gfp_mask(mapping), + .is_readahead = true, }; unsigned page_idx; @@ -388,7 +398,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - args.gfp)) { + readahead_gfp_mask(mapping))) { args.page = page; args.nr_pages = nr_pages - page_idx; args.bio = do_mpage_readpage(&args); @@ -397,7 +407,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, } BUG_ON(!list_empty(pages)); if (args.bio) - mpage_bio_submit(REQ_OP_READ, 0, args.bio); + mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); return 0; } EXPORT_SYMBOL(mpage_readpages); @@ -411,7 +421,6 @@ int mpage_readpage(struct page *page, get_block_t get_block) .page = page, .nr_pages = 1, .get_block = get_block, - .gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL), }; args.bio = do_mpage_readpage(&args); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9d319f1f66f6..a9242f336f02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -344,6 +344,10 @@ struct address_space_operations { /* Set a page dirty. Return true if this dirtied it */ int (*set_page_dirty)(struct page *page); + /* + * Reads in the requested pages. Unlike ->readpage(), this is + * PURELY used for read-ahead!. + */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); -- cgit v1.2.3 From c9f4cd71383576a916e7fca99c490fc92a289f5a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 17 Aug 2018 15:45:49 -0700 Subject: mm, huge page: copy target sub-page last when copy huge page Huge page helps to reduce TLB miss rate, but it has higher cache footprint, sometimes this may cause some issue. For example, when copying huge page on x86_64 platform, the cache footprint is 4M. But on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M LLC (last level cache). That is, in average, there are 2.5M LLC for each core and 1.25M LLC for each thread. If the cache contention is heavy when copying the huge page, and we copy the huge page from the begin to the end, it is possible that the begin of huge page is evicted from the cache after we finishing copying the end of the huge page. And it is possible for the application to access the begin of the huge page after copying the huge page. In c79b57e462b5d ("mm: hugetlb: clear target sub-page last when clearing huge page"), to keep the cache lines of the target subpage hot, the order to clear the subpages in the huge page in clear_huge_page() is changed to clearing the subpage which is furthest from the target subpage firstly, and the target subpage last. The similar order changing helps huge page copying too. That is implemented in this patch. Because we have put the order algorithm into a separate function, the implementation is quite simple. The patch is a generic optimization which should benefit quite some workloads, not for a specific use case. To demonstrate the performance benefit of the patch, we tested it with vm-scalability run on transparent huge page. With this patch, the throughput increases ~16.6% in vm-scalability anon-cow-seq test case with 36 processes on a 2 socket Xeon E5 v3 2699 system (36 cores, 72 threads). The test case set /sys/kernel/mm/transparent_hugepage/enabled to be always, mmap() a big anonymous memory area and populate it, then forked 36 child processes, each writes to the anonymous memory area from the begin to the end, so cause copy on write. For each child process, other child processes could be seen as other workloads which generate heavy cache pressure. At the same time, the IPC (instruction per cycle) increased from 0.63 to 0.78, and the time spent in user space is reduced ~7.2%. Link: http://lkml.kernel.org/r/20180524005851.4079-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Mike Kravetz Cc: Andi Kleen Cc: Jan Kara Cc: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Minchan Kim Cc: Shaohua Li Cc: Christopher Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- mm/huge_memory.c | 3 ++- mm/memory.c | 30 +++++++++++++++++++++++------- 3 files changed, 27 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 68a5121694ef..2fb32d1561eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2752,7 +2752,8 @@ extern void clear_huge_page(struct page *page, unsigned long addr_hint, unsigned int pages_per_huge_page); extern void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma, + unsigned long addr_hint, + struct vm_area_struct *vma, unsigned int pages_per_huge_page); extern long copy_huge_page_from_user(struct page *dst_page, const void __user *usr_src, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 064a9d78879d..78427af91de9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1328,7 +1328,8 @@ alloc: if (!page) clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); else - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + copy_user_huge_page(new_page, page, vmf->address, + vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); mmun_start = haddr; diff --git a/mm/memory.c b/mm/memory.c index 65bb59e031c9..175f344e1523 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4705,11 +4705,31 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src, } } +struct copy_subpage_arg { + struct page *dst; + struct page *src; + struct vm_area_struct *vma; +}; + +static void copy_subpage(unsigned long addr, int idx, void *arg) +{ + struct copy_subpage_arg *copy_arg = arg; + + copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, + addr, copy_arg->vma); +} + void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int pages_per_huge_page) { - int i; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + struct copy_subpage_arg arg = { + .dst = dst, + .src = src, + .vma = vma, + }; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { copy_user_gigantic_page(dst, src, addr, vma, @@ -4717,11 +4737,7 @@ void copy_user_huge_page(struct page *dst, struct page *src, return; } - might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } + process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); } long copy_huge_page_from_user(struct page *dst_page, -- cgit v1.2.3 From 4fbce633910ed80b135b84160a22b219080c8082 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 17 Aug 2018 15:46:22 -0700 Subject: mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range() link_mem_sections() and walk_memory_range() share most of the code, so we can use convert link_mem_sections() into a dummy function that calls walk_memory_range() with a callback to register_mem_sect_under_node(). This patch converts register_mem_sect_under_node() in order to match a walk_memory_range's callback, getting rid of the check_nid argument and checking instead if the system is still boothing, since we only have to check for the nid if the system is in such state. Link: http://lkml.kernel.org/r/20180622111839.10071-4-osalvador@techadventures.net Signed-off-by: Oscar Salvador Suggested-by: Pavel Tatashin Tested-by: Reza Arbab Tested-by: Jonathan Cameron Reviewed-by: Pavel Tatashin Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 44 ++++++-------------------------------------- include/linux/node.h | 12 +++++++----- mm/memory_hotplug.c | 5 +---- 3 files changed, 14 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index a5e821d09656..845d5523812b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -399,10 +399,9 @@ static int __ref get_nid_for_pfn(unsigned long pfn) } /* register memory section under specified node if it spans that node */ -int register_mem_sect_under_node(struct memory_block *mem_blk, int nid, - bool check_nid) +int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) { - int ret; + int ret, nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn; if (!mem_blk) @@ -433,7 +432,7 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid, * case, during hotplug we know that all pages in the memory * block belong to the same node. */ - if (check_nid) { + if (system_state == SYSTEM_BOOTING) { page_nid = get_nid_for_pfn(pfn); if (page_nid < 0) continue; @@ -490,41 +489,10 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, return 0; } -int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages, - bool check_nid) +int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long end_pfn = start_pfn + nr_pages; - unsigned long pfn; - struct memory_block *mem_blk = NULL; - int err = 0; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *mem_sect; - int ret; - - if (!present_section_nr(section_nr)) - continue; - mem_sect = __nr_to_section(section_nr); - - /* same memblock ? */ - if (mem_blk) - if ((section_nr >= mem_blk->start_section_nr) && - (section_nr <= mem_blk->end_section_nr)) - continue; - - mem_blk = find_memory_block_hinted(mem_sect, mem_blk); - - ret = register_mem_sect_under_node(mem_blk, nid, check_nid); - if (!err) - err = ret; - - /* discard ref obtained in find_memory_block() */ - } - - if (mem_blk) - kobject_put(&mem_blk->dev.kobj); - return err; + return walk_memory_range(start_pfn, end_pfn, (void *)&nid, + register_mem_sect_under_node); } #ifdef CONFIG_HUGETLBFS diff --git a/include/linux/node.h b/include/linux/node.h index 6d336e38d155..257bb3d6d014 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -33,10 +33,10 @@ typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) extern int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long nr_pages, bool check_nid); + unsigned long end_pfn); #else static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long nr_pages, bool check_nid) + unsigned long end_pfn) { return 0; } @@ -54,12 +54,14 @@ static inline int register_one_node(int nid) if (node_online(nid)) { struct pglist_data *pgdat = NODE_DATA(nid); + unsigned long start_pfn = pgdat->node_start_pfn; + unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; error = __register_one_node(nid); if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, pgdat->node_start_pfn, pgdat->node_spanned_pages, true); + error = link_mem_sections(nid, start_pfn, end_pfn); } return error; @@ -69,7 +71,7 @@ extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern int register_mem_sect_under_node(struct memory_block *mem_blk, - int nid, bool check_nid); + void *arg); extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, unsigned long phys_index); @@ -99,7 +101,7 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) return 0; } static inline int register_mem_sect_under_node(struct memory_block *mem_blk, - int nid, bool check_nid) + void *arg) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e2ed64b994e5..4eb6e824a80c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1123,7 +1123,6 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) u64 start, size; bool new_node = false; int ret; - unsigned long start_pfn, nr_pages; start = res->start; size = resource_size(res); @@ -1164,9 +1163,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* link memory sections under this node.*/ - start_pfn = start >> PAGE_SHIFT; - nr_pages = size >> PAGE_SHIFT; - ret = link_mem_sections(nid, start_pfn, nr_pages, false); + ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); BUG_ON(ret); /* create new memmap entry */ -- cgit v1.2.3 From dc0b58643aff8b378086f25cce6789ccba68cbcb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 17 Aug 2018 15:46:36 -0700 Subject: mm: introduce mem_cgroup_put() helper Introduce the mem_cgroup_put() helper, which helps to eliminate guarding memcg css release with "#ifdef CONFIG_MEMCG" in multiple places. Link: http://lkml.kernel.org/r/20180623000600.5818-2-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Reviewed-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 680d3395fc83..42f4719def32 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -378,6 +378,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -850,6 +855,10 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, -- cgit v1.2.3 From d46eb14b735b11927d4bdc2d1854c311af19de6d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Aug 2018 15:46:39 -0700 Subject: fs: fsnotify: account fsnotify metadata to kmemcg Patch series "Directed kmem charging", v8. The Linux kernel's memory cgroup allows limiting the memory usage of the jobs running on the system to provide isolation between the jobs. All the kernel memory allocated in the context of the job and marked with __GFP_ACCOUNT will also be included in the memory usage and be limited by the job's limit. The kernel memory can only be charged to the memcg of the process in whose context kernel memory was allocated. However there are cases where the allocated kernel memory should be charged to the memcg different from the current processes's memcg. This patch series contains two such concrete use-cases i.e. fsnotify and buffer_head. The fsnotify event objects can consume a lot of system memory for large or unlimited queues if there is either no or slow listener. The events are allocated in the context of the event producer. However they should be charged to the event consumer. Similarly the buffer_head objects can be allocated in a memcg different from the memcg of the page for which buffer_head objects are being allocated. To solve this issue, this patch series introduces mechanism to charge kernel memory to a given memcg. In case of fsnotify events, the memcg of the consumer can be used for charging and for buffer_head, the memcg of the page can be charged. For directed charging, the caller can use the scope API memalloc_[un]use_memcg() to specify the memcg to charge for all the __GFP_ACCOUNT allocations within the scope. This patch (of 2): A lot of memory can be consumed by the events generated for the huge or unlimited queues if there is either no or slow listener. This can cause system level memory pressure or OOMs. So, it's better to account the fsnotify kmem caches to the memcg of the listener. However the listener can be in a different memcg than the memcg of the producer and these allocations happen in the context of the event producer. This patch introduces remote memcg charging API which the producer can use to charge the allocations to the memcg of the listener. There are seven fsnotify kmem caches and among them allocations from dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and inotify_inode_mark_cachep happens in the context of syscall from the listener. So, SLAB_ACCOUNT is enough for these caches. The objects from fsnotify_mark_connector_cachep are not accounted as they are small compared to the notification mark or events and it is unclear whom to account connector to since it is shared by all events attached to the inode. The allocations from the event caches happen in the context of the event producer. For such caches we will need to remote charge the allocations to the listener's memcg. Thus we save the memcg reference in the fsnotify_group structure of the listener. This patch has also moved the members of fsnotify_group to keep the size same, at least for 64 bit build, even with additional member by filling the holes. [shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it] Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Jan Kara Cc: Amir Goldstein Cc: Greg Thelen Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 5 +++-- fs/notify/fanotify/fanotify.c | 14 ++++++++++---- fs/notify/fanotify/fanotify_user.c | 5 ++++- fs/notify/group.c | 3 +++ fs/notify/inotify/inotify_fsnotify.c | 7 ++++++- fs/notify/inotify/inotify_user.c | 5 ++++- include/linux/fsnotify_backend.h | 12 ++++++++---- include/linux/memcontrol.h | 10 +++++++++- include/linux/sched.h | 3 +++ include/linux/sched/mm.h | 37 ++++++++++++++++++++++++++++++++++++ kernel/fork.c | 3 +++ mm/memcontrol.c | 37 ++++++++++++++++++++++++++++++++---- 12 files changed, 123 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index e2bea2ac5dfb..a6365e6bc047 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -384,8 +384,9 @@ out_err: static int __init dnotify_init(void) { - dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); - dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC); + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, + SLAB_PANIC|SLAB_ACCOUNT); + dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f90842efea13..eb4e75175cfb 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "fanotify.h" @@ -140,8 +141,8 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const struct path *path) { - struct fanotify_event_info *event; - gfp_t gfp = GFP_KERNEL; + struct fanotify_event_info *event = NULL; + gfp_t gfp = GFP_KERNEL_ACCOUNT; /* * For queues with unlimited length lost events are not expected and @@ -151,19 +152,22 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, if (group->max_events == UINT_MAX) gfp |= __GFP_NOFAIL; + /* Whoever is interested in the event, pays for the allocation. */ + memalloc_use_memcg(group->memcg); + if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event_info *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) - return NULL; + goto out; event = &pevent->fae; pevent->response = 0; goto init; } event = kmem_cache_alloc(fanotify_event_cachep, gfp); if (!event) - return NULL; + goto out; init: __maybe_unused fsnotify_init_event(&event->fse, inode, mask); event->tgid = get_pid(task_tgid(current)); @@ -174,6 +178,8 @@ init: __maybe_unused event->path.mnt = NULL; event->path.dentry = NULL; } +out: + memalloc_unuse_memcg(); return event; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ec4d8c59d0e3..0cf45041dc32 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -756,6 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); + group->memcg = get_mem_cgroup_from_mm(current->mm); oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { @@ -957,7 +959,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + SLAB_PANIC|SLAB_ACCOUNT); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = diff --git a/fs/notify/group.c b/fs/notify/group.c index aa5468f23e45..c03b83662876 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "fsnotify.h" @@ -36,6 +37,8 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) if (group->ops->free_group_priv) group->ops->free_group_priv(group); + mem_cgroup_put(group->memcg); + kfree(group); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 9ab6dde38a14..f4184b4f3815 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "inotify.h" @@ -98,7 +99,11 @@ int inotify_handle_event(struct fsnotify_group *group, i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - event = kmalloc(alloc_len, GFP_KERNEL); + /* Whoever is interested in the event, pays for the allocation. */ + memalloc_use_memcg(group->memcg); + event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT); + memalloc_unuse_memcg(); + if (unlikely(!event)) { /* * Treat lost event due to ENOMEM the same way as queue diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1cf5b779d862..749c46ababa0 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "inotify.h" #include "../fdinfo.h" @@ -636,6 +637,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) oevent->name_len = 0; group->max_events = max_events; + group->memcg = get_mem_cgroup_from_mm(current->mm); spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); @@ -808,7 +810,8 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); - inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, + SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b38964a7a521..a0c4790c5302 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -84,6 +84,8 @@ struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; +struct mem_cgroup; + /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. @@ -127,6 +129,8 @@ struct fsnotify_event { * everything will be cleaned up. */ struct fsnotify_group { + const struct fsnotify_ops *ops; /* how this group handles things */ + /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. @@ -137,8 +141,6 @@ struct fsnotify_group { */ refcount_t refcnt; /* things with interest in this group */ - const struct fsnotify_ops *ops; /* how this group handles things */ - /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ @@ -160,6 +162,8 @@ struct fsnotify_group { atomic_t num_marks; /* 1 for each mark and 1 for not being * past the point of no return when freeing * a group */ + atomic_t user_waits; /* Number of tasks waiting for user + * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ @@ -167,8 +171,8 @@ struct fsnotify_group { struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ - atomic_t user_waits; /* Number of tasks waiting for user - * response */ + + struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 42f4719def32..121e218d2a21 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -373,6 +373,8 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -380,7 +382,8 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { - css_put(&memcg->css); + if (memcg) + css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ @@ -855,6 +858,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } +static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +{ + return NULL; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 95a5018c338e..1827f4a7a6de 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1152,6 +1152,9 @@ struct task_struct { /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; + + /* Used by memcontrol for targeted memcg charge: */ + struct mem_cgroup *active_memcg; #endif #ifdef CONFIG_BLK_CGROUP diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 44d356f5e47c..aebb370a0006 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -248,6 +248,43 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_MEMCG +/** + * memalloc_use_memcg - Starts the remote memcg charging scope. + * @memcg: memcg to charge. + * + * This function marks the beginning of the remote memcg charging scope. All the + * __GFP_ACCOUNT allocations till the end of the scope will be charged to the + * given memcg. + * + * NOTE: This function is not nesting safe. + */ +static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +{ + WARN_ON_ONCE(current->active_memcg); + current->active_memcg = memcg; +} + +/** + * memalloc_unuse_memcg - Ends the remote memcg charging scope. + * + * This function marks the end of the remote memcg charging scope started by + * memalloc_use_memcg(). + */ +static inline void memalloc_unuse_memcg(void) +{ + current->active_memcg = NULL; +} +#else +static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void memalloc_unuse_memcg(void) +{ +} +#endif + #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), diff --git a/kernel/fork.c b/kernel/fork.c index 33112315b5c0..5ee74c113381 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -871,6 +871,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif +#ifdef CONFIG_MEMCG + tsk->active_memcg = NULL; +#endif return tsk; free_stack: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b836e7f00309..bf9cf738c836 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -678,9 +678,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) } EXPORT_SYMBOL(mem_cgroup_from_task); -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +/** + * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. + * @mm: mm from which memcg should be extracted. It can be NULL. + * + * Obtain a reference on mm->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is + * returned. + */ +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { - struct mem_cgroup *memcg = NULL; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; rcu_read_lock(); do { @@ -700,6 +711,24 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_unlock(); return memcg; } +EXPORT_SYMBOL(get_mem_cgroup_from_mm); + +/** + * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. + */ +static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + if (unlikely(current->active_memcg)) { + struct mem_cgroup *memcg = root_mem_cgroup; + + rcu_read_lock(); + if (css_tryget_online(¤t->active_memcg->css)) + memcg = current->active_memcg; + rcu_read_unlock(); + return memcg; + } + return get_mem_cgroup_from_mm(current->mm); +} /** * mem_cgroup_iter - iterate over memory cgroup hierarchy @@ -2261,7 +2290,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (current->memcg_kmem_skip_account) return cachep; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2345,7 +2374,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) if (memcg_kmem_bypass()) return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) -- cgit v1.2.3 From f745c6f5fe75734f3b35d9d4e6ebe2a7d010ddda Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Aug 2018 15:46:44 -0700 Subject: fs, mm: account buffer_head to kmemcg The buffer_head can consume a significant amount of system memory and is directly related to the amount of page cache. In our production environment we have observed that a lot of machines are spending a significant amount of memory as buffer_head and can not be left as system memory overhead. Charging buffer_head is not as simple as adding __GFP_ACCOUNT to the allocation. The buffer_heads can be allocated in a memcg different from the memcg of the page for which buffer_heads are being allocated. One concrete example is memory reclaim. The reclaim can trigger I/O of pages of any memcg on the system. So, the right way to charge buffer_head is to extract the memcg from the page for which buffer_heads are being allocated and then use targeted memcg charging API. [shakeelb@google.com: use __GFP_ACCOUNT for directed memcg charging] Link: http://lkml.kernel.org/r/20180702220208.213380-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20180627191250.209150-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Jan Kara Cc: Amir Goldstein Cc: Greg Thelen Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 12 ++++++++++-- include/linux/memcontrol.h | 7 +++++++ mm/memcontrol.c | 22 ++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index c8c2b7d8b8d6..4cc679d5bf58 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -45,6 +45,7 @@ #include #include #include +#include #include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -813,12 +814,16 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry) { struct buffer_head *bh, *head; - gfp_t gfp = GFP_NOFS; + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; + struct mem_cgroup *memcg; if (retry) gfp |= __GFP_NOFAIL; + memcg = get_mem_cgroup_from_page(page); + memalloc_use_memcg(memcg); + head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { @@ -835,6 +840,9 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, /* Link the buffer to its page */ set_bh_page(bh, page, offset); } +out: + memalloc_unuse_memcg(); + mem_cgroup_put(memcg); return head; /* * In case anything failed, we just free everything we got. @@ -848,7 +856,7 @@ no_grow: } while (head); } - return NULL; + goto out; } EXPORT_SYMBOL_GPL(alloc_page_buffers); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 121e218d2a21..50e3e807b427 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -375,6 +375,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); +struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -863,6 +865,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + return NULL; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bf9cf738c836..c071af193986 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -713,6 +713,28 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); +/** + * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. + * @page: page from which memcg should be extracted. + * + * Obtain a reference on page->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. + */ +struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + + if (mem_cgroup_disabled()) + return NULL; + + rcu_read_lock(); + if (!memcg || !css_tryget_online(&memcg->css)) + memcg = root_mem_cgroup; + rcu_read_unlock(); + return memcg; +} +EXPORT_SYMBOL(get_mem_cgroup_from_page); + /** * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. */ -- cgit v1.2.3 From 0207df4fa1a869281ddbf72db6203dbf036b3e1a Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 17 Aug 2018 15:47:04 -0700 Subject: kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN KASAN learns about hotadded memory via the memory hotplug notifier. devm_memremap_pages() intentionally skips calling memory hotplug notifiers. So KASAN doesn't know anything about new memory added by devm_memremap_pages(). This causes a crash when KASAN tries to access non-existent shadow memory: BUG: unable to handle kernel paging request at ffffed0078000000 RIP: 0010:check_memory_region+0x82/0x1e0 Call Trace: memcpy+0x1f/0x50 pmem_do_bvec+0x163/0x720 pmem_make_request+0x305/0xac0 generic_make_request+0x54f/0xcf0 submit_bio+0x9c/0x370 submit_bh_wbc+0x4c7/0x700 block_read_full_page+0x5ef/0x870 do_read_cache_page+0x2b8/0xb30 read_dev_sector+0xbd/0x3f0 read_lba.isra.0+0x277/0x670 efi_partition+0x41a/0x18f0 check_partition+0x30d/0x5e9 rescan_partitions+0x18c/0x840 __blkdev_get+0x859/0x1060 blkdev_get+0x23f/0x810 __device_add_disk+0x9c8/0xde0 pmem_attach_disk+0x9a8/0xf50 nvdimm_bus_probe+0xf3/0x3c0 driver_probe_device+0x493/0xbd0 bus_for_each_drv+0x118/0x1b0 __device_attach+0x1cd/0x2b0 bus_probe_device+0x1ac/0x260 device_add+0x90d/0x1380 nd_async_device_register+0xe/0x50 async_run_entry_fn+0xc3/0x5d0 process_one_work+0xa0a/0x1810 worker_thread+0x87/0xe80 kthread+0x2d7/0x390 ret_from_fork+0x3a/0x50 Add kasan_add_zero_shadow()/kasan_remove_zero_shadow() - post mm_init() interface to map/unmap kasan_zero_page at requested virtual addresses. And use it to add/remove the shadow memory for hotplugged/unplugged device memory. Link: http://lkml.kernel.org/r/20180629164932.740-1-aryabinin@virtuozzo.com Fixes: 41e94a851304 ("add devm_memremap_pages") Signed-off-by: Andrey Ryabinin Reported-by: Dave Chinner Reviewed-by: Dan Williams Tested-by: Dan Williams Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 13 ++- kernel/memremap.c | 10 ++ mm/kasan/kasan_init.c | 316 +++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 325 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index de784fd11d12..46aae129917c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -20,7 +20,7 @@ extern pmd_t kasan_zero_pmd[PTRS_PER_PMD]; extern pud_t kasan_zero_pud[PTRS_PER_PUD]; extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D]; -void kasan_populate_zero_shadow(const void *shadow_start, +int kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end); static inline void *kasan_mem_to_shadow(const void *addr) @@ -71,6 +71,9 @@ struct kasan_cache { int kasan_module_alloc(void *addr, size_t size); void kasan_free_shadow(const struct vm_struct *vm); +int kasan_add_zero_shadow(void *start, unsigned long size); +void kasan_remove_zero_shadow(void *start, unsigned long size); + size_t ksize(const void *); static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } size_t kasan_metadata_size(struct kmem_cache *cache); @@ -124,6 +127,14 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} +static inline int kasan_add_zero_shadow(void *start, unsigned long size) +{ + return 0; +} +static inline void kasan_remove_zero_shadow(void *start, + unsigned long size) +{} + static inline void kasan_unpoison_slab(const void *ptr) { } static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 38283363da06..1f87ea6b6545 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -137,6 +138,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_begin(); arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? &pgmap->altmap : NULL); + kasan_remove_zero_shadow(__va(align_start), align_size); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); @@ -239,6 +241,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_pfn_remap; mem_hotplug_begin(); + error = kasan_add_zero_shadow(__va(align_start), align_size); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], @@ -267,6 +275,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return __va(res->start); err_add_memory: + kasan_remove_zero_shadow(__va(align_start), align_size); + err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index f436246ccc79..7a2a2f13f86f 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -17,10 +17,13 @@ #include #include #include +#include #include #include +#include "kasan.h" + /* * This page serves two purposes: * - It used as early shadow memory. The entire shadow region populated @@ -32,22 +35,59 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; #if CONFIG_PGTABLE_LEVELS > 4 p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d)); +} +#else +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud)); +} +#else +static inline bool kasan_pud_table(p4d_t p4d) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 2 pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd)); +} +#else +static inline bool kasan_pmd_table(pud_t pud) +{ + return 0; +} #endif pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte)); +} + +static inline bool kasan_zero_page_entry(pte_t pte) +{ + return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page)); +} + static __init void *early_alloc(size_t size, int node) { return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, +static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, addr); @@ -63,7 +103,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, } } -static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, +static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd = pmd_offset(pud, addr); @@ -78,14 +118,24 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, } if (pmd_none(*pmd)) { - pmd_populate_kernel(&init_mm, pmd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pte_t *p; + + if (slab_is_available()) + p = pte_alloc_one_kernel(&init_mm, addr); + else + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, pmd, p); } zero_pte_populate(pmd, addr, next); } while (pmd++, addr = next, addr != end); + + return 0; } -static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, +static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { pud_t *pud = pud_offset(p4d, addr); @@ -103,14 +153,24 @@ static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, } if (pud_none(*pud)) { - pud_populate(&init_mm, pud, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pmd_t *p; + + if (slab_is_available()) { + p = pmd_alloc(&init_mm, pud, addr); + if (!p) + return -ENOMEM; + } else { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pmd_populate(pud, addr, next); } while (pud++, addr = next, addr != end); + + return 0; } -static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, +static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -132,11 +192,21 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, } if (p4d_none(*p4d)) { - p4d_populate(&init_mm, p4d, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pud_t *p; + + if (slab_is_available()) { + p = pud_alloc(&init_mm, p4d, addr); + if (!p) + return -ENOMEM; + } else { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pud_populate(p4d, addr, next); } while (p4d++, addr = next, addr != end); + + return 0; } /** @@ -145,7 +215,7 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, * @shadow_start - start of the memory range to populate * @shadow_end - end of the memory range to populate */ -void __init kasan_populate_zero_shadow(const void *shadow_start, +int __ref kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end) { unsigned long addr = (unsigned long)shadow_start; @@ -191,9 +261,229 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, } if (pgd_none(*pgd)) { - pgd_populate(&init_mm, pgd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + p4d_t *p; + + if (slab_is_available()) { + p = p4d_alloc(&init_mm, pgd, addr); + if (!p) + return -ENOMEM; + } else { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_p4d_populate(pgd, addr, next); } while (pgd++, addr = next, addr != end); + + return 0; +} + +static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pmd_clear(pmd); +} + +static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pud_clear(pud); +} + +static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + p4d_clear(p4d); +} + +static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_start + i; + if (!p4d_none(*p4d)) + return; + } + + p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + pgd_clear(pgd); +} + +static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (WARN_ON(!kasan_zero_page_entry(*pte))) + continue; + pte_clear(&init_mm, addr, pte); + } +} + +static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pmd++) { + pte_t *pte; + + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (kasan_pte_table(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) + pmd_clear(pmd); + continue; + } + pte = pte_offset_kernel(pmd, addr); + kasan_remove_pte_table(pte, addr, next); + kasan_free_pte(pte_offset_kernel(pmd, 0), pmd); + } +} + +static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pud++) { + pmd_t *pmd, *pmd_base; + + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (kasan_pmd_table(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) + pud_clear(pud); + continue; + } + pmd = pmd_offset(pud, addr); + pmd_base = pmd_offset(pud, 0); + kasan_remove_pmd_table(pmd, addr, next); + kasan_free_pmd(pmd_base, pud); + } +} + +static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, p4d++) { + pud_t *pud; + + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + if (kasan_pud_table(*p4d)) { + if (IS_ALIGNED(addr, P4D_SIZE) && + IS_ALIGNED(next, P4D_SIZE)) + p4d_clear(p4d); + continue; + } + pud = pud_offset(p4d, addr); + kasan_remove_pud_table(pud, addr, next); + kasan_free_pud(pud_offset(p4d, 0), p4d); + } +} + +void kasan_remove_zero_shadow(void *start, unsigned long size) +{ + unsigned long addr, end, next; + pgd_t *pgd; + + addr = (unsigned long)kasan_mem_to_shadow(start); + end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return; + + for (; addr < end; addr = next) { + p4d_t *p4d; + + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (kasan_p4d_table(*pgd)) { + if (IS_ALIGNED(addr, PGDIR_SIZE) && + IS_ALIGNED(next, PGDIR_SIZE)) + pgd_clear(pgd); + continue; + } + + p4d = p4d_offset(pgd, addr); + kasan_remove_p4d_table(p4d, addr, next); + kasan_free_p4d(p4d_offset(pgd, 0), pgd); + } +} + +int kasan_add_zero_shadow(void *start, unsigned long size) +{ + int ret; + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(start); + shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return -EINVAL; + + ret = kasan_populate_zero_shadow(shadow_start, shadow_end); + if (ret) + kasan_remove_zero_shadow(shadow_start, + size >> KASAN_SHADOW_SCALE_SHIFT); + return ret; } -- cgit v1.2.3 From 29ef680ae7c21110af8e6416d84d8a72fc147b14 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 17 Aug 2018 15:47:11 -0700 Subject: memcg, oom: move out_of_memory back to the charge path Commit 3812c8c8f395 ("mm: memcg: do not trap chargers with full callstack on OOM") has changed the ENOMEM semantic of memcg charges. Rather than invoking the oom killer from the charging context it delays the oom killer to the page fault path (pagefault_out_of_memory). This in turn means that many users (e.g. slab or g-u-p) will get ENOMEM when the corresponding memcg hits the hard limit and the memcg is is OOM. This is behavior is inconsistent with !memcg case where the oom killer is invoked from the allocation context and the allocator keeps retrying until it succeeds. The difference in the behavior is user visible. mmap(MAP_POPULATE) might result in not fully populated ranges while the mmap return code doesn't tell that to the userspace. Random syscalls might fail with ENOMEM etc. The primary motivation of the different memcg oom semantic was the deadlock avoidance. Things have changed since then, though. We have an async oom teardown by the oom reaper now and so we do not have to rely on the victim to tear down its memory anymore. Therefore we can return to the original semantic as long as the memcg oom killer is not handed over to the users space. There is still one thing to be careful about here though. If the oom killer is not able to make any forward progress - e.g. because there is no eligible task to kill - then we have to bail out of the charge path to prevent from same class of deadlocks. We have basically two options here. Either we fail the charge with ENOMEM or force the charge and allow overcharge. The first option has been considered more harmful than useful because rare inconsistencies in the ENOMEM behavior is hard to test for and error prone. Basically the same reason why the page allocator doesn't fail allocations under such conditions. The later might allow runaways but those should be really unlikely unless somebody misconfigures the system. E.g. allowing to migrate tasks away from the memcg to a different unlimited memcg with move_charge_at_immigrate disabled. Link: http://lkml.kernel.org/r/20180628151101.25307-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Greg Thelen Cc: Johannes Weiner Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 +++++----- include/linux/sched.h | 2 +- mm/memcontrol.c | 75 ++++++++++++++++++++++++++++++++++++---------- mm/memory.c | 4 +-- 4 files changed, 71 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 50e3e807b427..57a202f31683 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -507,16 +507,16 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); -static inline void mem_cgroup_oom_enable(void) +static inline void mem_cgroup_enter_user_fault(void) { - WARN_ON(current->memcg_may_oom); - current->memcg_may_oom = 1; + WARN_ON(current->in_user_fault); + current->in_user_fault = 1; } -static inline void mem_cgroup_oom_disable(void) +static inline void mem_cgroup_exit_user_fault(void) { - WARN_ON(!current->memcg_may_oom); - current->memcg_may_oom = 0; + WARN_ON(!current->in_user_fault); + current->in_user_fault = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) @@ -961,11 +961,11 @@ static inline void mem_cgroup_handle_over_high(void) { } -static inline void mem_cgroup_oom_enable(void) +static inline void mem_cgroup_enter_user_fault(void) { } -static inline void mem_cgroup_oom_disable(void) +static inline void mem_cgroup_exit_user_fault(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 1827f4a7a6de..066a2c328653 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -722,7 +722,7 @@ struct task_struct { unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG - unsigned memcg_may_oom:1; + unsigned in_user_fault:1; #ifndef CONFIG_SLOB unsigned memcg_kmem_skip_account:1; #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c071af193986..d6724bed57d8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1534,28 +1534,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +enum oom_status { + OOM_SUCCESS, + OOM_FAILED, + OOM_ASYNC, + OOM_SKIPPED +}; + +static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) - return; + if (order > PAGE_ALLOC_COSTLY_ORDER) + return OOM_SKIPPED; + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack * that holds all kinds of filesystem and mm locks. * - * Also, the caller may handle a failed allocation gracefully - * (like optional page cache readahead) and so an OOM killer - * invocation might not even be necessary. + * cgroup1 allows disabling the OOM killer and waiting for outside + * handling until the charge can succeed; remember the context and put + * the task to sleep at the end of the page fault when all locks are + * released. + * + * On the other hand, in-kernel OOM killer allows for an async victim + * memory reclaim (oom_reaper) and that means that we are not solely + * relying on the oom victim to make a forward progress and we can + * invoke the oom killer here. * - * That's why we don't do anything here except remember the - * OOM context and then deal with it at the end of the page - * fault when the stack is unwound, the locks are released, - * and when we know whether the fault was overall successful. + * Please note that mem_cgroup_out_of_memory might fail to find a + * victim and then we have to bail out from the charge path. */ - css_get(&memcg->css); - current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; + if (memcg->oom_kill_disable) { + if (!current->in_user_fault) + return OOM_SKIPPED; + css_get(&memcg->css); + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; + + return OOM_ASYNC; + } + + if (mem_cgroup_out_of_memory(memcg, mask, order)) + return OOM_SUCCESS; + + WARN(1,"Memory cgroup charge failed because of no reclaimable memory! " + "This looks like a misconfiguration or a kernel bug."); + return OOM_FAILED; } /** @@ -1950,6 +1975,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; + bool oomed = false; + enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) return 0; @@ -2037,6 +2064,9 @@ retry: if (nr_retries--) goto retry; + if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) + goto nomem; + if (gfp_mask & __GFP_NOFAIL) goto force; @@ -2045,8 +2075,23 @@ retry: memcg_memory_event(mem_over_limit, MEMCG_OOM); - mem_cgroup_oom(mem_over_limit, gfp_mask, + /* + * keep retrying as long as the memcg oom killer is able to make + * a forward progress or bypass the charge if the oom killer + * couldn't make any progress. + */ + oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); + switch (oom_status) { + case OOM_SUCCESS: + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + oomed = true; + goto retry; + case OOM_FAILED: + goto force; + default: + goto nomem; + } nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; diff --git a/mm/memory.c b/mm/memory.c index 175f344e1523..ae2ec887508b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4153,7 +4153,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) - mem_cgroup_oom_enable(); + mem_cgroup_enter_user_fault(); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); @@ -4161,7 +4161,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ret = __handle_mm_fault(vma, address, flags); if (flags & FAULT_FLAG_USER) { - mem_cgroup_oom_disable(); + mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no -- cgit v1.2.3 From 84c07d11aa619c6d24c682f469b10f344f0c02aa Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:25 -0700 Subject: mm: introduce CONFIG_MEMCG_KMEM as combination of CONFIG_MEMCG && !CONFIG_SLOB Introduce new config option, which is used to replace repeating CONFIG_MEMCG && !CONFIG_SLOB pattern. Next patches add a little more memcg+kmem related code, so let's keep the defines more clearly. Link: http://lkml.kernel.org/r/153063053670.1818.15013136946600481138.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 4 ++-- include/linux/memcontrol.h | 6 +++--- include/linux/sched.h | 2 +- include/linux/slab.h | 2 +- init/Kconfig | 5 +++++ mm/list_lru.c | 8 ++++---- mm/memcontrol.c | 16 ++++++++-------- mm/slab.h | 6 +++--- mm/slab_common.c | 8 ++++---- 9 files changed, 31 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 96def9d15b1b..2d23b5b745be 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -42,7 +42,7 @@ struct list_lru_node { spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ struct list_lru_memcg __rcu *memcg_lrus; #endif @@ -51,7 +51,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM struct list_head list; #endif }; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 57a202f31683..f3c026df7443 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -271,7 +271,7 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; enum memcg_kmem_state kmem_state; @@ -1231,7 +1231,7 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void memcg_kmem_uncharge(struct page *page, int order); -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1284,6 +1284,6 @@ static inline void memcg_put_cache_ids(void) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 066a2c328653..789923fbee3a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -723,7 +723,7 @@ struct task_struct { #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM unsigned memcg_kmem_skip_account:1; #endif #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 14e3fe4bd6a1..ed9cbddeb4a6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -97,7 +97,7 @@ # define SLAB_FAILSLAB 0 #endif /* Account to memcg */ -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM # define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U) #else # define SLAB_ACCOUNT 0 diff --git a/init/Kconfig b/init/Kconfig index 4dc783023e43..9bd50ba8253f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -708,6 +708,11 @@ config MEMCG_SWAP_ENABLED select this option (if, for some reason, they need to disable it then swapaccount=0 does the trick). +config MEMCG_KMEM + bool + depends on MEMCG && !SLOB + default y + config BLK_CGROUP bool "IO controller" depends on BLOCK diff --git a/mm/list_lru.c b/mm/list_lru.c index b65e0b9b0646..c5217d84c6e1 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,7 +12,7 @@ #include #include -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -103,7 +103,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { return &nlru->lru; } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { @@ -284,7 +284,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, int begin, int end) { @@ -543,7 +543,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6724bed57d8..2f00b455080f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -251,7 +251,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: @@ -305,7 +305,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ /** * mem_cgroup_css_from_page - css of the memcg associated with a page @@ -2215,7 +2215,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM static int memcg_alloc_cache_id(void) { int id, size; @@ -2480,7 +2480,7 @@ void memcg_kmem_uncharge(struct page *page, int order) css_put_many(&memcg->css, nr_pages); } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2875,7 +2875,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { int memcg_id; @@ -2975,7 +2975,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) static void memcg_free_kmem(struct mem_cgroup *memcg) { } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ static int memcg_update_kmem_max(struct mem_cgroup *memcg, unsigned long max) @@ -4279,7 +4279,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); memcg->socket_pressure = jiffies; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -6119,7 +6119,7 @@ static int __init mem_cgroup_init(void) { int cpu, node; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* * Kmem cache creation is mostly done with the slab_mutex held, * so use a workqueue with limited concurrency to avoid stalling diff --git a/mm/slab.h b/mm/slab.h index 68bdf498da3b..58c6c1c2a78e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -203,7 +203,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* List of all root caches. */ extern struct list_head slab_root_caches; @@ -296,7 +296,7 @@ extern void memcg_link_cache(struct kmem_cache *s); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, void (*deact_fn)(struct kmem_cache *)); -#else /* CONFIG_MEMCG && !CONFIG_SLOB */ +#else /* CONFIG_MEMCG_KMEM */ /* If !memcg, all caches are root. */ #define slab_root_caches slab_caches @@ -351,7 +351,7 @@ static inline void memcg_link_cache(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 2296caf87bfb..fea3376f9816 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -127,7 +127,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, return i; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM LIST_HEAD(slab_root_caches); @@ -256,7 +256,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s) static inline void memcg_unlink_cache(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ /* * Figure out what the alignment of the objects will be given a set of @@ -584,7 +584,7 @@ static int shutdown_cache(struct kmem_cache *s) return 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. @@ -861,7 +861,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s) static inline void flush_memcg_workqueue(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) { -- cgit v1.2.3 From b4c2b231c3ba155623591fb6301ed97b95e1c039 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:29 -0700 Subject: mm: assign id to every memcg-aware shrinker Introduce shrinker::id number, which is used to enumerate memcg-aware shrinkers. The number start from 0, and the code tries to maintain it as small as possible. This will be used to represent a memcg-aware shrinkers in memcg shrinkers map. Since all memcg-aware shrinkers are based on list_lru, which is per-memcg in case of !CONFIG_MEMCG_KMEM only, the new functionality will be under this config option. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112546435.4097.10607140323811756557.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/153063054586.1818.6041047871606697364.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 4 +++ mm/vmscan.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'include/linux') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 6794490f25b2..7ca9c18cf130 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -66,6 +66,10 @@ struct shrinker { /* These are for internal use */ struct list_head list; +#ifdef CONFIG_MEMCG_KMEM + /* ID in shrinker_idr */ + int id; +#endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; }; diff --git a/mm/vmscan.c b/mm/vmscan.c index a00d94530e57..5cb4f779ea4a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -169,6 +169,50 @@ unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); +#ifdef CONFIG_MEMCG_KMEM +static DEFINE_IDR(shrinker_idr); +static int shrinker_nr_max; + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + int id, ret = -ENOMEM; + + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); + if (id < 0) + goto unlock; + + if (id >= shrinker_nr_max) + shrinker_nr_max = id + 1; + shrinker->id = id; + ret = 0; +unlock: + up_write(&shrinker_rwsem); + return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ + int id = shrinker->id; + + BUG_ON(id < 0); + + down_write(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); + up_write(&shrinker_rwsem); +} +#else /* CONFIG_MEMCG_KMEM */ +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { @@ -313,11 +357,28 @@ int prealloc_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) return -ENOMEM; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + if (prealloc_memcg_shrinker(shrinker)) + goto free_deferred; + } + return 0; + +free_deferred: + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; + return -ENOMEM; } void free_prealloced_shrinker(struct shrinker *shrinker) { + if (!shrinker->nr_deferred) + return; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -347,6 +408,8 @@ void unregister_shrinker(struct shrinker *shrinker) { if (!shrinker->nr_deferred) return; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); -- cgit v1.2.3 From 0a4465d340282f92719f4e3a56545a848e638d15 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:37 -0700 Subject: mm, memcg: assign memcg-aware shrinkers bitmap to memcg Imagine a big node with many cpus, memory cgroups and containers. Let we have 200 containers, every container has 10 mounts, and 10 cgroups. All container tasks don't touch foreign containers mounts. If there is intensive pages write, and global reclaim happens, a writing task has to iterate over all memcgs to shrink slab, before it's able to go to shrink_page_list(). Iteration over all the memcg slabs is very expensive: the task has to visit 200 * 10 = 2000 shrinkers for every memcg, and since there are 2000 memcgs, the total calls are 2000 * 2000 = 4000000. So, the shrinker makes 4 million do_shrink_slab() calls just to try to isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via shrink_page_list(). I've observed a node spending almost 100% in kernel, making useless iteration over already shrinked slab. This patch adds bitmap of memcg-aware shrinkers to memcg. The size of the bitmap depends on bitmap_nr_ids, and during memcg life it's maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in the map is related to corresponding shrinker id. Next patches will maintain set bit only for really charged memcg. This will allow shrink_slab() to increase its performance in significant way. See the last patch for the numbers. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain [ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()] Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 14 +++++ mm/memcontrol.c | 124 +++++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 8 ++- 3 files changed, 145 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f3c026df7443..2cccbb9e1b3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,6 +111,15 @@ struct lruvec_stat { long count[NR_VM_NODE_STAT_ITEMS]; }; +/* + * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, + * which have elements charged to this memcg. + */ +struct memcg_shrinker_map { + struct rcu_head rcu; + unsigned long map[0]; +}; + /* * per-zone information in memory controller. */ @@ -124,6 +133,9 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; +#ifdef CONFIG_MEMCG_KMEM + struct memcg_shrinker_map __rcu *shrinker_map; +#endif struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -1262,6 +1274,8 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } +extern int memcg_expand_shrinker_maps(int new_id); + #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 313355dddf66..827c9e87ca08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -320,6 +320,119 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; +static int memcg_shrinker_map_size; +static DEFINE_MUTEX(memcg_shrinker_map_mutex); + +static void memcg_free_shrinker_map_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); +} + +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, + int size, int old_size) +{ + struct memcg_shrinker_map *new, *old; + int nid; + + lockdep_assert_held(&memcg_shrinker_map_mutex); + + for_each_node(nid) { + old = rcu_dereference_protected( + mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* Set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_size); + memset((void *)new->map + old_size, 0, size - old_size); + + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); + call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); + } + + return 0; +} + +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct memcg_shrinker_map *map; + int nid; + + if (mem_cgroup_is_root(memcg)) + return; + + for_each_node(nid) { + pn = mem_cgroup_nodeinfo(memcg, nid); + map = rcu_dereference_protected(pn->shrinker_map, true); + if (map) + kvfree(map); + rcu_assign_pointer(pn->shrinker_map, NULL); + } +} + +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + struct memcg_shrinker_map *map; + int nid, size, ret = 0; + + if (mem_cgroup_is_root(memcg)) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + size = memcg_shrinker_map_size; + for_each_node(nid) { + map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); + if (!map) { + memcg_free_shrinker_maps(memcg); + ret = -ENOMEM; + break; + } + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + } + mutex_unlock(&memcg_shrinker_map_mutex); + + return ret; +} + +int memcg_expand_shrinker_maps(int new_id) +{ + int size, old_size, ret = 0; + struct mem_cgroup *memcg; + + size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); + old_size = memcg_shrinker_map_size; + if (size <= old_size) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + if (!root_mem_cgroup) + goto unlock; + + for_each_mem_cgroup(memcg) { + if (mem_cgroup_is_root(memcg)) + continue; + ret = memcg_expand_one_shrinker_map(memcg, size, old_size); + if (ret) + goto unlock; + } +unlock: + if (!ret) + memcg_shrinker_map_size = size; + mutex_unlock(&memcg_shrinker_map_mutex); + return ret; +} +#else /* CONFIG_MEMCG_KMEM */ +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ /** @@ -4356,6 +4469,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* + * A memcg must be visible for memcg_expand_shrinker_maps() + * by the time the maps are allocated. So, we allocate maps + * here, when for_each_mem_cgroup() can't skip it. + */ + if (memcg_alloc_shrinker_maps(memcg)) { + mem_cgroup_id_remove(memcg); + return -ENOMEM; + } + /* Online state pins memcg ID, memcg ID pins CSS */ atomic_set(&memcg->id.ref, 1); css_get(css); @@ -4408,6 +4531,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); + memcg_free_shrinker_maps(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 5cb4f779ea4a..db0970ba340d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,8 +183,14 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (id < 0) goto unlock; - if (id >= shrinker_nr_max) + if (id >= shrinker_nr_max) { + if (memcg_expand_shrinker_maps(id)) { + idr_remove(&shrinker_idr, id); + goto unlock; + } + shrinker_nr_max = id + 1; + } shrinker->id = id; ret = 0; unlock: -- cgit v1.2.3 From c92e8e10cafeaaedc84f23fed1bfcf9cf07399c2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:50 -0700 Subject: fs: propagate shrinker::id to list_lru Add list_lru::shrinker_id field and populate it by registered shrinker id. This will be used to set correct bit in memcg shrinkers map by lru code in next patches, after there appeared the first related to memcg element in list_lru. Link: http://lkml.kernel.org/r/153063059758.1818.14866596416857717800.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 4 ++-- include/linux/list_lru.h | 14 +++++++++----- mm/list_lru.c | 11 ++++++++++- mm/workingset.c | 3 ++- 4 files changed, 23 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 78227c4ddb21..f5f96e52e0cd 100644 --- a/fs/super.c +++ b/fs/super.c @@ -261,9 +261,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_dentry_lru)) + if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_inode_lru)) + if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) goto fail; return s; diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 2d23b5b745be..9e75bb33766b 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -53,16 +53,20 @@ struct list_lru { struct list_lru_node *node; #ifdef CONFIG_MEMCG_KMEM struct list_head list; + int shrinker_id; #endif }; void list_lru_destroy(struct list_lru *lru); int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key); - -#define list_lru_init(lru) __list_lru_init((lru), false, NULL) -#define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key)) -#define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL) + struct lock_class_key *key, struct shrinker *shrinker); + +#define list_lru_init(lru) \ + __list_lru_init((lru), false, NULL, NULL) +#define list_lru_init_key(lru, key) \ + __list_lru_init((lru), false, (key), NULL) +#define list_lru_init_memcg(lru, shrinker) \ + __list_lru_init((lru), true, NULL, shrinker) int memcg_update_all_list_lrus(int num_memcgs); void memcg_drain_all_list_lrus(int src_idx, int dst_idx); diff --git a/mm/list_lru.c b/mm/list_lru.c index c5217d84c6e1..5aebbb9b2f5b 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -546,12 +546,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) #endif /* CONFIG_MEMCG_KMEM */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key) + struct lock_class_key *key, struct shrinker *shrinker) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; int err = -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + if (shrinker) + lru->shrinker_id = shrinker->id; + else + lru->shrinker_id = -1; +#endif memcg_get_cache_ids(); lru->node = kzalloc(size, GFP_KERNEL); @@ -594,6 +600,9 @@ void list_lru_destroy(struct list_lru *lru) kfree(lru->node); lru->node = NULL; +#ifdef CONFIG_MEMCG_KMEM + lru->shrinker_id = -1; +#endif memcg_put_cache_ids(); } EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/workingset.c b/mm/workingset.c index 4e0b2523aae2..cd0b2ae615e4 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -526,7 +526,8 @@ static int __init workingset_init(void) ret = prealloc_shrinker(&workingset_shadow_shrinker); if (ret) goto err; - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, + &workingset_shadow_shrinker); if (ret) goto err_list_lru; register_shrinker_prepared(&workingset_shadow_shrinker); -- cgit v1.2.3 From 9bec5c35bfa3d41b046594b5890f772ed737f1fd Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:58 -0700 Subject: mm/list_lru: pass dst_memcg argument to memcg_drain_list_lru_node() This is just refactoring to allow the next patches to have dst_memcg pointer in memcg_drain_list_lru_node(). Link: http://lkml.kernel.org/r/153063062118.1818.2761273817739499749.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 2 +- mm/list_lru.c | 11 ++++++----- mm/memcontrol.c | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 9e75bb33766b..d9c16f2f2f00 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -69,7 +69,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, __list_lru_init((lru), true, NULL, shrinker) int memcg_update_all_list_lrus(int num_memcgs); -void memcg_drain_all_list_lrus(int src_idx, int dst_idx); +void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg); /** * list_lru_add: add an element to the lru list's tail diff --git a/mm/list_lru.c b/mm/list_lru.c index 1fc5be746e69..5384cda08984 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -502,8 +502,9 @@ fail: } static void memcg_drain_list_lru_node(struct list_lru_node *nlru, - int src_idx, int dst_idx) + int src_idx, struct mem_cgroup *dst_memcg) { + int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; /* @@ -523,7 +524,7 @@ static void memcg_drain_list_lru_node(struct list_lru_node *nlru, } static void memcg_drain_list_lru(struct list_lru *lru, - int src_idx, int dst_idx) + int src_idx, struct mem_cgroup *dst_memcg) { int i; @@ -531,16 +532,16 @@ static void memcg_drain_list_lru(struct list_lru *lru, return; for_each_node(i) - memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); + memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_memcg); } -void memcg_drain_all_list_lrus(int src_idx, int dst_idx) +void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) { struct list_lru *lru; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &list_lrus, list) - memcg_drain_list_lru(lru, src_idx, dst_idx); + memcg_drain_list_lru(lru, src_idx, dst_memcg); mutex_unlock(&list_lrus_mutex); } #else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 827c9e87ca08..a35dc901424d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3060,7 +3060,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) } rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); } -- cgit v1.2.3 From dfd2f10ccfd7e6bd2a096eaf42e76a7229776322 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:06 -0700 Subject: mm/memcontrol.c: export mem_cgroup_is_root() This will be used in next patch. Link: http://lkml.kernel.org/r/153063064347.1818.1987011484100392706.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ mm/memcontrol.c | 5 ----- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2cccbb9e1b3e..258c8a46959a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -318,6 +318,11 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return (memcg == root_mem_cgroup); +} + static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); @@ -784,6 +789,11 @@ void mem_cgroup_split_huge_fixup(struct page *head); struct mem_cgroup; +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return true; +} + static inline bool mem_cgroup_disabled(void) { return true; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a35dc901424d..d8cd6c39eca5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -261,11 +261,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; } -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ - return (memcg == root_mem_cgroup); -} - #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. -- cgit v1.2.3 From fae91d6d8be5e20c47e459dbeb3d43bd5f9486f4 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:10 -0700 Subject: mm/list_lru.c: set bit in memcg shrinker bitmap on first list_lru item appearance Introduce set_shrinker_bit() function to set shrinker-related bit in memcg shrinker bitmap, and set the bit after the first item is added and in case of reparenting destroyed memcg's items. This will allow next patch to make shrinkers be called only, in case of they have charged objects at the moment, and to improve shrink_slab() performance. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112557572.4097.17315791419810749985.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/153063065671.1818.15914674956134687268.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 ++++ mm/list_lru.c | 22 ++++++++++++++++++++-- mm/memcontrol.c | 13 +++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 258c8a46959a..0e6c515fb698 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1286,6 +1286,8 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) extern int memcg_expand_shrinker_maps(int new_id); +extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id); #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -1308,6 +1310,8 @@ static inline void memcg_put_cache_ids(void) { } +static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) { } #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/list_lru.c b/mm/list_lru.c index c6131925ec76..c9bdde9c03d1 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -30,6 +30,11 @@ static void list_lru_unregister(struct list_lru *lru) mutex_unlock(&list_lrus_mutex); } +static int lru_shrinker_id(struct list_lru *lru) +{ + return lru->shrinker_id; +} + static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -93,6 +98,11 @@ static void list_lru_unregister(struct list_lru *lru) { } +static int lru_shrinker_id(struct list_lru *lru) +{ + return -1; +} + static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; @@ -118,13 +128,17 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct mem_cgroup *memcg; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(nlru, item, NULL); + l = list_lru_from_kmem(nlru, item, &memcg); list_add_tail(item, &l->list); - l->nr_items++; + /* Set shrinker bit if the first element was added */ + if (!l->nr_items++) + memcg_set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -507,6 +521,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, struct list_lru_node *nlru = &lru->node[nid]; int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; + bool set; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, @@ -518,7 +533,10 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, dst = list_lru_from_memcg_idx(nlru, dst_idx); list_splice_init(&src->list, &dst->list); + set = (!dst->nr_items && src->nr_items); dst->nr_items += src->nr_items; + if (set) + memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; spin_unlock_irq(&nlru->lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d8cd6c39eca5..55c010a58535 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -422,6 +422,19 @@ unlock: mutex_unlock(&memcg_shrinker_map_mutex); return ret; } + +void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct memcg_shrinker_map *map; + + rcu_read_lock(); + map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + set_bit(shrinker_id, map->map); + rcu_read_unlock(); + } +} + #else /* CONFIG_MEMCG_KMEM */ static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) { -- cgit v1.2.3 From 9b996468cfdba09f688f52dba4287de596194613 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:21 -0700 Subject: mm: add SHRINK_EMPTY shrinker methods return value We need to distinguish the situations when shrinker has very small amount of objects (see vfs_pressure_ratio() called from super_cache_count()), and when it has no objects at all. Currently, in the both of these cases, shrinker::count_objects() returns 0. The patch introduces new SHRINK_EMPTY return value, which will be used for "no objects at all" case. It's is a refactoring mostly, as SHRINK_EMPTY is replaced by 0 by all callers of do_shrink_slab() in this patch, and all the magic will happen in further. Link: http://lkml.kernel.org/r/153063069574.1818.11037751256699341813.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 3 +++ include/linux/shrinker.h | 7 +++++-- mm/vmscan.c | 12 +++++++++--- mm/workingset.c | 3 +++ 4 files changed, 20 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index f5f96e52e0cd..7429588d6b49 100644 --- a/fs/super.c +++ b/fs/super.c @@ -144,6 +144,9 @@ static unsigned long super_cache_count(struct shrinker *shrink, total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); + if (!total_objects) + return SHRINK_EMPTY; + total_objects = vfs_pressure_ratio(total_objects); return total_objects; } diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 7ca9c18cf130..b154fd2b084c 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -34,12 +34,15 @@ struct shrink_control { }; #define SHRINK_STOP (~0UL) +#define SHRINK_EMPTY (~0UL - 1) /* * A callback you can register to apply pressure to ageable caches. * * @count_objects should return the number of freeable items in the cache. If - * there are no objects to free or the number of freeable items cannot be - * determined, it should return 0. No deadlock checks should be done during the + * there are no objects to free, it should return SHRINK_EMPTY, while 0 is + * returned in cases of the number of freeable items cannot be determined + * or shrinker should skip this cache for this time (e.g., their number + * is below shrinkable limit). No deadlock checks should be done during the * count callback - the shrinker relies on aggregating scan counts that couldn't * be executed due to potential deadlocks to be run at a later call when the * deadlock condition is no longer pending. diff --git a/mm/vmscan.c b/mm/vmscan.c index 2aa3cb760189..8199e1b9a204 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -456,8 +456,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); - if (freeable == 0) - return 0; + if (freeable == 0 || freeable == SHRINK_EMPTY) + return freeable; /* * copy the current shrinker scan count into a local variable @@ -596,6 +596,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; freed += ret; if (rwsem_is_contended(&shrinker_rwsem)) { @@ -641,6 +643,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { struct shrinker *shrinker; unsigned long freed = 0; + int ret; if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); @@ -658,7 +661,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += do_shrink_slab(&sc, shrinker, priority); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + freed += ret; /* * Bail out if someone want to register a new shrinker to * prevent the regsitration from being stalled for long periods diff --git a/mm/workingset.c b/mm/workingset.c index cd0b2ae615e4..bc72ad029b3e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -399,6 +399,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, } max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + if (!nodes) + return SHRINK_EMPTY; + if (nodes <= max_nodes) return 0; return nodes - max_nodes; -- cgit v1.2.3 From 6518202970c1052148daaef9a8096711775e43a2 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 17 Aug 2018 15:48:57 -0700 Subject: mm/cma: remove unsupported gfp_mask parameter from cma_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cma_alloc() doesn't really support gfp flags other than __GFP_NOWARN, so convert gfp_mask parameter to boolean no_warn parameter. This will help to avoid giving false feeling that this function supports standard gfp flags and callers can pass __GFP_ZERO to get zeroed buffer, what has already been an issue: see commit dd65a941f6ba ("arm64: dma-mapping: clear buffers allocated with FORCE_CONTIGUOUS flag"). Link: http://lkml.kernel.org/r/20180709122019eucas1p2340da484acfcc932537e6014f4fd2c29~-sqTPJKij2939229392eucas1p2j@eucas1p2.samsung.com Signed-off-by: Marek Szyprowski Acked-by: Michal Hocko Acked-by: Michał Nazarewicz Acked-by: Laura Abbott Acked-by: Vlastimil Babka Reviewed-by: Christoph Hellwig Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_builtin.c | 2 +- drivers/s390/char/vmcp.c | 2 +- drivers/staging/android/ion/ion_cma_heap.c | 2 +- include/linux/cma.h | 2 +- kernel/dma/contiguous.c | 3 ++- mm/cma.c | 8 ++++---- mm/cma_debug.c | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index d4a3f4da409b..fc6bb9630a9c 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -77,7 +77,7 @@ struct page *kvm_alloc_hpt_cma(unsigned long nr_pages) VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES), - GFP_KERNEL); + false); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma); diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 948ce82a7725..0fa1b6b1491a 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -68,7 +68,7 @@ static void vmcp_response_alloc(struct vmcp_session *session) * anymore the system won't work anyway. */ if (order > 2) - page = cma_alloc(vmcp_cma, nr_pages, 0, GFP_KERNEL); + page = cma_alloc(vmcp_cma, nr_pages, 0, false); if (page) { session->response = (char *)page_to_phys(page); session->cma_alloc = 1; diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c index 49718c96bf9e..3fafd013d80a 100644 --- a/drivers/staging/android/ion/ion_cma_heap.c +++ b/drivers/staging/android/ion/ion_cma_heap.c @@ -39,7 +39,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - pages = cma_alloc(cma_heap->cma, nr_pages, align, GFP_KERNEL); + pages = cma_alloc(cma_heap->cma, nr_pages, align, false); if (!pages) return -ENOMEM; diff --git a/include/linux/cma.h b/include/linux/cma.h index bf90f0bb42bd..190184b5ff32 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -33,7 +33,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, const char *name, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - gfp_t gfp_mask); + bool no_warn); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d987dcd1bd56..19ea5d70150c 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -191,7 +191,8 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); + return cma_alloc(dev_get_cma_area(dev), count, align, + gfp_mask & __GFP_NOWARN); } /** diff --git a/mm/cma.c b/mm/cma.c index 5809bbe360d7..4cb76121a3ab 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -395,13 +395,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP mask to use during compaction + * @no_warn: Avoid printing message about failed allocation * * This function allocates part of contiguous memory on specific * contiguous memory area. */ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - gfp_t gfp_mask) + bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; @@ -447,7 +447,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - gfp_mask); + GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); - if (ret && !(gfp_mask & __GFP_NOWARN)) { + if (ret && !no_warn) { pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f23467291cfb..ad6723e9d110 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -139,7 +139,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0, GFP_KERNEL); + p = cma_alloc(cma, count, 0, false); if (!p) { kfree(mem); return -ENOMEM; -- cgit v1.2.3 From d834c5ab83febf9624ad3b16c3c348aa1e02014c Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 17 Aug 2018 15:49:00 -0700 Subject: kernel/dma: remove unsupported gfp_mask parameter from dma_alloc_from_contiguous() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CMA memory allocator doesn't support standard gfp flags for memory allocation, so there is no point having it as a parameter for dma_alloc_from_contiguous() function. Replace it by a boolean no_warn argument, which covers all the underlaying cma_alloc() function supports. This will help to avoid giving false feeling that this function supports standard gfp flags and callers can pass __GFP_ZERO to get zeroed buffer, what has already been an issue: see commit dd65a941f6ba ("arm64: dma-mapping: clear buffers allocated with FORCE_CONTIGUOUS flag"). Link: http://lkml.kernel.org/r/20180709122020eucas1p21a71b092975cb4a3b9954ffc63f699d1~-sqUFoa-h2939329393eucas1p2Y@eucas1p2.samsung.com Signed-off-by: Marek Szyprowski Acked-by: Michał Nazarewicz Acked-by: Vlastimil Babka Reviewed-by: Christoph Hellwig Cc: Laura Abbott Cc: Michal Hocko Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 5 +++-- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/xtensa/kernel/pci-dma.c | 2 +- drivers/iommu/amd_iommu.c | 2 +- drivers/iommu/intel-iommu.c | 3 ++- include/linux/dma-contiguous.h | 4 ++-- kernel/dma/contiguous.c | 7 +++---- kernel/dma/direct.c | 3 ++- 8 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ba0e786c952e..66566472c153 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -594,7 +594,7 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size, struct page *page; void *ptr = NULL; - page = dma_alloc_from_contiguous(dev, count, order, gfp); + page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN); if (!page) return NULL; @@ -1299,7 +1299,8 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size, unsigned long order = get_order(size); struct page *page; - page = dma_alloc_from_contiguous(dev, count, order, gfp); + page = dma_alloc_from_contiguous(dev, count, order, + gfp & __GFP_NOWARN); if (!page) goto error; diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 61e93f0b5482..072c51fb07d7 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -355,7 +355,7 @@ static int __init atomic_pool_init(void) if (dev_get_cma_area(NULL)) page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, GFP_KERNEL); + pool_size_order, false); else page = alloc_pages(GFP_DMA32, pool_size_order); @@ -573,7 +573,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, struct page *page; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size), gfp); + get_order(size), gfp & __GFP_NOWARN); if (!page) return NULL; diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 392b4a80ebc2..a02dc563d290 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -137,7 +137,7 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size, if (gfpflags_allow_blocking(flag)) page = dma_alloc_from_contiguous(dev, count, get_order(size), - flag); + flag & __GFP_NOWARN); if (!page) page = alloc_pages(flag, get_order(size)); diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 596b95c50051..60b2eab29cd8 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2620,7 +2620,7 @@ static void *alloc_coherent(struct device *dev, size_t size, return NULL; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size), flag); + get_order(size), flag & __GFP_NOWARN); if (!page) return NULL; } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 115ff26e9ced..6a237d18fabf 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3758,7 +3758,8 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, if (gfpflags_allow_blocking(flags)) { unsigned int count = size >> PAGE_SHIFT; - page = dma_alloc_from_contiguous(dev, count, order, flags); + page = dma_alloc_from_contiguous(dev, count, order, + flags & __GFP_NOWARN); if (page && iommu_no_mapping(dev) && page_to_phys(page) + size > dev->coherent_dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 3c5a4cb3eb95..f247e8aa5e3d 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, } struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order, gfp_t gfp_mask); + unsigned int order, bool no_warn); bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count); @@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size, static inline struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order, gfp_t gfp_mask) + unsigned int order, bool no_warn) { return NULL; } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 19ea5d70150c..286d82329eb0 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -178,7 +178,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * @dev: Pointer to device for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP flags to use for this allocation. + * @no_warn: Avoid printing message about failed allocation. * * This function allocates memory buffer for specified device. It uses * device specific contiguous memory area if available or the default @@ -186,13 +186,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * function. */ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int align, gfp_t gfp_mask) + unsigned int align, bool no_warn) { if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, - gfp_mask & __GFP_NOWARN); + return cma_alloc(dev_get_cma_area(dev), count, align, no_warn); } /** diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c2860c5a9e96..1c35b7b945d0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -78,7 +78,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, gfp); + page = dma_alloc_from_contiguous(dev, count, page_order, + gfp & __GFP_NOWARN); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_release_from_contiguous(dev, page, count); page = NULL; -- cgit v1.2.3 From 40d18ebffb3974272a920c41f2d74431152cae98 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 17 Aug 2018 15:49:07 -0700 Subject: mm/hugetlb: remove gigantic page support for HIGHMEM This reverts ee8f248d266e ("hugetlb: add phys addr to struct huge_bootmem_page"). At one time powerpc used this field and supporting code. However that was removed with commit 79cc38ded1e1 ("powerpc/mm/hugetlb: Add support for reserving gigantic huge pages via kernel command line"). There are no users of this field and supporting code, so remove it. Link: http://lkml.kernel.org/r/20180711195913.1294-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: "Aneesh Kumar K . V" Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Cannon Matthews Cc: Becky Bruce Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 --- mm/hugetlb.c | 9 +-------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 36fa6a2a82e3..c39d9170a8a0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -348,9 +348,6 @@ struct hstate { struct huge_bootmem_page { struct list_head list; struct hstate *hstate; -#ifdef CONFIG_HIGHMEM - phys_addr_t phys; -#endif }; struct page *alloc_huge_page(struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f1bcaae0d73a..4cea30ac5033 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2139,16 +2139,9 @@ static void __init gather_bootmem_prealloc(void) struct huge_bootmem_page *m; list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); struct hstate *h = m->hstate; - struct page *page; -#ifdef CONFIG_HIGHMEM - page = pfn_to_page(m->phys >> PAGE_SHIFT); - memblock_free_late(__pa(m), - sizeof(struct huge_bootmem_page)); -#else - page = virt_to_page(m); -#endif WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); WARN_ON(PageReserved(page)); -- cgit v1.2.3 From 35fd1eb1e8212c02f6eae24335a9e5b80f9519b4 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:49:21 -0700 Subject: mm/sparse: abstract sparse buffer allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "sparse_init rewrite", v6. In sparse_init() we allocate two large buffers to temporary hold usemap and memmap for the whole machine. However, we can avoid doing that if we changed sparse_init() to operated on per-node bases instead of doing it on the whole machine beforehand. As shown by Baoquan http://lkml.kernel.org/r/20180628062857.29658-1-bhe@redhat.com The buffers are large enough to cause machine stop to boot on small memory systems. Another benefit of these changes is that they also obsolete CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER. This patch (of 5): When struct pages are allocated for sparse-vmemmap VA layout, we first try to allocate one large buffer, and than if that fails allocate struct pages for each section as we go. The code that allocates buffer is uses global variables and is spread across several call sites. Cleanup the code by introducing three functions to handle the global buffer: sparse_buffer_init() initialize the buffer sparse_buffer_fini() free the remaining part of the buffer sparse_buffer_alloc() alloc from the buffer, and if buffer is empty return NULL Define these functions in sparse.c instead of sparse-vmemmap.c because later we will use them for non-vmemmap sparse allocations as well. [akpm@linux-foundation.org: use PTR_ALIGN()] [akpm@linux-foundation.org: s/BUG_ON/WARN_ON/] Link: http://lkml.kernel.org/r/20180712203730.8703-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Tested-by: Michael Ellerman [powerpc] Reviewed-by: Oscar Salvador Tested-by: Oscar Salvador Cc: Pasha Tatashin Cc: Steven Sistare Cc: Daniel Jordan Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Dan Williams Cc: Jan Kara Cc: Jérôme Glisse Cc: Souptick Joarder Cc: Baoquan He Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Cc: Wei Yang Cc: Dave Hansen Cc: David Rientjes Cc: Ingo Molnar Cc: Abdul Haleem Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ mm/sparse-vmemmap.c | 40 ++++++---------------------------------- mm/sparse.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 54 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2fb32d1561eb..4ace5d50a892 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2671,6 +2671,10 @@ void sparse_mem_maps_populate_node(struct page **map_map, unsigned long map_count, int nodeid); +unsigned long __init section_map_size(void); +void sparse_buffer_init(unsigned long size, int nid); +void sparse_buffer_fini(void); +void *sparse_buffer_alloc(unsigned long size); struct page *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 95e2c7638a5c..b05c7663c640 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -43,12 +43,9 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long goal) { return memblock_virt_alloc_try_nid_raw(size, align, goal, - BOOTMEM_ALLOC_ACCESSIBLE, node); + BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void *vmemmap_buf; -static void *vmemmap_buf_end; - void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ @@ -76,18 +73,10 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) /* need to make sure size is all the same during early stage */ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) { - void *ptr; - - if (!vmemmap_buf) - return vmemmap_alloc_block(size, node); - - /* take the from buf */ - ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); - if (ptr + size > vmemmap_buf_end) - return vmemmap_alloc_block(size, node); - - vmemmap_buf = ptr + size; + void *ptr = sparse_buffer_alloc(size); + if (!ptr) + ptr = vmemmap_alloc_block(size, node); return ptr; } @@ -279,19 +268,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long map_count, int nodeid) { unsigned long pnum; - unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - void *vmemmap_buf_start; int nr_consumed_maps = 0; - size = ALIGN(size, PMD_SIZE); - vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, - PMD_SIZE, __pa(MAX_DMA_ADDRESS)); - - if (vmemmap_buf_start) { - vmemmap_buf = vmemmap_buf_start; - vmemmap_buf_end = vmemmap_buf_start + size * map_count; - } - + sparse_buffer_init(section_map_size() * map_count, nodeid); for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; @@ -303,12 +282,5 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); } - - if (vmemmap_buf_start) { - /* need to free left buf */ - memblock_free_early(__pa(vmemmap_buf), - vmemmap_buf_end - vmemmap_buf); - vmemmap_buf = NULL; - vmemmap_buf_end = NULL; - } + sparse_buffer_fini(); } diff --git a/mm/sparse.c b/mm/sparse.c index 2ea8b3dbd0df..9a0a5f598469 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -400,7 +400,14 @@ static void __init sparse_early_usemaps_alloc_node(void *data, } } -#ifndef CONFIG_SPARSEMEM_VMEMMAP +#ifdef CONFIG_SPARSEMEM_VMEMMAP +unsigned long __init section_map_size(void) + +{ + return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); +} + +#else struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap) { @@ -457,6 +464,42 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +static void *sparsemap_buf __meminitdata; +static void *sparsemap_buf_end __meminitdata; + +void __init sparse_buffer_init(unsigned long size, int nid) +{ + WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ + sparsemap_buf = + memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); + sparsemap_buf_end = sparsemap_buf + size; +} + +void __init sparse_buffer_fini(void) +{ + unsigned long size = sparsemap_buf_end - sparsemap_buf; + + if (sparsemap_buf && size > 0) + memblock_free_early(__pa(sparsemap_buf), size); + sparsemap_buf = NULL; +} + +void * __meminit sparse_buffer_alloc(unsigned long size) +{ + void *ptr = NULL; + + if (sparsemap_buf) { + ptr = PTR_ALIGN(sparsemap_buf, size); + if (ptr + size > sparsemap_buf_end) + ptr = NULL; + else + sparsemap_buf = ptr + size; + } + return ptr; +} + #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER static void __init sparse_early_mem_maps_alloc_node(void *data, unsigned long pnum_begin, -- cgit v1.2.3 From afda57bc13410459fc957e93341ade7bebca36e2 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:49:30 -0700 Subject: mm/sparse: move buffer init/fini to the common place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that both variants of sparse memory use the same buffers to populate memory map, we can move sparse_buffer_init()/sparse_buffer_fini() to the common place. Link: http://lkml.kernel.org/r/20180712203730.8703-4-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Tested-by: Michael Ellerman [powerpc] Tested-by: Oscar Salvador Reviewed-by: Andrew Morton Cc: Pasha Tatashin Cc: Abdul Haleem Cc: Baoquan He Cc: Daniel Jordan Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jan Kara Cc: Jérôme Glisse Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Souptick Joarder Cc: Steven Sistare Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- mm/sparse-vmemmap.c | 2 -- mm/sparse.c | 14 +++++++------- 3 files changed, 7 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ace5d50a892..48040510df05 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2671,9 +2671,6 @@ void sparse_mem_maps_populate_node(struct page **map_map, unsigned long map_count, int nodeid); -unsigned long __init section_map_size(void); -void sparse_buffer_init(unsigned long size, int nid); -void sparse_buffer_fini(void); void *sparse_buffer_alloc(unsigned long size); struct page *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index b05c7663c640..cd15f3d252c3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -270,7 +270,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long pnum; int nr_consumed_maps = 0; - sparse_buffer_init(section_map_size() * map_count, nodeid); for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; @@ -282,5 +281,4 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); } - sparse_buffer_fini(); } diff --git a/mm/sparse.c b/mm/sparse.c index db4867b62fff..20ca292d8f11 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -401,14 +401,14 @@ static void __init sparse_early_usemaps_alloc_node(void *data, } #ifdef CONFIG_SPARSEMEM_VMEMMAP -unsigned long __init section_map_size(void) +static unsigned long __init section_map_size(void) { return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); } #else -unsigned long __init section_map_size(void) +static unsigned long __init section_map_size(void) { return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } @@ -433,10 +433,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long map_count, int nodeid) { unsigned long pnum; - unsigned long size = section_map_size(); int nr_consumed_maps = 0; - sparse_buffer_init(size * map_count, nodeid); for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; @@ -447,14 +445,13 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); } - sparse_buffer_fini(); } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ static void *sparsemap_buf __meminitdata; static void *sparsemap_buf_end __meminitdata; -void __init sparse_buffer_init(unsigned long size, int nid) +static void __init sparse_buffer_init(unsigned long size, int nid) { WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ sparsemap_buf = @@ -464,7 +461,7 @@ void __init sparse_buffer_init(unsigned long size, int nid) sparsemap_buf_end = sparsemap_buf + size; } -void __init sparse_buffer_fini(void) +static void __init sparse_buffer_fini(void) { unsigned long size = sparsemap_buf_end - sparsemap_buf; @@ -494,8 +491,11 @@ static void __init sparse_early_mem_maps_alloc_node(void *data, unsigned long map_count, int nodeid) { struct page **map_map = (struct page **)data; + + sparse_buffer_init(section_map_size() * map_count, nodeid); sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, map_count, nodeid); + sparse_buffer_fini(); } #else static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) -- cgit v1.2.3 From 2a3cb8baef71e4dad4a6ec17f5f0db9e05f46a01 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:49:37 -0700 Subject: mm/sparse: delete old sparse_init and enable new one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename new_sparse_init() to sparse_init() which enables it. Delete old sparse_init() and all the code that became obsolete with. [pasha.tatashin@oracle.com: remove unused sparse_mem_maps_populate_node()] Link: http://lkml.kernel.org/r/20180716174447.14529-6-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180712203730.8703-6-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Tested-by: Michael Ellerman [powerpc] Tested-by: Oscar Salvador Reviewed-by: Oscar Salvador Cc: Pasha Tatashin Cc: Abdul Haleem Cc: Baoquan He Cc: Daniel Jordan Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jan Kara Cc: Jérôme Glisse Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Souptick Joarder Cc: Steven Sistare Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 -- mm/Kconfig | 4 - mm/sparse-vmemmap.c | 21 ----- mm/sparse.c | 237 +--------------------------------------------------- 4 files changed, 1 insertion(+), 267 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 48040510df05..a3cae495f9ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2665,12 +2665,6 @@ extern int randomize_va_space; const char * arch_vma_name(struct vm_area_struct *vma); void print_vma_addr(char *prefix, unsigned long rip); -void sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, - int nodeid); - void *sparse_buffer_alloc(unsigned long size); struct page *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap); diff --git a/mm/Kconfig b/mm/Kconfig index 08d8399bb93b..adfeae4decb4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -118,10 +118,6 @@ config SPARSEMEM_EXTREME config SPARSEMEM_VMEMMAP_ENABLE bool -config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - def_bool y - depends on SPARSEMEM && X86_64 - config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index cd15f3d252c3..8301293331a2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -261,24 +261,3 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid, return map; } - -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - unsigned long pnum; - int nr_consumed_maps = 0; - - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - - map_map[nr_consumed_maps] = - sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[nr_consumed_maps++]) - continue; - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - } -} diff --git a/mm/sparse.c b/mm/sparse.c index 248d5d7bbf55..10b07eea9a6e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -205,12 +205,6 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } -/* - * Record how many memory sections are marked as present - * during system bootup. - */ -static int __initdata nr_present_sections; - /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -240,7 +234,6 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; section_mark_present(ms); - nr_present_sections++; } } } @@ -377,37 +370,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static void __init sparse_early_usemaps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long usemap_count, int nodeid) -{ - void *usemap; - unsigned long pnum; - unsigned long **usemap_map = (unsigned long **)data; - int size = usemap_size(); - int nr_consumed_maps = 0; - - usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), - size * usemap_count); - if (!usemap) { - pr_warn("%s: allocation failed\n", __func__); - return; - } - - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[nr_consumed_maps] = usemap; - usemap += size; - check_usemap_section_nr(nodeid, usemap_map[nr_consumed_maps]); - nr_consumed_maps++; - } -} - #ifdef CONFIG_SPARSEMEM_VMEMMAP static unsigned long __init section_map_size(void) - { return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); } @@ -432,25 +396,6 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; } -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - unsigned long pnum; - int nr_consumed_maps = 0; - - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - map_map[nr_consumed_maps] = - sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[nr_consumed_maps++]) - continue; - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - } -} #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ static void *sparsemap_buf __meminitdata; @@ -489,190 +434,10 @@ void * __meminit sparse_buffer_alloc(unsigned long size) return ptr; } -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER -static void __init sparse_early_mem_maps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - struct page **map_map = (struct page **)data; - - sparse_buffer_init(section_map_size() * map_count, nodeid); - sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, - map_count, nodeid); - sparse_buffer_fini(); -} -#else -static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) -{ - struct page *map; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); - - map = sparse_mem_map_populate(pnum, nid, NULL); - if (map) - return map; - - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - return NULL; -} -#endif - void __weak __meminit vmemmap_populate_print_last(void) { } -/** - * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap - * @map: usemap_map for pageblock flags or mmap_map for vmemmap - * @unit_size: size of map unit - */ -static void __init alloc_usemap_and_memmap(void (*alloc_func) - (void *, unsigned long, unsigned long, - unsigned long, int), void *data, - int data_unit_size) -{ - unsigned long pnum; - unsigned long map_count; - int nodeid_begin = 0; - unsigned long pnum_begin = 0; - - for_each_present_section_nr(0, pnum) { - struct mem_section *ms; - - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; - } - map_count = 1; - for_each_present_section_nr(pnum_begin + 1, pnum) { - struct mem_section *ms; - int nodeid; - - ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - map_count++; - continue; - } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - alloc_func(data, pnum_begin, pnum, - map_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - data += map_count * data_unit_size; - map_count = 1; - } - /* ok, last chunk */ - alloc_func(data, pnum_begin, __highest_present_section_nr+1, - map_count, nodeid_begin); -} - -/* - * Allocate the accumulated non-linear sections, allocate a mem_map - * for each and record the physical to section mapping. - */ -void __init sparse_init(void) -{ - unsigned long pnum; - struct page *map; - unsigned long *usemap; - unsigned long **usemap_map; - int size; - int nr_consumed_maps = 0; -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - int size2; - struct page **map_map; -#endif - - /* see include/linux/mmzone.h 'struct mem_section' definition */ - BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); - - /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ - set_pageblock_order(); - - /* - * map is using big page (aka 2M in x86 64 bit) - * usemap is less one page (aka 24 bytes) - * so alloc 2M (with 2M align) and 24 bytes in turn will - * make next 2M slip to one more 2M later. - * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continuously. - * - * powerpc need to call sparse_init_one_section right after each - * sparse_early_mem_map_alloc, so allocate usemap_map at first. - */ - size = sizeof(unsigned long *) * nr_present_sections; - usemap_map = memblock_virt_alloc(size, 0); - if (!usemap_map) - panic("can not allocate usemap_map\n"); - alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map, - sizeof(usemap_map[0])); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * nr_present_sections; - map_map = memblock_virt_alloc(size2, 0); - if (!map_map) - panic("can not allocate map_map\n"); - alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map, - sizeof(map_map[0])); -#endif - - /* - * The number of present sections stored in nr_present_sections - * are kept the same since mem sections are marked as present in - * memory_present(). In this for loop, we need check which sections - * failed to allocate memmap or usemap, then clear its - * ->section_mem_map accordingly. During this process, we need - * increase 'nr_consumed_maps' whether its allocation of memmap - * or usemap failed or not, so that after we handle the i-th - * memory section, can get memmap and usemap of (i+1)-th section - * correctly. - */ - for_each_present_section_nr(0, pnum) { - struct mem_section *ms; - - if (nr_consumed_maps >= nr_present_sections) { - pr_err("nr_consumed_maps goes beyond nr_present_sections\n"); - break; - } - ms = __nr_to_section(pnum); - usemap = usemap_map[nr_consumed_maps]; - if (!usemap) { - ms->section_mem_map = 0; - nr_consumed_maps++; - continue; - } - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - map = map_map[nr_consumed_maps]; -#else - map = sparse_early_mem_map_alloc(pnum); -#endif - if (!map) { - ms->section_mem_map = 0; - nr_consumed_maps++; - continue; - } - - sparse_init_one_section(__nr_to_section(pnum), pnum, map, - usemap); - nr_consumed_maps++; - } - - vmemmap_populate_print_last(); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - memblock_free_early(__pa(map_map), size2); -#endif - memblock_free_early(__pa(usemap_map), size); -} - /* * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) * And number of present sections in this node is map_count. @@ -726,7 +491,7 @@ failed: * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. */ -void __init new_sparse_init(void) +void __init sparse_init(void) { unsigned long pnum_begin = first_present_section_nr(); int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); -- cgit v1.2.3 From 6b51e88199ca4f75ff647eff28efd30bfcb08dc4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:49:55 -0700 Subject: mm/list_lru: introduce list_lru_shrink_walk_irq() Provide list_lru_shrink_walk_irq() and let it behave like list_lru_walk_one() except that it locks the spinlock with spin_lock_irq(). This is used by scan_shadow_nodes() because its lock nests within the i_pages lock which is acquired with IRQ. This change allows to use proper locking promitives instead hand crafted lock_irq_disable() plus spin_lock(). There is no EXPORT_SYMBOL provided because the current user is in-kernel only. Add list_lru_shrink_walk_irq() which acquires the spinlock with the proper locking primitives. Link: http://lkml.kernel.org/r/20180716111921.5365-5-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Vladimir Davydov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 25 +++++++++++++++++++++++++ mm/list_lru.c | 15 +++++++++++++++ mm/workingset.c | 8 ++------ 3 files changed, 42 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index d9c16f2f2f00..aa5efd9351eb 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -166,6 +166,23 @@ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); +/** + * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. + * @lru: the lru pointer. + * @nid: the node id to scan from. + * @memcg: the cgroup to scan from. + * @isolate: callback function that is resposible for deciding what to do with + * the item currently being scanned + * @cb_arg: opaque type that will be passed to @isolate + * @nr_to_walk: how many items to scan. + * + * Same as @list_lru_walk_one except that the spinlock is acquired with + * spin_lock_irq(). + */ +unsigned long list_lru_walk_one_irq(struct list_lru *lru, + int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk); unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); @@ -178,6 +195,14 @@ list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, &sc->nr_to_scan); } +static inline unsigned long +list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc, + list_lru_walk_cb isolate, void *cb_arg) +{ + return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg, + &sc->nr_to_scan); +} + static inline unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, void *cb_arg, unsigned long nr_to_walk) diff --git a/mm/list_lru.c b/mm/list_lru.c index f5c6a2d1ea66..5b30625fd365 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -282,6 +282,21 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, } EXPORT_SYMBOL_GPL(list_lru_walk_one); +unsigned long +list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock_irq(&nlru->lock); + ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, + nr_to_walk); + spin_unlock_irq(&nlru->lock); + return ret; +} + unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) diff --git a/mm/workingset.c b/mm/workingset.c index bc72ad029b3e..4516dd790129 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -483,13 +483,9 @@ out: static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long ret; - /* list_lru lock nests inside the IRQ-safe i_pages lock */ - local_irq_disable(); - ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); - local_irq_enable(); - return ret; + return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, + NULL); } static struct shrinker workingset_shadow_shrinker = { -- cgit v1.2.3 From ddbf369c0a33924f76d092985bd20d9310f43d7f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 17 Aug 2018 15:49:58 -0700 Subject: mm, vmacache: hash addresses based on pmd When perf profiling a wide variety of different workloads, it was found that vmacache_find() had higher than expected cost: up to 0.08% of cpu utilization in some cases. This was found to rival other core VM functions such as alloc_pages_vma() with thp enabled and default mempolicy, and the conditionals in __get_vma_policy(). VMACACHE_HASH() determines which of the four per-task_struct slots a vma is cached for a particular address. This currently depends on the pfn, so pfn 5212 occupies a different vmacache slot than its neighboring pfn 5213. vmacache_find() iterates through all four of current's vmacache slots when looking up an address. Hashing based on pfn, an address has ~1/VMACACHE_SIZE chance of being cached in the first vmacache slot, or about 25%, *if* the vma is cached. This patch hashes an address by its pmd instead of pte to optimize for workloads with good spatial locality. This results in a higher probability of vmas being cached in the first slot that is checked: normally ~70% on the same workloads instead of 25%. [rientjes@google.com: various updates] Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1807231532290.109445@chino.kir.corp.google.com Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1807091749150.114630@chino.kir.corp.google.com Signed-off-by: David Rientjes Reviewed-by: Andrew Morton Cc: Davidlohr Bueso Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmacache.h | 6 ------ mm/vmacache.c | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h index a5b3aa8d281f..3e9a963edd6a 100644 --- a/include/linux/vmacache.h +++ b/include/linux/vmacache.h @@ -5,12 +5,6 @@ #include #include -/* - * Hash based on the page number. Provides a good hit rate for - * workloads with good locality and those with random accesses as well. - */ -#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK) - static inline void vmacache_flush(struct task_struct *tsk) { memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); diff --git a/mm/vmacache.c b/mm/vmacache.c index db7596eb6132..ea517bef7dc5 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -6,6 +6,18 @@ #include #include #include +#include + +/* + * Hash based on the pmd of addr if configured with MMU, which provides a good + * hit rate for workloads with spatial locality. Otherwise, use pages. + */ +#ifdef CONFIG_MMU +#define VMACACHE_SHIFT PMD_SHIFT +#else +#define VMACACHE_SHIFT PAGE_SHIFT +#endif +#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) /* * Flush vma caches for threads that share a given mm. @@ -87,6 +99,7 @@ static bool vmacache_valid(struct mm_struct *mm) struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) { + int idx = VMACACHE_HASH(addr); int i; count_vm_vmacache_event(VMACACHE_FIND_CALLS); @@ -95,16 +108,20 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[i]; + struct vm_area_struct *vma = current->vmacache.vmas[idx]; - if (!vma) - continue; - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; + if (vma) { +#ifdef CONFIG_DEBUG_VM_VMACACHE + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; +#endif + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } } + if (++idx == VMACACHE_SIZE) + idx = 0; } return NULL; @@ -115,6 +132,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, unsigned long start, unsigned long end) { + int idx = VMACACHE_HASH(start); int i; count_vm_vmacache_event(VMACACHE_FIND_CALLS); @@ -123,12 +141,14 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[i]; + struct vm_area_struct *vma = current->vmacache.vmas[idx]; if (vma && vma->vm_start == start && vma->vm_end == end) { count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; } + if (++idx == VMACACHE_SIZE) + idx = 0; } return NULL; -- cgit v1.2.3