From c66b0a052c6448d05289b82d789304f1361e995d Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 20 May 2024 11:52:48 -0700 Subject: selftests: mm: check return values Check return value and return error/skip the tests. Link: https://lkml.kernel.org/r/20240520185248.1801945-1-usama.anjum@collabora.com Fixes: 46fd75d4a3c9 ("selftests: mm: add pagemap ioctl tests") Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 2d785aca72a5..bcc73b4e805c 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1567,8 +1567,10 @@ int main(int argc, char *argv[]) /* 7. File Hugetlb testing */ mem_size = 2*1024*1024; fd = memfd_create("uffd-test", MFD_HUGETLB | MFD_NOEXEC_SEAL); + if (fd < 0) + ksft_exit_fail_msg("uffd-test creation failed %d %s\n", errno, strerror(errno)); mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (mem) { + if (mem != MAP_FAILED) { wp_init(mem, mem_size); wp_addr_range(mem, mem_size); -- cgit v1.2.3 From 85e8bcb4190efb0b4443b637fd7a62ca2f05de6f Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Wed, 22 May 2024 12:34:34 +0530 Subject: selftests/mm: va_high_addr_switch: reduce test noise Patch series "Restructure va_high_addr_switch". The va_high_addr_switch memory selftest tests out some corner cases related to allocation and page/hugepage faulting around the switch boundary. Currently, the page size and hugepage size have been statically defined. Post FEAT_LPA2, the Aarch64 Linux kernel adds support for 4k and 16k translation granules on higher addresses; we restructure the test to support the same. In addition, we avoid invocation of the binary twice, in the shell script, to reduce test noise. This patch (of 2): When invoking the binary with "--run-hugetlb" flag, the testcases involving the base page are anyways going to be run. Therefore, remove duplication by invoking the binary only once. Link: https://lkml.kernel.org/r/20240522070435.773918-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20240522070435.773918-2-dev.jain@arm.com Signed-off-by: Dev Jain Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 4 ---- 1 file changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index a0a75f302904..2c725773cd79 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -57,8 +57,4 @@ check_test_requirements() } check_test_requirements -./va_high_addr_switch - -# In order to run hugetlb testcases, "--run-hugetlb" must be appended -# to the binary. ./va_high_addr_switch --run-hugetlb -- cgit v1.2.3 From e4a4ba4154199ca9728c85589e2889389ef33c2a Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Wed, 22 May 2024 12:34:35 +0530 Subject: selftests/mm: va_high_addr_switch: dynamically initialize testcases to enable LPA2 testing Post FEAT_LPA2, the Aarch64 Linux kernel extends higher address support to 4K and 16K translation granules. To support testing this out, we need to do away with static initialization of page size, while still maintaining the nice array of testcases; this can be achieved by initializing and populating the array as a stack variable, and filling in the page size and hugepage size at runtime. Link: https://lkml.kernel.org/r/20240522070435.773918-3-dev.jain@arm.com Signed-off-by: Dev Jain Suggested-by: Andrew Morton Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 454 ++++++++++++----------- 1 file changed, 232 insertions(+), 222 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index cfbc501290d3..fa7eabfaf841 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -9,26 +9,9 @@ #include #include +#include "vm_util.h" #include "../kselftest.h" -#ifdef __powerpc64__ -#define PAGE_SIZE (64 << 10) -/* - * This will work with 16M and 2M hugepage size - */ -#define HUGETLB_SIZE (16 << 20) -#elif __aarch64__ -/* - * The default hugepage size for 64k base pagesize - * is 512MB. - */ -#define PAGE_SIZE (64 << 10) -#define HUGETLB_SIZE (512 << 20) -#else -#define PAGE_SIZE (4 << 10) -#define HUGETLB_SIZE (2 << 20) -#endif - /* * The hint addr value is used to allocate addresses * beyond the high address switch boundary. @@ -37,18 +20,8 @@ #define ADDR_MARK_128TB (1UL << 47) #define ADDR_MARK_256TB (1UL << 48) -#define HIGH_ADDR_128TB ((void *) (1UL << 48)) -#define HIGH_ADDR_256TB ((void *) (1UL << 49)) - -#define LOW_ADDR ((void *) (1UL << 30)) - -#ifdef __aarch64__ -#define ADDR_SWITCH_HINT ADDR_MARK_256TB -#define HIGH_ADDR HIGH_ADDR_256TB -#else -#define ADDR_SWITCH_HINT ADDR_MARK_128TB -#define HIGH_ADDR HIGH_ADDR_128TB -#endif +#define HIGH_ADDR_128TB (1UL << 48) +#define HIGH_ADDR_256TB (1UL << 49) struct testcase { void *addr; @@ -59,195 +32,230 @@ struct testcase { unsigned int keep_mapped:1; }; -static struct testcase testcases[] = { - { - /* - * If stack is moved, we could possibly allocate - * this at the requested address. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - /* - * Unless MAP_FIXED is specified, allocation based on hint - * addr is never at requested address or above it, which is - * beyond high address switch boundary in this case. Instead, - * a suitable allocation is found in lower address space. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", - .low_addr_required = 1, - }, - { - /* - * Exact mapping at high address switch boundary, should - * be obtained even without MAP_FIXED as area is free. - */ - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, - { - .addr = NULL, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED)", - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1) again", - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, -}; +static struct testcase *testcases; +static struct testcase *hugetlb_testcases; +static int sz_testcases, sz_hugetlb_testcases; +static unsigned long switch_hint; -static struct testcase hugetlb_testcases[] = { - { - .addr = NULL, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB) again", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", - }, -}; +/* Initialize testcases inside a function to compute parameters at runtime */ +void testcases_init(void) +{ + unsigned long pagesize = getpagesize(); + unsigned long hugepagesize = default_huge_page_size(); + unsigned long low_addr = (1UL << 30); + unsigned long addr_switch_hint = ADDR_MARK_128TB; + unsigned long high_addr = HIGH_ADDR_128TB; + +#ifdef __aarch64__ + + /* Post LPA2, the lower userspace VA on a 16K pagesize is 47 bits. */ + if (pagesize != (16UL << 10)) { + addr_switch_hint = ADDR_MARK_256TB; + high_addr = HIGH_ADDR_256TB; + } +#endif + + struct testcase t[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(addr_switch_hint - pagesize)), + .size = pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(addr_switch_hint - pagesize, pagesize)", + .low_addr_required = 1, + }, + { + /* + * Unless MAP_FIXED is specified, allocation based on hint + * addr is never at requested address or above it, which is + * beyond high address switch boundary in this case. Instead, + * a suitable allocation is found in lower address space. + */ + .addr = ((void *)(addr_switch_hint - pagesize)), + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(addr_switch_hint - pagesize, (2 * pagesize))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at high address switch boundary, should + * be obtained even without MAP_FIXED as area is free. + */ + .addr = ((void *)(addr_switch_hint)), + .size = pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(addr_switch_hint, pagesize)", + .keep_mapped = 1, + }, + { + .addr = (void *)(addr_switch_hint), + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(addr_switch_hint, 2 * pagesize, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = (void *)low_addr, + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(low_addr)", + .low_addr_required = 1, + }, + { + .addr = (void *)high_addr, + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(high_addr)", + .keep_mapped = 1, + }, + { + .addr = (void *)high_addr, + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(high_addr) again", + .keep_mapped = 1, + }, + { + .addr = (void *)high_addr, + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(high_addr, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(addr_switch_hint - pagesize)), + .size = pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(addr_switch_hint - pagesize, pagesize)", + .low_addr_required = 1, + }, + { + .addr = (void *)(addr_switch_hint - pagesize), + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(addr_switch_hint - pagesize, 2 * pagesize)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(addr_switch_hint - pagesize / 2), + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(addr_switch_hint - pagesize/2 , 2 * pagesize)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(addr_switch_hint)), + .size = pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(addr_switch_hint, pagesize)", + }, + { + .addr = (void *)(addr_switch_hint), + .size = 2 * pagesize, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(addr_switch_hint, 2 * pagesize, MAP_FIXED)", + }, + }; + + struct testcase ht[] = { + { + .addr = NULL, + .size = hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = (void *)low_addr, + .size = hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(low_addr, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = (void *)high_addr, + .size = hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(high_addr, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *)high_addr, + .size = hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(high_addr, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = (void *)high_addr, + .size = hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(high_addr, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(addr_switch_hint - pagesize), + .size = 2 * hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(addr_switch_hint - pagesize, 2*hugepagesize, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(addr_switch_hint), + .size = 2 * hugepagesize, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(addr_switch_hint , 2*hugepagesize, MAP_FIXED | MAP_HUGETLB)", + }, + }; + + testcases = malloc(sizeof(t)); + hugetlb_testcases = malloc(sizeof(ht)); + + /* Copy into global arrays */ + memcpy(testcases, t, sizeof(t)); + memcpy(hugetlb_testcases, ht, sizeof(ht)); + + sz_testcases = ARRAY_SIZE(t); + sz_hugetlb_testcases = ARRAY_SIZE(ht); + switch_hint = addr_switch_hint; +} static int run_test(struct testcase *test, int count) { @@ -267,7 +275,7 @@ static int run_test(struct testcase *test, int count) continue; } - if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + if (t->low_addr_required && p >= (void *)(switch_hint)) { printf("FAILED\n"); ret = KSFT_FAIL; } else { @@ -292,7 +300,7 @@ static int supported_arch(void) #elif defined(__x86_64__) return 1; #elif defined(__aarch64__) - return getpagesize() == PAGE_SIZE; + return 1; #else return 0; #endif @@ -305,8 +313,10 @@ int main(int argc, char **argv) if (!supported_arch()) return KSFT_SKIP; - ret = run_test(testcases, ARRAY_SIZE(testcases)); + testcases_init(); + + ret = run_test(testcases, sz_testcases); if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); return ret; } -- cgit v1.2.3 From 3a103b5315b78a0d58fa8dc95d70d9b907400f5e Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Thu, 23 May 2024 01:39:05 -0500 Subject: selftest: mm: Test if hugepage does not get leaked during __bio_release_pages() Commit 1b151e2435fc ("block: Remove special-casing of compound pages") caused a change in behaviour when releasing the pages if the buffer does not start at the beginning of the page. This was because the calculation of the number of pages to release was incorrect. This was fixed by commit 38b43539d64b ("block: Fix page refcounts for unaligned buffers in __bio_release_pages()"). We pin the user buffer during direct I/O writes. If this buffer is a hugepage, bio_release_page() will unpin it and decrement all references and pin counts at ->bi_end_io. However, if any references to the hugepage remain post-I/O, the hugepage will not be freed upon unmap, leading to a memory leak. This patch verifies that a hugepage, used as a user buffer for DIO operations, is correctly freed upon unmapping, regardless of whether the offsets are aligned or unaligned w.r.t page boundary. Test Result Fail Scenario (Without the fix) -------------------------------------------------------- []# ./hugetlb_dio TAP version 13 1..4 No. Free pages before allocation : 7 No. Free pages after munmap : 7 ok 1 : Huge pages freed successfully ! No. Free pages before allocation : 7 No. Free pages after munmap : 7 ok 2 : Huge pages freed successfully ! No. Free pages before allocation : 7 No. Free pages after munmap : 7 ok 3 : Huge pages freed successfully ! No. Free pages before allocation : 7 No. Free pages after munmap : 6 not ok 4 : Huge pages not freed! Totals: pass:3 fail:1 xfail:0 xpass:0 skip:0 error:0 Test Result PASS Scenario (With the fix) --------------------------------------------------------- []#./hugetlb_dio TAP version 13 1..4 No. Free pages before allocation : 7 No. Free pages after munmap : 7 ok 1 : Huge pages freed successfully ! No. Free pages before allocation : 7 No. Free pages after munmap : 7 ok 2 : Huge pages freed successfully ! No. Free pages before allocation : 7 No. Free pages after munmap : 7 ok 3 : Huge pages freed successfully ! No. Free pages before allocation : 7 No. Free pages after munmap : 7 ok 4 : Huge pages freed successfully ! Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0 [donettom@linux.ibm.com: address review comments from Muhammad] Link: https://lkml.kernel.org/r/20240604132801.23377-1-donettom@linux.ibm.com [donettom@linux.ibm.com: add this test to run_vmtests.sh] Link: https://lkml.kernel.org/r/20240607182000.6494-1-donettom@linux.ibm.com Link: https://lkml.kernel.org/r/20240523063905.3173-1-donettom@linux.ibm.com Fixes: 38b43539d64b ("block: Fix page refcounts for unaligned buffers in __bio_release_pages()") Signed-off-by: Donet Tom Signed-off-by: Ritesh Harjani (IBM) Co-developed-by: Ritesh Harjani (IBM) Reviewed-by: Muhammad Usama Anjum Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Ritesh Harjani (IBM) Cc: Shuah Khan Cc: Tony Battersby Cc: Jens Axboe Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/hugetlb_dio.c | 117 ++++++++++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 1 + 3 files changed, 119 insertions(+) create mode 100644 tools/testing/selftests/mm/hugetlb_dio.c (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 3b49bc3d0a3b..a1748a4c7df1 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -73,6 +73,7 @@ TEST_GEN_FILES += ksm_functional_tests TEST_GEN_FILES += mdwe_test TEST_GEN_FILES += hugetlb_fault_after_madv TEST_GEN_FILES += hugetlb_madv_vs_map +TEST_GEN_FILES += hugetlb_dio ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c new file mode 100644 index 000000000000..f9ac20c657ec --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This program tests for hugepage leaks after DIO writes to a file using a + * hugepage as the user buffer. During DIO, the user buffer is pinned and + * should be properly unpinned upon completion. This patch verifies that the + * kernel correctly unpins the buffer at DIO completion for both aligned and + * unaligned user buffer offsets (w.r.t page boundary), ensuring the hugepage + * is freed upon unmapping. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "vm_util.h" +#include "../kselftest.h" + +void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) +{ + int fd; + char *buffer = NULL; + char *orig_buffer = NULL; + size_t h_pagesize = 0; + size_t writesize; + int free_hpage_b = 0; + int free_hpage_a = 0; + const int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; + const int mmap_prot = PROT_READ | PROT_WRITE; + + writesize = end_off - start_off; + + /* Get the default huge page size */ + h_pagesize = default_huge_page_size(); + if (!h_pagesize) + ksft_exit_fail_msg("Unable to determine huge page size\n"); + + /* Open the file to DIO */ + fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664); + if (fd < 0) + ksft_exit_fail_perror("Error opening file\n"); + + /* Get the free huge pages before allocation */ + free_hpage_b = get_free_hugepages(); + if (free_hpage_b == 0) { + close(fd); + ksft_exit_skip("No free hugepage, exiting!\n"); + } + + /* Allocate a hugetlb page */ + orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0); + if (orig_buffer == MAP_FAILED) { + close(fd); + ksft_exit_fail_perror("Error mapping memory\n"); + } + buffer = orig_buffer; + buffer += start_off; + + memset(buffer, 'A', writesize); + + /* Write the buffer to the file */ + if (write(fd, buffer, writesize) != (writesize)) { + munmap(orig_buffer, h_pagesize); + close(fd); + ksft_exit_fail_perror("Error writing to file\n"); + } + + /* unmap the huge page */ + munmap(orig_buffer, h_pagesize); + close(fd); + + /* Get the free huge pages after unmap*/ + free_hpage_a = get_free_hugepages(); + + /* + * If the no. of free hugepages before allocation and after unmap does + * not match - that means there could still be a page which is pinned. + */ + if (free_hpage_a != free_hpage_b) { + ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); + ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); + ksft_test_result_fail(": Huge pages not freed!\n"); + } else { + ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); + ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); + ksft_test_result_pass(": Huge pages freed successfully !\n"); + } +} + +int main(void) +{ + size_t pagesize = 0; + + ksft_print_header(); + ksft_set_plan(4); + + /* Get base page size */ + pagesize = psize(); + + /* start and end is aligned to pagesize */ + run_dio_using_hugetlb(0, (pagesize * 3)); + + /* start is aligned but end is not aligned */ + run_dio_using_hugetlb(0, (pagesize * 3) - (pagesize / 2)); + + /* start is unaligned and end is aligned */ + run_dio_using_hugetlb(pagesize / 2, (pagesize * 3)); + + /* both start and end are unaligned */ + run_dio_using_hugetlb(pagesize / 2, (pagesize * 3) + (pagesize / 2)); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 3157204b9047..5698d519170d 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -265,6 +265,7 @@ CATEGORY="hugetlb" run_test ./map_hugetlb CATEGORY="hugetlb" run_test ./hugepage-mremap CATEGORY="hugetlb" run_test ./hugepage-vmemmap CATEGORY="hugetlb" run_test ./hugetlb-madvise +CATEGORY="hugetlb" run_test ./hugetlb_dio nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) # For this test, we need one and just one huge page -- cgit v1.2.3 From 504d8a5e0fd4def825a58d69ca783dcfa424d012 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 17 Jun 2024 19:24:17 -0700 Subject: selftests/mm: mseal, self_elf: fix missing __NR_mseal Patch series "cleanups, fixes, and progress towards avoiding "make headers"", v3. Eventually, once the build succeeds on a sufficiently old distro, the idea is to delete $(KHDR_INCLUDES) from the selftests/mm build, and then after that, from selftests/lib.mk and all of the other selftest builds. For now, this series merely achieves a clean build of selftests/mm on a not-so-old distro: Ubuntu 23.04. In other words, after this series is applied, it is possible to delete $(KHDR_INCLUDES) from selftests/mm/Makefile and the build will still succeed. 1. Add tools/uapi/asm/unistd_[32|x32|64].h files, which include definitions of __NR_mseal, and include them (indirectly) from the files that use __NR_mseal. The new files are copied from ./usr/include/asm, which is how we have agreed to do this sort of thing, see [1]. 2. Add fs.h, similarly created: it was copied directly from a snapshot of ./usr/include/linux/fs.h after running "make headers". 3. Add a few selected prctl.h values that the ksm and mdwe tests require. 4. Factor out some common code from mseal_test.c and seal_elf.c, into a new mseal_helpers.h file. 5. Remove local __NR_* definitions and checks. [1] commit e076eaca5906 ("selftests: break the dependency upon local header files") This patch (of 6): The selftests/mm build isn't exactly "broken", according to the current documentation, which still claims that one must run "make headers", before building the kselftests. However, according to the new plan to get rid of that requirement [1], they are future-broken: attempting to build selftests/mm *without* first running "make headers" will fail due to not finding __NR_mseal. Therefore, include asm-generic/unistd.h, which has all of the system call numbers that are needed, abstracted across the various CPU arches. Some explanation in support of this "asm-generic" approach: For most user space programs, the header file inclusion behaves as per this microblaze example, which comes from David Hildenbrand (thanks!): arch/microblaze/include/asm/unistd.h -> #include arch/microblaze/include/uapi/asm/unistd.h -> #include -> Generated during "make headers" usr/include/asm/unistd_32.h is generated via arch/microblaze/kernel/syscalls/Makefile with the syshdr command. So we never end up including asm-generic/unistd.h directly on microblaze... [2] However, those programs are installed on a single computer that has a single set of asm and kernel headers installed. In contrast, the kselftests are quite special, because they must provide a set of user space programs that: a) Mostly avoid using the installed (distro) system header files. b) Build (and run) on all supported CPU architectures c) Occasionally use symbols that have so new that they have not yet been included in the distro's header files. Doing (a) creates a new problem: how to get a set of cross-platform headers that works in all cases. Fortunately, asm-generic headers solve that one. Which is why we need to use them here--at least, for particularly difficult headers such as unistd.h. The reason this hasn't really come up yet, is that until now, the kselftests requirement (which I'm trying to eventually remove) was that "make headers" must first be run. That allowed the selftests to get a snapshot of sufficiently new header files that looked just like (and conflict with) the installed system headers. And as an aside, this is also an improvement over past practices of simply open-coding in a single (not per-arch) definition of a new symbol, directly into the selftest code. [1] commit e076eaca5906 ("selftests: break the dependency upon local header files") [2] https://lore.kernel.org/all/0b152bea-ccb6-403e-9c57-08ed5e828135@redhat.com/ Link: https://lkml.kernel.org/r/20240618022422.804305-1-jhubbard@nvidia.com Link: https://lkml.kernel.org/r/20240618022422.804305-2-jhubbard@nvidia.com Fixes: 4926c7a52de7 ("selftest mm/mseal memory sealing") Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Jeff Xu Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Muhammad Usama Anjum Cc: Peter Xu Cc: Rich Felker Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mseal_test.c | 2 +- tools/testing/selftests/mm/seal_elf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index 41998cf1dcf5..58c888529f42 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c index f2babec79bb6..27bf2f84231d 100644 --- a/tools/testing/selftests/mm/seal_elf.c +++ b/tools/testing/selftests/mm/seal_elf.c @@ -2,7 +2,7 @@ #define _GNU_SOURCE #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 5f9b7511b21e88bfaf4a469becc7fc2c0c74f597 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 17 Jun 2024 19:24:18 -0700 Subject: selftests/mm: mseal, self_elf: factor out test macros and other duplicated items Clean up and move some copy-pasted items into a new mseal_helpers.h. 1. The test macros can be made safer and simpler, by observing that they are invariably called when about to return. This means that the macros do not need an intrusive label to goto; they can simply return. 2. PKEY* items. We cannot, unfortunately use pkey-helpers.h. The best we can do is to factor out these few items into mseal_helpers.h. 3. These tests still need their own definition of u64, so also move that to the header file. 4. Be sure to include the new mseal_helpers.h in the Makefile dependencies. [jhubbard@nvidia.com: include the new mseal_helpers.h in Makefile dependencies] Link: https://lkml.kernel.org/r/01685978-f6b1-4c24-8397-22cd3c24b91a@nvidia.com Link: https://lkml.kernel.org/r/20240618022422.804305-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Jeff Xu Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Muhammad Usama Anjum Cc: Peter Xu Cc: Rich Felker Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/mseal_helpers.h | 41 +++++++++++++++++++++++++ tools/testing/selftests/mm/mseal_test.c | 49 +----------------------------- tools/testing/selftests/mm/seal_elf.c | 33 +------------------- 4 files changed, 44 insertions(+), 80 deletions(-) create mode 100644 tools/testing/selftests/mm/mseal_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index a1748a4c7df1..0d96c6123636 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -2,6 +2,7 @@ # Makefile for mm selftests LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h +LOCAL_HDRS += $(selfdir)/mm/mseal_helpers.h include local_config.mk diff --git a/tools/testing/selftests/mm/mseal_helpers.h b/tools/testing/selftests/mm/mseal_helpers.h new file mode 100644 index 000000000000..108d3fd0becb --- /dev/null +++ b/tools/testing/selftests/mm/mseal_helpers.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define FAIL_TEST_IF_FALSE(test_passed) \ + do { \ + if (!(test_passed)) { \ + ksft_test_result_fail("%s: line:%d\n", \ + __func__, __LINE__); \ + return; \ + } \ + } while (0) + +#define SKIP_TEST_IF_FALSE(test_passed) \ + do { \ + if (!(test_passed)) { \ + ksft_test_result_skip("%s: line:%d\n", \ + __func__, __LINE__); \ + return; \ + } \ + } while (0) + +#define TEST_END_CHECK() ksft_test_result_pass("%s\n", __func__) + +#ifndef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 +#endif + +#ifndef PKEY_BITS_PER_PKEY +#define PKEY_BITS_PER_PKEY 2 +#endif + +#ifndef PKEY_MASK +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif + +#ifndef u64 +#define u64 unsigned long long +#endif diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index 58c888529f42..d4d6ae42f502 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -17,54 +17,7 @@ #include #include #include - -/* - * need those definition for manually build using gcc. - * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 mseal_test.c -o mseal_test - */ -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -#ifndef PKEY_BITS_PER_PKEY -#define PKEY_BITS_PER_PKEY 2 -#endif - -#ifndef PKEY_MASK -#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) -#endif - -#define FAIL_TEST_IF_FALSE(c) do {\ - if (!(c)) {\ - ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\ - goto test_end;\ - } \ - } \ - while (0) - -#define SKIP_TEST_IF_FALSE(c) do {\ - if (!(c)) {\ - ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\ - goto test_end;\ - } \ - } \ - while (0) - - -#define TEST_END_CHECK() {\ - ksft_test_result_pass("%s\n", __func__);\ - return;\ -test_end:\ - return;\ -} - -#ifndef u64 -#define u64 unsigned long long -#endif +#include "mseal_helpers.h" static unsigned long get_vma_size(void *addr, int *prot) { diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c index 27bf2f84231d..45c73213775b 100644 --- a/tools/testing/selftests/mm/seal_elf.c +++ b/tools/testing/selftests/mm/seal_elf.c @@ -16,38 +16,7 @@ #include #include #include - -/* - * need those definition for manually build using gcc. - * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 seal_elf.c -o seal_elf - */ -#define FAIL_TEST_IF_FALSE(c) do {\ - if (!(c)) {\ - ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\ - goto test_end;\ - } \ - } \ - while (0) - -#define SKIP_TEST_IF_FALSE(c) do {\ - if (!(c)) {\ - ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\ - goto test_end;\ - } \ - } \ - while (0) - - -#define TEST_END_CHECK() {\ - ksft_test_result_pass("%s\n", __func__);\ - return;\ -test_end:\ - return;\ -} - -#ifndef u64 -#define u64 unsigned long long -#endif +#include "mseal_helpers.h" /* * define sys_xyx to call syscall directly. -- cgit v1.2.3 From eef07d69d3a4c2002f3d49149296c56f2abccf89 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 17 Jun 2024 19:24:19 -0700 Subject: selftests/mm: mseal, self_elf: rename TEST_END_CHECK to REPORT_TEST_PASS Now that the test macros are factored out into their final location, and simplified, it's time to rename TEST_END_CHECK to something that represents its new functionality: REPORT_TEST_PASS. Link: https://lkml.kernel.org/r/20240618022422.804305-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: David Hildenbrand Reviewed-by: Jeff Xu Tested-by: Jeff Xu Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Muhammad Usama Anjum Cc: Peter Xu Cc: Rich Felker Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mseal_helpers.h | 2 +- tools/testing/selftests/mm/mseal_test.c | 92 +++++++++++++++--------------- tools/testing/selftests/mm/seal_elf.c | 2 +- 3 files changed, 48 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mseal_helpers.h b/tools/testing/selftests/mm/mseal_helpers.h index 108d3fd0becb..0cfce31c76d2 100644 --- a/tools/testing/selftests/mm/mseal_helpers.h +++ b/tools/testing/selftests/mm/mseal_helpers.h @@ -18,7 +18,7 @@ } \ } while (0) -#define TEST_END_CHECK() ksft_test_result_pass("%s\n", __func__) +#define REPORT_TEST_PASS() ksft_test_result_pass("%s\n", __func__) #ifndef PKEY_DISABLE_ACCESS #define PKEY_DISABLE_ACCESS 0x1 diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index d4d6ae42f502..a818f010de47 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -240,7 +240,7 @@ static void test_seal_addseal(void) ret = sys_mseal(ptr, size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_unmapped_start(void) @@ -268,7 +268,7 @@ static void test_seal_unmapped_start(void) ret = sys_mseal(ptr + 2 * page_size, 2 * page_size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_unmapped_middle(void) @@ -300,7 +300,7 @@ static void test_seal_unmapped_middle(void) ret = sys_mseal(ptr + 3 * page_size, page_size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_unmapped_end(void) @@ -329,7 +329,7 @@ static void test_seal_unmapped_end(void) ret = sys_mseal(ptr, 2 * page_size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_multiple_vmas(void) @@ -360,7 +360,7 @@ static void test_seal_multiple_vmas(void) ret = sys_mseal(ptr, size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_split_start(void) @@ -385,7 +385,7 @@ static void test_seal_split_start(void) ret = sys_mseal(ptr + page_size, 3 * page_size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_split_end(void) @@ -410,7 +410,7 @@ static void test_seal_split_end(void) ret = sys_mseal(ptr, 3 * page_size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_invalid_input(void) @@ -445,7 +445,7 @@ static void test_seal_invalid_input(void) ret = sys_mseal(ptr - page_size, 5 * page_size); FAIL_TEST_IF_FALSE(ret < 0); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_zero_length(void) @@ -469,7 +469,7 @@ static void test_seal_zero_length(void) ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_zero_address(void) @@ -495,7 +495,7 @@ static void test_seal_zero_address(void) ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_twice(void) @@ -515,7 +515,7 @@ static void test_seal_twice(void) ret = sys_mseal(ptr, size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect(bool seal) @@ -539,7 +539,7 @@ static void test_seal_mprotect(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_start_mprotect(bool seal) @@ -569,7 +569,7 @@ static void test_seal_start_mprotect(bool seal) PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_end_mprotect(bool seal) @@ -599,7 +599,7 @@ static void test_seal_end_mprotect(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect_unalign_len(bool seal) @@ -628,7 +628,7 @@ static void test_seal_mprotect_unalign_len(bool seal) PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect_unalign_len_variant_2(bool seal) @@ -656,7 +656,7 @@ static void test_seal_mprotect_unalign_len_variant_2(bool seal) PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect_two_vma(bool seal) @@ -691,7 +691,7 @@ static void test_seal_mprotect_two_vma(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect_two_vma_with_split(bool seal) @@ -738,7 +738,7 @@ static void test_seal_mprotect_two_vma_with_split(bool seal) PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect_partial_mprotect(bool seal) @@ -764,7 +764,7 @@ static void test_seal_mprotect_partial_mprotect(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect_two_vma_with_gap(bool seal) @@ -807,7 +807,7 @@ static void test_seal_mprotect_two_vma_with_gap(bool seal) ret = sys_mprotect(ptr + 3 * page_size, page_size, PROT_READ); FAIL_TEST_IF_FALSE(ret == 0); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect_split(bool seal) @@ -844,7 +844,7 @@ static void test_seal_mprotect_split(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mprotect_merge(bool seal) @@ -878,7 +878,7 @@ static void test_seal_mprotect_merge(bool seal) ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ); FAIL_TEST_IF_FALSE(ret == 0); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_munmap(bool seal) @@ -903,7 +903,7 @@ static void test_seal_munmap(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } /* @@ -943,7 +943,7 @@ static void test_seal_munmap_two_vma(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } /* @@ -981,7 +981,7 @@ static void test_seal_munmap_vma_with_gap(bool seal) ret = sys_munmap(ptr, size); FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_munmap_start_freed(bool seal) @@ -1021,7 +1021,7 @@ static void test_munmap_start_freed(bool seal) FAIL_TEST_IF_FALSE(size == 0); } - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_munmap_end_freed(bool seal) @@ -1051,7 +1051,7 @@ static void test_munmap_end_freed(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_munmap_middle_freed(bool seal) @@ -1095,7 +1095,7 @@ static void test_munmap_middle_freed(bool seal) FAIL_TEST_IF_FALSE(size == 0); } - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_shrink(bool seal) @@ -1124,7 +1124,7 @@ static void test_seal_mremap_shrink(bool seal) } - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_expand(bool seal) @@ -1156,7 +1156,7 @@ static void test_seal_mremap_expand(bool seal) } - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_move(bool seal) @@ -1189,7 +1189,7 @@ static void test_seal_mremap_move(bool seal) } - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mmap_overwrite_prot(bool seal) @@ -1217,7 +1217,7 @@ static void test_seal_mmap_overwrite_prot(bool seal) } else FAIL_TEST_IF_FALSE(ret2 == ptr); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mmap_expand(bool seal) @@ -1248,7 +1248,7 @@ static void test_seal_mmap_expand(bool seal) } else FAIL_TEST_IF_FALSE(ret2 == ptr); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mmap_shrink(bool seal) @@ -1276,7 +1276,7 @@ static void test_seal_mmap_shrink(bool seal) } else FAIL_TEST_IF_FALSE(ret2 == ptr); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_shrink_fixed(bool seal) @@ -1307,7 +1307,7 @@ static void test_seal_mremap_shrink_fixed(bool seal) } else FAIL_TEST_IF_FALSE(ret2 == newAddr); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_expand_fixed(bool seal) @@ -1338,7 +1338,7 @@ static void test_seal_mremap_expand_fixed(bool seal) } else FAIL_TEST_IF_FALSE(ret2 == newAddr); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_move_fixed(bool seal) @@ -1368,7 +1368,7 @@ static void test_seal_mremap_move_fixed(bool seal) } else FAIL_TEST_IF_FALSE(ret2 == newAddr); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_move_fixed_zero(bool seal) @@ -1400,7 +1400,7 @@ static void test_seal_mremap_move_fixed_zero(bool seal) } - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_move_dontunmap(bool seal) @@ -1429,7 +1429,7 @@ static void test_seal_mremap_move_dontunmap(bool seal) } - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) @@ -1463,7 +1463,7 @@ static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) } - TEST_END_CHECK(); + REPORT_TEST_PASS(); } @@ -1556,7 +1556,7 @@ static void test_seal_merge_and_split(void) FAIL_TEST_IF_FALSE(size == 22 * page_size); FAIL_TEST_IF_FALSE(prot == 0x4); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_discard_ro_anon_on_rw(bool seal) @@ -1585,7 +1585,7 @@ static void test_seal_discard_ro_anon_on_rw(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_discard_ro_anon_on_pkey(bool seal) @@ -1632,7 +1632,7 @@ static void test_seal_discard_ro_anon_on_pkey(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_discard_ro_anon_on_filebacked(bool seal) @@ -1669,7 +1669,7 @@ static void test_seal_discard_ro_anon_on_filebacked(bool seal) FAIL_TEST_IF_FALSE(!ret); close(fd); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_discard_ro_anon_on_shared(bool seal) @@ -1698,7 +1698,7 @@ static void test_seal_discard_ro_anon_on_shared(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } static void test_seal_discard_ro_anon(bool seal) @@ -1728,7 +1728,7 @@ static void test_seal_discard_ro_anon(bool seal) else FAIL_TEST_IF_FALSE(!ret); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c index 45c73213775b..7aa1366063e4 100644 --- a/tools/testing/selftests/mm/seal_elf.c +++ b/tools/testing/selftests/mm/seal_elf.c @@ -127,7 +127,7 @@ static void test_seal_elf(void) FAIL_TEST_IF_FALSE(ret < 0); ksft_print_msg("somestr is sealed, mprotect is rejected\n"); - TEST_END_CHECK(); + REPORT_TEST_PASS(); } int main(int argc, char **argv) -- cgit v1.2.3 From e20194725bb25bb94c238fd9056108ea50a760c0 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 17 Jun 2024 19:24:20 -0700 Subject: selftests/mm: fix vm_util.c build failures: add snapshot of fs.h On Ubuntu 23.04, on a clean git tree, the selftests/mm build fails due 10 or 20 missing items, all of which are found in fs.h, which is created via "make headers". However, as per [1], the idea is to stop requiring "make headers", and instead, take a snapshot of the files and check them in. Here are a few of the build errors: vm_util.c:34:21: error: variable has incomplete type 'struct pm_scan_arg' struct pm_scan_arg arg; ... vm_util.c:45:28: error: use of undeclared identifier 'PAGE_IS_WPALLOWED' ... vm_util.c:55:21: error: variable has incomplete type 'struct page_region' ... vm_util.c:105:20: error: use of undeclared identifier 'PAGE_IS_SOFT_DIRTY' To fix this, add fs.h, taken from a snapshot of ./usr/include/linux/fs.h after running "make headers". [1] commit e076eaca5906 ("selftests: break the dependency upon local header files") Link: https://lkml.kernel.org/r/20240618022422.804305-5-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Jeff Xu Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Muhammad Usama Anjum Cc: Peter Xu Cc: Rich Felker Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/include/uapi/linux/fs.h | 392 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 392 insertions(+) create mode 100644 tools/include/uapi/linux/fs.h (limited to 'tools') diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h new file mode 100644 index 000000000000..cc3fea99fd43 --- /dev/null +++ b/tools/include/uapi/linux/fs.h @@ -0,0 +1,392 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_FS_H +#define _LINUX_FS_H + +/* + * This file has definitions for some important file table structures + * and constants and structures used by various generic file system + * ioctl's. Please do not make any changes in this file before + * sending patches for review to linux-fsdevel@vger.kernel.org and + * linux-api@vger.kernel.org. + */ + +#include +#include +#include +#include + +/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */ +#include + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ +#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1< Date: Mon, 17 Jun 2024 19:24:21 -0700 Subject: selftests/mm: kvm, mdwe fixes to avoid requiring "make headers" On Ubuntu 23.04, the kvm and mdwe selftests/mm build fails due to missing a few items that are found in prctl.h. Here is an excerpt of the build failures: ksm_tests.c:252:13: error: use of undeclared identifier 'PR_SET_MEMORY_MERGE' ... mdwe_test.c:26:18: error: use of undeclared identifier 'PR_SET_MDWE' mdwe_test.c:38:18: error: use of undeclared identifier 'PR_GET_MDWE' Fix these errors by adding a new tools/include/uapi/linux/prctl.h . This file was created by running "make headers", and then copying a snapshot over from ./usr/include/linux/prctl.h, as per the approach we settled on in [1]. [1] commit e076eaca5906 ("selftests: break the dependency upon local header files") Link: https://lkml.kernel.org/r/20240618022422.804305-6-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Jeff Xu Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Muhammad Usama Anjum Cc: Peter Xu Cc: Rich Felker Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/include/uapi/linux/prctl.h | 331 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100644 tools/include/uapi/linux/prctl.h (limited to 'tools') diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h new file mode 100644 index 000000000000..35791791a879 --- /dev/null +++ b/tools/include/uapi/linux/prctl.h @@ -0,0 +1,331 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_PRCTL_H +#define _LINUX_PRCTL_H + +#include + +/* Values to pass as first argument to prctl() */ + +#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ +#define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */ + +/* Get/set current->mm->dumpable */ +#define PR_GET_DUMPABLE 3 +#define PR_SET_DUMPABLE 4 + +/* Get/set unaligned access control bits (if meaningful) */ +#define PR_GET_UNALIGN 5 +#define PR_SET_UNALIGN 6 +# define PR_UNALIGN_NOPRINT 1 /* silently fix up unaligned user accesses */ +# define PR_UNALIGN_SIGBUS 2 /* generate SIGBUS on unaligned user access */ + +/* Get/set whether or not to drop capabilities on setuid() away from + * uid 0 (as per security/commoncap.c) */ +#define PR_GET_KEEPCAPS 7 +#define PR_SET_KEEPCAPS 8 + +/* Get/set floating-point emulation control bits (if meaningful) */ +#define PR_GET_FPEMU 9 +#define PR_SET_FPEMU 10 +# define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */ +# define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */ + +/* Get/set floating-point exception mode (if meaningful) */ +#define PR_GET_FPEXC 11 +#define PR_SET_FPEXC 12 +# define PR_FP_EXC_SW_ENABLE 0x80 /* Use FPEXC for FP exception enables */ +# define PR_FP_EXC_DIV 0x010000 /* floating point divide by zero */ +# define PR_FP_EXC_OVF 0x020000 /* floating point overflow */ +# define PR_FP_EXC_UND 0x040000 /* floating point underflow */ +# define PR_FP_EXC_RES 0x080000 /* floating point inexact result */ +# define PR_FP_EXC_INV 0x100000 /* floating point invalid operation */ +# define PR_FP_EXC_DISABLED 0 /* FP exceptions disabled */ +# define PR_FP_EXC_NONRECOV 1 /* async non-recoverable exc. mode */ +# define PR_FP_EXC_ASYNC 2 /* async recoverable exception mode */ +# define PR_FP_EXC_PRECISE 3 /* precise exception mode */ + +/* Get/set whether we use statistical process timing or accurate timestamp + * based process timing */ +#define PR_GET_TIMING 13 +#define PR_SET_TIMING 14 +# define PR_TIMING_STATISTICAL 0 /* Normal, traditional, + statistical process timing */ +# define PR_TIMING_TIMESTAMP 1 /* Accurate timestamp based + process timing */ + +#define PR_SET_NAME 15 /* Set process name */ +#define PR_GET_NAME 16 /* Get process name */ + +/* Get/set process endian */ +#define PR_GET_ENDIAN 19 +#define PR_SET_ENDIAN 20 +# define PR_ENDIAN_BIG 0 +# define PR_ENDIAN_LITTLE 1 /* True little endian mode */ +# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */ + +/* Get/set process seccomp mode */ +#define PR_GET_SECCOMP 21 +#define PR_SET_SECCOMP 22 + +/* Get/set the capability bounding set (as per security/commoncap.c) */ +#define PR_CAPBSET_READ 23 +#define PR_CAPBSET_DROP 24 + +/* Get/set the process' ability to use the timestamp counter instruction */ +#define PR_GET_TSC 25 +#define PR_SET_TSC 26 +# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */ +# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */ + +/* Get/set securebits (as per security/commoncap.c) */ +#define PR_GET_SECUREBITS 27 +#define PR_SET_SECUREBITS 28 + +/* + * Get/set the timerslack as used by poll/select/nanosleep + * A value of 0 means "use default" + */ +#define PR_SET_TIMERSLACK 29 +#define PR_GET_TIMERSLACK 30 + +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 + +/* + * Set early/late kill mode for hwpoison memory corruption. + * This influences when the process gets killed on a memory corruption. + */ +#define PR_MCE_KILL 33 +# define PR_MCE_KILL_CLEAR 0 +# define PR_MCE_KILL_SET 1 + +# define PR_MCE_KILL_LATE 0 +# define PR_MCE_KILL_EARLY 1 +# define PR_MCE_KILL_DEFAULT 2 + +#define PR_MCE_KILL_GET 34 + +/* + * Tune up process memory map specifics. + */ +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 +# define PR_SET_MM_ARG_START 8 +# define PR_SET_MM_ARG_END 9 +# define PR_SET_MM_ENV_START 10 +# define PR_SET_MM_ENV_END 11 +# define PR_SET_MM_AUXV 12 +# define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; + +/* + * Set specific pid that is allowed to ptrace the current task. + * A value of 0 mean "no process". + */ +#define PR_SET_PTRACER 0x59616d61 +# define PR_SET_PTRACER_ANY ((unsigned long)-1) + +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 + +/* + * If no_new_privs is set, then operations that grant new privileges (i.e. + * execve) will either fail or not grant them. This affects suid/sgid, + * file capabilities, and LSMs. + * + * Operations that merely manipulate or drop existing privileges (setresuid, + * capset, etc.) will still work. Drop those privileges if you want them gone. + * + * Changing LSM security domain is considered a new privilege. So, for example, + * asking selinux for a specific new context (e.g. with runcon) will result + * in execve returning -EPERM. + * + * See Documentation/userspace-api/no_new_privs.rst for more details. + */ +#define PR_SET_NO_NEW_PRIVS 38 +#define PR_GET_NO_NEW_PRIVS 39 + +#define PR_GET_TID_ADDRESS 40 + +#define PR_SET_THP_DISABLE 41 +#define PR_GET_THP_DISABLE 42 + +/* + * No longer implemented, but left here to ensure the numbers stay reserved: + */ +#define PR_MPX_ENABLE_MANAGEMENT 43 +#define PR_MPX_DISABLE_MANAGEMENT 44 + +#define PR_SET_FP_MODE 45 +#define PR_GET_FP_MODE 46 +# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ +# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ + +/* Control the ambient capability set */ +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 + +/* arm64 Scalable Vector Extension controls */ +/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */ +#define PR_SVE_SET_VL 50 /* set task vector length */ +# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ +#define PR_SVE_GET_VL 51 /* get task vector length */ +/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */ +# define PR_SVE_VL_LEN_MASK 0xffff +# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */ + +/* Per task speculation control */ +#define PR_GET_SPECULATION_CTRL 52 +#define PR_SET_SPECULATION_CTRL 53 +/* Speculation control variants */ +# define PR_SPEC_STORE_BYPASS 0 +# define PR_SPEC_INDIRECT_BRANCH 1 +# define PR_SPEC_L1D_FLUSH 2 +/* Return and control values for PR_SET/GET_SPECULATION_CTRL */ +# define PR_SPEC_NOT_AFFECTED 0 +# define PR_SPEC_PRCTL (1UL << 0) +# define PR_SPEC_ENABLE (1UL << 1) +# define PR_SPEC_DISABLE (1UL << 2) +# define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) + +/* Reset arm64 pointer authentication keys */ +#define PR_PAC_RESET_KEYS 54 +# define PR_PAC_APIAKEY (1UL << 0) +# define PR_PAC_APIBKEY (1UL << 1) +# define PR_PAC_APDAKEY (1UL << 2) +# define PR_PAC_APDBKEY (1UL << 3) +# define PR_PAC_APGAKEY (1UL << 4) + +/* Tagged user address controls for arm64 */ +#define PR_SET_TAGGED_ADDR_CTRL 55 +#define PR_GET_TAGGED_ADDR_CTRL 56 +# define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* MTE tag check fault modes */ +# define PR_MTE_TCF_NONE 0UL +# define PR_MTE_TCF_SYNC (1UL << 1) +# define PR_MTE_TCF_ASYNC (1UL << 2) +# define PR_MTE_TCF_MASK (PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC) +/* MTE tag inclusion mask */ +# define PR_MTE_TAG_SHIFT 3 +# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) +/* Unused; kept only for source compatibility */ +# define PR_MTE_TCF_SHIFT 1 + +/* Control reclaim behavior when allocating memory */ +#define PR_SET_IO_FLUSHER 57 +#define PR_GET_IO_FLUSHER 58 + +/* Dispatch syscalls to a userspace handler */ +#define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +/* The control values for the user space selector when dispatch is enabled */ +# define SYSCALL_DISPATCH_FILTER_ALLOW 0 +# define SYSCALL_DISPATCH_FILTER_BLOCK 1 + +/* Set/get enabled arm64 pointer authentication keys */ +#define PR_PAC_SET_ENABLED_KEYS 60 +#define PR_PAC_GET_ENABLED_KEYS 61 + +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 +# define PR_SCHED_CORE_SCOPE_THREAD 0 +# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 +# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 + +/* arm64 Scalable Matrix Extension controls */ +/* Flag values must be in sync with SVE versions */ +#define PR_SME_SET_VL 63 /* set task vector length */ +# define PR_SME_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ +#define PR_SME_GET_VL 64 /* get task vector length */ +/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */ +# define PR_SME_VL_LEN_MASK 0xffff +# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ + +/* Memory deny write / execute */ +#define PR_SET_MDWE 65 +# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) + +#define PR_GET_MDWE 66 + +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + +#define PR_GET_AUXV 0x41555856 + +#define PR_SET_MEMORY_MERGE 67 +#define PR_GET_MEMORY_MERGE 68 + +#define PR_RISCV_V_SET_CONTROL 69 +#define PR_RISCV_V_GET_CONTROL 70 +# define PR_RISCV_V_VSTATE_CTRL_DEFAULT 0 +# define PR_RISCV_V_VSTATE_CTRL_OFF 1 +# define PR_RISCV_V_VSTATE_CTRL_ON 2 +# define PR_RISCV_V_VSTATE_CTRL_INHERIT (1 << 4) +# define PR_RISCV_V_VSTATE_CTRL_CUR_MASK 0x3 +# define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc +# define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f + +#define PR_RISCV_SET_ICACHE_FLUSH_CTX 71 +# define PR_RISCV_CTX_SW_FENCEI_ON 0 +# define PR_RISCV_CTX_SW_FENCEI_OFF 1 +# define PR_RISCV_SCOPE_PER_PROCESS 0 +# define PR_RISCV_SCOPE_PER_THREAD 1 + +/* PowerPC Dynamic Execution Control Register (DEXCR) controls */ +#define PR_PPC_GET_DEXCR 72 +#define PR_PPC_SET_DEXCR 73 +/* DEXCR aspect to act on */ +# define PR_PPC_DEXCR_SBHE 0 /* Speculative branch hint enable */ +# define PR_PPC_DEXCR_IBRTPD 1 /* Indirect branch recurrent target prediction disable */ +# define PR_PPC_DEXCR_SRAPD 2 /* Subroutine return address prediction disable */ +# define PR_PPC_DEXCR_NPHIE 3 /* Non-privileged hash instruction enable */ +/* Action to apply / return */ +# define PR_PPC_DEXCR_CTRL_EDITABLE 0x1 /* Aspect can be modified with PR_PPC_SET_DEXCR */ +# define PR_PPC_DEXCR_CTRL_SET 0x2 /* Set the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_CLEAR 0x4 /* Clear the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_SET_ONEXEC 0x8 /* Set the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_MASK 0x1f + +#endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3 From ecc1793b2d08efeb2d1795814d2ea350c9d6992b Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 6 Jun 2024 20:36:19 +0000 Subject: selftests/mm: use asm volatile to not optimize mmap read variable create_pagecache_thp_and_fd() in split_huge_page_test.c used the variable dummy to perform mmap read. However, this test was skipped even on XFS which has large folio support. The issue was compiler (gcc 13.2.0) was optimizing out the dummy variable, therefore, not creating huge page in the page cache. Use asm volatile() trick to force the compiler not to optimize out the loop where we read from the mmaped addr. This is similar to what is being done in other tests (cow.c, etc) As the variable is now used in the asm statement, remove the unused attribute. Link: https://lkml.kernel.org/r/20240606203619.677276-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reviewed-by: Zi Yan Acked-by: David Hildenbrand Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Pankaj Raghav Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/split_huge_page_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index d3c7f5fb3e7b..e5e8dafc9d94 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -300,7 +300,7 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, char **addr) { size_t i; - int __attribute__((unused)) dummy = 0; + int dummy = 0; srand(time(NULL)); @@ -341,6 +341,7 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, for (size_t i = 0; i < fd_size; i++) dummy += *(*addr + i); + asm volatile("" : "+r" (dummy)); if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); -- cgit v1.2.3 From 05f3f75cf13faf4f16bbb86709ea4d482de2573d Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Wed, 5 Jun 2024 22:36:34 +0000 Subject: selftests/mm: include linux/mman.h thuge-gen defines MAP_HUGE_* macros that are provided by linux/mman.h since 4.15. Removes the macros and includes linux/mman.h instead. Link: https://lkml.kernel.org/r/20240605223637.1374969-2-edliaw@google.com Signed-off-by: Edward Liaw Reviewed-by: Carlos Llamas Reviewed-by: Muhammad Usama Anjum Cc: Bill Wendling Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/thuge-gen.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index ea7fd8fe2876..034635317935 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -15,6 +15,7 @@ #define _GNU_SOURCE 1 #include +#include #include #include #include @@ -28,10 +29,6 @@ #include "vm_util.h" #include "../kselftest.h" -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f #if !defined(MAP_HUGETLB) #define MAP_HUGETLB 0x40000 #endif -- cgit v1.2.3 From 8192bc03d908c60c95b72dd0acfc8e37a33ab2f7 Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Wed, 5 Jun 2024 22:36:35 +0000 Subject: selftests/mm: guard defines from shm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit thuge-gen.c defines SHM_HUGE_* macros that are provided by the uapi since 4.14. These macros get redefined when compiling with Android's bionic because its sys/shm.h will import the uapi definitions. However if linux/shm.h is included, with glibc, sys/shm.h will clash on some struct definitions: /usr/include/linux/shm.h:26:8: error: redefinition of ‘struct shmid_ds’ 26 | struct shmid_ds { | ^~~~~~~~ In file included from /usr/include/x86_64-linux-gnu/bits/shm.h:45, from /usr/include/x86_64-linux-gnu/sys/shm.h:30: /usr/include/x86_64-linux-gnu/bits/types/struct_shmid_ds.h:24:8: note: originally defined here 24 | struct shmid_ds | ^~~~~~~~ For now, guard the SHM_HUGE_* defines with ifndef to prevent redefinition warnings on Android bionic. Link: https://lkml.kernel.org/r/20240605223637.1374969-3-edliaw@google.com Signed-off-by: Edward Liaw Reviewed-by: Carlos Llamas Reviewed-by: Muhammad Usama Anjum Cc: Bill Wendling Cc: Carlos Llamas Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/thuge-gen.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 034635317935..d50dc71cac32 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -34,10 +34,18 @@ #endif #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ +#ifndef SHM_HUGE_SHIFT #define SHM_HUGE_SHIFT 26 +#endif +#ifndef SHM_HUGE_MASK #define SHM_HUGE_MASK 0x3f +#endif +#ifndef SHM_HUGE_2MB #define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) +#endif +#ifndef SHM_HUGE_1GB #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) +#endif #define NUM_PAGESIZES 5 #define NUM_PAGES 4 -- cgit v1.2.3 From a5c6bc590094a1a73cf6fa3f505e1945d2bf2461 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 17 Jun 2024 19:24:22 -0700 Subject: selftests/mm: remove local __NR_* definitions This continues the work on getting the selftests to build without requiring people to first run "make headers" [1]. Now that the system call numbers are in the correct, checked-in locations in the kernel tree (./tools/include/uapi/asm/unistd*.h), make sure that the mm selftests include that file (indirectly). Doing so provides guaranteed definitions at build time, so remove all of the checks for "ifdef __NR_xxx" in the mm selftests, because they will always be true (defined). [1] commit e076eaca5906 ("selftests: break the dependency upon local header files") Link: https://lkml.kernel.org/r/20240618022422.804305-7-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Jeff Xu Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Muhammad Usama Anjum Cc: Peter Xu Cc: Rich Felker Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugepage-mremap.c | 2 +- tools/testing/selftests/mm/ksm_functional_tests.c | 8 +------- tools/testing/selftests/mm/memfd_secret.c | 14 +------------- tools/testing/selftests/mm/mkdirty.c | 8 +------- tools/testing/selftests/mm/mlock2.h | 1 + tools/testing/selftests/mm/pagemap_ioctl.c | 2 +- tools/testing/selftests/mm/protection_keys.c | 2 +- tools/testing/selftests/mm/uffd-common.c | 4 ---- tools/testing/selftests/mm/uffd-stress.c | 16 +--------------- tools/testing/selftests/mm/uffd-unit-tests.c | 14 +------------- 10 files changed, 9 insertions(+), 62 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c index c463d1c09c9b..ada9156cc497 100644 --- a/tools/testing/selftests/mm/hugepage-mremap.c +++ b/tools/testing/selftests/mm/hugepage-mremap.c @@ -15,7 +15,7 @@ #define _GNU_SOURCE #include #include -#include +#include #include #include #include /* Definition of O_* constants */ diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index b61803e36d1c..66b4e111b5a2 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -369,7 +369,6 @@ unmap: munmap(map, size); } -#ifdef __NR_userfaultfd static void test_unmerge_uffd_wp(void) { struct uffdio_writeprotect uffd_writeprotect; @@ -430,7 +429,6 @@ close_uffd: unmap: munmap(map, size); } -#endif /* Verify that KSM can be enabled / queried with prctl. */ static void test_prctl(void) @@ -686,9 +684,7 @@ int main(int argc, char **argv) exit(test_child_ksm()); } -#ifdef __NR_userfaultfd tests++; -#endif ksft_print_header(); ksft_set_plan(tests); @@ -700,9 +696,7 @@ int main(int argc, char **argv) test_unmerge(); test_unmerge_zero_pages(); test_unmerge_discarded(); -#ifdef __NR_userfaultfd test_unmerge_uffd_wp(); -#endif test_prot_none(); diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 9a0597310a76..74c911aa3aea 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -28,8 +28,6 @@ #define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) #define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) -#ifdef __NR_memfd_secret - #define PATTERN 0x55 static const int prot = PROT_READ | PROT_WRITE; @@ -334,13 +332,3 @@ int main(int argc, char *argv[]) ksft_finished(); } - -#else /* __NR_memfd_secret */ - -int main(int argc, char *argv[]) -{ - printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c index b8a7efe9204e..1db134063c38 100644 --- a/tools/testing/selftests/mm/mkdirty.c +++ b/tools/testing/selftests/mm/mkdirty.c @@ -9,7 +9,7 @@ */ #include #include -#include +#include #include #include #include @@ -265,7 +265,6 @@ munmap: munmap(mmap_mem, mmap_size); } -#ifdef __NR_userfaultfd static void test_uffdio_copy(void) { struct uffdio_register uffdio_register; @@ -322,7 +321,6 @@ munmap: munmap(dst, pagesize); free(src); } -#endif /* __NR_userfaultfd */ int main(void) { @@ -335,9 +333,7 @@ int main(void) thpsize / 1024); tests += 3; } -#ifdef __NR_userfaultfd tests += 1; -#endif /* __NR_userfaultfd */ ksft_print_header(); ksft_set_plan(tests); @@ -367,9 +363,7 @@ int main(void) if (thpsize) test_pte_mapped_thp(); /* Placing a fresh page via userfaultfd may set the PTE dirty. */ -#ifdef __NR_userfaultfd test_uffdio_copy(); -#endif /* __NR_userfaultfd */ err = ksft_get_fail_cnt(); if (err) diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h index 4417eaa5cfb7..1e5731bab499 100644 --- a/tools/testing/selftests/mm/mlock2.h +++ b/tools/testing/selftests/mm/mlock2.h @@ -3,6 +3,7 @@ #include #include #include +#include static int mlock2_(void *start, size_t len, int flags) { diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index bcc73b4e805c..fc90af2a97b8 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 48dc151f8fca..eaa6d1fc5328 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 7ad6ba660c7d..717539eddf98 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -673,11 +673,7 @@ int uffd_open_dev(unsigned int flags) int uffd_open_sys(unsigned int flags) { -#ifdef __NR_userfaultfd return syscall(__NR_userfaultfd, flags); -#else - return -1; -#endif } int uffd_open(unsigned int flags) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index f78bab0f3d45..0abb9af0fc7f 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -33,11 +33,9 @@ * pthread_mutex_lock will also verify the atomicity of the memory * transfer (UFFDIO_COPY). */ - +#include #include "uffd-common.h" -#ifdef __NR_userfaultfd - #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) #define BOUNCE_VERIFY (1<<2) @@ -466,15 +464,3 @@ int main(int argc, char **argv) nr_pages, nr_pages_per_cpu); return userfaultfd_stress(); } - -#else /* __NR_userfaultfd */ - -#warning "missing __NR_userfaultfd definition" - -int main(void) -{ - printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 21ec23206ab4..b3d21eed203d 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -5,12 +5,11 @@ * Copyright (C) 2015-2023 Red Hat, Inc. */ +#include #include "uffd-common.h" #include "../../../../mm/gup_test.h" -#ifdef __NR_userfaultfd - /* The unit test doesn't need a large or random size, make it 32MB for now */ #define UFFD_TEST_MEM_SIZE (32UL << 20) @@ -1554,14 +1553,3 @@ int main(int argc, char *argv[]) return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS; } -#else /* __NR_userfaultfd */ - -#warning "missing __NR_userfaultfd definition" - -int main(void) -{ - printf("Skipping %s (missing __NR_userfaultfd)\n", __file__); - return KSFT_SKIP; -} - -#endif /* __NR_userfaultfd */ -- cgit v1.2.3 From ec3e837d8fd96c68599b2861dd412094b7bc335c Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:55 +0200 Subject: kmsan: allow disabling KMSAN checks for the current task Like for KASAN, it's useful to temporarily disable KMSAN checks around, e.g., redzone accesses. Introduce kmsan_disable_current() and kmsan_enable_current(), which are similar to their KASAN counterparts. Make them reentrant in order to handle memory allocations in interrupt context. Repurpose the allow_reporting field for this. Link: https://lkml.kernel.org/r/20240621113706.315500-12-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/dev-tools/kmsan.rst | 11 +++++++++-- include/linux/kmsan.h | 24 ++++++++++++++++++++++++ include/linux/kmsan_types.h | 2 +- mm/kmsan/core.c | 1 - mm/kmsan/hooks.c | 18 +++++++++++++++--- mm/kmsan/report.c | 7 ++++--- tools/objtool/check.c | 2 ++ 7 files changed, 55 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/Documentation/dev-tools/kmsan.rst b/Documentation/dev-tools/kmsan.rst index 323eedad53cd..6a48d96c5c85 100644 --- a/Documentation/dev-tools/kmsan.rst +++ b/Documentation/dev-tools/kmsan.rst @@ -110,6 +110,13 @@ in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every function in the file or directory. Most users won't need KMSAN_SANITIZE, unless their code gets broken by KMSAN (e.g. runs at early boot time). +KMSAN checks can also be temporarily disabled for the current task using +``kmsan_disable_current()`` and ``kmsan_enable_current()`` calls. Each +``kmsan_enable_current()`` call must be preceded by a +``kmsan_disable_current()`` call; these call pairs may be nested. One needs to +be careful with these calls, keeping the regions short and preferring other +ways to disable instrumentation, where possible. + Support ======= @@ -338,11 +345,11 @@ Per-task KMSAN state ~~~~~~~~~~~~~~~~~~~~ Every task_struct has an associated KMSAN task state that holds the KMSAN -context (see above) and a per-task flag disallowing KMSAN reports:: +context (see above) and a per-task counter disallowing KMSAN reports:: struct kmsan_context { ... - bool allow_reporting; + unsigned int depth; struct kmsan_context_state cstate; ... } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index fe6c2212bdb1..14b5ea6d3a43 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -239,6 +239,22 @@ void kmsan_unpoison_entry_regs(const struct pt_regs *regs); */ void *kmsan_get_metadata(void *addr, bool is_origin); +/** + * kmsan_enable_current(): Enable KMSAN for the current task. + * + * Each kmsan_enable_current() current call must be preceded by a + * kmsan_disable_current() call. These call pairs may be nested. + */ +void kmsan_enable_current(void); + +/** + * kmsan_disable_current(): Disable KMSAN for the current task. + * + * Each kmsan_disable_current() current call must be followed by a + * kmsan_enable_current() call. These call pairs may be nested. + */ +void kmsan_disable_current(void); + #else static inline void kmsan_init_shadow(void) @@ -338,6 +354,14 @@ static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) { } +static inline void kmsan_enable_current(void) +{ +} + +static inline void kmsan_disable_current(void) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h index 929287981afe..dfc59918b3c0 100644 --- a/include/linux/kmsan_types.h +++ b/include/linux/kmsan_types.h @@ -31,7 +31,7 @@ struct kmsan_context_state { struct kmsan_ctx { struct kmsan_context_state cstate; int kmsan_in_runtime; - bool allow_reporting; + unsigned int depth; }; #endif /* _LINUX_KMSAN_TYPES_H */ diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 95f859e38c53..81b22220711a 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -43,7 +43,6 @@ void kmsan_internal_task_create(struct task_struct *task) struct thread_info *info = current_thread_info(); __memset(ctx, 0, sizeof(*ctx)); - ctx->allow_reporting = true; kmsan_internal_unpoison_memory(info, sizeof(*info), false); } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index b408714f9ba3..267d0afa2e8b 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -39,12 +39,10 @@ void kmsan_task_create(struct task_struct *task) void kmsan_task_exit(struct task_struct *task) { - struct kmsan_ctx *ctx = &task->kmsan_ctx; - if (!kmsan_enabled || kmsan_in_runtime()) return; - ctx->allow_reporting = false; + kmsan_disable_current(); } void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) @@ -424,3 +422,17 @@ void kmsan_check_memory(const void *addr, size_t size) REASON_ANY); } EXPORT_SYMBOL(kmsan_check_memory); + +void kmsan_enable_current(void) +{ + KMSAN_WARN_ON(current->kmsan_ctx.depth == 0); + current->kmsan_ctx.depth--; +} +EXPORT_SYMBOL(kmsan_enable_current); + +void kmsan_disable_current(void) +{ + current->kmsan_ctx.depth++; + KMSAN_WARN_ON(current->kmsan_ctx.depth == 0); +} +EXPORT_SYMBOL(kmsan_disable_current); diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c index c79d3b0d2d0d..92e73ec61435 100644 --- a/mm/kmsan/report.c +++ b/mm/kmsan/report.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -158,12 +159,12 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, if (!kmsan_enabled) return; - if (!current->kmsan_ctx.allow_reporting) + if (current->kmsan_ctx.depth) return; if (!origin) return; - current->kmsan_ctx.allow_reporting = false; + kmsan_disable_current(); ua_flags = user_access_save(); raw_spin_lock(&kmsan_report_lock); pr_err("=====================================================\n"); @@ -216,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, if (panic_on_kmsan) panic("kmsan.panic set ...\n"); user_access_restore(ua_flags); - current->kmsan_ctx.allow_reporting = true; + kmsan_enable_current(); } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0a33d9195b7a..01237d167223 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1202,6 +1202,8 @@ static const char *uaccess_safe_builtin[] = { "__sanitizer_cov_trace_switch", /* KMSAN */ "kmsan_copy_to_user", + "kmsan_disable_current", + "kmsan_enable_current", "kmsan_report", "kmsan_unpoison_entry_regs", "kmsan_unpoison_memory", -- cgit v1.2.3 From 34ec4344a5dabbb39e23e8daf30779892c0211a6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Jun 2024 11:05:31 -0700 Subject: selftests/damon/access_memory: use user-defined region size Patch series "selftests/damon: test DAMOS tried regions and {min,max}_nr_regions". This patch series fix a minor issue in a program for DAMON selftest, and implement new functionality selftests for DAMOS tried regions and {min,max}_nr_regions. The test for max_nr_regions also test the recovery from online tuning-caused limit violation, which was fixed by a previous patch [1] titled "mm/damon/core: merge regions aggressively when max_nr_regions is unmet". The first patch fixes a minor problem in the articial memory access pattern generator for tests. Following 3 patches (2-4) implement schemes tried regions test. Then a couple of patches (5-6) implementing static setup based {min,max}_nr_regions functionality test follows. Final two patches (7-8) implement dynamic max_nr_regions update test. [1] https://lore.kernel.org/20240624210650.53960C2BBFC@smtp.kernel.org This patch (of 8): 'access_memory' is an artificial memory access pattern generator for DAMON tests. It creates and accesses memory regions that the user specified the number and size via the command line. However, real access part of the program ignores the user-specified size of each region. Instead, it uses a hard-coded value, 10 MiB. Fix it to use user-defined size. Note that all existing 'access_memory' users are setting the region size as 10 MiB. Hence no real problem has happened so far. Link: https://lkml.kernel.org/r/20240625180538.73134-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240625180538.73134-2-sj@kernel.org Fixes: b5906f5f7359 ("selftests/damon: add a test for update_schemes_tried_regions sysfs command") Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/access_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c index 585a2fa54329..56b17e8fe1be 100644 --- a/tools/testing/selftests/damon/access_memory.c +++ b/tools/testing/selftests/damon/access_memory.c @@ -35,7 +35,7 @@ int main(int argc, char *argv[]) start_clock = clock(); while ((clock() - start_clock) * 1000 / CLOCKS_PER_SEC < access_time_ms) - memset(regions[i], i, 1024 * 1024 * 10); + memset(regions[i], i, sz_region); } return 0; } -- cgit v1.2.3 From 209e6313fb17dbc66cec7230902ba4427717e6ee Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Jun 2024 11:05:32 -0700 Subject: selftests/damon/_damon_sysfs: support schemes_update_tried_regions Implement schemes_update_tried_regions DAMON sysfs command on _damon_sysfs.py, to use on implementations of future tests for the feature. Link: https://lkml.kernel.org/r/20240625180538.73134-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 35 ++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 2bd44c32be1b..f975742f29b3 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -175,16 +175,24 @@ class DamosStats: self.sz_applied = sz_applied self.qt_exceeds = qt_exceeds +class DamosTriedRegion: + def __init__(self, start, end, nr_accesses, age): + self.start = start + self.end = end + self.nr_accesses = nr_accesses + self.age = age + class Damos: action = None access_pattern = None quota = None apply_interval_us = None - # todo: Support watermarks, stats, tried_regions + # todo: Support watermarks, stats idx = None context = None tried_bytes = None stats = None + tried_regions = None def __init__(self, action='stat', access_pattern=DamosAccessPattern(), quota=DamosQuota(), apply_interval_us=0): @@ -398,6 +406,31 @@ class Kdamond: err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') return err + def update_schemes_tried_regions(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), + 'update_schemes_tried_regions') + if err is not None: + return err + for context in self.contexts: + for scheme in context.schemes: + tried_regions = [] + tried_regions_dir = os.path.join( + scheme.sysfs_dir(), 'tried_regions') + for filename in os.listdir( + os.path.join(scheme.sysfs_dir(), 'tried_regions')): + tried_region_dir = os.path.join(tried_regions_dir, filename) + if not os.path.isdir(tried_region_dir): + continue + region_values = [] + for f in ['start', 'end', 'nr_accesses', 'age']: + content, err = read_file( + os.path.join(tried_region_dir, f)) + if err is not None: + return err + region_values.append(int(content)) + tried_regions.append(DamosTriedRegion(*region_values)) + scheme.tried_regions = tried_regions + def update_schemes_tried_bytes(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_tried_bytes') -- cgit v1.2.3 From c94df805c774ff87022396bb8120d5872cd7e5ed Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Jun 2024 11:05:33 -0700 Subject: selftests/damon: implement a program for even-numbered memory regions access To test schemes_tried_regions feature, we need to have a program having specific number of regions that having different access pattern. Existing artificial access pattern generator, 'access_memory', cannot be used for the purpose, since it accesses only one region at a given time. Extending it could be an option, but since the purpose and the implementation are pretty simple, implementing another one from the scratch is better. Implement such another artificial memory access program that allocates user-defined number/size regions and accesses even-numbered regions. Link: https://lkml.kernel.org/r/20240625180538.73134-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- tools/testing/selftests/damon/access_memory_even.c | 42 ++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/damon/access_memory_even.c (limited to 'tools') diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 29a22f50e762..7b972b5cf487 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -4,7 +4,7 @@ TEST_GEN_FILES += huge_count_read_write TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race TEST_GEN_FILES += debugfs_target_ids_pid_leak -TEST_GEN_FILES += access_memory +TEST_GEN_FILES += access_memory access_memory_even TEST_FILES = _chk_dependency.sh _debugfs_common.sh diff --git a/tools/testing/selftests/damon/access_memory_even.c b/tools/testing/selftests/damon/access_memory_even.c new file mode 100644 index 000000000000..3be121487432 --- /dev/null +++ b/tools/testing/selftests/damon/access_memory_even.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Artificial memory access program for testing DAMON. + * + * Receives number of regions and size of each region from user. Allocate the + * regions and repeatedly access even numbered (starting from zero) regions. + */ + +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + char **regions; + clock_t start_clock; + int nr_regions; + int sz_region; + int access_time_ms; + int i; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return -1; + } + + nr_regions = atoi(argv[1]); + sz_region = atoi(argv[2]); + + regions = malloc(sizeof(*regions) * nr_regions); + for (i = 0; i < nr_regions; i++) + regions[i] = malloc(sz_region); + + while (1) { + for (i = 0; i < nr_regions; i++) { + if (i % 2 == 0) + memset(regions[i], i, sz_region); + } + } + return 0; +} -- cgit v1.2.3 From c9a3003a358df357b1eba44b05a5bb164621d5f5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Jun 2024 11:05:34 -0700 Subject: selftests/damon: implement DAMOS tried regions test Implement a test for DAMOS tried regions command of DAMON sysfs interface. It ensures the expected number of monitoring regions are created using an artificial memory access pattern generator program. Link: https://lkml.kernel.org/r/20240625180538.73134-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + .../testing/selftests/damon/damos_tried_regions.py | 65 ++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 tools/testing/selftests/damon/damos_tried_regions.py (limited to 'tools') diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 7b972b5cf487..356b4e9a515d 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -13,6 +13,7 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += sysfs.sh TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py +TEST_PROGS += damos_tried_regions.py TEST_PROGS += reclaim.sh lru_sort.sh # regression tests (reproducers of previously found bugs) diff --git a/tools/testing/selftests/damon/damos_tried_regions.py b/tools/testing/selftests/damon/damos_tried_regions.py new file mode 100644 index 000000000000..3b347eb28bd2 --- /dev/null +++ b/tools/testing/selftests/damon/damos_tried_regions.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import time + +import _damon_sysfs + +def main(): + # repeatedly access even-numbered ones in 14 regions of 10 MiB size + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory_even', '14', '%d' % sz_region]) + + # stat every monitored regions + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[_damon_sysfs.Damos(action='stat', + )] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err is not None: + proc.terminate() + print('kdamond start failed: %s' % err) + exit(1) + + collected_nr_regions = [] + while proc.poll() is None: + time.sleep(0.1) + err = kdamonds.kdamonds[0].update_schemes_tried_regions() + if err is not None: + proc.terminate() + print('tried regions update failed: %s' % err) + exit(1) + + scheme = kdamonds.kdamonds[0].contexts[0].schemes[0] + if scheme.tried_regions is None: + proc.terminate() + print('tried regions is not collected') + exit(1) + + nr_tried_regions = len(scheme.tried_regions) + if nr_tried_regions <= 0: + proc.terminate() + print('tried regions is not created') + exit(1) + collected_nr_regions.append(nr_tried_regions) + if len(collected_nr_regions) > 10: + break + proc.terminate() + + collected_nr_regions.sort() + sample = collected_nr_regions[4] + print('50-th percentile nr_regions: %d' % sample) + print('expectation (>= 14) is %s' % 'met' if sample >= 14 else 'not met') + if collected_nr_regions[4] < 14: + print('full nr_regions:') + print('\n'.join(collected_nr_regions)) + exit(1) + +if __name__ == '__main__': + main() -- cgit v1.2.3 From f60636047a6cc2a3076cb202b462cff4347c07d6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Jun 2024 11:05:35 -0700 Subject: selftests/damon/_damon_sysfs: implement kdamonds stop function Implement DAMON stop function on the test-purpose DAMON sysfs interface wrapper Python module, _damon_sysfs.py. This feature will be used by future DAMON tests that need to start/stop DAMON multiple times. Link: https://lkml.kernel.org/r/20240625180538.73134-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index f975742f29b3..17352b9d204d 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -406,6 +406,10 @@ class Kdamond: err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') return err + def stop(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'off') + return err + def update_schemes_tried_regions(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_tried_regions') @@ -511,3 +515,10 @@ class Kdamonds: if err is not None: return err return None + + def stop(self): + for kdamond in self.kdamonds: + err = kdamond.stop() + if err is not None: + return err + return None -- cgit v1.2.3 From 781497347d1bcb8c95fca8356575cfd42fff1354 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Jun 2024 11:05:36 -0700 Subject: selftests/damon: implement test for min/max_nr_regions Implement a kselftest for DAMON's {min,max}_nr_regions' parameters. The test ensures both the minimum and the maximum number of regions limit is respected even if the workload's real number of regions is less than the minimum or larger than the maximum limits. Link: https://lkml.kernel.org/r/20240625180538.73134-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- tools/testing/selftests/damon/damon_nr_regions.py | 85 +++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/damon/damon_nr_regions.py (limited to 'tools') diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 356b4e9a515d..1e2e98cc809d 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -13,7 +13,7 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += sysfs.sh TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py -TEST_PROGS += damos_tried_regions.py +TEST_PROGS += damos_tried_regions.py damon_nr_regions.py TEST_PROGS += reclaim.sh lru_sort.sh # regression tests (reproducers of previously found bugs) diff --git a/tools/testing/selftests/damon/damon_nr_regions.py b/tools/testing/selftests/damon/damon_nr_regions.py new file mode 100644 index 000000000000..dd2735923c59 --- /dev/null +++ b/tools/testing/selftests/damon/damon_nr_regions.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import time + +import _damon_sysfs + +def test_nr_regions(real_nr_regions, min_nr_regions, max_nr_regions): + ''' + Create process of the given 'real_nr_regions' regions, monitor it using + DAMON with given '{min,max}_nr_regions' monitoring parameter. + + Exit with non-zero return code if the given {min,max}_nr_regions is not + kept. + ''' + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory_even', '%d' % real_nr_regions, + '%d' % sz_region]) + + # stat every monitored regions + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + monitoring_attrs=_damon_sysfs.DamonAttrs( + min_nr_regions=min_nr_regions, + max_nr_regions=max_nr_regions), + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[_damon_sysfs.Damos(action='stat', + )] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err is not None: + proc.terminate() + print('kdamond start failed: %s' % err) + exit(1) + + collected_nr_regions = [] + while proc.poll() is None: + time.sleep(0.1) + err = kdamonds.kdamonds[0].update_schemes_tried_regions() + if err is not None: + proc.terminate() + print('tried regions update failed: %s' % err) + exit(1) + + scheme = kdamonds.kdamonds[0].contexts[0].schemes[0] + if scheme.tried_regions is None: + proc.terminate() + print('tried regions is not collected') + exit(1) + + nr_tried_regions = len(scheme.tried_regions) + if nr_tried_regions <= 0: + proc.terminate() + print('tried regions is not created') + exit(1) + collected_nr_regions.append(nr_tried_regions) + if len(collected_nr_regions) > 10: + break + proc.terminate() + kdamonds.stop() + + test_name = 'nr_regions test with %d/%d/%d real/min/max nr_regions' % ( + real_nr_regions, min_nr_regions, max_nr_regions) + if (collected_nr_regions[0] < min_nr_regions or + collected_nr_regions[-1] > max_nr_regions): + print('fail %s' % test_name) + print('number of regions that collected are:') + for nr in collected_nr_regions: + print(nr) + exit(1) + print('pass %s ' % test_name) + +def main(): + # test min_nr_regions larger than real nr regions + test_nr_regions(10, 20, 100) + + # test max_nr_regions smaller than real nr regions + test_nr_regions(15, 3, 10) + +if __name__ == '__main__': + main() -- cgit v1.2.3 From 5ac9adecf0cf07b99d4eeda24a89d27447acd860 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Jun 2024 11:05:37 -0700 Subject: _damon_sysfs: implement commit() for online parameters update Users can update DAMON parameters while it is running, using 'commit' DAMON sysfs interface command. For testing the feature in future tests, implement a function for doing that on the test-purpose DAMON sysfs interface wrapper Python module. Link: https://lkml.kernel.org/r/20240625180538.73134-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 17352b9d204d..6e136dc3df19 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -481,6 +481,25 @@ class Kdamond: goal.effective_bytes = int(content) return None + def commit(self): + nr_contexts_file = os.path.join(self.sysfs_dir(), + 'contexts', 'nr_contexts') + content, err = read_file(nr_contexts_file) + if err is not None: + return err + if int(content) != len(self.contexts): + err = write_file(nr_contexts_file, '%d' % len(self.contexts)) + if err is not None: + return err + + for context in self.contexts: + err = context.stage() + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'commit') + return err + + def commit_schemes_quota_goals(self): for context in self.contexts: for scheme in context.schemes: -- cgit v1.2.3 From 8bf890c8161215bd1fbd1a6d9b61acbb8eac165b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Jun 2024 11:05:38 -0700 Subject: selftests/damon/damon_nr_regions: test online-tuned max_nr_regions User could update max_nr_regions parameter while DAMON is running to a value that smaller than the current number of regions that DAMON is seeing. Such update could be done for reducing the monitoring overhead. In the case, DAMON should merge regions aggressively more than normal situation to ensure the new limit is successfully applied. Implement a kselftest to ensure that. Link: https://lkml.kernel.org/r/20240625180538.73134-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/damon_nr_regions.py | 60 +++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/damon_nr_regions.py b/tools/testing/selftests/damon/damon_nr_regions.py index dd2735923c59..2e8a74aff543 100644 --- a/tools/testing/selftests/damon/damon_nr_regions.py +++ b/tools/testing/selftests/damon/damon_nr_regions.py @@ -81,5 +81,65 @@ def main(): # test max_nr_regions smaller than real nr regions test_nr_regions(15, 3, 10) + # test online-tuned max_nr_regions that smaller than real nr regions + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory_even', '14', '%d' % sz_region]) + + # stat every monitored regions + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + monitoring_attrs=_damon_sysfs.DamonAttrs( + min_nr_regions=10, max_nr_regions=1000), + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[_damon_sysfs.Damos(action='stat', + )] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err is not None: + proc.terminate() + print('kdamond start failed: %s' % err) + exit(1) + + # wait until the real regions are found + time.sleep(3) + + attrs = kdamonds.kdamonds[0].contexts[0].monitoring_attrs + attrs.min_nr_regions = 3 + attrs.max_nr_regions = 7 + err = kdamonds.kdamonds[0].commit() + if err is not None: + proc.terminate() + print('commit failed: %s' % err) + exit(1) + # wait for next merge operation is executed + time.sleep(0.3) + + err = kdamonds.kdamonds[0].update_schemes_tried_regions() + if err is not None: + proc.terminate() + print('tried regions update failed: %s' % err) + exit(1) + + scheme = kdamonds.kdamonds[0].contexts[0].schemes[0] + if scheme.tried_regions is None: + proc.terminate() + print('tried regions is not collected') + exit(1) + + nr_tried_regions = len(scheme.tried_regions) + if nr_tried_regions <= 0: + proc.terminate() + print('tried regions is not created') + exit(1) + proc.terminate() + + if nr_tried_regions > 7: + print('fail online-tuned max_nr_regions: %d > 7' % nr_tried_regions) + exit(1) + print('pass online-tuned max_nr_regions') + if __name__ == '__main__': main() -- cgit v1.2.3 From a47a7af9b5118442dcfd2214a810033948a579ec Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Wed, 26 Jun 2024 09:05:12 -0400 Subject: mm: update uffd-stress to handle EINVAL for unset config features Now that we have updated userfaultfd_api to correctly return EINVAL when a feature is requested but not available, let's fix the uffd-stress test to only set the UFFD_FEATURE_WP_UNPOPULATED feature when the config is set. In addition, still run the test if the CONFIG_PTE_MARKER_UFFD_WP is not set, just dont use the corresponding UFFD_FEATURE_WP_UNPOPULATED feature. Link: https://lkml.kernel.org/r/20240626130513.120193-2-audra@redhat.com Signed-off-by: Audra Mitchell Cc: Al Viro Cc: Andrea Arcangeli Cc: Christian Brauner Cc: Jan Kara Cc: Mike Rapoport Cc: Peter Xu Cc: Rafael Aquini Cc: Shaohua Li Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 0abb9af0fc7f..cc8e0fd0da1c 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -36,6 +36,8 @@ #include #include "uffd-common.h" +uint64_t features; + #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) #define BOUNCE_VERIFY (1<<2) @@ -245,10 +247,14 @@ static int userfaultfd_stress(void) unsigned long nr; struct uffd_args args[nr_cpus]; uint64_t mem_size = nr_pages * page_size; + int flags = 0; memset(args, 0, sizeof(struct uffd_args) * nr_cpus); - if (uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED, NULL)) + if (features & UFFD_FEATURE_WP_UNPOPULATED && test_type == TEST_ANON) + flags = UFFD_FEATURE_WP_UNPOPULATED; + + if (uffd_test_ctx_init(flags, NULL)) err("context init failed"); if (posix_memalign(&area, page_size, page_size)) @@ -383,8 +389,6 @@ static void set_test_type(const char *type) static void parse_test_type_arg(const char *raw_type) { - uint64_t features = UFFD_API_FEATURES; - set_test_type(raw_type); if (!test_type) @@ -407,8 +411,8 @@ static void parse_test_type_arg(const char *raw_type) * feature. */ - if (userfaultfd_open(&features)) - err("Userfaultfd open failed"); + if (uffd_get_features(&features)) + err("failed to get available features"); test_uffdio_wp = test_uffdio_wp && (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); -- cgit v1.2.3 From a591d35c40237df735e8ffb4792acb6e56bbca48 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Wed, 26 Jun 2024 09:05:13 -0400 Subject: mm: turn off test_uffdio_wp if CONFIG_PTE_MARKER_UFFD_WP is not configured. If CONFIG_PTE_MARKER_UFFD_WP is disabled, then we turn off three features in userfaultfd_api (UFFD_FEATURE_WP_HUGETLBFS_SHMEM, UFFD_FEATURE_WP_UNPOPULATED, and UFFD_FEATURE_WP_ASYNC). Currently this test always will call uffdio_regsiter with the flag UFFDIO_REGISTER_MODE_WP. However, the kernel ensures in vma_can_userfault that if the feature UFFD_FEATURE_WP_HUGETLBFS_SHMEM is disabled, only allow the VM_UFFD_WP on anonymous vmas, meaning our call to uffdio_regsiter will fail. We still want to be able to run the test even if we have CONFIG_PTE_MARKER_UFFD_WP disabled, so check to see if the feature UFFD_FEATURE_WP_HUGETLBFS_SHMEM has been turned off in the test and if so, disable us from calling uffdio_regsiter with the flag UFFDIO_REGISTER_MODE_WP. Link: https://lkml.kernel.org/r/20240626130513.120193-3-audra@redhat.com Signed-off-by: Audra Mitchell Reviewed-by: Peter Xu Cc: Al Viro Cc: Andrea Arcangeli Cc: Christian Brauner Cc: Jan Kara Cc: Mike Rapoport Cc: Rafael Aquini Cc: Shaohua Li Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index cc8e0fd0da1c..a4b83280998a 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -417,6 +417,9 @@ static void parse_test_type_arg(const char *raw_type) test_uffdio_wp = test_uffdio_wp && (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); + if (test_type != TEST_ANON && !(features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM)) + test_uffdio_wp = false; + close(uffd); uffd = -1; } -- cgit v1.2.3 From 72ead83dad5c9f7a3ac00d4ee5f92a0794836bb1 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Wed, 26 Jun 2024 05:08:17 +0000 Subject: selftest/mm: test enable_soft_offline behaviors Add regression and new tests when hugepage has correctable memory errors, and how userspace wants to deal with it: * if enable_soft_offline=1, mapped hugepage is soft offlined * if enable_soft_offline=0, mapped hugepage is intact Free hugepages case is not explicitly covered by the tests. Hugepage having corrected memory errors is emulated with MADV_SOFT_OFFLINE. [jiaqiyan@google.com: v7] Link: https://lkml.kernel.org/r/20240628205958.2845610-4-jiaqiyan@google.com Link: https://lkml.kernel.org/r/20240626050818.2277273-4-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: Miaohe Lin Acked-by: David Rientjes Cc: Frank van der Linden Cc: Jane Chu Cc: Jonathan Corbet Cc: Lance Yang Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Randy Dunlap Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/hugetlb-soft-offline.c | 228 ++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 6 + 4 files changed, 236 insertions(+) create mode 100644 tools/testing/selftests/mm/hugetlb-soft-offline.c (limited to 'tools') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 0b9ab987601c..064e7b125643 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -6,6 +6,7 @@ hugepage-shm hugepage-vmemmap hugetlb-madvise hugetlb-read-hwpoison +hugetlb-soft-offline khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 0d96c6123636..e1aa09ddaa3d 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -43,6 +43,7 @@ TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugetlb-madvise TEST_GEN_FILES += hugetlb-read-hwpoison +TEST_GEN_FILES += hugetlb-soft-offline TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-mremap TEST_GEN_FILES += hugepage-shm diff --git a/tools/testing/selftests/mm/hugetlb-soft-offline.c b/tools/testing/selftests/mm/hugetlb-soft-offline.c new file mode 100644 index 000000000000..f086f0e04756 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb-soft-offline.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test soft offline behavior for HugeTLB pages: + * - if enable_soft_offline = 0, hugepages should stay intact and soft + * offlining failed with EOPNOTSUPP. + * - if enable_soft_offline = 1, a hugepage should be dissolved and + * nr_hugepages/free_hugepages should be reduced by 1. + * + * Before running, make sure more than 2 hugepages of default_hugepagesz + * are allocated. For example, if /proc/meminfo/Hugepagesize is 2048kB: + * echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#ifndef MADV_SOFT_OFFLINE +#define MADV_SOFT_OFFLINE 101 +#endif + +#define EPREFIX " !!! " + +static int do_soft_offline(int fd, size_t len, int expect_errno) +{ + char *filemap = NULL; + char *hwp_addr = NULL; + const unsigned long pagesize = getpagesize(); + int ret = 0; + + if (ftruncate(fd, len) < 0) { + ksft_perror(EPREFIX "ftruncate to len failed"); + return -1; + } + + filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (filemap == MAP_FAILED) { + ksft_perror(EPREFIX "mmap failed"); + ret = -1; + goto untruncate; + } + + memset(filemap, 0xab, len); + ksft_print_msg("Allocated %#lx bytes of hugetlb pages\n", len); + + hwp_addr = filemap + len / 2; + ret = madvise(hwp_addr, pagesize, MADV_SOFT_OFFLINE); + ksft_print_msg("MADV_SOFT_OFFLINE %p ret=%d, errno=%d\n", + hwp_addr, ret, errno); + if (ret != 0) + ksft_perror(EPREFIX "madvise failed"); + + if (errno == expect_errno) + ret = 0; + else { + ksft_print_msg("MADV_SOFT_OFFLINE should ret %d\n", + expect_errno); + ret = -1; + } + + munmap(filemap, len); +untruncate: + if (ftruncate(fd, 0) < 0) + ksft_perror(EPREFIX "ftruncate back to 0 failed"); + + return ret; +} + +static int set_enable_soft_offline(int value) +{ + char cmd[256] = {0}; + FILE *cmdfile = NULL; + + if (value != 0 && value != 1) + return -EINVAL; + + sprintf(cmd, "echo %d > /proc/sys/vm/enable_soft_offline", value); + cmdfile = popen(cmd, "r"); + + if (cmdfile) + ksft_print_msg("enable_soft_offline => %d\n", value); + else { + ksft_perror(EPREFIX "failed to set enable_soft_offline"); + return errno; + } + + pclose(cmdfile); + return 0; +} + +static int read_nr_hugepages(unsigned long hugepage_size, + unsigned long *nr_hugepages) +{ + char buffer[256] = {0}; + char cmd[256] = {0}; + + sprintf(cmd, "cat /sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages", + hugepage_size); + FILE *cmdfile = popen(cmd, "r"); + + if (cmdfile == NULL) { + ksft_perror(EPREFIX "failed to popen nr_hugepages"); + return -1; + } + + if (!fgets(buffer, sizeof(buffer), cmdfile)) { + ksft_perror(EPREFIX "failed to read nr_hugepages"); + pclose(cmdfile); + return -1; + } + + *nr_hugepages = atoll(buffer); + pclose(cmdfile); + return 0; +} + +static int create_hugetlbfs_file(struct statfs *file_stat) +{ + int fd; + + fd = memfd_create("hugetlb_tmp", MFD_HUGETLB); + if (fd < 0) { + ksft_perror(EPREFIX "could not open hugetlbfs file"); + return -1; + } + + memset(file_stat, 0, sizeof(*file_stat)); + if (fstatfs(fd, file_stat)) { + ksft_perror(EPREFIX "fstatfs failed"); + goto close; + } + if (file_stat->f_type != HUGETLBFS_MAGIC) { + ksft_print_msg(EPREFIX "not hugetlbfs file\n"); + goto close; + } + + return fd; +close: + close(fd); + return -1; +} + +static void test_soft_offline_common(int enable_soft_offline) +{ + int fd; + int expect_errno = enable_soft_offline ? 0 : EOPNOTSUPP; + struct statfs file_stat; + unsigned long hugepagesize_kb = 0; + unsigned long nr_hugepages_before = 0; + unsigned long nr_hugepages_after = 0; + int ret; + + ksft_print_msg("Test soft-offline when enabled_soft_offline=%d\n", + enable_soft_offline); + + fd = create_hugetlbfs_file(&file_stat); + if (fd < 0) + ksft_exit_fail_msg("Failed to create hugetlbfs file\n"); + + hugepagesize_kb = file_stat.f_bsize / 1024; + ksft_print_msg("Hugepagesize is %ldkB\n", hugepagesize_kb); + + if (set_enable_soft_offline(enable_soft_offline) != 0) { + close(fd); + ksft_exit_fail_msg("Failed to set enable_soft_offline\n"); + } + + if (read_nr_hugepages(hugepagesize_kb, &nr_hugepages_before) != 0) { + close(fd); + ksft_exit_fail_msg("Failed to read nr_hugepages\n"); + } + + ksft_print_msg("Before MADV_SOFT_OFFLINE nr_hugepages=%ld\n", + nr_hugepages_before); + + ret = do_soft_offline(fd, 2 * file_stat.f_bsize, expect_errno); + + if (read_nr_hugepages(hugepagesize_kb, &nr_hugepages_after) != 0) { + close(fd); + ksft_exit_fail_msg("Failed to read nr_hugepages\n"); + } + + ksft_print_msg("After MADV_SOFT_OFFLINE nr_hugepages=%ld\n", + nr_hugepages_after); + + // No need for the hugetlbfs file from now on. + close(fd); + + if (enable_soft_offline) { + if (nr_hugepages_before != nr_hugepages_after + 1) { + ksft_test_result_fail("MADV_SOFT_OFFLINE should reduced 1 hugepage\n"); + return; + } + } else { + if (nr_hugepages_before != nr_hugepages_after) { + ksft_test_result_fail("MADV_SOFT_OFFLINE reduced %lu hugepages\n", + nr_hugepages_before - nr_hugepages_after); + return; + } + } + + ksft_test_result(ret == 0, + "Test soft-offline when enabled_soft_offline=%d\n", + enable_soft_offline); +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_set_plan(2); + + test_soft_offline_common(1); + test_soft_offline_common(0); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 5698d519170d..03ac4f2e1cce 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -332,6 +332,12 @@ CATEGORY="hugetlb" run_test ./thuge-gen CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2 CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2 if $RUN_DESTRUCTIVE; then +nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) +enable_soft_offline=$(cat /proc/sys/vm/enable_soft_offline) +echo 8 > /proc/sys/vm/nr_hugepages +CATEGORY="hugetlb" run_test ./hugetlb-soft-offline +echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages +echo "$enable_soft_offline" > /proc/sys/vm/enable_soft_offline CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison fi -- cgit v1.2.3 From 95139d9408453df05dc4dfde37a0eb70afae0f81 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 22 Jun 2024 19:12:31 +1200 Subject: tools/mm: introduce a tool to assess swap entry allocation for thp_swapout Both Ryan and Chris have been utilizing the small test program to aid in debugging and identifying issues with swap entry allocation. While a real or intricate workload might be more suitable for assessing the correctness and effectiveness of the swap allocation policy, a small test program presents a simpler means of understanding the problem and initially verifying the improvements being made. Let's endeavor to integrate it into tools/mm. Although it presently only accommodates 64KB and 4KB, I'm optimistic that we can expand its capabilities to support multiple sizes and simulate more complex systems in the future as required. Basically, we have 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under high exercise in a short time. 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures new mTHP is always generated, released or swapped out, similar to the behavior on a PC or Android phone where many applications are frequently started and terminated. 3. Swap in with or without the "-a" option to observe how fragments due to swap-in and the incoming swap-in of large folios will impact swap-out fallback. Due to 2, we ensure a certain proportion of mTHP. Similarly, because of 3, we maintain a certain proportion of small folios, as we don't support large folios swap-in, meaning any swap-in will immediately result in small folios. Therefore, with both 2 and 3, we automatically achieve a system containing both mTHP and small folios. Additionally, 1 provides the ability to continuously swap them out. We can also use "-s" to add a dedicated small folios memory area. [akpm@linux-foundation.org: thp_swap_allocator_test.c needs mman.h, per Kairui Song] Link: https://lkml.kernel.org/r/20240622071231.576056-2-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: Chris Li Tested-by: Chris Li Reviewed-by: Ryan Roberts Tested-by: Ryan Roberts Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kairui Song Cc: Kalesh Singh Signed-off-by: Andrew Morton --- tools/mm/Makefile | 2 +- tools/mm/thp_swap_allocator_test.c | 234 +++++++++++++++++++++++++++++++++++++ 2 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 tools/mm/thp_swap_allocator_test.c (limited to 'tools') diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 7bb03606b9ea..15791c1c5b28 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -3,7 +3,7 @@ # include ../scripts/Makefile.include -BUILD_TARGETS=page-types slabinfo page_owner_sort +BUILD_TARGETS=page-types slabinfo page_owner_sort thp_swap_allocator_test INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps LIB_DIR = ../lib/api diff --git a/tools/mm/thp_swap_allocator_test.c b/tools/mm/thp_swap_allocator_test.c new file mode 100644 index 000000000000..83afc52275a5 --- /dev/null +++ b/tools/mm/thp_swap_allocator_test.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * thp_swap_allocator_test + * + * The purpose of this test program is helping check if THP swpout + * can correctly get swap slots to swap out as a whole instead of + * being split. It randomly releases swap entries through madvise + * DONTNEED and swapin/out on two memory areas: a memory area for + * 64KB THP and the other area for small folios. The second memory + * can be enabled by "-s". + * Before running the program, we need to setup a zRAM or similar + * swap device by: + * echo lzo > /sys/block/zram0/comp_algorithm + * echo 64M > /sys/block/zram0/disksize + * echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled + * echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled + * mkswap /dev/zram0 + * swapon /dev/zram0 + * The expected result should be 0% anon swpout fallback ratio w/ or + * w/o "-s". + * + * Author(s): Barry Song + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMSIZE_MTHP (60 * 1024 * 1024) +#define MEMSIZE_SMALLFOLIO (4 * 1024 * 1024) +#define ALIGNMENT_MTHP (64 * 1024) +#define ALIGNMENT_SMALLFOLIO (4 * 1024) +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024) +#define TOTAL_DONTNEED_SMALLFOLIO (1 * 1024 * 1024) +#define MTHP_FOLIO_SIZE (64 * 1024) + +#define SWPOUT_PATH \ + "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout" +#define SWPOUT_FALLBACK_PATH \ + "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback" + +static void *aligned_alloc_mem(size_t size, size_t alignment) +{ + void *mem = NULL; + + if (posix_memalign(&mem, alignment, size) != 0) { + perror("posix_memalign"); + return NULL; + } + return mem; +} + +/* + * This emulates the behavior of native libc and Java heap, + * as well as process exit and munmap. It helps generate mTHP + * and ensures that iterations can proceed with mTHP, as we + * currently don't support large folios swap-in. + */ +static void random_madvise_dontneed(void *mem, size_t mem_size, + size_t align_size, size_t total_dontneed_size) +{ + size_t num_pages = total_dontneed_size / align_size; + size_t i; + size_t offset; + void *addr; + + for (i = 0; i < num_pages; ++i) { + offset = (rand() % (mem_size / align_size)) * align_size; + addr = (char *)mem + offset; + if (madvise(addr, align_size, MADV_DONTNEED) != 0) + perror("madvise dontneed"); + + memset(addr, 0x11, align_size); + } +} + +static void random_swapin(void *mem, size_t mem_size, + size_t align_size, size_t total_swapin_size) +{ + size_t num_pages = total_swapin_size / align_size; + size_t i; + size_t offset; + void *addr; + + for (i = 0; i < num_pages; ++i) { + offset = (rand() % (mem_size / align_size)) * align_size; + addr = (char *)mem + offset; + memset(addr, 0x11, align_size); + } +} + +static unsigned long read_stat(const char *path) +{ + FILE *file; + unsigned long value; + + file = fopen(path, "r"); + if (!file) { + perror("fopen"); + return 0; + } + + if (fscanf(file, "%lu", &value) != 1) { + perror("fscanf"); + fclose(file); + return 0; + } + + fclose(file); + return value; +} + +int main(int argc, char *argv[]) +{ + int use_small_folio = 0, aligned_swapin = 0; + void *mem1 = NULL, *mem2 = NULL; + int i; + + for (i = 1; i < argc; ++i) { + if (strcmp(argv[i], "-s") == 0) + use_small_folio = 1; + else if (strcmp(argv[i], "-a") == 0) + aligned_swapin = 1; + } + + mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP); + if (mem1 == NULL) { + fprintf(stderr, "Failed to allocate large folios memory\n"); + return EXIT_FAILURE; + } + + if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) { + perror("madvise hugepage for mem1"); + free(mem1); + return EXIT_FAILURE; + } + + if (use_small_folio) { + mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP); + if (mem2 == NULL) { + fprintf(stderr, "Failed to allocate small folios memory\n"); + free(mem1); + return EXIT_FAILURE; + } + + if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) { + perror("madvise nohugepage for mem2"); + free(mem1); + free(mem2); + return EXIT_FAILURE; + } + } + + /* warm-up phase to occupy the swapfile */ + memset(mem1, 0x11, MEMSIZE_MTHP); + madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT); + if (use_small_folio) { + memset(mem2, 0x11, MEMSIZE_SMALLFOLIO); + madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT); + } + + /* iterations with newly created mTHP, swap-in, and swap-out */ + for (i = 0; i < 100; ++i) { + unsigned long initial_swpout; + unsigned long initial_swpout_fallback; + unsigned long final_swpout; + unsigned long final_swpout_fallback; + unsigned long swpout_inc; + unsigned long swpout_fallback_inc; + double fallback_percentage; + + initial_swpout = read_stat(SWPOUT_PATH); + initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH); + + /* + * The following setup creates a 1:1 ratio of mTHP to small folios + * since large folio swap-in isn't supported yet. Once we support + * mTHP swap-in, we'll likely need to reduce MEMSIZE_MTHP and + * increase MEMSIZE_SMALLFOLIO to maintain the ratio. + */ + random_swapin(mem1, MEMSIZE_MTHP, + aligned_swapin ? ALIGNMENT_MTHP : ALIGNMENT_SMALLFOLIO, + TOTAL_DONTNEED_MTHP); + random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP, + TOTAL_DONTNEED_MTHP); + + if (use_small_folio) { + random_swapin(mem2, MEMSIZE_SMALLFOLIO, + ALIGNMENT_SMALLFOLIO, + TOTAL_DONTNEED_SMALLFOLIO); + } + + if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) { + perror("madvise pageout for mem1"); + free(mem1); + if (mem2 != NULL) + free(mem2); + return EXIT_FAILURE; + } + + if (use_small_folio) { + if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) { + perror("madvise pageout for mem2"); + free(mem1); + free(mem2); + return EXIT_FAILURE; + } + } + + final_swpout = read_stat(SWPOUT_PATH); + final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH); + + swpout_inc = final_swpout - initial_swpout; + swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback; + + fallback_percentage = (double)swpout_fallback_inc / + (swpout_fallback_inc + swpout_inc) * 100; + + printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n", + i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage); + } + + free(mem1); + if (mem2 != NULL) + free(mem2); + + return EXIT_SUCCESS; +} -- cgit v1.2.3 From cc937dad85aea4ab9e4f9827d7ea55932c86906b Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Tue, 25 Jun 2024 22:34:45 +0000 Subject: selftests: centralize -D_GNU_SOURCE= to CFLAGS in lib.mk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Centralize the _GNU_SOURCE definition to CFLAGS in lib.mk. Remove redundant defines from Makefiles that import lib.mk. Convert any usage of "#define _GNU_SOURCE 1" to "#define _GNU_SOURCE". This uses the form "-D_GNU_SOURCE=", which is equivalent to "#define _GNU_SOURCE". Otherwise using "-D_GNU_SOURCE" is equivalent to "-D_GNU_SOURCE=1" and "#define _GNU_SOURCE 1", which is less commonly seen in source code and would require many changes in selftests to avoid redefinition warnings. Link: https://lkml.kernel.org/r/20240625223454.1586259-2-edliaw@google.com Signed-off-by: Edward Liaw Suggested-by: John Hubbard Acked-by: Shuah Khan Reviewed-by: Muhammad Usama Anjum Cc: Albert Ou Cc: AndrĂ© Almeida Cc: Darren Hart Cc: Dave Hansen Cc: Davidlohr Bueso Cc: David S. Miller Cc: Eric Dumazet Cc: Eric W. Biederman Cc: Fenghua Yu Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Kees Cook Cc: Kevin Tian Cc: Palmer Dabbelt Cc: Paolo Abeni Cc: Paolo Bonzini Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Reinette Chatre Cc: Sean Christopherson Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- tools/testing/selftests/exec/Makefile | 1 - tools/testing/selftests/futex/functional/Makefile | 2 +- tools/testing/selftests/intel_pstate/Makefile | 2 +- tools/testing/selftests/iommu/Makefile | 2 -- tools/testing/selftests/kvm/Makefile | 2 +- tools/testing/selftests/lib.mk | 3 +++ tools/testing/selftests/mm/thuge-gen.c | 2 +- tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/tcp_ao/Makefile | 2 +- tools/testing/selftests/proc/Makefile | 1 - tools/testing/selftests/resctrl/Makefile | 2 +- tools/testing/selftests/ring-buffer/Makefile | 1 - tools/testing/selftests/riscv/mm/Makefile | 2 +- tools/testing/selftests/sgx/Makefile | 2 +- tools/testing/selftests/tmpfs/Makefile | 1 - 15 files changed, 12 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index fb4472ddffd8..c5bdb653422b 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS = -Wall CFLAGS += -Wno-nonnull -CFLAGS += -D_GNU_SOURCE TEST_PROGS := binfmt_script.py TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 non-regular diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 994fa3468f17..f79f9bac7918 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) -CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE= -pthread $(INCLUDES) $(KHDR_INCLUDES) +CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES) LDLIBS := -lpthread -lrt LOCAL_HDRS := \ diff --git a/tools/testing/selftests/intel_pstate/Makefile b/tools/testing/selftests/intel_pstate/Makefile index 05d66ef50c97..f45372cb00fe 100644 --- a/tools/testing/selftests/intel_pstate/Makefile +++ b/tools/testing/selftests/intel_pstate/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS := $(CFLAGS) -Wall -D_GNU_SOURCE +CFLAGS := $(CFLAGS) -Wall LDLIBS += -lm ARCH ?= $(shell uname -m 2>/dev/null || echo not) diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile index 32c5fdfd0eef..fd6477911f24 100644 --- a/tools/testing/selftests/iommu/Makefile +++ b/tools/testing/selftests/iommu/Makefile @@ -2,8 +2,6 @@ CFLAGS += -Wall -O2 -Wno-unused-function CFLAGS += $(KHDR_INCLUDES) -CFLAGS += -D_GNU_SOURCE - TEST_GEN_PROGS := TEST_GEN_PROGS += iommufd TEST_GEN_PROGS += iommufd_fail_nth diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ac280dcba996..4ee37abf70ff 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -231,7 +231,7 @@ LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ - -D_GNU_SOURCE -fno-builtin-memcmp -fno-builtin-memcpy \ + -fno-builtin-memcmp -fno-builtin-memcpy \ -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 429535816dbd..4984c5d8b6ac 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -188,6 +188,9 @@ endef clean: $(if $(TEST_GEN_MODS_DIR),clean_mods_dir) $(CLEAN) +# Build with _GNU_SOURCE by default +CFLAGS += -D_GNU_SOURCE= + # Enables to extend CFLAGS and LDFLAGS from command line, e.g. # make USERCFLAGS=-Werror USERLDFLAGS=-static CFLAGS += $(USERCFLAGS) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index d50dc71cac32..e4370b79b62f 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -13,7 +13,7 @@ sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m (warning this will remove all if someone else uses them) */ -#define _GNU_SOURCE 1 +#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index d9393569d03a..2ec3154c607a 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for net selftests -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS += -Wall -Wl,--no-as-needed -O2 -g CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES) # Additional include paths needed by kselftest.h CFLAGS += -I../ diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile index 522d991e310e..bd88b90b902b 100644 --- a/tools/testing/selftests/net/tcp_ao/Makefile +++ b/tools/testing/selftests/net/tcp_ao/Makefile @@ -26,7 +26,7 @@ LIB := $(LIBDIR)/libaotst.a LDLIBS += $(LIB) -pthread LIBDEPS := lib/aolib.h Makefile -CFLAGS := -Wall -O2 -g -D_GNU_SOURCE -fno-strict-aliasing +CFLAGS += -Wall -O2 -g -fno-strict-aliasing CFLAGS += $(KHDR_INCLUDES) CFLAGS += -iquote ./lib/ -I ../../../../include/ diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index cd95369254c0..25c34cc9238e 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O2 -Wno-unused-function -CFLAGS += -D_GNU_SOURCE LDFLAGS += -pthread TEST_GEN_PROGS := diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile index 021863f86053..f408bd6bfc3d 100644 --- a/tools/testing/selftests/resctrl/Makefile +++ b/tools/testing/selftests/resctrl/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2 -D_GNU_SOURCE +CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2 CFLAGS += $(KHDR_INCLUDES) TEST_GEN_PROGS := resctrl_tests diff --git a/tools/testing/selftests/ring-buffer/Makefile b/tools/testing/selftests/ring-buffer/Makefile index 627c5fa6d1ab..23605782639e 100644 --- a/tools/testing/selftests/ring-buffer/Makefile +++ b/tools/testing/selftests/ring-buffer/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -Wl,-no-as-needed -Wall CFLAGS += $(KHDR_INCLUDES) -CFLAGS += -D_GNU_SOURCE TEST_GEN_PROGS = map_test diff --git a/tools/testing/selftests/riscv/mm/Makefile b/tools/testing/selftests/riscv/mm/Makefile index c333263f2b27..4664ed79e20b 100644 --- a/tools/testing/selftests/riscv/mm/Makefile +++ b/tools/testing/selftests/riscv/mm/Makefile @@ -3,7 +3,7 @@ # Originally tools/testing/arm64/abi/Makefile # Additional include paths needed by kselftest.h and local headers -CFLAGS += -D_GNU_SOURCE -std=gnu99 -I. +CFLAGS += -std=gnu99 -I. TEST_GEN_FILES := mmap_default mmap_bottomup diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile index 867f88ce2570..03b5e13b872b 100644 --- a/tools/testing/selftests/sgx/Makefile +++ b/tools/testing/selftests/sgx/Makefile @@ -12,7 +12,7 @@ OBJCOPY := $(CROSS_COMPILE)objcopy endif INCLUDES := -I$(top_srcdir)/tools/include -HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC +HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC $(CFLAGS) HOST_LDFLAGS := -z noexecstack -lcrypto ENCL_CFLAGS += -Wall -Werror -static-pie -nostdlib -ffreestanding -fPIE \ -fno-stack-protector -mrdrnd $(INCLUDES) diff --git a/tools/testing/selftests/tmpfs/Makefile b/tools/testing/selftests/tmpfs/Makefile index aa11ccc92e5b..3be931e1193f 100644 --- a/tools/testing/selftests/tmpfs/Makefile +++ b/tools/testing/selftests/tmpfs/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O2 -CFLAGS += -D_GNU_SOURCE TEST_GEN_PROGS := TEST_GEN_PROGS += bug-link-o-tmpfile -- cgit v1.2.3 From 3a3b7fec3974f954600844e41d773c00857ef48a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 1 Jul 2024 11:31:15 -0400 Subject: mm: remove CONFIG_MEMCG_KMEM CONFIG_MEMCG_KMEM used to be a user-visible option for whether slab tracking is enabled. It has been default-enabled and equivalent to CONFIG_MEMCG for almost a decade. We've only grown more kernel memory accounting sites since, and there is no imaginable cgroup usecase going forward that wants to track user pages but not the multitude of user-drivable kernel allocations. Link: https://lkml.kernel.org/r/20240701153148.452230-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: David Hildenbrand Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/bpf.h | 4 +-- include/linux/list_lru.h | 2 +- include/linux/memcontrol.h | 22 +++---------- include/linux/sched.h | 3 +- include/linux/slab.h | 12 +++---- include/trace/events/kmem.h | 4 +-- init/Kconfig | 5 --- kernel/bpf/memalloc.c | 9 ++---- kernel/bpf/syscall.c | 6 ++-- mm/kfence/core.c | 6 ++-- mm/kfence/kfence.h | 2 +- mm/list_lru.c | 14 ++++---- mm/memcontrol-v1.c | 6 ++-- mm/memcontrol.c | 60 ++++------------------------------- mm/percpu-internal.h | 6 ++-- mm/percpu.c | 6 ++-- mm/slab.h | 2 +- mm/slab_common.c | 10 +++--- mm/slub.c | 10 +++--- tools/testing/selftests/cgroup/config | 1 - 20 files changed, 59 insertions(+), 131 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5e694a308081..b8637555c9c2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -275,7 +275,7 @@ struct bpf_map { u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; struct btf *btf; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; @@ -2252,7 +2252,7 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 792b67ceb631..5099a8ccd5f4 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -50,7 +50,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct list_head list; int shrinker_id; bool memcg_aware; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60418934827c..7e2eb091049a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -195,7 +195,7 @@ struct mem_cgroup { /* Range enforcement for interrupt charges */ struct work_struct high_work; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP unsigned long zswap_max; /* @@ -236,7 +236,6 @@ struct mem_cgroup { */ unsigned long socket_pressure; -#ifdef CONFIG_MEMCG_KMEM int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting @@ -247,7 +246,6 @@ struct mem_cgroup { struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; -#endif struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -532,7 +530,6 @@ retry: return memcg; } -#ifdef CONFIG_MEMCG_KMEM /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. @@ -548,15 +545,6 @@ static inline bool folio_memcg_kmem(struct folio *folio) return folio->memcg_data & MEMCG_DATA_KMEM; } - -#else -static inline bool folio_memcg_kmem(struct folio *folio) -{ - return false; -} - -#endif - static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); @@ -1488,7 +1476,7 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -1663,7 +1651,7 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, } #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); @@ -1806,9 +1794,9 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); diff --git a/include/linux/sched.h b/include/linux/sched.h index a7770c566c4d..82da65131a6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1457,9 +1457,8 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; -#endif -#ifdef CONFIG_MEMCG_KMEM + /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 7247e217e21b..a332dd2fa6cd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -41,7 +41,7 @@ enum _slab_flag_bits { #ifdef CONFIG_FAILSLAB _SLAB_FAILSLAB, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG _SLAB_ACCOUNT, #endif #ifdef CONFIG_KASAN_GENERIC @@ -171,7 +171,7 @@ enum _slab_flag_bits { # define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif /* Account to memcg */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else # define SLAB_ACCOUNT __SLAB_FLAG_UNUSED @@ -407,7 +407,7 @@ enum kmalloc_cache_type { #ifndef CONFIG_ZONE_DMA KMALLOC_DMA = KMALLOC_NORMAL, #endif -#ifndef CONFIG_MEMCG_KMEM +#ifndef CONFIG_MEMCG KMALLOC_CGROUP = KMALLOC_NORMAL, #endif KMALLOC_RANDOM_START = KMALLOC_NORMAL, @@ -420,7 +420,7 @@ enum kmalloc_cache_type { #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG KMALLOC_CGROUP, #endif NR_KMALLOC_TYPES @@ -435,7 +435,7 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; #define KMALLOC_NOT_NORMAL_BITS \ (__GFP_RECLAIMABLE | \ (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ - (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) + (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0)) extern unsigned long random_kmalloc_seed; @@ -463,7 +463,7 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigne */ if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) return KMALLOC_DMA; - if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) + if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE)) return KMALLOC_RECLAIM; else return KMALLOC_CGROUP; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 8a829e0f6e55..b37eb0a7060f 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -36,7 +36,7 @@ TRACE_EVENT(kmem_cache_alloc, __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; - __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? + __entry->accounted = IS_ENABLED(CONFIG_MEMCG) ? ((gfp_flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)) : false; ), @@ -87,7 +87,7 @@ TRACE_EVENT(kmalloc, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node, - (IS_ENABLED(CONFIG_MEMCG_KMEM) && + (IS_ENABLED(CONFIG_MEMCG) && (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false") ); diff --git a/init/Kconfig b/init/Kconfig index aca0ae9be04f..26bf8bb0a7ce 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -986,11 +986,6 @@ config MEMCG_V1 San N is unsure. -config MEMCG_KMEM - bool - depends on MEMCG - default y - config BLK_CGROUP bool "IO controller" depends on BLOCK diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index a546aba46d5d..dec892ded031 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -155,12 +155,9 @@ static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (c->objcg) return get_mem_cgroup_from_objcg(c->objcg); -#endif - -#ifdef CONFIG_MEMCG return root_mem_cgroup; #else return NULL; @@ -534,7 +531,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif @@ -556,7 +553,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG objcg = get_obj_cgroup_from_current(); #endif ma->objcg = objcg; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f45ed6adc092..8f716f06c345 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -385,7 +385,7 @@ void bpf_map_free_id(struct bpf_map *map) spin_unlock_irqrestore(&map_idr_lock, flags); } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static void bpf_map_save_memcg(struct bpf_map *map) { /* Currently if a map is created by a process belonging to the root @@ -486,7 +486,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, unsigned long i, j; struct page *pg; int ret = 0; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct mem_cgroup *memcg, *old_memcg; memcg = bpf_map_get_memcg(map); @@ -505,7 +505,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, break; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG set_active_memcg(old_memcg); mem_cgroup_put(memcg); #endif diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 83f8e78827c0..c5cb54fc696d 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -602,7 +602,7 @@ static unsigned long kfence_init_pool(void) continue; __folio_set_slab(slab_folio(slab)); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | MEMCG_DATA_OBJEXTS; #endif @@ -652,7 +652,7 @@ reset_slab: if (!i || (i % 2)) continue; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG slab->obj_exts = 0; #endif __folio_clear_slab(slab_folio(slab)); @@ -1146,7 +1146,7 @@ void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG KFENCE_WARN_ON(meta->obj_exts.objcg); #endif /* diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 084f5f36e8e7..db87a05047bd 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -97,7 +97,7 @@ struct kfence_metadata { struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct slabobj_ext obj_exts; #endif }; diff --git a/mm/list_lru.c b/mm/list_lru.c index 3fd64736bc45..a29d96929d7c 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,7 +15,7 @@ #include "slab.h" #include "internal.h" -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -83,7 +83,7 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) @@ -294,7 +294,7 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; unsigned long index; @@ -324,7 +324,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) { int nid; @@ -544,14 +544,14 @@ static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker) { int i; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (shrinker) lru->shrinker_id = shrinker->id; else @@ -591,7 +591,7 @@ void list_lru_destroy(struct list_lru *lru) kfree(lru->node); lru->node = NULL; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG lru->shrinker_id = -1; #endif } diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 7218884bc3c9..6b3e56e88a8a 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -2756,7 +2756,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, return 0; } -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG static int mem_cgroup_slab_show(struct seq_file *m, void *p) { /* @@ -2863,7 +2863,7 @@ struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG { .name = "kmem.slabinfo", .seq_show = mem_cgroup_slab_show, @@ -2922,7 +2922,6 @@ struct cftype memsw_files[] = { { }, /* terminate */ }; -#ifdef CONFIG_MEMCG_KMEM void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { @@ -2932,7 +2931,6 @@ void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) page_counter_uncharge(&memcg->kmem, -nr_pages); } } -#endif /* CONFIG_MEMCG_KMEM */ bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a2339396cfcb..9ddce038ddda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -118,7 +118,6 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) #define CURRENT_OBJCG_UPDATE_BIT 0 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT) -#ifdef CONFIG_MEMCG_KMEM static DEFINE_SPINLOCK(objcg_lock); bool mem_cgroup_kmem_disabled(void) @@ -223,7 +222,6 @@ EXPORT_SYMBOL(memcg_kmem_online_key); DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); EXPORT_SYMBOL(memcg_bpf_enabled_key); -#endif /** * mem_cgroup_css_from_folio - css of the memcg associated with a folio @@ -423,7 +421,7 @@ static const unsigned int memcg_vm_event_stat[] = { PGDEACTIVATE, PGLAZYFREE, PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP ZSWPIN, ZSWPOUT, ZSWPWB, @@ -1346,7 +1344,7 @@ static const struct memory_stat memory_stats[] = { { "sock", MEMCG_SOCK }, { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP { "zswap", MEMCG_ZSWAP_B }, { "zswapped", MEMCG_ZSWAPPED }, #endif @@ -1700,13 +1698,11 @@ struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; -#ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; -#endif struct work_struct work; unsigned long flags; @@ -1717,23 +1713,10 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { }; static DEFINE_MUTEX(percpu_charge_mutex); -#ifdef CONFIG_MEMCG_KMEM static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); -#else -static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) -{ - return NULL; -} -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, - struct mem_cgroup *root_memcg) -{ - return false; -} -#endif - /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. @@ -2412,8 +2395,6 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) local_irq_enable(); } -#ifdef CONFIG_MEMCG_KMEM - static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) @@ -3069,7 +3050,6 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, obj_cgroup_put(objcg); } } -#endif /* CONFIG_MEMCG_KMEM */ /* * Because folio_memcg(head) is not set on tails, set it now. @@ -3116,7 +3096,6 @@ unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val; } -#ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { struct obj_cgroup *objcg; @@ -3167,15 +3146,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) */ memcg_reparent_list_lrus(memcg, parent); } -#else -static int memcg_online_kmem(struct mem_cgroup *memcg) -{ - return 0; -} -static void memcg_offline_kmem(struct mem_cgroup *memcg) -{ -} -#endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_CGROUP_WRITEBACK @@ -3590,10 +3560,8 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) vmpressure_init(&memcg->vmpressure); memcg->socket_pressure = jiffies; memcg1_memcg_init(memcg); -#ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; INIT_LIST_HEAD(&memcg->objcg_list); -#endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) @@ -3627,7 +3595,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg1_soft_limit_reset(memcg); -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP memcg->zswap_max = PAGE_COUNTER_MAX; WRITE_ONCE(memcg->zswap_writeback, !parent || READ_ONCE(parent->zswap_writeback)); @@ -3659,10 +3627,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); -#if defined(CONFIG_MEMCG_KMEM) if (!cgroup_memory_nobpf) static_branch_inc(&memcg_bpf_enabled_key); -#endif return &memcg->css; } @@ -3755,10 +3721,8 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg1_tcpmem_active(memcg)) static_branch_dec(&memcg_sockets_enabled_key); -#if defined(CONFIG_MEMCG_KMEM) if (!cgroup_memory_nobpf) static_branch_dec(&memcg_bpf_enabled_key); -#endif vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); @@ -3901,7 +3865,6 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) atomic64_set(&memcg->vmstats->stats_updates, 0); } -#ifdef CONFIG_MEMCG_KMEM static void mem_cgroup_fork(struct task_struct *task) { /* @@ -3929,7 +3892,6 @@ static void mem_cgroup_exit(struct task_struct *task) */ task->objcg = NULL; } -#endif #ifdef CONFIG_LRU_GEN static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) @@ -3953,7 +3915,6 @@ static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {} #endif /* CONFIG_LRU_GEN */ -#ifdef CONFIG_MEMCG_KMEM static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) { struct task_struct *task; @@ -3964,17 +3925,12 @@ static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg); } } -#else -static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {} -#endif /* CONFIG_MEMCG_KMEM */ -#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) static void mem_cgroup_attach(struct cgroup_taskset *tset) { mem_cgroup_lru_gen_attach(tset); mem_cgroup_kmem_attach(tset); } -#endif static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { @@ -4421,13 +4377,9 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, -#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) .attach = mem_cgroup_attach, -#endif -#ifdef CONFIG_MEMCG_KMEM .fork = mem_cgroup_fork, .exit = mem_cgroup_exit, -#endif .dfl_cftypes = memory_files, #ifdef CONFIG_MEMCG_V1 .can_attach = memcg1_can_attach, @@ -5395,7 +5347,7 @@ static struct cftype swap_files[] = { { } /* terminate */ }; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP /** * obj_cgroup_may_zswap - check if this cgroup can zswap * @objcg: the object cgroup @@ -5577,7 +5529,7 @@ static struct cftype zswap_files[] = { }, { } /* terminate */ }; -#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ +#endif /* CONFIG_ZSWAP */ static int __init mem_cgroup_swap_init(void) { @@ -5588,7 +5540,7 @@ static int __init mem_cgroup_swap_init(void) #ifdef CONFIG_MEMCG_V1 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #endif -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); #endif return 0; diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 7e42f0ca3b7b..4b3d6ec43703 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -33,7 +33,7 @@ struct pcpu_block_md { }; struct pcpuobj_ext { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *cgroup; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -41,7 +41,7 @@ struct pcpuobj_ext { #endif }; -#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING) +#if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING) #define NEED_PCPUOBJ_EXT #endif @@ -154,7 +154,7 @@ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (!mem_cgroup_kmem_disabled()) extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif diff --git a/mm/percpu.c b/mm/percpu.c index 474e3683b74d..20d91af8c033 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1619,7 +1619,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { @@ -1681,7 +1681,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) obj_cgroup_put(objcg); } -#else /* CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG */ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { @@ -1697,7 +1697,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ #ifdef CONFIG_MEM_ALLOC_PROFILING static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, diff --git a/mm/slab.h b/mm/slab.h index 5f8f47c5bee0..3586e6183224 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -573,7 +573,7 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p); void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, diff --git a/mm/slab_common.c b/mm/slab_common.c index 1560a1546bb1..60268bb258fc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -725,7 +725,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup); #define KMALLOC_DMA_NAME(sz) #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG #define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz, #else #define KMALLOC_CGROUP_NAME(sz) @@ -867,7 +867,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type) if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) { flags |= SLAB_RECLAIM_ACCOUNT; - } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { + } else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) { if (mem_cgroup_kmem_disabled()) { kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; return; @@ -883,10 +883,10 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type) #endif /* - * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for + * If CONFIG_MEMCG is enabled, disable cache merging for * KMALLOC_NORMAL caches. */ - if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL)) + if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL)) flags |= SLAB_NO_MERGE; if (minalign > ARCH_KMALLOC_MINALIGN) { @@ -913,7 +913,7 @@ void __init create_kmalloc_caches(void) enum kmalloc_cache_type type; /* - * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined + * Including KMALLOC_CGROUP if CONFIG_MEMCG defined */ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { /* Caches that are NOT of the two-to-the-power-of size. */ diff --git a/mm/slub.c b/mm/slub.c index 177ad7d3288b..cc11f3869cc6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2020,7 +2020,7 @@ static inline bool need_slab_obj_ext(void) return true; /* - * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally + * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally * inside memcg_slab_post_alloc_hook. No other users for now. */ return false; @@ -2104,7 +2104,7 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, #endif /* CONFIG_SLAB_OBJ_EXT */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static void memcg_alloc_abort_single(struct kmem_cache *s, void *object); @@ -2146,7 +2146,7 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, __memcg_slab_free_hook(s, slab, p, objects, obj_exts); } -#else /* CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG */ static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, @@ -2159,7 +2159,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ /* * Hooks for other subsystems that check memory allocations. In a typical @@ -4456,7 +4456,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, do_slab_free(s, slab, object, object, 1, addr); } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG /* Do not inline the rare memcg charging failed path into the allocation path */ static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config index 97d549ee894f..39f979690dd3 100644 --- a/tools/testing/selftests/cgroup/config +++ b/tools/testing/selftests/cgroup/config @@ -3,5 +3,4 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_MEMCG=y -CONFIG_MEMCG_KMEM=y CONFIG_PAGE_COUNTER=y -- cgit v1.2.3 From 8d42e2a91dcf86b34461cd7f709797805afa9f43 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Sun, 23 Jun 2024 23:36:17 -0700 Subject: selftests/udmabuf: add tests to verify data after page migration Since the memfd pages associated with a udmabuf may be migrated as part of udmabuf create, we need to verify the data coherency after successful migration. The new tests added in this patch try to do just that using 4k sized pages and also 2 MB sized huge pages for the memfd. Successful completion of the tests would mean that there is no disconnect between the memfd pages and the ones associated with a udmabuf. And, these tests can also be augmented in the future to test newer udmabuf features (such as handling memfd hole punch). The idea for these tests comes from a patch by Mike Kravetz here: https://lists.freedesktop.org/archives/dri-devel/2023-June/410623.html v1->v2: (suggestions from Shuah) - Use ksft_* functions to print and capture results of tests - Use appropriate KSFT_* status codes for exit() - Add Mike Kravetz's suggested-by tag Link: https://lkml.kernel.org/r/20240624063952.1572359-10-vivek.kasireddy@intel.com Signed-off-by: Vivek Kasireddy Suggested-by: Mike Kravetz Acked-by: Dave Airlie Acked-by: Gerd Hoffmann Cc: Shuah Khan Cc: David Hildenbrand Cc: Daniel Vetter Cc: Hugh Dickins Cc: Peter Xu Cc: Jason Gunthorpe Cc: Dongwon Kim Cc: Junxiao Chang Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Signed-off-by: Andrew Morton --- tools/testing/selftests/drivers/dma-buf/udmabuf.c | 214 ++++++++++++++++++---- 1 file changed, 183 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c index c812080e304e..6062723a172e 100644 --- a/tools/testing/selftests/drivers/dma-buf/udmabuf.c +++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c @@ -9,52 +9,162 @@ #include #include #include +#include #include #include +#include #include #include +#include "../../kselftest.h" #define TEST_PREFIX "drivers/dma-buf/udmabuf" #define NUM_PAGES 4 +#define NUM_ENTRIES 4 +#define MEMFD_SIZE 1024 /* in pages */ -static int memfd_create(const char *name, unsigned int flags) +static unsigned int page_size; + +static int create_memfd_with_seals(off64_t size, bool hpage) +{ + int memfd, ret; + unsigned int flags = MFD_ALLOW_SEALING; + + if (hpage) + flags |= MFD_HUGETLB; + + memfd = memfd_create("udmabuf-test", flags); + if (memfd < 0) { + ksft_print_msg("%s: [skip,no-memfd]\n", TEST_PREFIX); + exit(KSFT_SKIP); + } + + ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK); + if (ret < 0) { + ksft_print_msg("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); + exit(KSFT_SKIP); + } + + ret = ftruncate(memfd, size); + if (ret == -1) { + ksft_print_msg("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); + exit(KSFT_FAIL); + } + + return memfd; +} + +static int create_udmabuf_list(int devfd, int memfd, off64_t memfd_size) +{ + struct udmabuf_create_list *list; + int ubuf_fd, i; + + list = malloc(sizeof(struct udmabuf_create_list) + + sizeof(struct udmabuf_create_item) * NUM_ENTRIES); + if (!list) { + ksft_print_msg("%s: [FAIL, udmabuf-malloc]\n", TEST_PREFIX); + exit(KSFT_FAIL); + } + + for (i = 0; i < NUM_ENTRIES; i++) { + list->list[i].memfd = memfd; + list->list[i].offset = i * (memfd_size / NUM_ENTRIES); + list->list[i].size = getpagesize() * NUM_PAGES; + } + + list->count = NUM_ENTRIES; + list->flags = UDMABUF_FLAGS_CLOEXEC; + ubuf_fd = ioctl(devfd, UDMABUF_CREATE_LIST, list); + free(list); + if (ubuf_fd < 0) { + ksft_print_msg("%s: [FAIL, udmabuf-create]\n", TEST_PREFIX); + exit(KSFT_FAIL); + } + + return ubuf_fd; +} + +static void write_to_memfd(void *addr, off64_t size, char chr) +{ + int i; + + for (i = 0; i < size / page_size; i++) { + *((char *)addr + (i * page_size)) = chr; + } +} + +static void *mmap_fd(int fd, off64_t size) +{ + void *addr; + + addr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + ksft_print_msg("%s: ubuf_fd mmap fail\n", TEST_PREFIX); + exit(KSFT_FAIL); + } + + return addr; +} + +static int compare_chunks(void *addr1, void *addr2, off64_t memfd_size) { - return syscall(__NR_memfd_create, name, flags); + off64_t off; + int i = 0, j, k = 0, ret = 0; + char char1, char2; + + while (i < NUM_ENTRIES) { + off = i * (memfd_size / NUM_ENTRIES); + for (j = 0; j < NUM_PAGES; j++, k++) { + char1 = *((char *)addr1 + off + (j * getpagesize())); + char2 = *((char *)addr2 + (k * getpagesize())); + if (char1 != char2) { + ret = -1; + goto err; + } + } + i++; + } +err: + munmap(addr1, memfd_size); + munmap(addr2, NUM_ENTRIES * NUM_PAGES * getpagesize()); + return ret; } int main(int argc, char *argv[]) { struct udmabuf_create create; int devfd, memfd, buf, ret; - off_t size; - void *mem; + off64_t size; + void *addr1, *addr2; + + ksft_print_header(); + ksft_set_plan(6); devfd = open("/dev/udmabuf", O_RDWR); if (devfd < 0) { - printf("%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", - TEST_PREFIX); - exit(77); + ksft_print_msg( + "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", + TEST_PREFIX); + exit(KSFT_SKIP); } memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); if (memfd < 0) { - printf("%s: [skip,no-memfd]\n", TEST_PREFIX); - exit(77); + ksft_print_msg("%s: [skip,no-memfd]\n", TEST_PREFIX); + exit(KSFT_SKIP); } ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK); if (ret < 0) { - printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); - exit(77); + ksft_print_msg("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); + exit(KSFT_SKIP); } - size = getpagesize() * NUM_PAGES; ret = ftruncate(memfd, size); if (ret == -1) { - printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); - exit(1); + ksft_print_msg("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); + exit(KSFT_FAIL); } memset(&create, 0, sizeof(create)); @@ -64,44 +174,86 @@ int main(int argc, char *argv[]) create.offset = getpagesize()/2; create.size = getpagesize(); buf = ioctl(devfd, UDMABUF_CREATE, &create); - if (buf >= 0) { - printf("%s: [FAIL,test-1]\n", TEST_PREFIX); - exit(1); - } + if (buf >= 0) + ksft_test_result_fail("%s: [FAIL,test-1]\n", TEST_PREFIX); + else + ksft_test_result_pass("%s: [PASS,test-1]\n", TEST_PREFIX); /* should fail (size not multiple of page) */ create.memfd = memfd; create.offset = 0; create.size = getpagesize()/2; buf = ioctl(devfd, UDMABUF_CREATE, &create); - if (buf >= 0) { - printf("%s: [FAIL,test-2]\n", TEST_PREFIX); - exit(1); - } + if (buf >= 0) + ksft_test_result_fail("%s: [FAIL,test-2]\n", TEST_PREFIX); + else + ksft_test_result_pass("%s: [PASS,test-2]\n", TEST_PREFIX); /* should fail (not memfd) */ create.memfd = 0; /* stdin */ create.offset = 0; create.size = size; buf = ioctl(devfd, UDMABUF_CREATE, &create); - if (buf >= 0) { - printf("%s: [FAIL,test-3]\n", TEST_PREFIX); - exit(1); - } + if (buf >= 0) + ksft_test_result_fail("%s: [FAIL,test-3]\n", TEST_PREFIX); + else + ksft_test_result_pass("%s: [PASS,test-3]\n", TEST_PREFIX); /* should work */ + page_size = getpagesize(); + addr1 = mmap_fd(memfd, size); + write_to_memfd(addr1, size, 'a'); create.memfd = memfd; create.offset = 0; create.size = size; buf = ioctl(devfd, UDMABUF_CREATE, &create); - if (buf < 0) { - printf("%s: [FAIL,test-4]\n", TEST_PREFIX); - exit(1); - } + if (buf < 0) + ksft_test_result_fail("%s: [FAIL,test-4]\n", TEST_PREFIX); + else + ksft_test_result_pass("%s: [PASS,test-4]\n", TEST_PREFIX); + + munmap(addr1, size); + close(buf); + close(memfd); + + /* should work (migration of 4k size pages)*/ + size = MEMFD_SIZE * page_size; + memfd = create_memfd_with_seals(size, false); + addr1 = mmap_fd(memfd, size); + write_to_memfd(addr1, size, 'a'); + buf = create_udmabuf_list(devfd, memfd, size); + addr2 = mmap_fd(buf, NUM_PAGES * NUM_ENTRIES * getpagesize()); + write_to_memfd(addr1, size, 'b'); + ret = compare_chunks(addr1, addr2, size); + if (ret < 0) + ksft_test_result_fail("%s: [FAIL,test-5]\n", TEST_PREFIX); + else + ksft_test_result_pass("%s: [PASS,test-5]\n", TEST_PREFIX); + + close(buf); + close(memfd); + + /* should work (migration of 2MB size huge pages)*/ + page_size = getpagesize() * 512; /* 2 MB */ + size = MEMFD_SIZE * page_size; + memfd = create_memfd_with_seals(size, true); + addr1 = mmap_fd(memfd, size); + write_to_memfd(addr1, size, 'a'); + buf = create_udmabuf_list(devfd, memfd, size); + addr2 = mmap_fd(buf, NUM_PAGES * NUM_ENTRIES * getpagesize()); + write_to_memfd(addr1, size, 'b'); + ret = compare_chunks(addr1, addr2, size); + if (ret < 0) + ksft_test_result_fail("%s: [FAIL,test-6]\n", TEST_PREFIX); + else + ksft_test_result_pass("%s: [PASS,test-6]\n", TEST_PREFIX); - fprintf(stderr, "%s: ok\n", TEST_PREFIX); close(buf); close(memfd); close(devfd); + + ksft_print_msg("%s: ok\n", TEST_PREFIX); + ksft_print_cnts(); + return 0; } -- cgit v1.2.3 From 77179b6f30811bf0bd2f62fcdf235997123e70dc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 27 Jun 2024 10:08:57 -0700 Subject: tools: sync uapi/linux/fs.h header into tools subdir We need this UAPI header in tools/include subdirectory for using it from BPF selftests. Link: https://lkml.kernel.org/r/20240627170900.1672542-6-andrii@kernel.org Signed-off-by: Andrii Nakryiko Acked-by: Liam R. Howlett Cc: Alexey Dobriyan Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Cc: Mike Rapoport (IBM) Cc: Suren Baghdasaryan Cc: Andi Kleen Cc: Arnd Bergmann Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- tools/include/uapi/linux/fs.h | 180 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 170 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index cc3fea99fd43..8a27bc5c7a7f 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _LINUX_FS_H -#define _LINUX_FS_H +#ifndef _UAPI_LINUX_FS_H +#define _UAPI_LINUX_FS_H /* * This file has definitions for some important file table structures @@ -13,10 +13,14 @@ #include #include #include +#ifndef __KERNEL__ #include +#endif /* Use of MS_* flags within the kernel is restricted to core mount(2) code. */ +#if !defined(__KERNEL__) #include +#endif /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -308,29 +312,31 @@ struct fsxattr { typedef int __bitwise __kernel_rwf_t; /* high priority request, poll if possible */ -#define RWF_HIPRI ((__kernel_rwf_t)0x00000001) +#define RWF_HIPRI ((__force __kernel_rwf_t)0x00000001) /* per-IO O_DSYNC */ -#define RWF_DSYNC ((__kernel_rwf_t)0x00000002) +#define RWF_DSYNC ((__force __kernel_rwf_t)0x00000002) /* per-IO O_SYNC */ -#define RWF_SYNC ((__kernel_rwf_t)0x00000004) +#define RWF_SYNC ((__force __kernel_rwf_t)0x00000004) /* per-IO, return -EAGAIN if operation would block */ -#define RWF_NOWAIT ((__kernel_rwf_t)0x00000008) +#define RWF_NOWAIT ((__force __kernel_rwf_t)0x00000008) /* per-IO O_APPEND */ -#define RWF_APPEND ((__kernel_rwf_t)0x00000010) +#define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) /* per-IO negation of O_APPEND */ -#define RWF_NOAPPEND ((__kernel_rwf_t)0x00000020) +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND) +#define PROCFS_IOCTL_MAGIC 'f' + /* Pagemap ioctl */ -#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) +#define PAGEMAP_SCAN _IOWR(PROCFS_IOCTL_MAGIC, 16, struct pm_scan_arg) /* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ #define PAGE_IS_WPALLOWED (1 << 0) @@ -389,4 +395,158 @@ struct pm_scan_arg { __u64 return_mask; }; -#endif /* _LINUX_FS_H */ +/* /proc//maps ioctl */ +#define PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 17, struct procmap_query) + +enum procmap_query_flags { + /* + * VMA permission flags. + * + * Can be used as part of procmap_query.query_flags field to look up + * only VMAs satisfying specified subset of permissions. E.g., specifying + * PROCMAP_QUERY_VMA_READABLE only will return both readable and read/write VMAs, + * while having PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_WRITABLE will only + * return read/write VMAs, though both executable/non-executable and + * private/shared will be ignored. + * + * PROCMAP_QUERY_VMA_* flags are also returned in procmap_query.vma_flags + * field to specify actual VMA permissions. + */ + PROCMAP_QUERY_VMA_READABLE = 0x01, + PROCMAP_QUERY_VMA_WRITABLE = 0x02, + PROCMAP_QUERY_VMA_EXECUTABLE = 0x04, + PROCMAP_QUERY_VMA_SHARED = 0x08, + /* + * Query modifier flags. + * + * By default VMA that covers provided address is returned, or -ENOENT + * is returned. With PROCMAP_QUERY_COVERING_OR_NEXT_VMA flag set, closest + * VMA with vma_start > addr will be returned if no covering VMA is + * found. + * + * PROCMAP_QUERY_FILE_BACKED_VMA instructs query to consider only VMAs that + * have file backing. Can be combined with PROCMAP_QUERY_COVERING_OR_NEXT_VMA + * to iterate all VMAs with file backing. + */ + PROCMAP_QUERY_COVERING_OR_NEXT_VMA = 0x10, + PROCMAP_QUERY_FILE_BACKED_VMA = 0x20, +}; + +/* + * Input/output argument structured passed into ioctl() call. It can be used + * to query a set of VMAs (Virtual Memory Areas) of a process. + * + * Each field can be one of three kinds, marked in a short comment to the + * right of the field: + * - "in", input argument, user has to provide this value, kernel doesn't modify it; + * - "out", output argument, kernel sets this field with VMA data; + * - "in/out", input and output argument; user provides initial value (used + * to specify maximum allowable buffer size), and kernel sets it to actual + * amount of data written (or zero, if there is no data). + * + * If matching VMA is found (according to criterias specified by + * query_addr/query_flags, all the out fields are filled out, and ioctl() + * returns 0. If there is no matching VMA, -ENOENT will be returned. + * In case of any other error, negative error code other than -ENOENT is + * returned. + * + * Most of the data is similar to the one returned as text in /proc//maps + * file, but procmap_query provides more querying flexibility. There are no + * consistency guarantees between subsequent ioctl() calls, but data returned + * for matched VMA is self-consistent. + */ +struct procmap_query { + /* Query struct size, for backwards/forward compatibility */ + __u64 size; + /* + * Query flags, a combination of enum procmap_query_flags values. + * Defines query filtering and behavior, see enum procmap_query_flags. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_flags; /* in */ + /* + * Query address. By default, VMA that covers this address will + * be looked up. PROCMAP_QUERY_* flags above modify this default + * behavior further. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_addr; /* in */ + /* VMA starting (inclusive) and ending (exclusive) address, if VMA is found. */ + __u64 vma_start; /* out */ + __u64 vma_end; /* out */ + /* VMA permissions flags. A combination of PROCMAP_QUERY_VMA_* flags. */ + __u64 vma_flags; /* out */ + /* VMA backing page size granularity. */ + __u64 vma_page_size; /* out */ + /* + * VMA file offset. If VMA has file backing, this specifies offset + * within the file that VMA's start address corresponds to. + * Is set to zero if VMA has no backing file. + */ + __u64 vma_offset; /* out */ + /* Backing file's inode number, or zero, if VMA has no backing file. */ + __u64 inode; /* out */ + /* Backing file's device major/minor number, or zero, if VMA has no backing file. */ + __u32 dev_major; /* out */ + __u32 dev_minor; /* out */ + /* + * If set to non-zero value, signals the request to return VMA name + * (i.e., VMA's backing file's absolute path, with " (deleted)" suffix + * appended, if file was unlinked from FS) for matched VMA. VMA name + * can also be some special name (e.g., "[heap]", "[stack]") or could + * be even user-supplied with prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME). + * + * Kernel will set this field to zero, if VMA has no associated name. + * Otherwise kernel will return actual amount of bytes filled in + * user-supplied buffer (see vma_name_addr field below), including the + * terminating zero. + * + * If VMA name is longer that user-supplied maximum buffer size, + * -E2BIG error is returned. + * + * If this field is set to non-zero value, vma_name_addr should point + * to valid user space memory buffer of at least vma_name_size bytes. + * If set to zero, vma_name_addr should be set to zero as well + */ + __u32 vma_name_size; /* in/out */ + /* + * If set to non-zero value, signals the request to extract and return + * VMA's backing file's build ID, if the backing file is an ELF file + * and it contains embedded build ID. + * + * Kernel will set this field to zero, if VMA has no backing file, + * backing file is not an ELF file, or ELF file has no build ID + * embedded. + * + * Build ID is a binary value (not a string). Kernel will set + * build_id_size field to exact number of bytes used for build ID. + * If build ID is requested and present, but needs more bytes than + * user-supplied maximum buffer size (see build_id_addr field below), + * -E2BIG error will be returned. + * + * If this field is set to non-zero value, build_id_addr should point + * to valid user space memory buffer of at least build_id_size bytes. + * If set to zero, build_id_addr should be set to zero as well + */ + __u32 build_id_size; /* in/out */ + /* + * User-supplied address of a buffer of at least vma_name_size bytes + * for kernel to fill with matched VMA's name (see vma_name_size field + * description above for details). + * + * Should be set to zero if VMA name should not be returned. + */ + __u64 vma_name_addr; /* in */ + /* + * User-supplied address of a buffer of at least build_id_size bytes + * for kernel to fill with matched VMA's ELF build ID, if available + * (see build_id_size field description above for details). + * + * Should be set to zero if build ID should not be returned. + */ + __u64 build_id_addr; /* in */ +}; + +#endif /* _UAPI_LINUX_FS_H */ -- cgit v1.2.3 From 81510a0eaa6916c2fbb0b2639f3e617a296979a3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 27 Jun 2024 10:08:58 -0700 Subject: selftests/proc: add PROCMAP_QUERY ioctl tests Extend existing proc-pid-vm.c tests with PROCMAP_QUERY ioctl() API. Test a few successful and negative cases, validating querying filtering and exact vs next VMA logic works as expected. Link: https://lkml.kernel.org/r/20240627170900.1672542-7-andrii@kernel.org Signed-off-by: Andrii Nakryiko Acked-by: Liam R. Howlett Cc: Alexey Dobriyan Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Cc: Mike Rapoport (IBM) Cc: Suren Baghdasaryan Cc: Andi Kleen Cc: Arnd Bergmann Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/proc-pid-vm.c | 86 ++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 25c34cc9238e..4d57f2a87510 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O2 -Wno-unused-function +CFLAGS += $(TOOLS_INCLUDES) LDFLAGS += -pthread TEST_GEN_PROGS := diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index cacbd2a4aec9..d04685771952 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "../kselftest.h" @@ -492,6 +493,91 @@ int main(void) assert(buf[13] == '\n'); } + /* Test PROCMAP_QUERY ioctl() for /proc/$PID/maps */ + { + char path_buf[256], exp_path_buf[256]; + struct procmap_query q; + int fd, err; + + snprintf(path_buf, sizeof(path_buf), "/proc/%u/maps", pid); + fd = open(path_buf, O_RDONLY); + if (fd == -1) + return 1; + + /* CASE 1: exact MATCH at VADDR */ + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_addr = VADDR; + q.query_flags = 0; + q.vma_name_addr = (__u64)(unsigned long)path_buf; + q.vma_name_size = sizeof(path_buf); + + err = ioctl(fd, PROCMAP_QUERY, &q); + assert(err == 0); + + assert(q.query_addr == VADDR); + assert(q.query_flags == 0); + + assert(q.vma_flags == (PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_EXECUTABLE)); + assert(q.vma_start == VADDR); + assert(q.vma_end == VADDR + PAGE_SIZE); + assert(q.vma_page_size == PAGE_SIZE); + + assert(q.vma_offset == 0); + assert(q.inode == st.st_ino); + assert(q.dev_major == MAJOR(st.st_dev)); + assert(q.dev_minor == MINOR(st.st_dev)); + + snprintf(exp_path_buf, sizeof(exp_path_buf), + "/tmp/#%llu (deleted)", (unsigned long long)st.st_ino); + assert(q.vma_name_size == strlen(exp_path_buf) + 1); + assert(strcmp(path_buf, exp_path_buf) == 0); + + /* CASE 2: NO MATCH at VADDR-1 */ + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_addr = VADDR - 1; + q.query_flags = 0; /* exact match */ + + err = ioctl(fd, PROCMAP_QUERY, &q); + err = err < 0 ? -errno : 0; + assert(err == -ENOENT); + + /* CASE 3: MATCH COVERING_OR_NEXT_VMA at VADDR - 1 */ + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_addr = VADDR - 1; + q.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA; + + err = ioctl(fd, PROCMAP_QUERY, &q); + assert(err == 0); + + assert(q.query_addr == VADDR - 1); + assert(q.query_flags == PROCMAP_QUERY_COVERING_OR_NEXT_VMA); + assert(q.vma_start == VADDR); + assert(q.vma_end == VADDR + PAGE_SIZE); + + /* CASE 4: NO MATCH at VADDR + PAGE_SIZE */ + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_addr = VADDR + PAGE_SIZE; /* point right after the VMA */ + q.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA; + + err = ioctl(fd, PROCMAP_QUERY, &q); + err = err < 0 ? -errno : 0; + assert(err == -ENOENT); + + /* CASE 5: NO MATCH WRITABLE at VADDR */ + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_addr = VADDR; + q.query_flags = PROCMAP_QUERY_VMA_WRITABLE; + + err = ioctl(fd, PROCMAP_QUERY, &q); + err = err < 0 ? -errno : 0; + assert(err == -ENOENT); + } + return 0; } #else -- cgit v1.2.3