From d48ad080ec0101c2cca92926bed64993ab565c3d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 7 Jun 2017 22:47:13 -0400 Subject: rhashtable: use get_random_u32 for hash_rnd This is much faster and just as secure. It also has the added benefit of probably returning better randomness at early-boot on systems with architectural RNGs. Signed-off-by: Jason A. Donenfeld Cc: Thomas Graf Cc: Herbert Xu Signed-off-by: Theodore Ts'o --- lib/rhashtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index d9e7274a04cd..a1eb7c947f46 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -235,7 +235,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, INIT_LIST_HEAD(&tbl->walkers); - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); -- cgit v1.2.3 From d06bfd1989fe97623b32d6df4ffa6e4338c99dc8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 7 Jun 2017 23:06:55 -0400 Subject: random: warn when kernel uses unseeded randomness This enables an important dmesg notification about when drivers have used the crng without it being seeded first. Prior, these errors would occur silently, and so there hasn't been a great way of diagnosing these types of bugs for obscure setups. By adding this as a config option, we can leave it on by default, so that we learn where these issues happen, in the field, will still allowing some people to turn it off, if they really know what they're doing and do not want the log entries. However, we don't leave it _completely_ by default. An earlier version of this patch simply had `default y`. I'd really love that, but it turns out, this problem with unseeded randomness being used is really quite present and is going to take a long time to fix. Thus, as a compromise between log-messages-for-all and nobody-knows, this is `default y`, except it is also `depends on DEBUG_KERNEL`. This will ensure that the curious see the messages while others don't have to. Signed-off-by: Jason A. Donenfeld Signed-off-by: Theodore Ts'o --- drivers/char/random.c | 15 +++++++++++++-- lib/Kconfig.debug | 16 ++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/drivers/char/random.c b/drivers/char/random.c index 3853dd4f92e7..fa5bbd5a7ca0 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -288,7 +288,6 @@ #define SEC_XFER_SIZE 512 #define EXTRACT_SIZE 10 -#define DEBUG_RANDOM_BOOT 0 #define LONGS(x) (((x) + sizeof(unsigned long) - 1)/sizeof(unsigned long)) @@ -1481,7 +1480,7 @@ void get_random_bytes(void *buf, int nbytes) { __u8 tmp[CHACHA20_BLOCK_SIZE]; -#if DEBUG_RANDOM_BOOT > 0 +#ifdef CONFIG_WARN_UNSEEDED_RANDOM if (!crng_ready()) printk(KERN_NOTICE "random: %pF get_random_bytes called " "with crng_init = %d\n", (void *) _RET_IP_, crng_init); @@ -2075,6 +2074,12 @@ u64 get_random_u64(void) return ret; #endif +#ifdef CONFIG_WARN_UNSEEDED_RANDOM + if (!crng_ready()) + printk(KERN_NOTICE "random: %pF get_random_u64 called " + "with crng_init = %d\n", (void *) _RET_IP_, crng_init); +#endif + batch = &get_cpu_var(batched_entropy_u64); if (use_lock) read_lock_irqsave(&batched_entropy_reset_lock, flags); @@ -2101,6 +2106,12 @@ u32 get_random_u32(void) if (arch_get_random_int(&ret)) return ret; +#ifdef CONFIG_WARN_UNSEEDED_RANDOM + if (!crng_ready()) + printk(KERN_NOTICE "random: %pF get_random_u32 called " + "with crng_init = %d\n", (void *) _RET_IP_, crng_init); +#endif + batch = &get_cpu_var(batched_entropy_u32); if (use_lock) read_lock_irqsave(&batched_entropy_reset_lock, flags); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e4587ebe52c7..c4159605bfbf 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1209,6 +1209,22 @@ config STACKTRACE It is also used by various kernel debugging features that require stack trace generation. +config WARN_UNSEEDED_RANDOM + bool "Warn when kernel uses unseeded randomness" + default y + depends on DEBUG_KERNEL + help + Some parts of the kernel contain bugs relating to their use of + cryptographically secure random numbers before it's actually possible + to generate those numbers securely. This setting ensures that these + flaws don't go unnoticed, by enabling a message, should this ever + occur. This will allow people with obscure setups to know when things + are going wrong, so that they might contact developers about fixing + it. + + Say Y here, unless you simply do not care about using unseeded + randomness and do not want a potential warning message in your logs. + config DEBUG_KOBJECT bool "kobject debugging" depends on DEBUG_KERNEL -- cgit v1.2.3 From 3cc78125a081bb79eb38f3e9a585e5a81bb81cb1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 10 Jul 2017 15:51:26 -0700 Subject: lib/test_bitmap.c: add optimisation tests Patch series "Bitmap optimisations", v2. These three bitmap patches use more efficient specialisations when the compiler can figure out that it's safe to do so. Thanks to Rasmus's eagle eyes, a nasty bug in v1 was avoided, and I've added a test case which would have caught it. This patch (of 4): This version of the test is actually a no-op; the next patch will enable it. Link: http://lkml.kernel.org/r/20170628153221.11322-2-willy@infradead.org Signed-off-by: Matthew Wilcox Cc: Rasmus Villemoes Cc: Martin Schwidefsky Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_bitmap.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'lib') diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index e2cbd43d193c..252d3bddbe7d 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -333,10 +333,42 @@ static void __init test_bitmap_u32_array_conversions(void) } } +#define __bitmap_set(a, b, c) bitmap_set(a, b, c) +#define __bitmap_clear(a, b, c) bitmap_clear(a, b, c) + +static void noinline __init test_mem_optimisations(void) +{ + DECLARE_BITMAP(bmap1, 1024); + DECLARE_BITMAP(bmap2, 1024); + unsigned int start, nbits; + + for (start = 0; start < 1024; start += 8) { + memset(bmap1, 0x5a, sizeof(bmap1)); + memset(bmap2, 0x5a, sizeof(bmap2)); + for (nbits = 0; nbits < 1024 - start; nbits += 8) { + bitmap_set(bmap1, start, nbits); + __bitmap_set(bmap2, start, nbits); + if (!bitmap_equal(bmap1, bmap2, 1024)) + printk("set not equal %d %d\n", start, nbits); + if (!__bitmap_equal(bmap1, bmap2, 1024)) + printk("set not __equal %d %d\n", start, nbits); + + bitmap_clear(bmap1, start, nbits); + __bitmap_clear(bmap2, start, nbits); + if (!bitmap_equal(bmap1, bmap2, 1024)) + printk("clear not equal %d %d\n", start, nbits); + if (!__bitmap_equal(bmap1, bmap2, 1024)) + printk("clear not __equal %d %d\n", start, + nbits); + } + } +} + static int __init test_bitmap_init(void) { test_zero_fill_copy(); test_bitmap_u32_array_conversions(); + test_mem_optimisations(); if (failed_tests == 0) pr_info("all %u tests passed\n", total_tests); -- cgit v1.2.3 From e5af323c9badd5dc09af7ccf9d45616ebffc623c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 10 Jul 2017 15:51:29 -0700 Subject: bitmap: optimise bitmap_set and bitmap_clear of a single bit We have eight users calling bitmap_clear for a single bit and seventeen calling bitmap_set for a single bit. Rather than fix all of them to call __clear_bit or __set_bit, turn bitmap_clear and bitmap_set into inline functions and make this special case efficient. Link: http://lkml.kernel.org/r/20170628153221.11322-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Rasmus Villemoes Cc: Martin Schwidefsky Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 23 ++++++++++++++++++++--- lib/bitmap.c | 8 ++++---- lib/test_bitmap.c | 3 --- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3b77588a9360..4e0f0c8167af 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -112,9 +112,8 @@ extern int __bitmap_intersects(const unsigned long *bitmap1, extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); - -extern void bitmap_set(unsigned long *map, unsigned int start, int len); -extern void bitmap_clear(unsigned long *map, unsigned int start, int len); +extern void __bitmap_set(unsigned long *map, unsigned int start, int len); +extern void __bitmap_clear(unsigned long *map, unsigned int start, int len); extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, @@ -315,6 +314,24 @@ static __always_inline int bitmap_weight(const unsigned long *src, unsigned int return __bitmap_weight(src, nbits); } +static __always_inline void bitmap_set(unsigned long *map, unsigned int start, + unsigned int nbits) +{ + if (__builtin_constant_p(nbits) && nbits == 1) + __set_bit(start, map); + else + __bitmap_set(map, start, nbits); +} + +static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, + unsigned int nbits) +{ + if (__builtin_constant_p(nbits) && nbits == 1) + __clear_bit(start, map); + else + __bitmap_clear(map, start, nbits); +} + static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, int nbits) { diff --git a/lib/bitmap.c b/lib/bitmap.c index 08c6ef3a2b6f..9a532805364b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -251,7 +251,7 @@ int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) } EXPORT_SYMBOL(__bitmap_weight); -void bitmap_set(unsigned long *map, unsigned int start, int len) +void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; @@ -270,9 +270,9 @@ void bitmap_set(unsigned long *map, unsigned int start, int len) *p |= mask_to_set; } } -EXPORT_SYMBOL(bitmap_set); +EXPORT_SYMBOL(__bitmap_set); -void bitmap_clear(unsigned long *map, unsigned int start, int len) +void __bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; @@ -291,7 +291,7 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len) *p &= ~mask_to_clear; } } -EXPORT_SYMBOL(bitmap_clear); +EXPORT_SYMBOL(__bitmap_clear); /** * bitmap_find_next_zero_area_off - find a contiguous aligned zero area diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 252d3bddbe7d..2526a2975c51 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -333,9 +333,6 @@ static void __init test_bitmap_u32_array_conversions(void) } } -#define __bitmap_set(a, b, c) bitmap_set(a, b, c) -#define __bitmap_clear(a, b, c) bitmap_clear(a, b, c) - static void noinline __init test_mem_optimisations(void) { DECLARE_BITMAP(bmap1, 1024); -- cgit v1.2.3 From 512750ef8b06290a55d749239f956f9c21d7daca Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 10 Jul 2017 15:51:38 -0700 Subject: lib/kstrtox.c: delete end-of-string test Standard "while (*s)" test is unnecessary because NUL won't pass valid-digit test anyway. Save one branch per parsed character. Link: http://lkml.kernel.org/r/20170514193756.GA32563@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/kstrtox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kstrtox.c b/lib/kstrtox.c index bf85e05ce858..90013f4841c7 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -51,7 +51,7 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long res = 0; rv = 0; - while (*s) { + while (1) { unsigned int val; if ('0' <= *s && *s <= '9') -- cgit v1.2.3 From be5f3c7774a158c5bd08de22d54b0612f954dfa8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 10 Jul 2017 15:51:41 -0700 Subject: lib/kstrtox.c: use "unsigned int" more gcc does generates stupid code sign extending data back and forth. Help by using "unsigned int". add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-61 (-61) function old new delta _parse_integer 128 123 -5 It _still_ does generate useless MOVSX but I don't know how to delete it: 0000000000000070 <_parse_integer>: ... a0: 89 c2 mov edx,eax a2: 83 e8 30 sub eax,0x30 a5: 83 f8 09 cmp eax,0x9 a8: 76 11 jbe bb <_parse_integer+0x4b> aa: 83 ca 20 or edx,0x20 ad: 0f be c2 ===> movsx eax,dl <=== useless b0: 8d 50 9f lea edx,[rax-0x61] b3: 83 fa 05 cmp edx,0x5 Patch also helps on embedded archs which generally only like "int". On arm "and 0xff" is generated which is waste because all values used in comparisons are positive. Link: http://lkml.kernel.org/r/20170514194720.GB32563@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/kstrtox.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/kstrtox.c b/lib/kstrtox.c index 90013f4841c7..720144075c1e 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -52,12 +52,14 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long res = 0; rv = 0; while (1) { + unsigned int c = *s; + unsigned int lc = c | 0x20; /* don't tolower() this line */ unsigned int val; - if ('0' <= *s && *s <= '9') - val = *s - '0'; - else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f') - val = _tolower(*s) - 'a' + 10; + if ('0' <= c && c <= '9') + val = c - '0'; + else if ('a' <= lc && lc <= 'f') + val = lc - 'a' + 10; else break; -- cgit v1.2.3 From 0f789b67647205b77dee56fcc27a7d8de3fcd52e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 10 Jul 2017 15:51:43 -0700 Subject: lib/interval_tree_test.c: allow the module to be compiled-in Patch series "lib/interval_tree_test: some debugging improvements". Here are some patches that update the interval_tree_test module allowing users to pass finer grained options to run the actual test. This patch (of 4): It is a tristate after all, and also serves well for quick debugging. Link: http://lkml.kernel.org/r/20170518174936.20265-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ca9460f049b8..e20fc079bebd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1594,7 +1594,7 @@ config RBTREE_TEST config INTERVAL_TREE_TEST tristate "Interval tree test" - depends on m && DEBUG_KERNEL + depends on DEBUG_KERNEL select INTERVAL_TREE help A benchmark measuring the performance of the interval tree library -- cgit v1.2.3 From a54dae0338b7f01eb0f9c7571fb9b74f791d1c6b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 10 Jul 2017 15:51:46 -0700 Subject: lib/interval_tree_test.c: make test options module parameters Allows for more flexible debugging. Link: http://lkml.kernel.org/r/20170518174936.20265-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/interval_tree_test.c | 57 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c index 245900b98c8e..1093f0496d5e 100644 --- a/lib/interval_tree_test.c +++ b/lib/interval_tree_test.c @@ -1,16 +1,25 @@ #include +#include #include #include +#include #include -#define NODES 100 -#define PERF_LOOPS 100000 -#define SEARCHES 100 -#define SEARCH_LOOPS 10000 +#define __param(type, name, init, msg) \ + static type name = init; \ + module_param(name, type, 0444); \ + MODULE_PARM_DESC(name, msg); + +__param(int, nnodes, 100, "Number of nodes in the interval tree"); +__param(int, perf_loops, 100000, "Number of iterations modifying the tree"); + +__param(int, nsearches, 100, "Number of searches to the interval tree"); +__param(int, search_loops, 10000, "Number of iterations searching the tree"); + static struct rb_root root = RB_ROOT; -static struct interval_tree_node nodes[NODES]; -static u32 queries[SEARCHES]; +static struct interval_tree_node *nodes = NULL; +static u32 *queries = NULL; static struct rnd_state rnd; @@ -29,7 +38,8 @@ search(unsigned long query, struct rb_root *root) static void init(void) { int i; - for (i = 0; i < NODES; i++) { + + for (i = 0; i < nnodes; i++) { u32 a = prandom_u32_state(&rnd); u32 b = prandom_u32_state(&rnd); if (a <= b) { @@ -40,7 +50,7 @@ static void init(void) nodes[i].last = a; } } - for (i = 0; i < SEARCHES; i++) + for (i = 0; i < nsearches; i++) queries[i] = prandom_u32_state(&rnd); } @@ -50,6 +60,16 @@ static int interval_tree_test_init(void) unsigned long results; cycles_t time1, time2, time; + nodes = kmalloc(nnodes * sizeof(struct interval_tree_node), GFP_KERNEL); + if (!nodes) + return -ENOMEM; + + queries = kmalloc(nsearches * sizeof(int), GFP_KERNEL); + if (!queries) { + kfree(nodes); + return -ENOMEM; + } + printk(KERN_ALERT "interval tree insert/remove"); prandom_seed_state(&rnd, 3141592653589793238ULL); @@ -57,39 +77,42 @@ static int interval_tree_test_init(void) time1 = get_cycles(); - for (i = 0; i < PERF_LOOPS; i++) { - for (j = 0; j < NODES; j++) + for (i = 0; i < perf_loops; i++) { + for (j = 0; j < nnodes; j++) interval_tree_insert(nodes + j, &root); - for (j = 0; j < NODES; j++) + for (j = 0; j < nnodes; j++) interval_tree_remove(nodes + j, &root); } time2 = get_cycles(); time = time2 - time1; - time = div_u64(time, PERF_LOOPS); + time = div_u64(time, perf_loops); printk(" -> %llu cycles\n", (unsigned long long)time); printk(KERN_ALERT "interval tree search"); - for (j = 0; j < NODES; j++) + for (j = 0; j < nnodes; j++) interval_tree_insert(nodes + j, &root); time1 = get_cycles(); results = 0; - for (i = 0; i < SEARCH_LOOPS; i++) - for (j = 0; j < SEARCHES; j++) + for (i = 0; i < search_loops; i++) + for (j = 0; j < nsearches; j++) results += search(queries[j], &root); time2 = get_cycles(); time = time2 - time1; - time = div_u64(time, SEARCH_LOOPS); - results = div_u64(results, SEARCH_LOOPS); + time = div_u64(time, search_loops); + results = div_u64(results, search_loops); printk(" -> %llu cycles (%lu results)\n", (unsigned long long)time, results); + kfree(queries); + kfree(nodes); + return -EAGAIN; /* Fail will directly unload the module */ } -- cgit v1.2.3 From a8ec14d4f6aa8e245efacc992c8ee6ea0464ce2a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 10 Jul 2017 15:51:49 -0700 Subject: lib/interval_tree_test.c: allow users to limit scope of endpoint Add a 'max_endpoint' parameter such that users may easily limit the size of the intervals that are randomly generated. Link: http://lkml.kernel.org/r/20170518174936.20265-4-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/interval_tree_test.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c index 1093f0496d5e..0fef6364a958 100644 --- a/lib/interval_tree_test.c +++ b/lib/interval_tree_test.c @@ -16,6 +16,7 @@ __param(int, perf_loops, 100000, "Number of iterations modifying the tree"); __param(int, nsearches, 100, "Number of searches to the interval tree"); __param(int, search_loops, 10000, "Number of iterations searching the tree"); +__param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint"); static struct rb_root root = RB_ROOT; static struct interval_tree_node *nodes = NULL; @@ -40,18 +41,20 @@ static void init(void) int i; for (i = 0; i < nnodes; i++) { - u32 a = prandom_u32_state(&rnd); - u32 b = prandom_u32_state(&rnd); - if (a <= b) { - nodes[i].start = a; - nodes[i].last = b; - } else { - nodes[i].start = b; - nodes[i].last = a; - } + u32 b = (prandom_u32_state(&rnd) >> 4) % max_endpoint; + u32 a = (prandom_u32_state(&rnd) >> 4) % b; + + nodes[i].start = a; + nodes[i].last = b; } + + /* + * Limit the search scope to what the user defined. + * Otherwise we are merely measuring empty walks, + * which is pointless. + */ for (i = 0; i < nsearches; i++) - queries[i] = prandom_u32_state(&rnd); + queries[i] = (prandom_u32_state(&rnd) >> 4) % max_endpoint; } static int interval_tree_test_init(void) -- cgit v1.2.3 From c46ecce431ebe6b1a9551d1f530eb432dae5c39b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 10 Jul 2017 15:51:52 -0700 Subject: lib/interval_tree_test.c: allow full tree search ... such that a user can specify visiting all the nodes in the tree (intersects with the world). This is a nice opposite from the very basic default query which is a single point. Link: http://lkml.kernel.org/r/20170518174936.20265-5-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/interval_tree_test.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c index 0fef6364a958..df495fe81421 100644 --- a/lib/interval_tree_test.c +++ b/lib/interval_tree_test.c @@ -15,6 +15,7 @@ __param(int, perf_loops, 100000, "Number of iterations modifying the tree"); __param(int, nsearches, 100, "Number of searches to the interval tree"); __param(int, search_loops, 10000, "Number of iterations searching the tree"); +__param(bool, search_all, false, "Searches will iterate all nodes in the tree"); __param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint"); @@ -25,13 +26,13 @@ static u32 *queries = NULL; static struct rnd_state rnd; static inline unsigned long -search(unsigned long query, struct rb_root *root) +search(struct rb_root *root, unsigned long start, unsigned long last) { struct interval_tree_node *node; unsigned long results = 0; - for (node = interval_tree_iter_first(root, query, query); node; - node = interval_tree_iter_next(node, query, query)) + for (node = interval_tree_iter_first(root, start, last); node; + node = interval_tree_iter_next(node, start, last)) results++; return results; } @@ -102,8 +103,12 @@ static int interval_tree_test_init(void) results = 0; for (i = 0; i < search_loops; i++) - for (j = 0; j < nsearches; j++) - results += search(queries[j], &root); + for (j = 0; j < nsearches; j++) { + unsigned long start = search_all ? 0 : queries[j]; + unsigned long last = search_all ? max_endpoint : queries[j]; + + results += search(&root, start, last); + } time2 = get_cycles(); time = time2 - time1; -- cgit v1.2.3 From 12e8fd6fd380261fd200d2e8f7a519ade73ea05b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:51:55 -0700 Subject: lib/rhashtable.c: use kvzalloc() in bucket_table_alloc() when possible bucket_table_alloc() can be currently called with GFP_KERNEL or GFP_ATOMIC. For the former we basically have an open coded kvzalloc() while the later only uses kzalloc(). Let's simplify the code a bit by the dropping the open coded path and replace it with kvzalloc(). Link: http://lkml.kernel.org/r/20170531155145.17111-3-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Thomas Graf Cc: Herbert Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rhashtable.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index d9e7274a04cd..42466c167257 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -211,11 +211,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, int i; size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || - gfp != GFP_KERNEL) + if (gfp != GFP_KERNEL) tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); - if (tbl == NULL && gfp == GFP_KERNEL) - tbl = vzalloc(size); + else + tbl = kvzalloc(size, gfp); size = nbuckets; -- cgit v1.2.3 From a94c33dd1f677d16c4f1a162b4b3e9eba1b07c24 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Mon, 10 Jul 2017 15:51:58 -0700 Subject: lib/extable.c: use bsearch() library function in search_extable() [thomas@m3y3r.de: v3: fix arch specific implementations] Link: http://lkml.kernel.org/r/1497890858.12931.7.camel@m3y3r.de Signed-off-by: Thomas Meyer Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/module.c | 3 ++- arch/mips/kernel/traps.c | 3 ++- arch/sh/mm/extable_64.c | 34 ++++++++++++++++++---------------- arch/sparc/mm/extable.c | 28 ++++++++++++++-------------- include/linux/extable.h | 5 +++-- kernel/extable.c | 3 ++- kernel/module.c | 2 +- lib/extable.c | 41 +++++++++++++++++++++-------------------- 8 files changed, 63 insertions(+), 56 deletions(-) (limited to 'lib') diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 94627a3a6a0d..50c020c47e54 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -317,7 +317,8 @@ const struct exception_table_entry *search_module_dbetables(unsigned long addr) spin_lock_irqsave(&dbe_lock, flags); list_for_each_entry(dbe, &dbe_list, dbe_list) { - e = search_extable(dbe->dbe_start, dbe->dbe_end - 1, addr); + e = search_extable(dbe->dbe_start, + dbe->dbe_end - dbe->dbe_start, addr); if (e) break; } diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 38dfa27730ff..b68b4d0726d3 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -429,7 +429,8 @@ static const struct exception_table_entry *search_dbe_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___dbe_table, __stop___dbe_table - 1, addr); + e = search_extable(__start___dbe_table, + __stop___dbe_table - __start___dbe_table, addr); if (!e) e = search_module_dbetables(addr); return e; diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c index b90cdfad2c78..7a3b4d33d2e7 100644 --- a/arch/sh/mm/extable_64.c +++ b/arch/sh/mm/extable_64.c @@ -10,6 +10,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#include #include #include #include @@ -40,10 +41,23 @@ static const struct exception_table_entry *check_exception_ranges(unsigned long return NULL; } +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > _elt->insn) + return 1; + if (_key < _elt->insn) + return -1; + return 0; +} + /* Simple binary search */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { const struct exception_table_entry *mid; @@ -52,20 +66,8 @@ search_extable(const struct exception_table_entry *first, if (mid) return mid; - while (first <= last) { - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } int fixup_exception(struct pt_regs *regs) diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c index db214e9931d9..2422511dc8c5 100644 --- a/arch/sparc/mm/extable.c +++ b/arch/sparc/mm/extable.c @@ -13,11 +13,11 @@ void sort_extable(struct exception_table_entry *start, /* Caller knows they are in a range if ret->fixup == 0 */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *start, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - const struct exception_table_entry *walk; + int i; /* Single insn entries are encoded as: * word 1: insn address @@ -37,30 +37,30 @@ search_extable(const struct exception_table_entry *start, */ /* 1. Try to find an exact match. */ - for (walk = start; walk <= last; walk++) { - if (walk->fixup == 0) { + for (i = 0; i < num; i++) { + if (base[i].fixup == 0) { /* A range entry, skip both parts. */ - walk++; + i++; continue; } /* A deleted entry; see trim_init_extable */ - if (walk->fixup == -1) + if (base[i].fixup == -1) continue; - if (walk->insn == value) - return walk; + if (base[i].insn == value) + return &base[i]; } /* 2. Try to find a range match. */ - for (walk = start; walk <= (last - 1); walk++) { - if (walk->fixup) + for (i = 0; i < (num - 1); i++) { + if (base[i].fixup) continue; - if (walk[0].insn <= value && walk[1].insn > value) - return walk; + if (base[i].insn <= value && base[i + 1].insn > value) + return &base[i]; - walk++; + i++; } return NULL; diff --git a/include/linux/extable.h b/include/linux/extable.h index 7effea4b257d..28addad0dda7 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -2,13 +2,14 @@ #define _LINUX_EXTABLE_H #include /* for NULL */ +#include struct module; struct exception_table_entry; const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value); void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish); diff --git a/kernel/extable.c b/kernel/extable.c index 223df4a328a4..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + e = search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); if (!e) e = search_module_extables(addr); return e; diff --git a/kernel/module.c b/kernel/module.c index b3dbdde82e80..b0f92a365140 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4196,7 +4196,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) goto out; e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, + mod->num_exentries, addr); out: preempt_enable(); diff --git a/lib/extable.c b/lib/extable.c index 62968daa66a9..f54996fdd0b8 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -51,7 +52,7 @@ static void swap_ex(void *a, void *b, int size) * This is used both for the kernel exception table and for * the exception tables of modules that get loaded. */ -static int cmp_ex(const void *a, const void *b) +static int cmp_ex_sort(const void *a, const void *b) { const struct exception_table_entry *x = a, *y = b; @@ -67,7 +68,7 @@ void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, swap_ex); + cmp_ex_sort, swap_ex); } #ifdef CONFIG_MODULES @@ -93,6 +94,20 @@ void trim_init_extable(struct module *m) #endif /* !ARCH_HAS_SORT_EXTABLE */ #ifndef ARCH_HAS_SEARCH_EXTABLE + +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > ex_to_insn(_elt)) + return 1; + if (_key < ex_to_insn(_elt)) + return -1; + return 0; +} + /* * Search one exception table for an entry corresponding to the * given instruction address, and return the address of the entry, @@ -101,25 +116,11 @@ void trim_init_extable(struct module *m) * already sorted. */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - while (first <= last) { - const struct exception_table_entry *mid; - - mid = ((last - first) >> 1) + first; - /* - * careful, the distance between value and insn - * can be larger than MAX_LONG: - */ - if (ex_to_insn(mid) < value) - first = mid + 1; - else if (ex_to_insn(mid) > value) - last = mid - 1; - else - return mid; - } - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } #endif -- cgit v1.2.3 From 166a0f780a8fdb67f232c85e9905ba84f7247da9 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 10 Jul 2017 15:52:01 -0700 Subject: lib/bsearch.c: micro-optimize pivot position calculation There is a slightly faster way (in terms of the number of instructions being used) to calculate the position of a middle element, preserving integer overflow safeness. ./scripts/bloat-o-meter lib/bsearch.o.old lib/bsearch.o.new add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-24 (-24) function old new delta bsearch 122 98 -24 TEST INT array of size 100001, elements [0..100000]. gcc 7.1, Os, x86_64. a) bsearch() of existing key "100001 - 2": BASE ==== $ perf stat ./a.out Performance counter stats for './a.out': 619.445196 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 133 page-faults:u # 0.215 K/sec 1,949,517,279 cycles:u # 3.147 GHz (83.06%) 181,017,938 stalled-cycles-frontend:u # 9.29% frontend cycles idle (83.05%) 82,959,265 stalled-cycles-backend:u # 4.26% backend cycles idle (67.02%) 4,355,706,383 instructions:u # 2.23 insn per cycle # 0.04 stalled cycles per insn (83.54%) 1,051,539,242 branches:u # 1697.550 M/sec (83.54%) 15,263,381 branch-misses:u # 1.45% of all branches (83.43%) 0.620082548 seconds time elapsed PATCHED ======= $ perf stat ./a.out Performance counter stats for './a.out': 475.097316 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 135 page-faults:u # 0.284 K/sec 1,487,467,717 cycles:u # 3.131 GHz (82.95%) 186,537,162 stalled-cycles-frontend:u # 12.54% frontend cycles idle (82.93%) 28,797,869 stalled-cycles-backend:u # 1.94% backend cycles idle (67.10%) 3,807,564,203 instructions:u # 2.56 insn per cycle # 0.05 stalled cycles per insn (83.57%) 1,049,344,291 branches:u # 2208.693 M/sec (83.60%) 5,485 branch-misses:u # 0.00% of all branches (83.58%) 0.475760235 seconds time elapsed b) bsearch() of un-existing key "100001 + 2": BASE ==== $ perf stat ./a.out Performance counter stats for './a.out': 499.244480 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 132 page-faults:u # 0.264 K/sec 1,571,194,855 cycles:u # 3.147 GHz (83.18%) 13,450,980 stalled-cycles-frontend:u # 0.86% frontend cycles idle (83.18%) 21,256,072 stalled-cycles-backend:u # 1.35% backend cycles idle (66.78%) 4,171,197,909 instructions:u # 2.65 insn per cycle # 0.01 stalled cycles per insn (83.68%) 1,009,175,281 branches:u # 2021.405 M/sec (83.79%) 3,122 branch-misses:u # 0.00% of all branches (83.37%) 0.499871144 seconds time elapsed PATCHED ======= $ perf stat ./a.out Performance counter stats for './a.out': 399.023499 task-clock:u (msec) # 0.998 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 134 page-faults:u # 0.336 K/sec 1,245,793,991 cycles:u # 3.122 GHz (83.39%) 11,529,273 stalled-cycles-frontend:u # 0.93% frontend cycles idle (83.46%) 12,116,311 stalled-cycles-backend:u # 0.97% backend cycles idle (66.92%) 3,679,710,005 instructions:u # 2.95 insn per cycle # 0.00 stalled cycles per insn (83.47%) 1,009,792,625 branches:u # 2530.660 M/sec (83.46%) 2,590 branch-misses:u # 0.00% of all branches (83.12%) 0.399733539 seconds time elapsed Link: http://lkml.kernel.org/r/20170607150457.5905-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bsearch.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/bsearch.c b/lib/bsearch.c index e33c179089db..18b445b010c3 100644 --- a/lib/bsearch.c +++ b/lib/bsearch.c @@ -33,19 +33,21 @@ void *bsearch(const void *key, const void *base, size_t num, size_t size, int (*cmp)(const void *key, const void *elt)) { - size_t start = 0, end = num; + const char *pivot; int result; - while (start < end) { - size_t mid = start + (end - start) / 2; + while (num > 0) { + pivot = base + (num >> 1) * size; + result = cmp(key, pivot); - result = cmp(key, base + mid * size); - if (result < 0) - end = mid; - else if (result > 0) - start = mid + 1; - else - return (void *)base + mid * size; + if (result == 0) + return (void *)pivot; + + if (result > 0) { + base = pivot + size; + num--; + } + num >>= 1; } return NULL; -- cgit v1.2.3 From 9308f2f9e7f055cf3934645ec622bb5259dc1c14 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:43 -0700 Subject: test_sysctl: add dedicated proc sysctl test driver The existing tools/testing/selftests/sysctl/ tests include two test cases, but these use existing production kernel sysctl interfaces. We want to expand test coverage but we can't just be looking for random safe production values to poke at, that's just insane! Instead just dedicate a test driver for debugging purposes and port the existing scripts to use it. This will make it easier for further tests to be added. Subsequent patches will extend our test coverage for sysctl. The stress test driver uses a new license (GPL on Linux, copyleft-next outside of Linux). Linus was fine with this [0] and later due to Ted's and Alans's request ironed out an "or" language clause to use [1] which is already present upstream. [0] https://lkml.kernel.org/r/CA+55aFyhxcvD+q7tp+-yrSFDKfR0mOHgyEAe=f_94aKLsOu0Og@mail.gmail.com [1] https://lkml.kernel.org/r/1495234558.7848.122.camel@linux.intel.com Link: http://lkml.kernel.org/r/20170630224431.17374-2-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook Cc: "Eric W. Biederman" Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 11 +++ lib/Makefile | 1 + lib/test_sysctl.c | 113 ++++++++++++++++++++++++ tools/testing/selftests/sysctl/config | 1 + tools/testing/selftests/sysctl/run_numerictests | 4 +- tools/testing/selftests/sysctl/run_stringtests | 4 +- 6 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 lib/test_sysctl.c create mode 100644 tools/testing/selftests/sysctl/config (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e20fc079bebd..f28f4252e54a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1785,6 +1785,17 @@ config TEST_FIRMWARE If unsure, say N. +config TEST_SYSCTL + tristate "sysctl test driver" + default n + depends on PROC_SYSCTL + help + This builds the "test_sysctl" module. This driver enables to test the + proc sysctl interfaces available to drivers safely without affecting + production knobs which might alter system functionality. + + If unsure, say N. + config TEST_UDELAY tristate "udelay test driver" default n diff --git a/lib/Makefile b/lib/Makefile index 5a008329324e..85e91e51a9fe 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o +obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c new file mode 100644 index 000000000000..b2163bfb6eb2 --- /dev/null +++ b/lib/test_sysctl.c @@ -0,0 +1,113 @@ +/* + * proc sysctl test driver + * + * Copyright (C) 2017 Luis R. Rodriguez + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or at your option any + * later version; or, when distributed separately from the Linux kernel or + * when incorporated into other software packages, subject to the following + * license: + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of copyleft-next (version 0.3.1 or later) as published + * at http://copyleft-next.org/. + */ + +/* + * This module provides an interface to the the proc sysctl interfaces. This + * driver requires CONFIG_PROC_SYSCTL. It will not normally be loaded by the + * system unless explicitly requested by name. You can also build this driver + * into your kernel. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int i_zero; +static int i_one_hundred = 100; + +struct test_sysctl_data { + int int_0001; + char string_0001[65]; +}; + +static struct test_sysctl_data test_data = { + .int_0001 = 60, + .string_0001 = "(none)", +}; + +/* These are all under /proc/sys/debug/test_sysctl/ */ +static struct ctl_table test_table[] = { + { + .procname = "int_0001", + .data = &test_data.int_0001, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }, + { + .procname = "string_0001", + .data = &test_data.string_0001, + .maxlen = sizeof(test_data.string_0001), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { } +}; + +static struct ctl_table test_sysctl_table[] = { + { + .procname = "test_sysctl", + .maxlen = 0, + .mode = 0555, + .child = test_table, + }, + { } +}; + +static struct ctl_table test_sysctl_root_table[] = { + { + .procname = "debug", + .maxlen = 0, + .mode = 0555, + .child = test_sysctl_table, + }, + { } +}; + +static struct ctl_table_header *test_sysctl_header; + +static int __init test_sysctl_init(void) +{ + test_sysctl_header = register_sysctl_table(test_sysctl_root_table); + if (!test_sysctl_header) + return -ENOMEM; + return 0; +} +late_initcall(test_sysctl_init); + +static void __exit test_sysctl_exit(void) +{ + if (test_sysctl_header) + unregister_sysctl_table(test_sysctl_header); +} + +module_exit(test_sysctl_exit); + +MODULE_AUTHOR("Luis R. Rodriguez "); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/sysctl/config b/tools/testing/selftests/sysctl/config new file mode 100644 index 000000000000..6ca14800d755 --- /dev/null +++ b/tools/testing/selftests/sysctl/config @@ -0,0 +1 @@ +CONFIG_TEST_SYSCTL=y diff --git a/tools/testing/selftests/sysctl/run_numerictests b/tools/testing/selftests/sysctl/run_numerictests index e6e76c93d948..c375ce0f4c15 100755 --- a/tools/testing/selftests/sysctl/run_numerictests +++ b/tools/testing/selftests/sysctl/run_numerictests @@ -1,7 +1,7 @@ #!/bin/sh -SYSCTL="/proc/sys" -TARGET="${SYSCTL}/vm/swappiness" +SYSCTL="/proc/sys/debug/test_sysctl/" +TARGET="${SYSCTL}/int_0001" ORIG=$(cat "${TARGET}") TEST_STR=$(( $ORIG + 1 )) diff --git a/tools/testing/selftests/sysctl/run_stringtests b/tools/testing/selftests/sysctl/run_stringtests index 857ec667fb02..a6f2618afeaa 100755 --- a/tools/testing/selftests/sysctl/run_stringtests +++ b/tools/testing/selftests/sysctl/run_stringtests @@ -1,7 +1,7 @@ #!/bin/sh -SYSCTL="/proc/sys" -TARGET="${SYSCTL}/kernel/domainname" +SYSCTL="/proc/sys/debug/test_sysctl/" +TARGET="${SYSCTL}/string_0001" ORIG=$(cat "${TARGET}") TEST_STR="Testing sysctl" -- cgit v1.2.3 From eb965eda1cabf26e62afc07a06cdf2fd5aaa2906 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:52 -0700 Subject: test_sysctl: add simple proc_dointvec() case Test against a simple proc_dointvec() case. While at it, add a test against INT_MAX. Make sure INT_MAX works, and INT_MAX+1 will fail. Also test negative values work. Link: http://lkml.kernel.org/r/20170630224431.17374-5-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Kees Cook Cc: "Eric W. Biederman" Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_sysctl.c | 11 ++++++ tools/testing/selftests/sysctl/sysctl.sh | 62 ++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'lib') diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index b2163bfb6eb2..1472e1ae4931 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -41,11 +41,15 @@ static int i_one_hundred = 100; struct test_sysctl_data { int int_0001; + int int_0002; + char string_0001[65]; }; static struct test_sysctl_data test_data = { .int_0001 = 60, + .int_0002 = 1, + .string_0001 = "(none)", }; @@ -60,6 +64,13 @@ static struct ctl_table test_table[] = { .extra1 = &i_zero, .extra2 = &i_one_hundred, }, + { + .procname = "int_0002", + .data = &test_data.int_0002, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "string_0001", .data = &test_data.string_0001, diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 6ec807576f7c..7ba3fa2bbd54 100644 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -31,6 +31,7 @@ TEST_FILE=$(mktemp) # we have tons of space. ALL_TESTS="0001:1:1" ALL_TESTS="$ALL_TESTS 0002:1:1" +ALL_TESTS="$ALL_TESTS 0003:1:1" test_modprobe() { @@ -82,6 +83,9 @@ function check_production_sysctl_writes_strict() if [ -z $MAX_DIGITS ]; then MAX_DIGITS=$(($PAGE_SIZE/8)) fi + if [ -z $INT_MAX ]; then + INT_MAX=$(getconf INT_MAX) + fi } test_reqs() @@ -122,6 +126,9 @@ reset_vals() int_0001) VAL="60" ;; + int_0002) + VAL="1" + ;; string_0001) VAL="(none)" ;; @@ -296,6 +303,48 @@ run_limit_digit() test_rc } +# You are using an int +run_limit_digit_int() +{ + echo -n "Testing INT_MAX works ..." + reset_vals + TEST_STR="$INT_MAX" + echo -n $TEST_STR > $TARGET + + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing INT_MAX + 1 will fail as expected..." + reset_vals + let TEST_STR=$INT_MAX+1 + echo -n $TEST_STR > $TARGET 2> /dev/null + + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing negative values will work as expected..." + reset_vals + TEST_STR="-3" + echo -n $TEST_STR > $TARGET 2> /dev/null + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc +} + run_stringtests() { echo -n "Writing entire sysctl in short writes ... " @@ -389,6 +438,18 @@ sysctl_test_0002() run_stringtests } +sysctl_test_0003() +{ + TARGET="${SYSCTL}/int_0002" + reset_vals + ORIG=$(cat "${TARGET}") + TEST_STR=$(( $ORIG + 1 )) + + run_numerictests + run_limit_digit + run_limit_digit_int +} + list_tests() { echo "Test ID list:" @@ -399,6 +460,7 @@ list_tests() echo echo "0001 x $(get_test_count 0001) - tests proc_dointvec_minmax()" echo "0002 x $(get_test_count 0002) - tests proc_dostring()" + echo "0003 x $(get_test_count 0003) - tests proc_dointvec()" } test_reqs -- cgit v1.2.3 From 2920fad3a5d394b66011c7f35c7b05278354055e Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:55 -0700 Subject: test_sysctl: add simple proc_douintvec() case Test against a simple proc_douintvec() case. While at it, add a test against UINT_MAX. Make sure UINT_MAX works, and UINT_MAX+1 will fail and that negative values are not accepted. Link: http://lkml.kernel.org/r/20170630224431.17374-6-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Kees Cook Cc: "Eric W. Biederman" Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_sysctl.c | 11 ++++++ tools/testing/selftests/sysctl/sysctl.sh | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) (limited to 'lib') diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 1472e1ae4931..53db3513ab08 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -43,6 +43,8 @@ struct test_sysctl_data { int int_0001; int int_0002; + unsigned int uint_0001; + char string_0001[65]; }; @@ -50,6 +52,8 @@ static struct test_sysctl_data test_data = { .int_0001 = 60, .int_0002 = 1, + .uint_0001 = 314, + .string_0001 = "(none)", }; @@ -71,6 +75,13 @@ static struct ctl_table test_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "uint_0001", + .data = &test_data.uint_0001, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { .procname = "string_0001", .data = &test_data.string_0001, diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 7ba3fa2bbd54..abeef675a884 100644 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -32,6 +32,7 @@ TEST_FILE=$(mktemp) ALL_TESTS="0001:1:1" ALL_TESTS="$ALL_TESTS 0002:1:1" ALL_TESTS="$ALL_TESTS 0003:1:1" +ALL_TESTS="$ALL_TESTS 0004:1:1" test_modprobe() { @@ -86,6 +87,9 @@ function check_production_sysctl_writes_strict() if [ -z $INT_MAX ]; then INT_MAX=$(getconf INT_MAX) fi + if [ -z $UINT_MAX ]; then + UINT_MAX=$(getconf UINT_MAX) + fi } test_reqs() @@ -129,6 +133,9 @@ reset_vals() int_0002) VAL="1" ;; + uint_0001) + VAL="314" + ;; string_0001) VAL="(none)" ;; @@ -345,6 +352,49 @@ run_limit_digit_int() test_rc } +# You are using an unsigned int +run_limit_digit_uint() +{ + echo -n "Testing UINT_MAX works ..." + reset_vals + TEST_STR="$UINT_MAX" + echo -n $TEST_STR > $TARGET + + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing UINT_MAX + 1 will fail as expected..." + reset_vals + TEST_STR=$(($UINT_MAX+1)) + echo -n $TEST_STR > $TARGET 2> /dev/null + + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing negative values will not work as expected ..." + reset_vals + TEST_STR="-3" + echo -n $TEST_STR > $TARGET 2> /dev/null + + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc +} + run_stringtests() { echo -n "Writing entire sysctl in short writes ... " @@ -450,6 +500,18 @@ sysctl_test_0003() run_limit_digit_int } +sysctl_test_0004() +{ + TARGET="${SYSCTL}/uint_0001" + reset_vals + ORIG=$(cat "${TARGET}") + TEST_STR=$(( $ORIG + 1 )) + + run_numerictests + run_limit_digit + run_limit_digit_uint +} + list_tests() { echo "Test ID list:" @@ -461,6 +523,7 @@ list_tests() echo "0001 x $(get_test_count 0001) - tests proc_dointvec_minmax()" echo "0002 x $(get_test_count 0002) - tests proc_dostring()" echo "0003 x $(get_test_count 0003) - tests proc_dointvec()" + echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" } test_reqs -- cgit v1.2.3 From 7c43a657a4beadeb6d2fe1a00732261e313a807f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:58 -0700 Subject: test_sysctl: test against int proc_dointvec() array support Add a few initial respective tests for an array: o Echoing values separated by spaces works o Echoing only first elements will set first elements o Confirm PAGE_SIZE limit still applies even if an array is used Link: http://lkml.kernel.org/r/20170630224431.17374-7-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Kees Cook Cc: "Eric W. Biederman" Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_sysctl.c | 13 +++++ tools/testing/selftests/sysctl/sysctl.sh | 89 ++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) (limited to 'lib') diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 53db3513ab08..3dd801c1c85b 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -42,6 +42,7 @@ static int i_one_hundred = 100; struct test_sysctl_data { int int_0001; int int_0002; + int int_0003[4]; unsigned int uint_0001; @@ -52,6 +53,11 @@ static struct test_sysctl_data test_data = { .int_0001 = 60, .int_0002 = 1, + .int_0003[0] = 0, + .int_0003[1] = 1, + .int_0003[2] = 2, + .int_0003[3] = 3, + .uint_0001 = 314, .string_0001 = "(none)", @@ -75,6 +81,13 @@ static struct ctl_table test_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "int_0003", + .data = &test_data.int_0003, + .maxlen = sizeof(test_data.int_0003), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "uint_0001", .data = &test_data.uint_0001, diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index abeef675a884..ec232c3cfcaa 100644 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -33,6 +33,7 @@ ALL_TESTS="0001:1:1" ALL_TESTS="$ALL_TESTS 0002:1:1" ALL_TESTS="$ALL_TESTS 0003:1:1" ALL_TESTS="$ALL_TESTS 0004:1:1" +ALL_TESTS="$ALL_TESTS 0005:3:1" test_modprobe() { @@ -108,6 +109,10 @@ test_reqs() echo "$0: You need getconf installed" exit 1 fi + if ! which diff 2> /dev/null > /dev/null; then + echo "$0: You need diff installed" + exit 1 + fi } function load_req_mod() @@ -167,6 +172,12 @@ verify() return 0 } +verify_diff_w() +{ + echo "$TEST_STR" | diff -q -w -u - $1 + return $? +} + test_rc() { if [[ $rc != 0 ]]; then @@ -352,6 +363,74 @@ run_limit_digit_int() test_rc } +# You used an int array +run_limit_digit_int_array() +{ + echo -n "Testing array works as expected ... " + TEST_STR="4 3 2 1" + echo -n $TEST_STR > $TARGET + + if ! verify_diff_w "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing skipping trailing array elements works ... " + # Do not reset_vals, carry on the values from the last test. + # If we only echo in two digits the last two are left intact + TEST_STR="100 101" + echo -n $TEST_STR > $TARGET + # After we echo in, to help diff we need to set on TEST_STR what + # we expect the result to be. + TEST_STR="100 101 2 1" + + if ! verify_diff_w "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing PAGE_SIZE limit on array works ... " + # Do not reset_vals, carry on the values from the last test. + # Even if you use an int array, you are still restricted to + # MAX_DIGITS, this is a known limitation. Test limit works. + LIMIT=$((MAX_DIGITS -1)) + TEST_STR="9" + (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \ + dd of="${TARGET}" 2>/dev/null + + TEST_STR="9 101 2 1" + if ! verify_diff_w "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing exceeding PAGE_SIZE limit fails as expected ... " + # Do not reset_vals, carry on the values from the last test. + # Now go over limit. + LIMIT=$((MAX_DIGITS)) + TEST_STR="7" + (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \ + dd of="${TARGET}" 2>/dev/null + + TEST_STR="7 101 2 1" + if verify_diff_w "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc +} + # You are using an unsigned int run_limit_digit_uint() { @@ -512,6 +591,15 @@ sysctl_test_0004() run_limit_digit_uint } +sysctl_test_0005() +{ + TARGET="${SYSCTL}/int_0003" + reset_vals + ORIG=$(cat "${TARGET}") + + run_limit_digit_int_array +} + list_tests() { echo "Test ID list:" @@ -524,6 +612,7 @@ list_tests() echo "0002 x $(get_test_count 0002) - tests proc_dostring()" echo "0003 x $(get_test_count 0003) - tests proc_dointvec()" echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" + echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" } test_reqs -- cgit v1.2.3 From e41d58185f1444368873d4d7422f7664a68be61d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 12 Jul 2017 14:34:35 -0700 Subject: fault-inject: support systematic fault injection Add /proc/self/task//fail-nth file that allows failing 0-th, 1-st, 2-nd and so on calls systematically. Excerpt from the added documentation: "Write to this file of integer N makes N-th call in the current task fail (N is 0-based). Read from this file returns a single char 'Y' or 'N' that says if the fault setup with a previous write to this file was injected or not, and disables the fault if it wasn't yet injected. Note that this file enables all types of faults (slab, futex, etc). This setting takes precedence over all other generic settings like probability, interval, times, etc. But per-capability settings (e.g. fail_futex/ignore-private) take precedence over it. This feature is intended for systematic testing of faults in a single system call. See an example below" Why add a new setting: 1. Existing settings are global rather than per-task. So parallel testing is not possible. 2. attr->interval is close but it depends on attr->count which is non reset to 0, so interval does not work as expected. 3. Trying to model this with existing settings requires manipulations of all of probability, interval, times, space, task-filter and unexposed count and per-task make-it-fail files. 4. Existing settings are per-failure-type, and the set of failure types is potentially expanding. 5. make-it-fail can't be changed by unprivileged user and aggressive stress testing better be done from an unprivileged user. Similarly, this would require opening the debugfs files to the unprivileged user, as he would need to reopen at least times file (not possible to pre-open before dropping privs). The proposed interface solves all of the above (see the example). We want to integrate this into syzkaller fuzzer. A prototype has found 10 bugs in kernel in first day of usage: https://groups.google.com/forum/#!searchin/syzkaller/%22FAULT_INJECTION%22%7Csort:relevance I've made the current interface work with all types of our sandboxes. For setuid the secret sauce was prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) to make /proc entries non-root owned. So I am fine with the current version of the code. [akpm@linux-foundation.org: fix build] Link: http://lkml.kernel.org/r/20170328130128.101773-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Akinobu Mita Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/fault-injection/fault-injection.txt | 78 +++++++++++++++++++++++ fs/proc/base.c | 52 +++++++++++++++ include/linux/sched.h | 1 + kernel/fork.c | 4 ++ lib/fault-inject.c | 7 ++ 5 files changed, 142 insertions(+) (limited to 'lib') diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 415484f3d59a..192d8cbcc5f9 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -134,6 +134,22 @@ use the boot option: fail_futex= mmc_core.fail_request=,,, +o proc entries + +- /proc/self/task//fail-nth: + + Write to this file of integer N makes N-th call in the current task fail + (N is 0-based). Read from this file returns a single char 'Y' or 'N' + that says if the fault setup with a previous write to this file was + injected or not, and disables the fault if it wasn't yet injected. + Note that this file enables all types of faults (slab, futex, etc). + This setting takes precedence over all other generic debugfs settings + like probability, interval, times, etc. But per-capability settings + (e.g. fail_futex/ignore-private) take precedence over it. + + This feature is intended for systematic testing of faults in a single + system call. See an example below. + How to add new fault injection capability ----------------------------------------- @@ -278,3 +294,65 @@ allocation failure. # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ -- make -C tools/testing/selftests/ run_tests + +Systematic faults using fail-nth +--------------------------------- + +The following code systematically faults 0-th, 1-st, 2-nd and so on +capabilities in the socketpair() system call. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main() +{ + int i, err, res, fail_nth, fds[2]; + char buf[128]; + + system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); + sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); + fail_nth = open(buf, O_RDWR); + for (i = 0;; i++) { + sprintf(buf, "%d", i); + write(fail_nth, buf, strlen(buf)); + res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); + err = errno; + read(fail_nth, buf, 1); + if (res == 0) { + close(fds[0]); + close(fds[1]); + } + printf("%d-th fault %c: res=%d/%d\n", i, buf[0], res, err); + if (buf[0] != 'Y') + break; + } + return 0; +} + +An example output: + +0-th fault Y: res=-1/23 +1-th fault Y: res=-1/23 +2-th fault Y: res=-1/23 +3-th fault Y: res=-1/12 +4-th fault Y: res=-1/12 +5-th fault Y: res=-1/23 +6-th fault Y: res=-1/23 +7-th fault Y: res=-1/23 +8-th fault Y: res=-1/12 +9-th fault Y: res=-1/12 +10-th fault Y: res=-1/12 +11-th fault Y: res=-1/12 +12-th fault Y: res=-1/12 +13-th fault Y: res=-1/12 +14-th fault Y: res=-1/12 +15-th fault Y: res=-1/12 +16-th fault N: res=0/12 diff --git a/fs/proc/base.c b/fs/proc/base.c index f1e1927ccd48..88b773f318cd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1355,6 +1355,53 @@ static const struct file_operations proc_fault_inject_operations = { .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; + +static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err, n; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + err = kstrtoint_from_user(buf, count, 10, &n); + if (err) + return err; + if (n < 0 || n == INT_MAX) + return -EINVAL; + current->fail_nth = n + 1; + return count; +} + +static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + if (count < 1) + return -EINVAL; + err = put_user((char)(current->fail_nth ? 'N' : 'Y'), buf); + if (err) + return err; + current->fail_nth = 0; + return 1; +} + +static const struct file_operations proc_fail_nth_operations = { + .read = proc_fail_nth_read, + .write = proc_fail_nth_write, +}; #endif @@ -3311,6 +3358,11 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + /* + * Operations on the file check that the task is current, + * so we create it with 0666 to support testing under unprivileged user. + */ + REG("fail-nth", 0666, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), diff --git a/include/linux/sched.h b/include/linux/sched.h index 20814b7d7d70..3822d749fc9e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -974,6 +974,7 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; + int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call diff --git a/kernel/fork.c b/kernel/fork.c index d2b9d7c31eaf..ade237a96308 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -573,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 4ff157159a0d..09ac73c177fd 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + if (in_task() && current->fail_nth) { + if (--current->fail_nth == 0) + goto fail; + return false; + } + /* No need to check any other properties if the probability is 0 */ if (attr->probability == 0) return false; @@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (!fail_stacktrace(attr)) return false; +fail: fail_dump(attr); if (atomic_read(&attr->times) != -1) -- cgit v1.2.3 From 05a4a95279311c3a4633b4277a5d21cfd616c6c7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:46 -0700 Subject: kernel/watchdog: split up config options Split SOFTLOCKUP_DETECTOR from LOCKUP_DETECTOR, and split HARDLOCKUP_DETECTOR_PERF from HARDLOCKUP_DETECTOR. LOCKUP_DETECTOR implies the general boot, sysctl, and programming interfaces for the lockup detectors. An architecture that wants to use a hard lockup detector must define HAVE_HARDLOCKUP_DETECTOR_PERF or HAVE_HARDLOCKUP_DETECTOR_ARCH. Alternatively an arch can define HAVE_NMI_WATCHDOG, which provides the minimum arch_touch_nmi_watchdog, and it otherwise does its own thing and does not implement the LOCKUP_DETECTOR interfaces. sparc is unusual in that it has started to implement some of the interfaces, but not fully yet. It should probably be converted to a full HAVE_HARDLOCKUP_DETECTOR_ARCH. [npiggin@gmail.com: fix] Link: http://lkml.kernel.org/r/20170617223522.66c0ad88@roar.ozlabs.ibm.com Link: http://lkml.kernel.org/r/20170616065715.18390-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 25 ++++- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/setup_64.c | 2 +- arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 2 +- include/linux/nmi.h | 29 +++-- kernel/Makefile | 2 +- kernel/sysctl.c | 31 +++--- kernel/watchdog.c | 243 +++++++++++++++++++++++++++-------------- kernel/watchdog_hld.c | 32 ------ lib/Kconfig.debug | 45 +++++--- 11 files changed, 251 insertions(+), 162 deletions(-) (limited to 'lib') diff --git a/arch/Kconfig b/arch/Kconfig index cae0958a2298..fb9bd7d36b05 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -198,9 +198,6 @@ config HAVE_KPROBES_ON_FTRACE config HAVE_NMI bool -config HAVE_NMI_WATCHDOG - depends on HAVE_NMI - bool # # An arch should select this if it provides all these things: # @@ -288,6 +285,28 @@ config HAVE_PERF_EVENTS_NMI subsystem. Also has support for calculating CPU cycle events to determine how many clock cycles in a given period. +config HAVE_HARDLOCKUP_DETECTOR_PERF + bool + depends on HAVE_PERF_EVENTS_NMI + help + The arch chooses to use the generic perf-NMI-based hardlockup + detector. Must define HAVE_PERF_EVENTS_NMI. + +config HAVE_NMI_WATCHDOG + depends on HAVE_NMI + bool + help + The arch provides a low level NMI watchdog. It provides + asm/nmi.h, and defines its own arch_touch_nmi_watchdog(). + +config HAVE_HARDLOCKUP_DETECTOR_ARCH + bool + select HAVE_NMI_WATCHDOG + help + The arch chooses to provide its own hardlockup detector, which is + a superset of the HAVE_NMI_WATCHDOG. It also conforms to config + interfaces and parameters provided by hardlockup detector subsystem. + config HAVE_PERF_REGS bool help diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7177a3f4f418..63ed758e1d20 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -197,6 +197,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 4640f6d64f8b..074a075a9cdb 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -752,7 +752,7 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return ppc_proc_freq * watchdog_thresh; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 94a18681353d..3d2b8ce54e00 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -162,6 +162,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c73c9fb281e1..d6f387780849 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,7 +19,7 @@ #include #include -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index bd387ef8bccd..8aa01fd859fb 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -11,13 +11,21 @@ #endif #ifdef CONFIG_LOCKUP_DETECTOR +void lockup_detector_init(void); +#else +static inline void lockup_detector_init(void) +{ +} +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -extern unsigned int hardlockup_panic; -void lockup_detector_init(void); +extern int soft_watchdog_enabled; +extern atomic_t watchdog_park_in_progress; #else static inline void touch_softlockup_watchdog_sched(void) { @@ -31,9 +39,6 @@ static inline void touch_softlockup_watchdog_sync(void) static inline void touch_all_softlockup_watchdogs(void) { } -static inline void lockup_detector_init(void) -{ -} #endif #ifdef CONFIG_DETECT_HUNG_TASK @@ -63,15 +68,18 @@ static inline void reset_hung_task_detector(void) #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); +extern unsigned int hardlockup_panic; #else static inline void hardlockup_detector_disable(void) {} #endif -#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); #else +#if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline void arch_touch_nmi_watchdog(void) {} #endif +#endif /** * touch_nmi_watchdog - restart NMI watchdog timeout. @@ -141,15 +149,18 @@ static inline bool trigger_single_cpu_backtrace(int cpu) } #endif -#ifdef CONFIG_LOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh); +#endif + +#ifdef CONFIG_LOCKUP_DETECTOR extern int nmi_watchdog_enabled; -extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; +extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; -extern atomic_t watchdog_park_in_progress; +extern int __read_mostly watchdog_suspended; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index df9f2a367882..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -900,6 +900,14 @@ static struct ctl_table kern_table[] = { .extra2 = &zero, #endif }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &soft_watchdog_enabled, @@ -909,13 +917,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, { .procname = "softlockup_panic", .data = &softlockup_panic, @@ -925,27 +926,29 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_SMP { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#endif /* CONFIG_SMP */ #endif -#ifdef CONFIG_SMP +#ifdef CONFIG_HARDLOCKUP_DETECTOR { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, + .procname = "hardlockup_panic", + .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, @@ -957,6 +960,8 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #endif +#endif + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..1fba9c3d66dc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,15 +29,58 @@ #include #include +/* Watchdog configuration */ static DEFINE_MUTEX(watchdog_proc_mutex); -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +int __read_mostly nmi_watchdog_enabled; + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | + NMI_WATCHDOG_ENABLED; #else unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif -int __read_mostly nmi_watchdog_enabled; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR int __read_mostly soft_watchdog_enabled; +#endif + int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #endif -static struct cpumask watchdog_cpumask __read_mostly; +struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - /* * The 'watchdog_running' variable is set to 1 when the watchdog threads * are registered/started and is set to 0 when the watchdog threads are @@ -72,7 +109,27 @@ static int __read_mostly watchdog_running; * of 'watchdog_running' cannot change while the watchdog is deactivated * temporarily (see related code in 'proc' handlers). */ -static int __read_mostly watchdog_suspended; +int __read_mostly watchdog_suspended; + +/* + * These functions can be overridden if an architecture implements its + * own hardlockup detector. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) + +atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); static u64 __read_mostly sample_period; @@ -120,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int __init hardlockup_all_cpu_backtrace_setup(char *str) { sysctl_hardlockup_all_cpu_backtrace = @@ -128,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -213,18 +272,6 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -/* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -237,21 +284,21 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static void watchdog_interrupt_count(void) +/* watchdog detector functions */ +bool is_hardlockup(void) { - __this_cpu_inc(hrtimer_interrupts); -} + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -/* - * These two functions are mostly architecture specific - * defining them as weak here. - */ -int __weak watchdog_nmi_enable(unsigned int cpu) -{ - return 0; + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return true; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; } -void __weak watchdog_nmi_disable(unsigned int cpu) + +static void watchdog_interrupt_count(void) { + __this_cpu_inc(hrtimer_interrupts); } static int watchdog_enable_all_cpus(void); @@ -502,57 +549,6 @@ static void watchdog_unpark_threads(void) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - static int update_watchdog_all_cpus(void) { int ret; @@ -604,6 +600,81 @@ static void watchdog_disable_all_cpus(void) } } +#else /* SOFTLOCKUP */ +static int watchdog_park_threads(void) +{ + return 0; +} + +static void watchdog_unpark_threads(void) +{ +} + +static int watchdog_enable_all_cpus(void) +{ + return 0; +} + +static void watchdog_disable_all_cpus(void) +{ +} + +static void set_sample_period(void) +{ +} +#endif /* SOFTLOCKUP */ + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); + /* + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). + */ + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + mutex_unlock(&watchdog_proc_mutex); + + return ret; +} + +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) +{ + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; + /* + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. + */ + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); +} + #ifdef CONFIG_SYSCTL /* @@ -810,9 +881,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ +#ifdef CONFIG_SOFTLOCKUP_DETECTOR if (smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask) != 0) pr_err("cpumask update failed\n"); +#endif } } out: diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 90d688df6ce1..295a0d84934c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -22,39 +22,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); void arch_touch_nmi_watchdog(void) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f28f4252e54a..b0d01c6d4e03 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -778,34 +778,45 @@ config DEBUG_SHIRQ menu "Debug Lockups and Hangs" config LOCKUP_DETECTOR - bool "Detect Hard and Soft Lockups" + bool + +config SOFTLOCKUP_DETECTOR + bool "Detect Soft Lockups" depends on DEBUG_KERNEL && !S390 + select LOCKUP_DETECTOR help Say Y here to enable the kernel to act as a watchdog to detect - hard and soft lockups. + soft lockups. Softlockups are bugs that cause the kernel to loop in kernel mode for more than 20 seconds, without giving other tasks a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config HARDLOCKUP_DETECTOR_PERF + bool + select SOFTLOCKUP_DETECTOR + +# +# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard +# lockup detector rather than the perf based detector. +# +config HARDLOCKUP_DETECTOR + bool "Detect Hard Lockups" + depends on DEBUG_KERNEL && !S390 + depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH + select LOCKUP_DETECTOR + select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF + select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH + help + Say Y here to enable the kernel to act as a watchdog to detect + hard lockups. + Hardlockups are bugs that cause the CPU to loop in kernel mode for more than 10 seconds, without letting other interrupts have a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. - The overhead should be minimal. A periodic hrtimer runs to - generate interrupts and kick the watchdog task every 4 seconds. - An NMI is generated every 10 seconds or so to check for hardlockups. - - The frequency of hrtimer and NMI events and the soft and hard lockup - thresholds can be controlled through the sysctl watchdog_thresh. - -config HARDLOCKUP_DETECTOR - def_bool y - depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG - depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI - config BOOTPARAM_HARDLOCKUP_PANIC bool "Panic (Reboot) On Hard Lockups" depends on HARDLOCKUP_DETECTOR @@ -826,7 +837,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to panic on "soft lockups", which are bugs that cause the kernel to loop in kernel @@ -843,7 +854,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE int - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR range 0 1 default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC default 1 if BOOTPARAM_SOFTLOCKUP_PANIC @@ -851,7 +862,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL - default LOCKUP_DETECTOR + default SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in -- cgit v1.2.3 From 6974f0c4555e285ab217cee58b6e874f776ff409 Mon Sep 17 00:00:00 2001 From: Daniel Micay Date: Wed, 12 Jul 2017 14:36:10 -0700 Subject: include/linux/string.h: add the option of fortified string.h functions This adds support for compiling with a rough equivalent to the glibc _FORTIFY_SOURCE=1 feature, providing compile-time and runtime buffer overflow checks for string.h functions when the compiler determines the size of the source or destination buffer at compile-time. Unlike glibc, it covers buffer reads in addition to writes. GNU C __builtin_*_chk intrinsics are avoided because they would force a much more complex implementation. They aren't designed to detect read overflows and offer no real benefit when using an implementation based on inline checks. Inline checks don't add up to much code size and allow full use of the regular string intrinsics while avoiding the need for a bunch of _chk functions and per-arch assembly to avoid wrapper overhead. This detects various overflows at compile-time in various drivers and some non-x86 core kernel code. There will likely be issues caught in regular use at runtime too. Future improvements left out of initial implementation for simplicity, as it's all quite optional and can be done incrementally: * Some of the fortified string functions (strncpy, strcat), don't yet place a limit on reads from the source based on __builtin_object_size of the source buffer. * Extending coverage to more string functions like strlcat. * It should be possible to optionally use __builtin_object_size(x, 1) for some functions (C strings) to detect intra-object overflows (like glibc's _FORTIFY_SOURCE=2), but for now this takes the conservative approach to avoid likely compatibility issues. * The compile-time checks should be made available via a separate config option which can be enabled by default (or always enabled) once enough time has passed to get the issues it catches fixed. Kees said: "This is great to have. While it was out-of-tree code, it would have blocked at least CVE-2016-3858 from being exploitable (improper size argument to strlcpy()). I've sent a number of fixes for out-of-bounds-reads that this detected upstream already" [arnd@arndb.de: x86: fix fortified memcpy] Link: http://lkml.kernel.org/r/20170627150047.660360-1-arnd@arndb.de [keescook@chromium.org: avoid panic() in favor of BUG()] Link: http://lkml.kernel.org/r/20170626235122.GA25261@beast [keescook@chromium.org: move from -mm, add ARCH_HAS_FORTIFY_SOURCE, tweak Kconfig help] Link: http://lkml.kernel.org/r/20170526095404.20439-1-danielmicay@gmail.com Link: http://lkml.kernel.org/r/1497903987-21002-8-git-send-email-keescook@chromium.org Signed-off-by: Daniel Micay Signed-off-by: Kees Cook Signed-off-by: Arnd Bergmann Acked-by: Kees Cook Cc: Mark Rutland Cc: Daniel Axtens Cc: Rasmus Villemoes Cc: Andy Shevchenko Cc: Chris Metcalf Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 6 ++ arch/arm64/Kconfig | 1 + arch/arm64/include/asm/string.h | 5 + arch/powerpc/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/boot/compressed/misc.c | 5 + arch/x86/include/asm/string_32.h | 9 ++ arch/x86/include/asm/string_64.h | 7 ++ arch/x86/lib/memcpy_32.c | 2 +- include/linux/string.h | 200 +++++++++++++++++++++++++++++++++++++++ lib/string.c | 7 ++ security/Kconfig | 7 ++ 12 files changed, 250 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/arch/Kconfig b/arch/Kconfig index fb9bd7d36b05..21d0089117fe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -223,6 +223,12 @@ config GENERIC_SMP_IDLE_THREAD config GENERIC_IDLE_POLL_SETUP bool +config ARCH_HAS_FORTIFY_SOURCE + bool + help + An architecture should select this when it can successfully + build and run with CONFIG_FORTIFY_SOURCE. + # Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h config ARCH_HAS_SET_MEMORY bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8addb851ab5e..dfd908630631 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -12,6 +12,7 @@ config ARM64 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 2eb714c4639f..d0aa42907569 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -63,6 +63,11 @@ extern int memcmp(const void *, const void *, size_t); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + #endif #endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index fce2f4f20891..36f858c37ca7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -125,6 +125,7 @@ config PPC select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3d2b8ce54e00..781521b7cf9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -50,6 +50,7 @@ config X86 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 00241c815524..a0838ab929f2 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -411,3 +411,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putstr("done.\nBooting the kernel.\n"); return output; } + +void fortify_panic(const char *name) +{ + error("detected buffer overflow"); +} diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 3d3e8353ee5c..e9ee84873de5 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -142,7 +142,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, } #define __HAVE_ARCH_MEMCPY +extern void *memcpy(void *, const void *, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #ifdef CONFIG_X86_USE_3DNOW #include @@ -195,11 +197,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) #endif #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t n); +extern int memcmp(const void *, const void *, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #define memcmp __builtin_memcmp +#endif #define __HAVE_ARCH_MEMCHR extern void *memchr(const void *cs, int c, size_t count); @@ -321,6 +327,8 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, : __memset_generic((s), (c), (count))) #define __HAVE_ARCH_MEMSET +extern void *memset(void *, int, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #if (__GNUC__ >= 4) #define memset(s, c, count) __builtin_memset(s, c, count) #else @@ -330,6 +338,7 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, (count)) \ : __memset((s), (c), (count))) #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ /* * find the first occurrence of byte 'c', or 1 past the area if none diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 1f22bc277c45..2a8c822de1fc 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -31,6 +31,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); +#ifndef CONFIG_FORTIFY_SOURCE #ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ @@ -51,6 +52,7 @@ extern void *__memcpy(void *to, const void *from, size_t len); */ #define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); @@ -77,6 +79,11 @@ int strcmp(const char *cs, const char *ct); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + #endif #define __HAVE_ARCH_MEMCPY_MCSAFE 1 diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index cad12634d6bd..2eab7d0bfedd 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -6,7 +6,7 @@ __visible void *memcpy(void *to, const void *from, size_t n) { -#ifdef CONFIG_X86_USE_3DNOW +#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE) return __memcpy3d(to, from, n); #else return __memcpy(to, from, n); diff --git a/include/linux/string.h b/include/linux/string.h index 7439d83eaa33..96f5a5fd0377 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -193,4 +193,204 @@ static inline const char *kbasename(const char *path) return tail ? tail + 1 : path; } +#define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline)) +#define __RENAME(x) __asm__(#x) + +void fortify_panic(const char *name) __noreturn __cold; +void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter"); +void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter"); +void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); + +#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) +__FORTIFY_INLINE char *strcpy(char *p, const char *q) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strcpy(p, q); + if (strscpy(p, q, p_size < q_size ? p_size : q_size) < 0) + fortify_panic(__func__); + return p; +} + +__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __builtin_strncpy(p, q, size); +} + +__FORTIFY_INLINE char *strcat(char *p, const char *q) +{ + size_t p_size = __builtin_object_size(p, 0); + if (p_size == (size_t)-1) + return __builtin_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) + fortify_panic(__func__); + return p; +} + +__FORTIFY_INLINE __kernel_size_t strlen(const char *p) +{ + __kernel_size_t ret; + size_t p_size = __builtin_object_size(p, 0); + if (p_size == (size_t)-1) + return __builtin_strlen(p); + ret = strnlen(p, p_size); + if (p_size <= ret) + fortify_panic(__func__); + return ret; +} + +extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); +__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) +{ + size_t p_size = __builtin_object_size(p, 0); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); + return ret; +} + +/* defined after fortified strlen to reuse it */ +extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); +__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) +{ + size_t ret; + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + if (__builtin_constant_p(len) && len >= p_size) + __write_overflow(); + if (len >= p_size) + fortify_panic(__func__); + __builtin_memcpy(p, q, len); + p[len] = '\0'; + } + return ret; +} + +/* defined after fortified strlen and strnlen to reuse them */ +__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) +{ + size_t p_len, copy_len; + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strncat(p, q, count); + p_len = strlen(p); + copy_len = strnlen(q, count); + if (p_size < p_len + copy_len + 1) + fortify_panic(__func__); + __builtin_memcpy(p + p_len, q, copy_len); + p[p_len + copy_len] = '\0'; + return p; +} + +__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __builtin_memset(p, c, size); +} + +__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (__builtin_constant_p(size)) { + if (p_size < size) + __write_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __builtin_memcpy(p, q, size); +} + +__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (__builtin_constant_p(size)) { + if (p_size < size) + __write_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __builtin_memmove(p, q, size); +} + +extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); +__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_memscan(p, c, size); +} + +__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (__builtin_constant_p(size)) { + if (p_size < size) + __read_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __builtin_memcmp(p, q, size); +} + +__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __builtin_memchr(p, c, size); +} + +void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); +__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_memchr_inv(p, c, size); +} + +extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); +__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_kmemdup(p, size, gfp); +} +#endif + #endif /* _LINUX_STRING_H_ */ diff --git a/lib/string.c b/lib/string.c index 1c1fc9187b05..ebbb99c775bd 100644 --- a/lib/string.c +++ b/lib/string.c @@ -978,3 +978,10 @@ char *strreplace(char *s, char old, char new) return s; } EXPORT_SYMBOL(strreplace); + +void fortify_panic(const char *name) +{ + pr_emerg("detected buffer overflow in %s\n", name); + BUG(); +} +EXPORT_SYMBOL(fortify_panic); diff --git a/security/Kconfig b/security/Kconfig index d540bfe73190..e8e449444e65 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -163,6 +163,13 @@ config HARDENED_USERCOPY_PAGESPAN been removed. This config is intended to be used only while trying to find such users. +config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + config STATIC_USERMODEHELPER bool "Force all usermode helper calls through a single binary" help -- cgit v1.2.3 From 3e8f399da490e6ac20a3cfd6aa404c9aa961a9a2 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Jul 2017 14:37:51 -0700 Subject: writeback: rework wb_[dec|inc]_stat family of functions Currently the writeback statistics code uses a percpu counters to hold various statistics. Furthermore we have 2 families of functions - those which disable local irq and those which doesn't and whose names begin with double underscore. However, they both end up calling __add_wb_stats which in turn calls percpu_counter_add_batch which is already irq-safe. Exploiting this fact allows to eliminated the __wb_* functions since they don't add any further protection than we already have. Furthermore, refactor the wb_* function to call __add_wb_stat directly without the irq-disabling dance. This will likely result in better runtime of code which deals with modifying the stat counters. While at it also document why percpu_counter_add_batch is in fact preempt and irq-safe since at least 3 people got confused. Link: http://lkml.kernel.org/r/1498029937-27293-1-git-send-email-nborisov@suse.com Signed-off-by: Nikolay Borisov Acked-by: Tejun Heo Reviewed-by: Jan Kara Cc: Josef Bacik Cc: Mel Gorman Cc: Jeff Layton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 8 ++++---- include/linux/backing-dev.h | 24 ++---------------------- lib/percpu_counter.c | 7 +++++++ mm/page-writeback.c | 10 +++++----- 4 files changed, 18 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8b426f83909f..245c430a2e41 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -380,8 +380,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct page *page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (likely(page) && PageDirty(page)) { - __dec_wb_stat(old_wb, WB_RECLAIMABLE); - __inc_wb_stat(new_wb, WB_RECLAIMABLE); + dec_wb_stat(old_wb, WB_RECLAIMABLE); + inc_wb_stat(new_wb, WB_RECLAIMABLE); } } @@ -391,8 +391,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) &mapping->tree_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); - __dec_wb_stat(old_wb, WB_WRITEBACK); - __inc_wb_stat(new_wb, WB_WRITEBACK); + dec_wb_stat(old_wb, WB_WRITEBACK); + inc_wb_stat(new_wb, WB_WRITEBACK); } } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 334165c911f0..854e1bdd0b2a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -69,34 +69,14 @@ static inline void __add_wb_stat(struct bdi_writeback *wb, percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void __inc_wb_stat(struct bdi_writeback *wb, - enum wb_stat_item item) -{ - __add_wb_stat(wb, item, 1); -} - static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - unsigned long flags; - - local_irq_save(flags); - __inc_wb_stat(wb, item); - local_irq_restore(flags); -} - -static inline void __dec_wb_stat(struct bdi_writeback *wb, - enum wb_stat_item item) -{ - __add_wb_stat(wb, item, -1); + __add_wb_stat(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - unsigned long flags; - - local_irq_save(flags); - __dec_wb_stat(wb, item); - local_irq_restore(flags); + __add_wb_stat(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 8ee7e5ec21be..3bf4a9984f4c 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -72,6 +72,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) } EXPORT_SYMBOL(percpu_counter_set); +/** + * This function is both preempt and irq safe. The former is due to explicit + * preemption disable. The latter is guaranteed by the fact that the slow path + * is explicitly protected by an irq-safe spinlock whereas the fast patch uses + * this_cpu_add which is irq-safe by definition. Hence there is no need muck + * with irq state before calling this one + */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b60cc7ddac2..96e93b214d31 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { struct wb_domain *cgdom; - __inc_wb_stat(wb, WB_WRITTEN); + inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); @@ -2435,8 +2435,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_lruvec_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); - __inc_wb_stat(wb, WB_RECLAIMABLE); - __inc_wb_stat(wb, WB_DIRTIED); + inc_wb_stat(wb, WB_RECLAIMABLE); + inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2741,7 +2741,7 @@ int test_clear_page_writeback(struct page *page) if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); - __dec_wb_stat(wb, WB_WRITEBACK); + dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); } } @@ -2786,7 +2786,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) - __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); /* * We can come through here when swapping anonymous -- cgit v1.2.3 From ffba19ccae8d98beb0a17345a0b1ee9e415b23b8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 Jul 2017 14:49:41 -0700 Subject: lib/atomic64_test.c: add a test that atomic64_inc_not_zero() returns an int atomic64_inc_not_zero() returns a "truth value" which in C is traditionally an int. That means callers are likely to expect the result will fit in an int. If an implementation returns a "true" value which does not fit in an int, then there's a possibility that callers will truncate it when they store it in an int. In fact this happened in practice, see commit 966d2b04e070 ("percpu-refcount: fix reference leak during percpu-atomic transition"). So add a test that the result fits in an int, even when the input doesn't. This catches the case where an implementation just passes the non-zero input value out as the result. Link: http://lkml.kernel.org/r/1499775133-1231-1-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Cc: Douglas Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/atomic64_test.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'lib') diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index fd70c0e0e673..62ab629f51ca 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -153,8 +153,10 @@ static __init void test_atomic64(void) long long v0 = 0xaaa31337c001d00dLL; long long v1 = 0xdeadbeefdeafcafeLL; long long v2 = 0xfaceabadf00df001LL; + long long v3 = 0x8000000000000000LL; long long onestwos = 0x1111111122222222LL; long long one = 1LL; + int r_int; atomic64_t v = ATOMIC64_INIT(v0); long long r = v0; @@ -240,6 +242,11 @@ static __init void test_atomic64(void) BUG_ON(!atomic64_inc_not_zero(&v)); r += one; BUG_ON(v.counter != r); + + /* Confirm the return value fits in an int, even if the value doesn't */ + INIT(v3); + r_int = atomic64_inc_not_zero(&v); + BUG_ON(!r_int); } static __init int test_atomics_init(void) -- cgit v1.2.3 From 1203c8e6fb0aa1e9c39d2323607a74c3adc34fd8 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 14 Jul 2017 14:49:57 -0700 Subject: fault-inject: simplify access check for fail-nth The fail-nth file is created with 0666 and the access is permitted if and only if the task is current. This file is owned by the currnet user. So we can create it with 0644 and allow the owner to write it. This enables to watch the status of task->fail_nth from another processes. [akinobu.mita@gmail.com: don't convert unsigned type value as signed int] Link: http://lkml.kernel.org/r/1492444483-9239-1-git-send-email-akinobu.mita@gmail.com [akinobu.mita@gmail.com: avoid unwanted data race to task->fail_nth] Link: http://lkml.kernel.org/r/1499962492-8931-1-git-send-email-akinobu.mita@gmail.com Link: http://lkml.kernel.org/r/1491490561-10485-5-git-send-email-akinobu.mita@gmail.com Signed-off-by: Akinobu Mita Acked-by: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 25 ++++++++++--------------- lib/fault-inject.c | 7 +++++-- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/fs/proc/base.c b/fs/proc/base.c index 7d795d28dd02..872a3f28bfe4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1363,16 +1363,16 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, int err; unsigned int n; + err = kstrtouint_from_user(buf, count, 0, &n); + if (err) + return err; + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; + WRITE_ONCE(task->fail_nth, n); put_task_struct(task); - if (task != current) - return -EPERM; - err = kstrtouint_from_user(buf, count, 0, &n); - if (err) - return err; - current->fail_nth = n; + return count; } @@ -1386,11 +1386,10 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - put_task_struct(task); - if (task != current) - return -EPERM; - len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); + len = snprintf(numbuf, sizeof(numbuf), "%u\n", + READ_ONCE(task->fail_nth)); len = simple_read_from_buffer(buf, count, ppos, numbuf, len); + put_task_struct(task); return len; } @@ -3355,11 +3354,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), - /* - * Operations on the file check that the task is current, - * so we create it with 0666 to support testing under unprivileged user. - */ - REG("fail-nth", 0666, proc_fail_nth_operations), + REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 09ac73c177fd..7d315fdb9f13 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -107,9 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { - if (in_task() && current->fail_nth) { - if (--current->fail_nth == 0) + if (in_task()) { + unsigned int fail_nth = READ_ONCE(current->fail_nth); + + if (fail_nth && !WRITE_ONCE(current->fail_nth, fail_nth - 1)) goto fail; + return false; } -- cgit v1.2.3 From d9c6a72d6fa29d3a7999dda726577e5d1fccafa5 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 14 Jul 2017 14:50:08 -0700 Subject: kmod: add test driver to stress test the module loader This adds a new stress test driver for kmod: the kernel module loader. The new stress test driver, test_kmod, is only enabled as a module right now. It should be possible to load this as built-in and load tests early (refer to the force_init_test module parameter), however since a lot of test can get a system out of memory fast we leave this disabled for now. Using a system with 1024 MiB of RAM can *easily* get your kernel OOM fast with this test driver. The test_kmod driver exposes API knobs for us to fine tune simple request_module() and get_fs_type() calls. Since these API calls only allow each one parameter a test driver for these is rather simple. Other factors that can help out test driver though are the number of calls we issue and knowing current limitations of each. This exposes configuration as much as possible through userspace to be able to build tests directly from userspace. Since it allows multiple misc devices its will eventually (once we add a knob to let us create new devices at will) also be possible to perform more tests in parallel, provided you have enough memory. We only enable tests we know work as of right now. Demo screenshots: # tools/testing/selftests/kmod/kmod.sh kmod_test_0001_driver: OK! - loading kmod test kmod_test_0001_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected MODULE_NOT_FOUND kmod_test_0001_fs: OK! - loading kmod test kmod_test_0001_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL kmod_test_0002_driver: OK! - loading kmod test kmod_test_0002_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected MODULE_NOT_FOUND kmod_test_0002_fs: OK! - loading kmod test kmod_test_0002_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL kmod_test_0003: OK! - loading kmod test kmod_test_0003: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0004: OK! - loading kmod test kmod_test_0004: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0005: OK! - loading kmod test kmod_test_0005: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0006: OK! - loading kmod test kmod_test_0006: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0005: OK! - loading kmod test kmod_test_0005: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0006: OK! - loading kmod test kmod_test_0006: OK! - Return value: 0 (SUCCESS), expected SUCCESS XXX: add test restult for 0007 Test completed You can also request for specific tests: # tools/testing/selftests/kmod/kmod.sh -t 0001 kmod_test_0001_driver: OK! - loading kmod test kmod_test_0001_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected MODULE_NOT_FOUND kmod_test_0001_fs: OK! - loading kmod test kmod_test_0001_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL Test completed Lastly, the current available number of tests: # tools/testing/selftests/kmod/kmod.sh --help Usage: tools/testing/selftests/kmod/kmod.sh [ -t <4-number-digit> ] Valid tests: 0001-0009 0001 - Simple test - 1 thread for empty string 0002 - Simple test - 1 thread for modules/filesystems that do not exist 0003 - Simple test - 1 thread for get_fs_type() only 0004 - Simple test - 2 threads for get_fs_type() only 0005 - multithreaded tests with default setup - request_module() only 0006 - multithreaded tests with default setup - get_fs_type() only 0007 - multithreaded tests with default setup test request_module() and get_fs_type() 0008 - multithreaded - push kmod_concurrent over max_modprobes for request_module() 0009 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type() The following test cases currently fail, as such they are not currently enabled by default: # tools/testing/selftests/kmod/kmod.sh -t 0008 # tools/testing/selftests/kmod/kmod.sh -t 0009 To be sure to run them as intended please unload both of the modules: o test_module o xfs And ensure they are not loaded on your system prior to testing them. If you use these paritions for your rootfs you can change the default test driver used for get_fs_type() by exporting it into your environment. For example of other test defaults you can override refer to kmod.sh allow_user_defaults(). Behind the scenes this is how we fine tune at a test case prior to hitting a trigger to run it: cat /sys/devices/virtual/misc/test_kmod0/config echo -n "2" > /sys/devices/virtual/misc/test_kmod0/config_test_case echo -n "ext4" > /sys/devices/virtual/misc/test_kmod0/config_test_fs echo -n "80" > /sys/devices/virtual/misc/test_kmod0/config_num_threads cat /sys/devices/virtual/misc/test_kmod0/config echo -n "1" > /sys/devices/virtual/misc/test_kmod0/config_num_threads Finally to trigger: echo -n "1" > /sys/devices/virtual/misc/test_kmod0/trigger_config The kmod.sh script uses the above constructs to build different test cases. A bit of interpretation of the current failures follows, first two premises: a) When request_module() is used userspace figures out an optimized version of module order for us. Once it finds the modules it needs, as per depmod symbol dep map, it will finit_module() the respective modules which are needed for the original request_module() request. b) We have an optimization in place whereby if a kernel uses request_module() on a module already loaded we never bother userspace as the module already is loaded. This is all handled by kernel/kmod.c. A few things to consider to help identify root causes of issues: 0) kmod 19 has a broken heuristic for modules being assumed to be built-in to your kernel and will return 0 even though request_module() failed. Upgrade to a newer version of kmod. 1) A get_fs_type() call for "xfs" will request_module() for "fs-xfs", not for "xfs". The optimization in kernel described in b) fails to catch if we have a lot of consecutive get_fs_type() calls. The reason is the optimization in place does not look for aliases. This means two consecutive get_fs_type() calls will bump kmod_concurrent, whereas request_module() will not. This one explanation why test case 0009 fails at least once for get_fs_type(). 2) If a module fails to load --- for whatever reason (kmod_concurrent limit reached, file not yet present due to rootfs switch, out of memory) we have a period of time during which module request for the same name either with request_module() or get_fs_type() will *also* fail to load even if the file for the module is ready. This explains why *multiple* NULLs are possible on test 0009. 3) finit_module() consumes quite a bit of memory. 4) Filesystems typically also have more dependent modules than other modules, its important to note though that even though a get_fs_type() call does not incur additional kmod_concurrent bumps, since userspace loads dependencies it finds it needs via finit_module_fd(), it *will* take much more memory to load a module with a lot of dependencies. Because of 3) and 4) we will easily run into out of memory failures with certain tests. For instance test 0006 fails on qemu with 1024 MiB of RAM. It panics a box after reaping all userspace processes and still not having enough memory to reap. [arnd@arndb.de: add dependencies for test module] Link: http://lkml.kernel.org/r/20170630154834.3689272-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170628223155.26472-3-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Jessica Yu Cc: Shuah Khan Cc: Rusty Russell Cc: Michal Marek Cc: Petr Mladek Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 + lib/Kconfig.debug | 27 + lib/Makefile | 1 + lib/test_kmod.c | 1246 +++++++++++++++++++++++++++++++++ tools/testing/selftests/kmod/Makefile | 11 + tools/testing/selftests/kmod/config | 7 + tools/testing/selftests/kmod/kmod.sh | 635 +++++++++++++++++ 7 files changed, 1929 insertions(+) create mode 100644 lib/test_kmod.c create mode 100644 tools/testing/selftests/kmod/Makefile create mode 100644 tools/testing/selftests/kmod/config create mode 100644 tools/testing/selftests/kmod/kmod.sh (limited to 'lib') diff --git a/MAINTAINERS b/MAINTAINERS index c917d4161427..0876084a97da 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7560,6 +7560,8 @@ L: linux-kernel@vger.kernel.org S: Maintained F: kernel/kmod.c F: include/linux/kmod.h +F: lib/test_kmod.c +F: tools/testing/selftests/kmod/ KPROBES M: Ananth N Mavinakayanahalli diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b0d01c6d4e03..789c6e9e5e01 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1847,6 +1847,33 @@ config BUG_ON_DATA_CORRUPTION If unsure, say N. +config TEST_KMOD + tristate "kmod stress tester" + default n + depends on m + depends on BLOCK && (64BIT || LBDAF) # for XFS, BTRFS + depends on NETDEVICES && NET_CORE && INET # for TUN + select TEST_LKM + select XFS_FS + select TUN + select BTRFS_FS + help + Test the kernel's module loading mechanism: kmod. kmod implements + support to load modules using the Linux kernel's usermode helper. + This test provides a series of tests against kmod. + + Although technically you can either build test_kmod as a module or + into the kernel we disallow building it into the kernel since + it stress tests request_module() and this will very likely cause + some issues by taking over precious threads available from other + module load requests, ultimately this could be fatal. + + To run tests run: + + tools/testing/selftests/kmod/kmod.sh --help + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/Makefile b/lib/Makefile index 85e91e51a9fe..40c18372b301 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_UUID) += test_uuid.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o +obj-$(CONFIG_TEST_KMOD) += test_kmod.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/test_kmod.c b/lib/test_kmod.c new file mode 100644 index 000000000000..6c1d678bcf8b --- /dev/null +++ b/lib/test_kmod.c @@ -0,0 +1,1246 @@ +/* + * kmod stress test driver + * + * Copyright (C) 2017 Luis R. Rodriguez + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or at your option any + * later version; or, when distributed separately from the Linux kernel or + * when incorporated into other software packages, subject to the following + * license: + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of copyleft-next (version 0.3.1 or later) as published + * at http://copyleft-next.org/. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +/* + * This driver provides an interface to trigger and test the kernel's + * module loader through a series of configurations and a few triggers. + * To test this driver use the following script as root: + * + * tools/testing/selftests/kmod/kmod.sh --help + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TEST_START_NUM_THREADS 50 +#define TEST_START_DRIVER "test_module" +#define TEST_START_TEST_FS "xfs" +#define TEST_START_TEST_CASE TEST_KMOD_DRIVER + + +static bool force_init_test = false; +module_param(force_init_test, bool_enable_only, 0644); +MODULE_PARM_DESC(force_init_test, + "Force kicking a test immediately after driver loads"); + +/* + * For device allocation / registration + */ +static DEFINE_MUTEX(reg_dev_mutex); +static LIST_HEAD(reg_test_devs); + +/* + * num_test_devs actually represents the *next* ID of the next + * device we will allow to create. + */ +static int num_test_devs; + +/** + * enum kmod_test_case - linker table test case + * + * If you add a test case, please be sure to review if you need to se + * @need_mod_put for your tests case. + * + * @TEST_KMOD_DRIVER: stress tests request_module() + * @TEST_KMOD_FS_TYPE: stress tests get_fs_type() + */ +enum kmod_test_case { + __TEST_KMOD_INVALID = 0, + + TEST_KMOD_DRIVER, + TEST_KMOD_FS_TYPE, + + __TEST_KMOD_MAX, +}; + +struct test_config { + char *test_driver; + char *test_fs; + unsigned int num_threads; + enum kmod_test_case test_case; + int test_result; +}; + +struct kmod_test_device; + +/** + * kmod_test_device_info - thread info + * + * @ret_sync: return value if request_module() is used, sync request for + * @TEST_KMOD_DRIVER + * @fs_sync: return value of get_fs_type() for @TEST_KMOD_FS_TYPE + * @thread_idx: thread ID + * @test_dev: test device test is being performed under + * @need_mod_put: Some tests (get_fs_type() is one) requires putting the module + * (module_put(fs_sync->owner)) when done, otherwise you will not be able + * to unload the respective modules and re-test. We use this to keep + * accounting of when we need this and to help out in case we need to + * error out and deal with module_put() on error. + */ +struct kmod_test_device_info { + int ret_sync; + struct file_system_type *fs_sync; + struct task_struct *task_sync; + unsigned int thread_idx; + struct kmod_test_device *test_dev; + bool need_mod_put; +}; + +/** + * kmod_test_device - test device to help test kmod + * + * @dev_idx: unique ID for test device + * @config: configuration for the test + * @misc_dev: we use a misc device under the hood + * @dev: pointer to misc_dev's own struct device + * @config_mutex: protects configuration of test + * @trigger_mutex: the test trigger can only be fired once at a time + * @thread_lock: protects @done count, and the @info per each thread + * @done: number of threads which have completed or failed + * @test_is_oom: when we run out of memory, use this to halt moving forward + * @kthreads_done: completion used to signal when all work is done + * @list: needed to be part of the reg_test_devs + * @info: array of info for each thread + */ +struct kmod_test_device { + int dev_idx; + struct test_config config; + struct miscdevice misc_dev; + struct device *dev; + struct mutex config_mutex; + struct mutex trigger_mutex; + struct mutex thread_mutex; + + unsigned int done; + + bool test_is_oom; + struct completion kthreads_done; + struct list_head list; + + struct kmod_test_device_info *info; +}; + +static const char *test_case_str(enum kmod_test_case test_case) +{ + switch (test_case) { + case TEST_KMOD_DRIVER: + return "TEST_KMOD_DRIVER"; + case TEST_KMOD_FS_TYPE: + return "TEST_KMOD_FS_TYPE"; + default: + return "invalid"; + } +} + +static struct miscdevice *dev_to_misc_dev(struct device *dev) +{ + return dev_get_drvdata(dev); +} + +static struct kmod_test_device *misc_dev_to_test_dev(struct miscdevice *misc_dev) +{ + return container_of(misc_dev, struct kmod_test_device, misc_dev); +} + +static struct kmod_test_device *dev_to_test_dev(struct device *dev) +{ + struct miscdevice *misc_dev; + + misc_dev = dev_to_misc_dev(dev); + + return misc_dev_to_test_dev(misc_dev); +} + +/* Must run with thread_mutex held */ +static void kmod_test_done_check(struct kmod_test_device *test_dev, + unsigned int idx) +{ + struct test_config *config = &test_dev->config; + + test_dev->done++; + dev_dbg(test_dev->dev, "Done thread count: %u\n", test_dev->done); + + if (test_dev->done == config->num_threads) { + dev_info(test_dev->dev, "Done: %u threads have all run now\n", + test_dev->done); + dev_info(test_dev->dev, "Last thread to run: %u\n", idx); + complete(&test_dev->kthreads_done); + } +} + +static void test_kmod_put_module(struct kmod_test_device_info *info) +{ + struct kmod_test_device *test_dev = info->test_dev; + struct test_config *config = &test_dev->config; + + if (!info->need_mod_put) + return; + + switch (config->test_case) { + case TEST_KMOD_DRIVER: + break; + case TEST_KMOD_FS_TYPE: + if (info && info->fs_sync && info->fs_sync->owner) + module_put(info->fs_sync->owner); + break; + default: + BUG(); + } + + info->need_mod_put = true; +} + +static int run_request(void *data) +{ + struct kmod_test_device_info *info = data; + struct kmod_test_device *test_dev = info->test_dev; + struct test_config *config = &test_dev->config; + + switch (config->test_case) { + case TEST_KMOD_DRIVER: + info->ret_sync = request_module("%s", config->test_driver); + break; + case TEST_KMOD_FS_TYPE: + info->fs_sync = get_fs_type(config->test_fs); + info->need_mod_put = true; + break; + default: + /* __trigger_config_run() already checked for test sanity */ + BUG(); + return -EINVAL; + } + + dev_dbg(test_dev->dev, "Ran thread %u\n", info->thread_idx); + + test_kmod_put_module(info); + + mutex_lock(&test_dev->thread_mutex); + info->task_sync = NULL; + kmod_test_done_check(test_dev, info->thread_idx); + mutex_unlock(&test_dev->thread_mutex); + + return 0; +} + +static int tally_work_test(struct kmod_test_device_info *info) +{ + struct kmod_test_device *test_dev = info->test_dev; + struct test_config *config = &test_dev->config; + int err_ret = 0; + + switch (config->test_case) { + case TEST_KMOD_DRIVER: + /* + * Only capture errors, if one is found that's + * enough, for now. + */ + if (info->ret_sync != 0) + err_ret = info->ret_sync; + dev_info(test_dev->dev, + "Sync thread %d return status: %d\n", + info->thread_idx, info->ret_sync); + break; + case TEST_KMOD_FS_TYPE: + /* For now we make this simple */ + if (!info->fs_sync) + err_ret = -EINVAL; + dev_info(test_dev->dev, "Sync thread %u fs: %s\n", + info->thread_idx, info->fs_sync ? config->test_fs : + "NULL"); + break; + default: + BUG(); + } + + return err_ret; +} + +/* + * XXX: add result option to display if all errors did not match. + * For now we just keep any error code if one was found. + * + * If this ran it means *all* tasks were created fine and we + * are now just collecting results. + * + * Only propagate errors, do not override with a subsequent sucess case. + */ +static void tally_up_work(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + struct kmod_test_device_info *info; + unsigned int idx; + int err_ret = 0; + int ret = 0; + + mutex_lock(&test_dev->thread_mutex); + + dev_info(test_dev->dev, "Results:\n"); + + for (idx=0; idx < config->num_threads; idx++) { + info = &test_dev->info[idx]; + ret = tally_work_test(info); + if (ret) + err_ret = ret; + } + + /* + * Note: request_module() returns 256 for a module not found even + * though modprobe itself returns 1. + */ + config->test_result = err_ret; + + mutex_unlock(&test_dev->thread_mutex); +} + +static int try_one_request(struct kmod_test_device *test_dev, unsigned int idx) +{ + struct kmod_test_device_info *info = &test_dev->info[idx]; + int fail_ret = -ENOMEM; + + mutex_lock(&test_dev->thread_mutex); + + info->thread_idx = idx; + info->test_dev = test_dev; + info->task_sync = kthread_run(run_request, info, "%s-%u", + KBUILD_MODNAME, idx); + + if (!info->task_sync || IS_ERR(info->task_sync)) { + test_dev->test_is_oom = true; + dev_err(test_dev->dev, "Setting up thread %u failed\n", idx); + info->task_sync = NULL; + goto err_out; + } else + dev_dbg(test_dev->dev, "Kicked off thread %u\n", idx); + + mutex_unlock(&test_dev->thread_mutex); + + return 0; + +err_out: + info->ret_sync = fail_ret; + mutex_unlock(&test_dev->thread_mutex); + + return fail_ret; +} + +static void test_dev_kmod_stop_tests(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + struct kmod_test_device_info *info; + unsigned int i; + + dev_info(test_dev->dev, "Ending request_module() tests\n"); + + mutex_lock(&test_dev->thread_mutex); + + for (i=0; i < config->num_threads; i++) { + info = &test_dev->info[i]; + if (info->task_sync && !IS_ERR(info->task_sync)) { + dev_info(test_dev->dev, + "Stopping still-running thread %i\n", i); + kthread_stop(info->task_sync); + } + + /* + * info->task_sync is well protected, it can only be + * NULL or a pointer to a struct. If its NULL we either + * never ran, or we did and we completed the work. Completed + * tasks *always* put the module for us. This is a sanity + * check -- just in case. + */ + if (info->task_sync && info->need_mod_put) + test_kmod_put_module(info); + } + + mutex_unlock(&test_dev->thread_mutex); +} + +/* + * Only wait *iff* we did not run into any errors during all of our thread + * set up. If run into any issues we stop threads and just bail out with + * an error to the trigger. This also means we don't need any tally work + * for any threads which fail. + */ +static int try_requests(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + unsigned int idx; + int ret; + bool any_error = false; + + for (idx=0; idx < config->num_threads; idx++) { + if (test_dev->test_is_oom) { + any_error = true; + break; + } + + ret = try_one_request(test_dev, idx); + if (ret) { + any_error = true; + break; + } + } + + if (!any_error) { + test_dev->test_is_oom = false; + dev_info(test_dev->dev, + "No errors were found while initializing threads\n"); + wait_for_completion(&test_dev->kthreads_done); + tally_up_work(test_dev); + } else { + test_dev->test_is_oom = true; + dev_info(test_dev->dev, + "At least one thread failed to start, stop all work\n"); + test_dev_kmod_stop_tests(test_dev); + return -ENOMEM; + } + + return 0; +} + +static int run_test_driver(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + + dev_info(test_dev->dev, "Test case: %s (%u)\n", + test_case_str(config->test_case), + config->test_case); + dev_info(test_dev->dev, "Test driver to load: %s\n", + config->test_driver); + dev_info(test_dev->dev, "Number of threads to run: %u\n", + config->num_threads); + dev_info(test_dev->dev, "Thread IDs will range from 0 - %u\n", + config->num_threads - 1); + + return try_requests(test_dev); +} + +static int run_test_fs_type(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + + dev_info(test_dev->dev, "Test case: %s (%u)\n", + test_case_str(config->test_case), + config->test_case); + dev_info(test_dev->dev, "Test filesystem to load: %s\n", + config->test_fs); + dev_info(test_dev->dev, "Number of threads to run: %u\n", + config->num_threads); + dev_info(test_dev->dev, "Thread IDs will range from 0 - %u\n", + config->num_threads - 1); + + return try_requests(test_dev); +} + +static ssize_t config_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + int len = 0; + + mutex_lock(&test_dev->config_mutex); + + len += snprintf(buf, PAGE_SIZE, + "Custom trigger configuration for: %s\n", + dev_name(dev)); + + len += snprintf(buf+len, PAGE_SIZE - len, + "Number of threads:\t%u\n", + config->num_threads); + + len += snprintf(buf+len, PAGE_SIZE - len, + "Test_case:\t%s (%u)\n", + test_case_str(config->test_case), + config->test_case); + + if (config->test_driver) + len += snprintf(buf+len, PAGE_SIZE - len, + "driver:\t%s\n", + config->test_driver); + else + len += snprintf(buf+len, PAGE_SIZE - len, + "driver:\tEMTPY\n"); + + if (config->test_fs) + len += snprintf(buf+len, PAGE_SIZE - len, + "fs:\t%s\n", + config->test_fs); + else + len += snprintf(buf+len, PAGE_SIZE - len, + "fs:\tEMTPY\n"); + + mutex_unlock(&test_dev->config_mutex); + + return len; +} +static DEVICE_ATTR_RO(config); + +/* + * This ensures we don't allow kicking threads through if our configuration + * is faulty. + */ +static int __trigger_config_run(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + + test_dev->done = 0; + + switch (config->test_case) { + case TEST_KMOD_DRIVER: + return run_test_driver(test_dev); + case TEST_KMOD_FS_TYPE: + return run_test_fs_type(test_dev); + default: + dev_warn(test_dev->dev, + "Invalid test case requested: %u\n", + config->test_case); + return -EINVAL; + } +} + +static int trigger_config_run(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + int ret; + + mutex_lock(&test_dev->trigger_mutex); + mutex_lock(&test_dev->config_mutex); + + ret = __trigger_config_run(test_dev); + if (ret < 0) + goto out; + dev_info(test_dev->dev, "General test result: %d\n", + config->test_result); + + /* + * We must return 0 after a trigger even unless something went + * wrong with the setup of the test. If the test setup went fine + * then userspace must just check the result of config->test_result. + * One issue with relying on the return from a call in the kernel + * is if the kernel returns a possitive value using this trigger + * will not return the value to userspace, it would be lost. + * + * By not relying on capturing the return value of tests we are using + * through the trigger it also us to run tests with set -e and only + * fail when something went wrong with the driver upon trigger + * requests. + */ + ret = 0; + +out: + mutex_unlock(&test_dev->config_mutex); + mutex_unlock(&test_dev->trigger_mutex); + + return ret; +} + +static ssize_t +trigger_config_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + int ret; + + if (test_dev->test_is_oom) + return -ENOMEM; + + /* For all intents and purposes we don't care what userspace + * sent this trigger, we care only that we were triggered. + * We treat the return value only for caputuring issues with + * the test setup. At this point all the test variables should + * have been allocated so typically this should never fail. + */ + ret = trigger_config_run(test_dev); + if (unlikely(ret < 0)) + goto out; + + /* + * Note: any return > 0 will be treated as success + * and the error value will not be available to userspace. + * Do not rely on trying to send to userspace a test value + * return value as possitive return errors will be lost. + */ + if (WARN_ON(ret > 0)) + return -EINVAL; + + ret = count; +out: + return ret; +} +static DEVICE_ATTR_WO(trigger_config); + +/* + * XXX: move to kstrncpy() once merged. + * + * Users should use kfree_const() when freeing these. + */ +static int __kstrncpy(char **dst, const char *name, size_t count, gfp_t gfp) +{ + *dst = kstrndup(name, count, gfp); + if (!*dst) + return -ENOSPC; + return count; +} + +static int config_copy_test_driver_name(struct test_config *config, + const char *name, + size_t count) +{ + return __kstrncpy(&config->test_driver, name, count, GFP_KERNEL); +} + + +static int config_copy_test_fs(struct test_config *config, const char *name, + size_t count) +{ + return __kstrncpy(&config->test_fs, name, count, GFP_KERNEL); +} + +static void __kmod_config_free(struct test_config *config) +{ + if (!config) + return; + + kfree_const(config->test_driver); + config->test_driver = NULL; + + kfree_const(config->test_fs); + config->test_driver = NULL; +} + +static void kmod_config_free(struct kmod_test_device *test_dev) +{ + struct test_config *config; + + if (!test_dev) + return; + + config = &test_dev->config; + + mutex_lock(&test_dev->config_mutex); + __kmod_config_free(config); + mutex_unlock(&test_dev->config_mutex); +} + +static ssize_t config_test_driver_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + int copied; + + mutex_lock(&test_dev->config_mutex); + + kfree_const(config->test_driver); + config->test_driver = NULL; + + copied = config_copy_test_driver_name(config, buf, count); + mutex_unlock(&test_dev->config_mutex); + + return copied; +} + +/* + * As per sysfs_kf_seq_show() the buf is max PAGE_SIZE. + */ +static ssize_t config_test_show_str(struct mutex *config_mutex, + char *dst, + char *src) +{ + int len; + + mutex_lock(config_mutex); + len = snprintf(dst, PAGE_SIZE, "%s\n", src); + mutex_unlock(config_mutex); + + return len; +} + +static ssize_t config_test_driver_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return config_test_show_str(&test_dev->config_mutex, buf, + config->test_driver); +} +static DEVICE_ATTR(config_test_driver, 0644, config_test_driver_show, + config_test_driver_store); + +static ssize_t config_test_fs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + int copied; + + mutex_lock(&test_dev->config_mutex); + + kfree_const(config->test_fs); + config->test_fs = NULL; + + copied = config_copy_test_fs(config, buf, count); + mutex_unlock(&test_dev->config_mutex); + + return copied; +} + +static ssize_t config_test_fs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return config_test_show_str(&test_dev->config_mutex, buf, + config->test_fs); +} +static DEVICE_ATTR(config_test_fs, 0644, config_test_fs_show, + config_test_fs_store); + +static int trigger_config_run_type(struct kmod_test_device *test_dev, + enum kmod_test_case test_case, + const char *test_str) +{ + int copied = 0; + struct test_config *config = &test_dev->config; + + mutex_lock(&test_dev->config_mutex); + + switch (test_case) { + case TEST_KMOD_DRIVER: + kfree_const(config->test_driver); + config->test_driver = NULL; + copied = config_copy_test_driver_name(config, test_str, + strlen(test_str)); + break; + case TEST_KMOD_FS_TYPE: + break; + kfree_const(config->test_fs); + config->test_driver = NULL; + copied = config_copy_test_fs(config, test_str, + strlen(test_str)); + default: + mutex_unlock(&test_dev->config_mutex); + return -EINVAL; + } + + config->test_case = test_case; + + mutex_unlock(&test_dev->config_mutex); + + if (copied <= 0 || copied != strlen(test_str)) { + test_dev->test_is_oom = true; + return -ENOMEM; + } + + test_dev->test_is_oom = false; + + return trigger_config_run(test_dev); +} + +static void free_test_dev_info(struct kmod_test_device *test_dev) +{ + vfree(test_dev->info); + test_dev->info = NULL; +} + +static int kmod_config_sync_info(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + + free_test_dev_info(test_dev); + test_dev->info = vzalloc(config->num_threads * + sizeof(struct kmod_test_device_info)); + if (!test_dev->info) { + dev_err(test_dev->dev, "Cannot alloc test_dev info\n"); + return -ENOMEM; + } + + return 0; +} + +/* + * Old kernels may not have this, if you want to port this code to + * test it on older kernels. + */ +#ifdef get_kmod_umh_limit +static unsigned int kmod_init_test_thread_limit(void) +{ + return get_kmod_umh_limit(); +} +#else +static unsigned int kmod_init_test_thread_limit(void) +{ + return TEST_START_NUM_THREADS; +} +#endif + +static int __kmod_config_init(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + int ret = -ENOMEM, copied; + + __kmod_config_free(config); + + copied = config_copy_test_driver_name(config, TEST_START_DRIVER, + strlen(TEST_START_DRIVER)); + if (copied != strlen(TEST_START_DRIVER)) + goto err_out; + + copied = config_copy_test_fs(config, TEST_START_TEST_FS, + strlen(TEST_START_TEST_FS)); + if (copied != strlen(TEST_START_TEST_FS)) + goto err_out; + + config->num_threads = kmod_init_test_thread_limit(); + config->test_result = 0; + config->test_case = TEST_START_TEST_CASE; + + ret = kmod_config_sync_info(test_dev); + if (ret) + goto err_out; + + test_dev->test_is_oom = false; + + return 0; + +err_out: + test_dev->test_is_oom = true; + WARN_ON(test_dev->test_is_oom); + + __kmod_config_free(config); + + return ret; +} + +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + int ret; + + mutex_lock(&test_dev->trigger_mutex); + mutex_lock(&test_dev->config_mutex); + + ret = __kmod_config_init(test_dev); + if (ret < 0) { + ret = -ENOMEM; + dev_err(dev, "could not alloc settings for config trigger: %d\n", + ret); + goto out; + } + + dev_info(dev, "reset\n"); + ret = count; + +out: + mutex_unlock(&test_dev->config_mutex); + mutex_unlock(&test_dev->trigger_mutex); + + return ret; +} +static DEVICE_ATTR_WO(reset); + +static int test_dev_config_update_uint_sync(struct kmod_test_device *test_dev, + const char *buf, size_t size, + unsigned int *config, + int (*test_sync)(struct kmod_test_device *test_dev)) +{ + int ret; + long new; + unsigned int old_val; + + ret = kstrtol(buf, 10, &new); + if (ret) + return ret; + + if (new > UINT_MAX) + return -EINVAL; + + mutex_lock(&test_dev->config_mutex); + + old_val = *config; + *(unsigned int *)config = new; + + ret = test_sync(test_dev); + if (ret) { + *(unsigned int *)config = old_val; + + ret = test_sync(test_dev); + WARN_ON(ret); + + mutex_unlock(&test_dev->config_mutex); + return -EINVAL; + } + + mutex_unlock(&test_dev->config_mutex); + /* Always return full write size even if we didn't consume all */ + return size; +} + +static int test_dev_config_update_uint_range(struct kmod_test_device *test_dev, + const char *buf, size_t size, + unsigned int *config, + unsigned int min, + unsigned int max) +{ + int ret; + long new; + + ret = kstrtol(buf, 10, &new); + if (ret) + return ret; + + if (new < min || new > max || new > UINT_MAX) + return -EINVAL; + + mutex_lock(&test_dev->config_mutex); + *config = new; + mutex_unlock(&test_dev->config_mutex); + + /* Always return full write size even if we didn't consume all */ + return size; +} + +static int test_dev_config_update_int(struct kmod_test_device *test_dev, + const char *buf, size_t size, + int *config) +{ + int ret; + long new; + + ret = kstrtol(buf, 10, &new); + if (ret) + return ret; + + if (new > INT_MAX || new < INT_MIN) + return -EINVAL; + + mutex_lock(&test_dev->config_mutex); + *config = new; + mutex_unlock(&test_dev->config_mutex); + /* Always return full write size even if we didn't consume all */ + return size; +} + +static ssize_t test_dev_config_show_int(struct kmod_test_device *test_dev, + char *buf, + int config) +{ + int val; + + mutex_lock(&test_dev->config_mutex); + val = config; + mutex_unlock(&test_dev->config_mutex); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} + +static ssize_t test_dev_config_show_uint(struct kmod_test_device *test_dev, + char *buf, + unsigned int config) +{ + unsigned int val; + + mutex_lock(&test_dev->config_mutex); + val = config; + mutex_unlock(&test_dev->config_mutex); + + return snprintf(buf, PAGE_SIZE, "%u\n", val); +} + +static ssize_t test_result_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_update_int(test_dev, buf, count, + &config->test_result); +} + +static ssize_t config_num_threads_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_update_uint_sync(test_dev, buf, count, + &config->num_threads, + kmod_config_sync_info); +} + +static ssize_t config_num_threads_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_show_int(test_dev, buf, config->num_threads); +} +static DEVICE_ATTR(config_num_threads, 0644, config_num_threads_show, + config_num_threads_store); + +static ssize_t config_test_case_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_update_uint_range(test_dev, buf, count, + &config->test_case, + __TEST_KMOD_INVALID + 1, + __TEST_KMOD_MAX - 1); +} + +static ssize_t config_test_case_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_show_uint(test_dev, buf, config->test_case); +} +static DEVICE_ATTR(config_test_case, 0644, config_test_case_show, + config_test_case_store); + +static ssize_t test_result_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_show_int(test_dev, buf, config->test_result); +} +static DEVICE_ATTR(test_result, 0644, test_result_show, test_result_store); + +#define TEST_KMOD_DEV_ATTR(name) &dev_attr_##name.attr + +static struct attribute *test_dev_attrs[] = { + TEST_KMOD_DEV_ATTR(trigger_config), + TEST_KMOD_DEV_ATTR(config), + TEST_KMOD_DEV_ATTR(reset), + + TEST_KMOD_DEV_ATTR(config_test_driver), + TEST_KMOD_DEV_ATTR(config_test_fs), + TEST_KMOD_DEV_ATTR(config_num_threads), + TEST_KMOD_DEV_ATTR(config_test_case), + TEST_KMOD_DEV_ATTR(test_result), + + NULL, +}; + +ATTRIBUTE_GROUPS(test_dev); + +static int kmod_config_init(struct kmod_test_device *test_dev) +{ + int ret; + + mutex_lock(&test_dev->config_mutex); + ret = __kmod_config_init(test_dev); + mutex_unlock(&test_dev->config_mutex); + + return ret; +} + +static struct kmod_test_device *alloc_test_dev_kmod(int idx) +{ + int ret; + struct kmod_test_device *test_dev; + struct miscdevice *misc_dev; + + test_dev = vzalloc(sizeof(struct kmod_test_device)); + if (!test_dev) { + pr_err("Cannot alloc test_dev\n"); + goto err_out; + } + + mutex_init(&test_dev->config_mutex); + mutex_init(&test_dev->trigger_mutex); + mutex_init(&test_dev->thread_mutex); + + init_completion(&test_dev->kthreads_done); + + ret = kmod_config_init(test_dev); + if (ret < 0) { + pr_err("Cannot alloc kmod_config_init()\n"); + goto err_out_free; + } + + test_dev->dev_idx = idx; + misc_dev = &test_dev->misc_dev; + + misc_dev->minor = MISC_DYNAMIC_MINOR; + misc_dev->name = kasprintf(GFP_KERNEL, "test_kmod%d", idx); + if (!misc_dev->name) { + pr_err("Cannot alloc misc_dev->name\n"); + goto err_out_free_config; + } + misc_dev->groups = test_dev_groups; + + return test_dev; + +err_out_free_config: + free_test_dev_info(test_dev); + kmod_config_free(test_dev); +err_out_free: + vfree(test_dev); + test_dev = NULL; +err_out: + return NULL; +} + +static void free_test_dev_kmod(struct kmod_test_device *test_dev) +{ + if (test_dev) { + kfree_const(test_dev->misc_dev.name); + test_dev->misc_dev.name = NULL; + free_test_dev_info(test_dev); + kmod_config_free(test_dev); + vfree(test_dev); + test_dev = NULL; + } +} + +static struct kmod_test_device *register_test_dev_kmod(void) +{ + struct kmod_test_device *test_dev = NULL; + int ret; + + mutex_unlock(®_dev_mutex); + + /* int should suffice for number of devices, test for wrap */ + if (unlikely(num_test_devs + 1) < 0) { + pr_err("reached limit of number of test devices\n"); + goto out; + } + + test_dev = alloc_test_dev_kmod(num_test_devs); + if (!test_dev) + goto out; + + ret = misc_register(&test_dev->misc_dev); + if (ret) { + pr_err("could not register misc device: %d\n", ret); + free_test_dev_kmod(test_dev); + goto out; + } + + test_dev->dev = test_dev->misc_dev.this_device; + list_add_tail(&test_dev->list, ®_test_devs); + dev_info(test_dev->dev, "interface ready\n"); + + num_test_devs++; + +out: + mutex_unlock(®_dev_mutex); + + return test_dev; + +} + +static int __init test_kmod_init(void) +{ + struct kmod_test_device *test_dev; + int ret; + + test_dev = register_test_dev_kmod(); + if (!test_dev) { + pr_err("Cannot add first test kmod device\n"); + return -ENODEV; + } + + /* + * With some work we might be able to gracefully enable + * testing with this driver built-in, for now this seems + * rather risky. For those willing to try have at it, + * and enable the below. Good luck! If that works, try + * lowering the init level for more fun. + */ + if (force_init_test) { + ret = trigger_config_run_type(test_dev, + TEST_KMOD_DRIVER, "tun"); + if (WARN_ON(ret)) + return ret; + ret = trigger_config_run_type(test_dev, + TEST_KMOD_FS_TYPE, "btrfs"); + if (WARN_ON(ret)) + return ret; + } + + return 0; +} +late_initcall(test_kmod_init); + +static +void unregister_test_dev_kmod(struct kmod_test_device *test_dev) +{ + mutex_lock(&test_dev->trigger_mutex); + mutex_lock(&test_dev->config_mutex); + + test_dev_kmod_stop_tests(test_dev); + + dev_info(test_dev->dev, "removing interface\n"); + misc_deregister(&test_dev->misc_dev); + kfree(&test_dev->misc_dev.name); + + mutex_unlock(&test_dev->config_mutex); + mutex_unlock(&test_dev->trigger_mutex); + + free_test_dev_kmod(test_dev); +} + +static void __exit test_kmod_exit(void) +{ + struct kmod_test_device *test_dev, *tmp; + + mutex_lock(®_dev_mutex); + list_for_each_entry_safe(test_dev, tmp, ®_test_devs, list) { + list_del(&test_dev->list); + unregister_test_dev_kmod(test_dev); + } + mutex_unlock(®_dev_mutex); +} +module_exit(test_kmod_exit); + +MODULE_AUTHOR("Luis R. Rodriguez "); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/kmod/Makefile b/tools/testing/selftests/kmod/Makefile new file mode 100644 index 000000000000..fa2ccc5fb3de --- /dev/null +++ b/tools/testing/selftests/kmod/Makefile @@ -0,0 +1,11 @@ +# Makefile for kmod loading selftests + +# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" +all: + +TEST_PROGS := kmod.sh + +include ../lib.mk + +# Nothing to clean up. +clean: diff --git a/tools/testing/selftests/kmod/config b/tools/testing/selftests/kmod/config new file mode 100644 index 000000000000..259f4fd6b5e2 --- /dev/null +++ b/tools/testing/selftests/kmod/config @@ -0,0 +1,7 @@ +CONFIG_TEST_KMOD=m +CONFIG_TEST_LKM=m +CONFIG_XFS_FS=m + +# For the module parameter force_init_test is used +CONFIG_TUN=m +CONFIG_BTRFS_FS=m diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh new file mode 100644 index 000000000000..10196a62ed09 --- /dev/null +++ b/tools/testing/selftests/kmod/kmod.sh @@ -0,0 +1,635 @@ +#!/bin/bash +# +# Copyright (C) 2017 Luis R. Rodriguez +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or at your option any +# later version; or, when distributed separately from the Linux kernel or +# when incorporated into other software packages, subject to the following +# license: +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of copyleft-next (version 0.3.1 or later) as published +# at http://copyleft-next.org/. + +# This is a stress test script for kmod, the kernel module loader. It uses +# test_kmod which exposes a series of knobs for the API for us so we can +# tweak each test in userspace rather than in kernelspace. +# +# The way kmod works is it uses the kernel's usermode helper API to eventually +# call /sbin/modprobe. It has a limit of the number of concurrent calls +# possible. The kernel interface to load modules is request_module(), however +# mount uses get_fs_type(). Both behave slightly differently, but the +# differences are important enough to test each call separately. For this +# reason test_kmod starts by providing tests for both calls. +# +# The test driver test_kmod assumes a series of defaults which you can +# override by exporting to your environment prior running this script. +# For instance this script assumes you do not have xfs loaded upon boot. +# If this is false, export DEFAULT_KMOD_FS="ext4" prior to running this +# script if the filesyste module you don't have loaded upon bootup +# is ext4 instead. Refer to allow_user_defaults() for a list of user +# override variables possible. +# +# You'll want at least 4 GiB of RAM to expect to run these tests +# without running out of memory on them. For other requirements refer +# to test_reqs() + +set -e + +TEST_NAME="kmod" +TEST_DRIVER="test_${TEST_NAME}" +TEST_DIR=$(dirname $0) + +# This represents +# +# TEST_ID:TEST_COUNT:ENABLED +# +# TEST_ID: is the test id number +# TEST_COUNT: number of times we should run the test +# ENABLED: 1 if enabled, 0 otherwise +# +# Once these are enabled please leave them as-is. Write your own test, +# we have tons of space. +ALL_TESTS="0001:3:1" +ALL_TESTS="$ALL_TESTS 0002:3:1" +ALL_TESTS="$ALL_TESTS 0003:1:1" +ALL_TESTS="$ALL_TESTS 0004:1:1" +ALL_TESTS="$ALL_TESTS 0005:10:1" +ALL_TESTS="$ALL_TESTS 0006:10:1" +ALL_TESTS="$ALL_TESTS 0007:5:1" + +# Disabled tests: +# +# 0008 x 150 - multithreaded - push kmod_concurrent over max_modprobes for request_module()" +# Current best-effort failure interpretation: +# Enough module requests get loaded in place fast enough to reach over the +# max_modprobes limit and trigger a failure -- before we're even able to +# start processing pending requests. +ALL_TESTS="$ALL_TESTS 0008:150:0" + +# 0009 x 150 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" +# Current best-effort failure interpretation: +# +# get_fs_type() requests modules using aliases as such the optimization in +# place today to look for already loaded modules will not take effect and +# we end up requesting a new module to load, this bumps the kmod_concurrent, +# and in certain circumstances can lead to pushing the kmod_concurrent over +# the max_modprobe limit. +# +# This test fails much easier than test 0008 since the alias optimizations +# are not in place. +ALL_TESTS="$ALL_TESTS 0009:150:0" + +test_modprobe() +{ + if [ ! -d $DIR ]; then + echo "$0: $DIR not present" >&2 + echo "You must have the following enabled in your kernel:" >&2 + cat $TEST_DIR/config >&2 + exit 1 + fi +} + +function allow_user_defaults() +{ + if [ -z $DEFAULT_KMOD_DRIVER ]; then + DEFAULT_KMOD_DRIVER="test_module" + fi + + if [ -z $DEFAULT_KMOD_FS ]; then + DEFAULT_KMOD_FS="xfs" + fi + + if [ -z $PROC_DIR ]; then + PROC_DIR="/proc/sys/kernel/" + fi + + if [ -z $MODPROBE_LIMIT ]; then + MODPROBE_LIMIT=50 + fi + + if [ -z $DIR ]; then + DIR="/sys/devices/virtual/misc/${TEST_DRIVER}0/" + fi + + if [ -z $DEFAULT_NUM_TESTS ]; then + DEFAULT_NUM_TESTS=150 + fi + + MODPROBE_LIMIT_FILE="${PROC_DIR}/kmod-limit" +} + +test_reqs() +{ + if ! which modprobe 2> /dev/null > /dev/null; then + echo "$0: You need modprobe installed" >&2 + exit 1 + fi + + if ! which kmod 2> /dev/null > /dev/null; then + echo "$0: You need kmod installed" >&2 + exit 1 + fi + + # kmod 19 has a bad bug where it returns 0 when modprobe + # gets called *even* if the module was not loaded due to + # some bad heuristics. For details see: + # + # A work around is possible in-kernel but its rather + # complex. + KMOD_VERSION=$(kmod --version | awk '{print $3}') + if [[ $KMOD_VERSION -le 19 ]]; then + echo "$0: You need at least kmod 20" >&2 + echo "kmod <= 19 is buggy, for details see:" >&2 + echo "http://git.kernel.org/cgit/utils/kernel/kmod/kmod.git/commit/libkmod/libkmod-module.c?id=fd44a98ae2eb5eb32161088954ab21e58e19dfc4" >&2 + exit 1 + fi + + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo $msg must be run as root >&2 + exit 0 + fi +} + +function load_req_mod() +{ + trap "test_modprobe" EXIT + + if [ ! -d $DIR ]; then + # Alanis: "Oh isn't it ironic?" + modprobe $TEST_DRIVER + fi +} + +test_finish() +{ + echo "Test completed" +} + +errno_name_to_val() +{ + case "$1" in + # kmod calls modprobe and upon of a module not found + # modprobe returns just 1... However in the kernel we + # *sometimes* see 256... + MODULE_NOT_FOUND) + echo 256;; + SUCCESS) + echo 0;; + -EPERM) + echo -1;; + -ENOENT) + echo -2;; + -EINVAL) + echo -22;; + -ERR_ANY) + echo -123456;; + *) + echo invalid;; + esac +} + +errno_val_to_name() + case "$1" in + 256) + echo MODULE_NOT_FOUND;; + 0) + echo SUCCESS;; + -1) + echo -EPERM;; + -2) + echo -ENOENT;; + -22) + echo -EINVAL;; + -123456) + echo -ERR_ANY;; + *) + echo invalid;; + esac + +config_set_test_case_driver() +{ + if ! echo -n 1 >$DIR/config_test_case; then + echo "$0: Unable to set to test case to driver" >&2 + exit 1 + fi +} + +config_set_test_case_fs() +{ + if ! echo -n 2 >$DIR/config_test_case; then + echo "$0: Unable to set to test case to fs" >&2 + exit 1 + fi +} + +config_num_threads() +{ + if ! echo -n $1 >$DIR/config_num_threads; then + echo "$0: Unable to set to number of threads" >&2 + exit 1 + fi +} + +config_get_modprobe_limit() +{ + if [[ -f ${MODPROBE_LIMIT_FILE} ]] ; then + MODPROBE_LIMIT=$(cat $MODPROBE_LIMIT_FILE) + fi + echo $MODPROBE_LIMIT +} + +config_num_thread_limit_extra() +{ + MODPROBE_LIMIT=$(config_get_modprobe_limit) + let EXTRA_LIMIT=$MODPROBE_LIMIT+$1 + config_num_threads $EXTRA_LIMIT +} + +# For special characters use printf directly, +# refer to kmod_test_0001 +config_set_driver() +{ + if ! echo -n $1 >$DIR/config_test_driver; then + echo "$0: Unable to set driver" >&2 + exit 1 + fi +} + +config_set_fs() +{ + if ! echo -n $1 >$DIR/config_test_fs; then + echo "$0: Unable to set driver" >&2 + exit 1 + fi +} + +config_get_driver() +{ + cat $DIR/config_test_driver +} + +config_get_test_result() +{ + cat $DIR/test_result +} + +config_reset() +{ + if ! echo -n "1" >"$DIR"/reset; then + echo "$0: reset shuld have worked" >&2 + exit 1 + fi +} + +config_show_config() +{ + echo "----------------------------------------------------" + cat "$DIR"/config + echo "----------------------------------------------------" +} + +config_trigger() +{ + if ! echo -n "1" >"$DIR"/trigger_config 2>/dev/null; then + echo "$1: FAIL - loading should have worked" + config_show_config + exit 1 + fi + echo "$1: OK! - loading kmod test" +} + +config_trigger_want_fail() +{ + if echo "1" > $DIR/trigger_config 2>/dev/null; then + echo "$1: FAIL - test case was expected to fail" + config_show_config + exit 1 + fi + echo "$1: OK! - kmod test case failed as expected" +} + +config_expect_result() +{ + RC=$(config_get_test_result) + RC_NAME=$(errno_val_to_name $RC) + + ERRNO_NAME=$2 + ERRNO=$(errno_name_to_val $ERRNO_NAME) + + if [[ $ERRNO_NAME = "-ERR_ANY" ]]; then + if [[ $RC -ge 0 ]]; then + echo "$1: FAIL, test expects $ERRNO_NAME - got $RC_NAME ($RC)" >&2 + config_show_config + exit 1 + fi + elif [[ $RC != $ERRNO ]]; then + echo "$1: FAIL, test expects $ERRNO_NAME ($ERRNO) - got $RC_NAME ($RC)" >&2 + config_show_config + exit 1 + fi + echo "$1: OK! - Return value: $RC ($RC_NAME), expected $ERRNO_NAME" +} + +kmod_defaults_driver() +{ + config_reset + modprobe -r $DEFAULT_KMOD_DRIVER + config_set_driver $DEFAULT_KMOD_DRIVER +} + +kmod_defaults_fs() +{ + config_reset + modprobe -r $DEFAULT_KMOD_FS + config_set_fs $DEFAULT_KMOD_FS + config_set_test_case_fs +} + +kmod_test_0001_driver() +{ + NAME='\000' + + kmod_defaults_driver + config_num_threads 1 + printf '\000' >"$DIR"/config_test_driver + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} MODULE_NOT_FOUND +} + +kmod_test_0001_fs() +{ + NAME='\000' + + kmod_defaults_fs + config_num_threads 1 + printf '\000' >"$DIR"/config_test_fs + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} -EINVAL +} + +kmod_test_0001() +{ + kmod_test_0001_driver + kmod_test_0001_fs +} + +kmod_test_0002_driver() +{ + NAME="nope-$DEFAULT_KMOD_DRIVER" + + kmod_defaults_driver + config_set_driver $NAME + config_num_threads 1 + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} MODULE_NOT_FOUND +} + +kmod_test_0002_fs() +{ + NAME="nope-$DEFAULT_KMOD_FS" + + kmod_defaults_fs + config_set_fs $NAME + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} -EINVAL +} + +kmod_test_0002() +{ + kmod_test_0002_driver + kmod_test_0002_fs +} + +kmod_test_0003() +{ + kmod_defaults_fs + config_num_threads 1 + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0004() +{ + kmod_defaults_fs + config_num_threads 2 + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0005() +{ + kmod_defaults_driver + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0006() +{ + kmod_defaults_fs + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0007() +{ + kmod_test_0005 + kmod_test_0006 +} + +kmod_test_0008() +{ + kmod_defaults_driver + MODPROBE_LIMIT=$(config_get_modprobe_limit) + let EXTRA=$MODPROBE_LIMIT/6 + config_num_thread_limit_extra $EXTRA + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0009() +{ + kmod_defaults_fs + MODPROBE_LIMIT=$(config_get_modprobe_limit) + let EXTRA=$MODPROBE_LIMIT/4 + config_num_thread_limit_extra $EXTRA + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +list_tests() +{ + echo "Test ID list:" + echo + echo "TEST_ID x NUM_TEST" + echo "TEST_ID: Test ID" + echo "NUM_TESTS: Number of recommended times to run the test" + echo + echo "0001 x $(get_test_count 0001) - Simple test - 1 thread for empty string" + echo "0002 x $(get_test_count 0002) - Simple test - 1 thread for modules/filesystems that do not exist" + echo "0003 x $(get_test_count 0003) - Simple test - 1 thread for get_fs_type() only" + echo "0004 x $(get_test_count 0004) - Simple test - 2 threads for get_fs_type() only" + echo "0005 x $(get_test_count 0005) - multithreaded tests with default setup - request_module() only" + echo "0006 x $(get_test_count 0006) - multithreaded tests with default setup - get_fs_type() only" + echo "0007 x $(get_test_count 0007) - multithreaded tests with default setup test request_module() and get_fs_type()" + echo "0008 x $(get_test_count 0008) - multithreaded - push kmod_concurrent over max_modprobes for request_module()" + echo "0009 x $(get_test_count 0009) - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" +} + +usage() +{ + NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .) + let NUM_TESTS=$NUM_TESTS+1 + MAX_TEST=$(printf "%04d\n" $NUM_TESTS) + echo "Usage: $0 [ -t <4-number-digit> ] | [ -w <4-number-digit> ] |" + echo " [ -s <4-number-digit> ] | [ -c <4-number-digit> " + echo " [ all ] [ -h | --help ] [ -l ]" + echo "" + echo "Valid tests: 0001-$MAX_TEST" + echo "" + echo " all Runs all tests (default)" + echo " -t Run test ID the number amount of times is recommended" + echo " -w Watch test ID run until it runs into an error" + echo " -c Run test ID once" + echo " -s Run test ID x test-count number of times" + echo " -l List all test ID list" + echo " -h|--help Help" + echo + echo "If an error every occurs execution will immediately terminate." + echo "If you are adding a new test try using -w first to" + echo "make sure the test passes a series of tests." + echo + echo Example uses: + echo + echo "${TEST_NAME}.sh -- executes all tests" + echo "${TEST_NAME}.sh -t 0008 -- Executes test ID 0008 number of times is recomended" + echo "${TEST_NAME}.sh -w 0008 -- Watch test ID 0008 run until an error occurs" + echo "${TEST_NAME}.sh -s 0008 -- Run test ID 0008 once" + echo "${TEST_NAME}.sh -c 0008 3 -- Run test ID 0008 three times" + echo + list_tests + exit 1 +} + +function test_num() +{ + re='^[0-9]+$' + if ! [[ $1 =~ $re ]]; then + usage + fi +} + +function get_test_count() +{ + test_num $1 + TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + LAST_TWO=${TEST_DATA#*:*} + echo ${LAST_TWO%:*} +} + +function get_test_enabled() +{ + test_num $1 + TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + echo ${TEST_DATA#*:*:} +} + +function run_all_tests() +{ + for i in $ALL_TESTS ; do + TEST_ID=${i%:*:*} + ENABLED=$(get_test_enabled $TEST_ID) + TEST_COUNT=$(get_test_count $TEST_ID) + if [[ $ENABLED -eq "1" ]]; then + test_case $TEST_ID $TEST_COUNT + fi + done +} + +function watch_log() +{ + if [ $# -ne 3 ]; then + clear + fi + date + echo "Running test: $2 - run #$1" +} + +function watch_case() +{ + i=0 + while [ 1 ]; do + + if [ $# -eq 1 ]; then + test_num $1 + watch_log $i ${TEST_NAME}_test_$1 + ${TEST_NAME}_test_$1 + else + watch_log $i all + run_all_tests + fi + let i=$i+1 + done +} + +function test_case() +{ + NUM_TESTS=$DEFAULT_NUM_TESTS + if [ $# -eq 2 ]; then + NUM_TESTS=$2 + fi + + i=0 + while [ $i -lt $NUM_TESTS ]; do + test_num $1 + watch_log $i ${TEST_NAME}_test_$1 noclear + RUN_TEST=${TEST_NAME}_test_$1 + $RUN_TEST + let i=$i+1 + done +} + +function parse_args() +{ + if [ $# -eq 0 ]; then + run_all_tests + else + if [[ "$1" = "all" ]]; then + run_all_tests + elif [[ "$1" = "-w" ]]; then + shift + watch_case $@ + elif [[ "$1" = "-t" ]]; then + shift + test_num $1 + test_case $1 $(get_test_count $1) + elif [[ "$1" = "-c" ]]; then + shift + test_num $1 + test_num $2 + test_case $1 $2 + elif [[ "$1" = "-s" ]]; then + shift + test_case $1 1 + elif [[ "$1" = "-l" ]]; then + list_tests + elif [[ "$1" = "-h" || "$1" = "--help" ]]; then + usage + else + usage + fi + fi +} + +test_reqs +allow_user_defaults +load_req_mod + +trap "test_finish" EXIT + +parse_args $@ + +exit 0 -- cgit v1.2.3 From eecabf567422eda02bd179f2707d8fe24f52d888 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 8 Jun 2017 04:16:59 -0400 Subject: random: suppress spammy warnings about unseeded randomness Unfortunately, on some models of some architectures getting a fully seeded CRNG is extremely difficult, and so this can result in dmesg getting spammed for a surprisingly long time. This is really bad from a security perspective, and so architecture maintainers really need to do what they can to get the CRNG seeded sooner after the system is booted. However, users can't do anything actionble to address this, and spamming the kernel messages log will only just annoy people. For developers who want to work on improving this situation, CONFIG_WARN_UNSEEDED_RANDOM has been renamed to CONFIG_WARN_ALL_UNSEEDED_RANDOM. By default the kernel will always print the first use of unseeded randomness. This way, hopefully the security obsessed will be happy that there is _some_ indication when the kernel boots there may be a potential issue with that architecture or subarchitecture. To see all uses of unseeded randomness, developers can enable CONFIG_WARN_ALL_UNSEEDED_RANDOM. Signed-off-by: Theodore Ts'o --- drivers/char/random.c | 56 +++++++++++++++++++++++++++++++++++---------------- lib/Kconfig.debug | 24 ++++++++++++++++------ 2 files changed, 57 insertions(+), 23 deletions(-) (limited to 'lib') diff --git a/drivers/char/random.c b/drivers/char/random.c index fa5bbd5a7ca0..799d37981d99 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -436,6 +436,7 @@ static void _extract_crng(struct crng_state *crng, static void _crng_backtrack_protect(struct crng_state *crng, __u8 tmp[CHACHA20_BLOCK_SIZE], int used); static void process_random_ready_list(void); +static void _get_random_bytes(void *buf, int nbytes); /********************************************************************** * @@ -776,7 +777,7 @@ static void crng_initialize(struct crng_state *crng) _extract_entropy(&input_pool, &crng->state[4], sizeof(__u32) * 12, 0); else - get_random_bytes(&crng->state[4], sizeof(__u32) * 12); + _get_random_bytes(&crng->state[4], sizeof(__u32) * 12); for (i = 4; i < 16; i++) { if (!arch_get_random_seed_long(&rv) && !arch_get_random_long(&rv)) @@ -1466,6 +1467,30 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, return ret; } +#define warn_unseeded_randomness(previous) \ + _warn_unseeded_randomness(__func__, (void *) _RET_IP_, (previous)) + +static void _warn_unseeded_randomness(const char *func_name, void *caller, + void **previous) +{ +#ifdef CONFIG_WARN_ALL_UNSEEDED_RANDOM + const bool print_once = false; +#else + static bool print_once __read_mostly; +#endif + + if (print_once || + crng_ready() || + (previous && (caller == READ_ONCE(*previous)))) + return; + WRITE_ONCE(*previous, caller); +#ifndef CONFIG_WARN_ALL_UNSEEDED_RANDOM + print_once = true; +#endif + pr_notice("random: %s called from %pF with crng_init=%d\n", + func_name, caller, crng_init); +} + /* * This function is the exported kernel interface. It returns some * number of good random numbers, suitable for key generation, seeding @@ -1476,15 +1501,10 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, * wait_for_random_bytes() should be called and return 0 at least once * at any point prior. */ -void get_random_bytes(void *buf, int nbytes) +static void _get_random_bytes(void *buf, int nbytes) { __u8 tmp[CHACHA20_BLOCK_SIZE]; -#ifdef CONFIG_WARN_UNSEEDED_RANDOM - if (!crng_ready()) - printk(KERN_NOTICE "random: %pF get_random_bytes called " - "with crng_init = %d\n", (void *) _RET_IP_, crng_init); -#endif trace_get_random_bytes(nbytes, _RET_IP_); while (nbytes >= CHACHA20_BLOCK_SIZE) { @@ -1501,6 +1521,14 @@ void get_random_bytes(void *buf, int nbytes) crng_backtrack_protect(tmp, CHACHA20_BLOCK_SIZE); memzero_explicit(tmp, sizeof(tmp)); } + +void get_random_bytes(void *buf, int nbytes) +{ + static void *previous; + + warn_unseeded_randomness(&previous); + _get_random_bytes(buf, nbytes); +} EXPORT_SYMBOL(get_random_bytes); /* @@ -2064,6 +2092,7 @@ u64 get_random_u64(void) bool use_lock = READ_ONCE(crng_init) < 2; unsigned long flags = 0; struct batched_entropy *batch; + static void *previous; #if BITS_PER_LONG == 64 if (arch_get_random_long((unsigned long *)&ret)) @@ -2074,11 +2103,7 @@ u64 get_random_u64(void) return ret; #endif -#ifdef CONFIG_WARN_UNSEEDED_RANDOM - if (!crng_ready()) - printk(KERN_NOTICE "random: %pF get_random_u64 called " - "with crng_init = %d\n", (void *) _RET_IP_, crng_init); -#endif + warn_unseeded_randomness(&previous); batch = &get_cpu_var(batched_entropy_u64); if (use_lock) @@ -2102,15 +2127,12 @@ u32 get_random_u32(void) bool use_lock = READ_ONCE(crng_init) < 2; unsigned long flags = 0; struct batched_entropy *batch; + static void *previous; if (arch_get_random_int(&ret)) return ret; -#ifdef CONFIG_WARN_UNSEEDED_RANDOM - if (!crng_ready()) - printk(KERN_NOTICE "random: %pF get_random_u32 called " - "with crng_init = %d\n", (void *) _RET_IP_, crng_init); -#endif + warn_unseeded_randomness(&previous); batch = &get_cpu_var(batched_entropy_u32); if (use_lock) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c4159605bfbf..9d0a244074b9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1209,10 +1209,9 @@ config STACKTRACE It is also used by various kernel debugging features that require stack trace generation. -config WARN_UNSEEDED_RANDOM - bool "Warn when kernel uses unseeded randomness" - default y - depends on DEBUG_KERNEL +config WARN_ALL_UNSEEDED_RANDOM + bool "Warn for all uses of unseeded randomness" + default n help Some parts of the kernel contain bugs relating to their use of cryptographically secure random numbers before it's actually possible @@ -1222,8 +1221,21 @@ config WARN_UNSEEDED_RANDOM are going wrong, so that they might contact developers about fixing it. - Say Y here, unless you simply do not care about using unseeded - randomness and do not want a potential warning message in your logs. + Unfortunately, on some models of some architectures getting + a fully seeded CRNG is extremely difficult, and so this can + result in dmesg getting spammed for a surprisingly long + time. This is really bad from a security perspective, and + so architecture maintainers really need to do what they can + to get the CRNG seeded sooner after the system is booted. + However, since users can not do anything actionble to + address this, by default the kernel will issue only a single + warning for the first use of unseeded randomness. + + Say Y here if you want to receive warnings for all uses of + unseeded randomness. This will be of use primarily for + those developers interersted in improving the security of + Linux kernels running on their architecture (or + subarchitecture). config DEBUG_KOBJECT bool "kobject debugging" -- cgit v1.2.3