From d46150d6fd1068e39e47a0c6e1f437ea71d83928 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:21 +0200 Subject: lib/interval_tree_test.c: Include instead of Substitute the inclusion of header with to allow the removal of legacy inclusion of from . Signed-off-by: Uros Bizjak Cc: Andrew Morton Signed-off-by: Jason A. Donenfeld --- lib/interval_tree_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c index f37f4d44faa9..837064b83a6c 100644 --- a/lib/interval_tree_test.c +++ b/lib/interval_tree_test.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 9127ad42420dda0c54ddc00826242c46a918c116 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:22 +0200 Subject: kunit: string-stream-test: Include Include header to allow the removal of legacy inclusion of from . Signed-off-by: Uros Bizjak Cc: Brendan Higgins Cc: David Gow Cc: Rae Moar Signed-off-by: Jason A. Donenfeld --- lib/kunit/string-stream-test.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/kunit/string-stream-test.c b/lib/kunit/string-stream-test.c index 7511442ea98f..7734e33156f9 100644 --- a/lib/kunit/string-stream-test.c +++ b/lib/kunit/string-stream-test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From baacb8b41308988f42cdbfa27991b1b0014d0228 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:23 +0200 Subject: random32: Include instead of Substitute the inclusion of header with to allow the removal of legacy inclusion of from . Signed-off-by: Uros Bizjak Cc: Andrew Morton Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Signed-off-by: Jason A. Donenfeld --- lib/random32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/random32.c b/lib/random32.c index 32060b852668..31fc2ca68856 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From a7e74510e03d6254791f4a8e3eabf5f231c771a8 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:24 +0200 Subject: lib/rbtree-test: Include instead of Substitute the inclusion of header with to allow the removal of legacy inclusion of from . Signed-off-by: Uros Bizjak Cc: Andrew Morton Signed-off-by: Jason A. Donenfeld --- lib/rbtree_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 41ae3c7570d3..8655a76d29a1 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 2e2fe47182fcc006c3e1054d8d67e312af359843 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:25 +0200 Subject: bpf/tests: Include instead of Substitute the inclusion of header with to allow the removal of legacy inclusion of from . Signed-off-by: Uros Bizjak Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Eduard Zingerman Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Signed-off-by: Jason A. Donenfeld --- lib/test_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_bpf.c b/lib/test_bpf.c index fa5edd6ef7f7..2eed1ad958e9 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 1da74f9050a14b6e7f22e6f4d60bf62e517970cd Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:26 +0200 Subject: lib/test_parman: Include instead of Substitute the inclusion of header with to allow the removal of legacy inclusion of from . Signed-off-by: Uros Bizjak Cc: Andrew Morton Cc: Jiri Pirko Signed-off-by: Jason A. Donenfeld --- lib/test_parman.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_parman.c b/lib/test_parman.c index 35e32243693c..f9b97426a337 100644 --- a/lib/test_parman.c +++ b/lib/test_parman.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #define TEST_PARMAN_PRIO_SHIFT 7 /* defines number of prios for testing */ -- cgit v1.2.3 From 0402779aae14d56a7972a83250fd3fe636abaf12 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:27 +0200 Subject: lib/test_scanf: Include instead of Substitute the inclusion of header with to allow the removal of legacy inclusion of from . Signed-off-by: Uros Bizjak Acked-by: Petr Mladek Cc: Steven Rostedt Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Cc: Andrew Morton Signed-off-by: Jason A. Donenfeld --- lib/test_scanf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_scanf.c b/lib/test_scanf.c index 7257b1768545..44f8508c9d88 100644 --- a/lib/test_scanf.c +++ b/lib/test_scanf.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 8b3e26677bc64d42d2f38d9abc8dccc09d8a4259 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:50 -0700 Subject: lib: packing: refuse operating on bit indices which exceed size of buffer While reworking the implementation, it became apparent that this check does not exist. There is no functional issue yet, because at call sites, "startbit" and "endbit" are always hardcoded to correct values, and never come from the user. Even with the upcoming support of arbitrary buffer lengths, the "startbit >= 8 * pbuflen" check will remain correct. This is because we intend to always interpret the packed buffer in a way that avoids discontinuities in the available bit indices. Signed-off-by: Vladimir Oltean Tested-by: Jacob Keller Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-1-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- lib/packing.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/packing.c b/lib/packing.c index 3f656167c17e..439125286d2b 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -86,8 +86,10 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, */ int plogical_first_u8, plogical_last_u8, box; - /* startbit is expected to be larger than endbit */ - if (startbit < endbit) + /* startbit is expected to be larger than endbit, and both are + * expected to be within the logically addressable range of the buffer. + */ + if (unlikely(startbit < endbit || startbit >= 8 * pbuflen || endbit < 0)) /* Invalid function call */ return -EINVAL; -- cgit v1.2.3 From a636ba5e8682ae3b0c80efa2485cf6c1d4ff3a51 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:51 -0700 Subject: lib: packing: adjust definitions and implementation for arbitrary buffer lengths Jacob Keller has a use case for packing() in the intel/ice networking driver, but it cannot be used as-is. Simply put, the API quirks for LSW32_IS_FIRST and LITTLE_ENDIAN are naively implemented with the undocumented assumption that the buffer length must be a multiple of 4. All calculations of group offsets and offsets of bytes within groups assume that this is the case. But in the ice case, this does not hold true. For example, packing into a buffer of 22 bytes would yield wrong results, but pretending it was a 24 byte buffer would work. Rather than requiring such hacks, and leaving a big question mark when it comes to discontinuities in the accessible bit fields of such buffer, we should extend the packing API to support this use case. It turns out that we can keep the design in terms of groups of 4 bytes, but also make it work if the total length is not a multiple of 4. Just like before, imagine the buffer as a big number, and its most significant bytes (the ones that would make up to a multiple of 4) are missing. Thus, with a big endian (no quirks) interpretation of the buffer, those most significant bytes would be absent from the beginning of the buffer, and with a LSW32_IS_FIRST interpretation, they would be absent from the end of the buffer. The LITTLE_ENDIAN quirk, in the packing() API world, only affects byte ordering within groups of 4. Thus, it does not change which bytes are missing. Only the significance of the remaining bytes within the (smaller) group. No change intended for buffer sizes which are multiples of 4. Tested with the sja1105 driver and with downstream unit tests. Link: https://lore.kernel.org/netdev/a0338310-e66c-497c-bc1f-a597e50aa3ff@intel.com/ Signed-off-by: Vladimir Oltean Tested-by: Jacob Keller Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-2-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- Documentation/core-api/packing.rst | 71 ++++++++++++++++++++++++++++++++++++++ lib/packing.c | 70 ++++++++++++++++++++++--------------- 2 files changed, 114 insertions(+), 27 deletions(-) (limited to 'lib') diff --git a/Documentation/core-api/packing.rst b/Documentation/core-api/packing.rst index 3ed13bc9a195..821691f23c54 100644 --- a/Documentation/core-api/packing.rst +++ b/Documentation/core-api/packing.rst @@ -151,6 +151,77 @@ the more significant 4-byte word. We always think of our offsets as if there were no quirk, and we translate them afterwards, before accessing the memory region. +Note on buffer lengths not multiple of 4 +---------------------------------------- + +To deal with memory layout quirks where groups of 4 bytes are laid out "little +endian" relative to each other, but "big endian" within the group itself, the +concept of groups of 4 bytes is intrinsic to the packing API (not to be +confused with the memory access, which is performed byte by byte, though). + +With buffer lengths not multiple of 4, this means one group will be incomplete. +Depending on the quirks, this may lead to discontinuities in the bit fields +accessible through the buffer. The packing API assumes discontinuities were not +the intention of the memory layout, so it avoids them by effectively logically +shortening the most significant group of 4 octets to the number of octets +actually available. + +Example with a 31 byte sized buffer given below. Physical buffer offsets are +implicit, and increase from left to right within a group, and from top to +bottom within a column. + +No quirks: + +:: + + 31 29 28 | Group 7 (most significant) + 27 26 25 24 | Group 6 + 23 22 21 20 | Group 5 + 19 18 17 16 | Group 4 + 15 14 13 12 | Group 3 + 11 10 9 8 | Group 2 + 7 6 5 4 | Group 1 + 3 2 1 0 | Group 0 (least significant) + +QUIRK_LSW32_IS_FIRST: + +:: + + 3 2 1 0 | Group 0 (least significant) + 7 6 5 4 | Group 1 + 11 10 9 8 | Group 2 + 15 14 13 12 | Group 3 + 19 18 17 16 | Group 4 + 23 22 21 20 | Group 5 + 27 26 25 24 | Group 6 + 30 29 28 | Group 7 (most significant) + +QUIRK_LITTLE_ENDIAN: + +:: + + 30 28 29 | Group 7 (most significant) + 24 25 26 27 | Group 6 + 20 21 22 23 | Group 5 + 16 17 18 19 | Group 4 + 12 13 14 15 | Group 3 + 8 9 10 11 | Group 2 + 4 5 6 7 | Group 1 + 0 1 2 3 | Group 0 (least significant) + +QUIRK_LITTLE_ENDIAN | QUIRK_LSW32_IS_FIRST: + +:: + + 0 1 2 3 | Group 0 (least significant) + 4 5 6 7 | Group 1 + 8 9 10 11 | Group 2 + 12 13 14 15 | Group 3 + 16 17 18 19 | Group 4 + 20 21 22 23 | Group 5 + 24 25 26 27 | Group 6 + 28 29 30 | Group 7 (most significant) + Intended use ------------ diff --git a/lib/packing.c b/lib/packing.c index 439125286d2b..435236a914fe 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -9,27 +9,6 @@ #include #include -static int get_le_offset(int offset) -{ - int closest_multiple_of_4; - - closest_multiple_of_4 = (offset / 4) * 4; - offset -= closest_multiple_of_4; - return closest_multiple_of_4 + (3 - offset); -} - -static int get_reverse_lsw32_offset(int offset, size_t len) -{ - int closest_multiple_of_4; - int word_index; - - word_index = offset / 4; - closest_multiple_of_4 = word_index * 4; - offset -= closest_multiple_of_4; - word_index = (len / 4) - word_index - 1; - return word_index * 4 + offset; -} - static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit, int *box_end_bit, u8 *box_mask) { @@ -47,6 +26,48 @@ static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit, *box_end_bit = new_box_end_bit; } +/** + * calculate_box_addr - Determine physical location of byte in buffer + * @box: Index of byte within buffer seen as a logical big-endian big number + * @len: Size of buffer in bytes + * @quirks: mask of QUIRK_LSW32_IS_FIRST and QUIRK_LITTLE_ENDIAN + * + * Function interprets the buffer as a @len byte sized big number, and returns + * the physical offset of the @box logical octet within it. Internally, it + * treats the big number as groups of 4 bytes. If @len is not a multiple of 4, + * the last group may be shorter. + * + * @QUIRK_LSW32_IS_FIRST gives the ordering of groups of 4 octets relative to + * each other. If set, the most significant group of 4 octets is last in the + * buffer (and may be truncated if @len is not a multiple of 4). + * + * @QUIRK_LITTLE_ENDIAN gives the ordering of bytes within each group of 4. + * If set, the most significant byte is last in the group. If @len takes the + * form of 4k+3, the last group will only be able to represent 24 bits, and its + * most significant octet is byte 2. + * + * Return: the physical offset into the buffer corresponding to the logical box. + */ +static int calculate_box_addr(int box, size_t len, u8 quirks) +{ + size_t offset_of_group, offset_in_group, this_group = box / 4; + size_t group_size; + + if (quirks & QUIRK_LSW32_IS_FIRST) + offset_of_group = this_group * 4; + else + offset_of_group = len - ((this_group + 1) * 4); + + group_size = min(4, len - offset_of_group); + + if (quirks & QUIRK_LITTLE_ENDIAN) + offset_in_group = box - this_group * 4; + else + offset_in_group = group_size - (box - this_group * 4) - 1; + + return offset_of_group + offset_in_group; +} + /** * packing - Convert numbers (currently u64) between a packed and an unpacked * format. Unpacked means laid out in memory in the CPU's native @@ -157,12 +178,7 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, * effective addressing inside the pbuf (so it's not * logical any longer). */ - box_addr = pbuflen - box - 1; - if (quirks & QUIRK_LITTLE_ENDIAN) - box_addr = get_le_offset(box_addr); - if (quirks & QUIRK_LSW32_IS_FIRST) - box_addr = get_reverse_lsw32_offset(box_addr, - pbuflen); + box_addr = calculate_box_addr(box, pbuflen, quirks); if (op == UNPACK) { u64 pval; -- cgit v1.2.3 From 7263f64e16d90ded44ce211381b2def83db32fd9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:53 -0700 Subject: lib: packing: add pack() and unpack() wrappers over packing() Geert Uytterhoeven described packing() as "really bad API" because of not being able to enforce const correctness. The same function is used both when "pbuf" is input and "uval" is output, as in the other way around. Create 2 wrapper functions where const correctness can be ensured. Do ugly type casts inside, to be able to reuse packing() as currently implemented - which will _not_ modify the input argument. Also, take the opportunity to change the type of startbit and endbit to size_t - an unsigned type - in these new function prototypes. When int, an extra check for negative values is necessary. Hopefully, when packing() goes away completely, that check can be dropped. My concern is that code which does rely on the conditional directionality of packing() is harder to refactor without blowing up in size. So it may take a while to completely eliminate packing(). But let's make alternatives available for those who do not need that. Link: https://lore.kernel.org/netdev/20210223112003.2223332-1-geert+renesas@glider.be/ Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-4-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- include/linux/packing.h | 6 ++++++ lib/packing.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'lib') diff --git a/include/linux/packing.h b/include/linux/packing.h index 69baefebcd02..ea25cb93cc70 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -20,4 +20,10 @@ enum packing_op { int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, enum packing_op op, u8 quirks); +int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks); + +int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks); + #endif diff --git a/lib/packing.c b/lib/packing.c index 435236a914fe..2922db8a528c 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -90,6 +90,8 @@ static int calculate_box_addr(int box, size_t len, u8 quirks) * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and * QUIRK_MSB_ON_THE_RIGHT. * + * Note: this is deprecated, prefer to use pack() or unpack() in new code. + * * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming * correct usage, return code may be discarded. * If op is PACK, pbuf is modified. @@ -216,4 +218,56 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, } EXPORT_SYMBOL(packing); +/** + * pack - Pack u64 number into bitfield of buffer. + * + * @pbuf: Pointer to a buffer holding the packed value. + * @uval: Pointer to an u64 holding the unpacked value. + * @startbit: The index (in logical notation, compensated for quirks) where + * the packed value starts within pbuf. Must be larger than, or + * equal to, endbit. + * @endbit: The index (in logical notation, compensated for quirks) where + * the packed value ends within pbuf. Must be smaller than, or equal + * to, startbit. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming + * correct usage, return code may be discarded. The @pbuf memory will + * be modified on success. + */ +int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) +{ + return packing(pbuf, (u64 *)uval, startbit, endbit, pbuflen, PACK, quirks); +} +EXPORT_SYMBOL(pack); + +/** + * unpack - Unpack u64 number from packed buffer. + * + * @pbuf: Pointer to a buffer holding the packed value. + * @uval: Pointer to an u64 holding the unpacked value. + * @startbit: The index (in logical notation, compensated for quirks) where + * the packed value starts within pbuf. Must be larger than, or + * equal to, endbit. + * @endbit: The index (in logical notation, compensated for quirks) where + * the packed value ends within pbuf. Must be smaller than, or equal + * to, startbit. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming + * correct usage, return code may be discarded. The @uval will be + * modified on success. + */ +int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) +{ + return packing((void *)pbuf, uval, startbit, endbit, pbuflen, UNPACK, quirks); +} +EXPORT_SYMBOL(unpack); + MODULE_DESCRIPTION("Generic bitfield packing and unpacking"); -- cgit v1.2.3 From 28aec9ca29f05dabb835293acc6e202bf51b8092 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:54 -0700 Subject: lib: packing: duplicate pack() and unpack() implementations packing() is now used in some hot paths, and it would be good to get rid of some ifs and buts that depend on "op", to speed things up a little bit. With the main implementations now taking size_t endbit, we no longer have to check for negative values. Update the local integer variables to also be size_t to match. Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-5-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- include/linux/packing.h | 4 +- lib/packing.c | 221 ++++++++++++++++++++++++++++++------------------ 2 files changed, 143 insertions(+), 82 deletions(-) (limited to 'lib') diff --git a/include/linux/packing.h b/include/linux/packing.h index ea25cb93cc70..5d36dcd06f60 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -20,8 +20,8 @@ enum packing_op { int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, enum packing_op op, u8 quirks); -int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit, - size_t pbuflen, u8 quirks); +int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, + u8 quirks); int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, size_t pbuflen, u8 quirks); diff --git a/lib/packing.c b/lib/packing.c index 2922db8a528c..500184530d07 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -9,11 +9,11 @@ #include #include -static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit, - int *box_end_bit, u8 *box_mask) +static void adjust_for_msb_right_quirk(u64 *to_write, size_t *box_start_bit, + size_t *box_end_bit, u8 *box_mask) { - int box_bit_width = *box_start_bit - *box_end_bit + 1; - int new_box_start_bit, new_box_end_bit; + size_t box_bit_width = *box_start_bit - *box_end_bit + 1; + size_t new_box_start_bit, new_box_end_bit; *to_write >>= *box_end_bit; *to_write = bitrev8(*to_write) >> (8 - box_bit_width); @@ -48,7 +48,7 @@ static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit, * * Return: the physical offset into the buffer corresponding to the logical box. */ -static int calculate_box_addr(int box, size_t len, u8 quirks) +static size_t calculate_box_addr(size_t box, size_t len, u8 quirks) { size_t offset_of_group, offset_in_group, this_group = box / 4; size_t group_size; @@ -69,13 +69,10 @@ static int calculate_box_addr(int box, size_t len, u8 quirks) } /** - * packing - Convert numbers (currently u64) between a packed and an unpacked - * format. Unpacked means laid out in memory in the CPU's native - * understanding of integers, while packed means anything else that - * requires translation. + * pack - Pack u64 number into bitfield of buffer. * * @pbuf: Pointer to a buffer holding the packed value. - * @uval: Pointer to an u64 holding the unpacked value. + * @uval: CPU-readable unpacked value to pack. * @startbit: The index (in logical notation, compensated for quirks) where * the packed value starts within pbuf. Must be larger than, or * equal to, endbit. @@ -83,58 +80,45 @@ static int calculate_box_addr(int box, size_t len, u8 quirks) * the packed value ends within pbuf. Must be smaller than, or equal * to, startbit. * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. - * @op: If PACK, then uval will be treated as const pointer and copied (packed) - * into pbuf, between startbit and endbit. - * If UNPACK, then pbuf will be treated as const pointer and the logical - * value between startbit and endbit will be copied (unpacked) to uval. * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and * QUIRK_MSB_ON_THE_RIGHT. * - * Note: this is deprecated, prefer to use pack() or unpack() in new code. - * * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming - * correct usage, return code may be discarded. - * If op is PACK, pbuf is modified. - * If op is UNPACK, uval is modified. + * correct usage, return code may be discarded. The @pbuf memory will + * be modified on success. */ -int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, - enum packing_op op, u8 quirks) +int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, + u8 quirks) { - /* Number of bits for storing "uval" - * also width of the field to access in the pbuf - */ - u64 value_width; /* Logical byte indices corresponding to the * start and end of the field. */ int plogical_first_u8, plogical_last_u8, box; + /* width of the field to access in the pbuf */ + u64 value_width; /* startbit is expected to be larger than endbit, and both are * expected to be within the logically addressable range of the buffer. */ - if (unlikely(startbit < endbit || startbit >= 8 * pbuflen || endbit < 0)) + if (unlikely(startbit < endbit || startbit >= 8 * pbuflen)) /* Invalid function call */ return -EINVAL; value_width = startbit - endbit + 1; - if (value_width > 64) + if (unlikely(value_width > 64)) return -ERANGE; /* Check if "uval" fits in "value_width" bits. * If value_width is 64, the check will fail, but any * 64-bit uval will surely fit. */ - if (op == PACK && value_width < 64 && (*uval >= (1ull << value_width))) + if (unlikely(value_width < 64 && uval >= (1ull << value_width))) /* Cannot store "uval" inside "value_width" bits. * Truncating "uval" is most certainly not desirable, * so simply erroring out is appropriate. */ return -ERANGE; - /* Initialize parameter */ - if (op == UNPACK) - *uval = 0; - /* Iterate through an idealistic view of the pbuf as an u64 with * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low * logical bit significance. "box" denotes the current logical u8. @@ -144,11 +128,12 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, for (box = plogical_first_u8; box >= plogical_last_u8; box--) { /* Bit indices into the currently accessed 8-bit box */ - int box_start_bit, box_end_bit, box_addr; + size_t box_start_bit, box_end_bit, box_addr; u8 box_mask; /* Corresponding bits from the unpacked u64 parameter */ - int proj_start_bit, proj_end_bit; + size_t proj_start_bit, proj_end_bit; u64 proj_mask; + u64 pval; /* This u8 may need to be accessed in its entirety * (from bit 7 to bit 0), or not, depending on the @@ -182,44 +167,25 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, */ box_addr = calculate_box_addr(box, pbuflen, quirks); - if (op == UNPACK) { - u64 pval; - - /* Read from pbuf, write to uval */ - pval = ((u8 *)pbuf)[box_addr] & box_mask; - if (quirks & QUIRK_MSB_ON_THE_RIGHT) - adjust_for_msb_right_quirk(&pval, - &box_start_bit, - &box_end_bit, - &box_mask); - - pval >>= box_end_bit; - pval <<= proj_end_bit; - *uval &= ~proj_mask; - *uval |= pval; - } else { - u64 pval; - - /* Write to pbuf, read from uval */ - pval = (*uval) & proj_mask; - pval >>= proj_end_bit; - if (quirks & QUIRK_MSB_ON_THE_RIGHT) - adjust_for_msb_right_quirk(&pval, - &box_start_bit, - &box_end_bit, - &box_mask); - - pval <<= box_end_bit; - ((u8 *)pbuf)[box_addr] &= ~box_mask; - ((u8 *)pbuf)[box_addr] |= pval; - } + /* Write to pbuf, read from uval */ + pval = uval & proj_mask; + pval >>= proj_end_bit; + if (quirks & QUIRK_MSB_ON_THE_RIGHT) + adjust_for_msb_right_quirk(&pval, + &box_start_bit, + &box_end_bit, + &box_mask); + + pval <<= box_end_bit; + ((u8 *)pbuf)[box_addr] &= ~box_mask; + ((u8 *)pbuf)[box_addr] |= pval; } return 0; } -EXPORT_SYMBOL(packing); +EXPORT_SYMBOL(pack); /** - * pack - Pack u64 number into bitfield of buffer. + * unpack - Unpack u64 number from packed buffer. * * @pbuf: Pointer to a buffer holding the packed value. * @uval: Pointer to an u64 holding the unpacked value. @@ -234,18 +200,103 @@ EXPORT_SYMBOL(packing); * QUIRK_MSB_ON_THE_RIGHT. * * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming - * correct usage, return code may be discarded. The @pbuf memory will - * be modified on success. + * correct usage, return code may be discarded. The @uval will be + * modified on success. */ -int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit, - size_t pbuflen, u8 quirks) +int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) { - return packing(pbuf, (u64 *)uval, startbit, endbit, pbuflen, PACK, quirks); + /* Logical byte indices corresponding to the + * start and end of the field. + */ + int plogical_first_u8, plogical_last_u8, box; + /* width of the field to access in the pbuf */ + u64 value_width; + + /* startbit is expected to be larger than endbit, and both are + * expected to be within the logically addressable range of the buffer. + */ + if (unlikely(startbit < endbit || startbit >= 8 * pbuflen)) + /* Invalid function call */ + return -EINVAL; + + value_width = startbit - endbit + 1; + if (unlikely(value_width > 64)) + return -ERANGE; + + /* Initialize parameter */ + *uval = 0; + + /* Iterate through an idealistic view of the pbuf as an u64 with + * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low + * logical bit significance. "box" denotes the current logical u8. + */ + plogical_first_u8 = startbit / 8; + plogical_last_u8 = endbit / 8; + + for (box = plogical_first_u8; box >= plogical_last_u8; box--) { + /* Bit indices into the currently accessed 8-bit box */ + size_t box_start_bit, box_end_bit, box_addr; + u8 box_mask; + /* Corresponding bits from the unpacked u64 parameter */ + size_t proj_start_bit, proj_end_bit; + u64 proj_mask; + u64 pval; + + /* This u8 may need to be accessed in its entirety + * (from bit 7 to bit 0), or not, depending on the + * input arguments startbit and endbit. + */ + if (box == plogical_first_u8) + box_start_bit = startbit % 8; + else + box_start_bit = 7; + if (box == plogical_last_u8) + box_end_bit = endbit % 8; + else + box_end_bit = 0; + + /* We have determined the box bit start and end. + * Now we calculate where this (masked) u8 box would fit + * in the unpacked (CPU-readable) u64 - the u8 box's + * projection onto the unpacked u64. Though the + * box is u8, the projection is u64 because it may fall + * anywhere within the unpacked u64. + */ + proj_start_bit = ((box * 8) + box_start_bit) - endbit; + proj_end_bit = ((box * 8) + box_end_bit) - endbit; + proj_mask = GENMASK_ULL(proj_start_bit, proj_end_bit); + box_mask = GENMASK_ULL(box_start_bit, box_end_bit); + + /* Determine the offset of the u8 box inside the pbuf, + * adjusted for quirks. The adjusted box_addr will be used for + * effective addressing inside the pbuf (so it's not + * logical any longer). + */ + box_addr = calculate_box_addr(box, pbuflen, quirks); + + /* Read from pbuf, write to uval */ + pval = ((u8 *)pbuf)[box_addr] & box_mask; + if (quirks & QUIRK_MSB_ON_THE_RIGHT) + adjust_for_msb_right_quirk(&pval, + &box_start_bit, + &box_end_bit, + &box_mask); + + pval >>= box_end_bit; + pval <<= proj_end_bit; + *uval &= ~proj_mask; + *uval |= pval; + } + return 0; } -EXPORT_SYMBOL(pack); +EXPORT_SYMBOL(unpack); /** - * unpack - Unpack u64 number from packed buffer. + * packing - Convert numbers (currently u64) between a packed and an unpacked + * format. Unpacked means laid out in memory in the CPU's native + * understanding of integers, while packed means anything else that + * requires translation. * * @pbuf: Pointer to a buffer holding the packed value. * @uval: Pointer to an u64 holding the unpacked value. @@ -256,18 +307,28 @@ EXPORT_SYMBOL(pack); * the packed value ends within pbuf. Must be smaller than, or equal * to, startbit. * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @op: If PACK, then uval will be treated as const pointer and copied (packed) + * into pbuf, between startbit and endbit. + * If UNPACK, then pbuf will be treated as const pointer and the logical + * value between startbit and endbit will be copied (unpacked) to uval. * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and * QUIRK_MSB_ON_THE_RIGHT. * + * Note: this is deprecated, prefer to use pack() or unpack() in new code. + * * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming - * correct usage, return code may be discarded. The @uval will be - * modified on success. + * correct usage, return code may be discarded. + * If op is PACK, pbuf is modified. + * If op is UNPACK, uval is modified. */ -int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, - size_t pbuflen, u8 quirks) +int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, + enum packing_op op, u8 quirks) { - return packing((void *)pbuf, uval, startbit, endbit, pbuflen, UNPACK, quirks); + if (op == PACK) + return pack(pbuf, *uval, startbit, endbit, pbuflen, quirks); + + return unpack(pbuf, uval, startbit, endbit, pbuflen, quirks); } -EXPORT_SYMBOL(unpack); +EXPORT_SYMBOL(packing); MODULE_DESCRIPTION("Generic bitfield packing and unpacking"); -- cgit v1.2.3 From e9502ea6db8a457f0bc28a4a1f03517ad0547911 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 2 Oct 2024 14:51:55 -0700 Subject: lib: packing: add KUnit tests adapted from selftests Add 24 simple KUnit tests for the lib/packing.c pack() and unpack() APIs. The first 16 tests exercise all combinations of quirks with a simple magic number value on a 16-byte buffer. The remaining 8 tests cover non-multiple-of-4 buffer sizes. These tests were originally written by Vladimir as simple selftest functions. I adapted them to KUnit, refactoring them into a table driven approach. This will aid in adding additional tests in the future. Co-developed-by: Vladimir Oltean Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-6-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + lib/Kconfig | 12 +++ lib/Makefile | 1 + lib/packing_test.c | 258 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 272 insertions(+) create mode 100644 lib/packing_test.c (limited to 'lib') diff --git a/MAINTAINERS b/MAINTAINERS index c27f3190737f..af635dc60cfe 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17471,6 +17471,7 @@ S: Supported F: Documentation/core-api/packing.rst F: include/linux/packing.h F: lib/packing.c +F: lib/packing_test.c PADATA PARALLEL EXECUTION MECHANISM M: Steffen Klassert diff --git a/lib/Kconfig b/lib/Kconfig index b38849af6f13..50d85f38b569 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -40,6 +40,18 @@ config PACKING When in doubt, say N. +config PACKING_KUNIT_TEST + tristate "KUnit tests for packing library" if !KUNIT_ALL_TESTS + depends on PACKING && KUNIT + default KUNIT_ALL_TESTS + help + This builds KUnit tests for the packing library. + + For more information on KUnit and unit tests in general, + please refer to the KUnit documentation in Documentation/dev-tools/kunit/. + + When in doubt, say N. + config BITREVERSE tristate diff --git a/lib/Makefile b/lib/Makefile index 773adf88af41..811ba12c8cd0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -154,6 +154,7 @@ obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o obj-$(CONFIG_BITREVERSE) += bitrev.o obj-$(CONFIG_LINEAR_RANGES) += linear_ranges.o obj-$(CONFIG_PACKING) += packing.o +obj-$(CONFIG_PACKING_KUNIT_TEST) += packing_test.o obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o obj-$(CONFIG_CRC16) += crc16.o obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o diff --git a/lib/packing_test.c b/lib/packing_test.c new file mode 100644 index 000000000000..4d07523dba29 --- /dev/null +++ b/lib/packing_test.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024, Vladimir Oltean + * Copyright (c) 2024, Intel Corporation. + */ +#include +#include + +struct packing_test_case { + const char *desc; + const u8 *pbuf; + size_t pbuf_size; + u64 uval; + size_t start_bit; + size_t end_bit; + u8 quirks; +}; + +#define NO_QUIRKS 0 + +/** + * PBUF - Initialize .pbuf and .pbuf_size + * @array: elements of constant physical buffer + * + * Initializes the .pbuf and .pbuf_size fields of a struct packing_test_case + * with a constant array of the specified elements. + */ +#define PBUF(array...) \ + .pbuf = (const u8[]){ array }, \ + .pbuf_size = sizeof((const u8 []){ array }) + +static const struct packing_test_case cases[] = { + /* These tests pack and unpack a magic 64-bit value + * (0xcafedeadbeefcafe) at a fixed logical offset (32) within an + * otherwise zero array of 128 bits (16 bytes). They test all possible + * bit layouts of the 128 bit buffer. + */ + { + .desc = "no quirks, 16 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xca, 0xfe, 0xde, 0xad, + 0xbe, 0xef, 0xca, 0xfe, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = NO_QUIRKS, + }, + { + .desc = "lsw32 first, 16 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xbe, 0xef, 0xca, 0xfe, + 0xca, 0xfe, 0xde, 0xad, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_LSW32_IS_FIRST, + }, + { + .desc = "little endian words, 16 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xad, 0xde, 0xfe, 0xca, + 0xfe, 0xca, 0xef, 0xbe, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "lsw32 first + little endian words, 16 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xfe, 0xca, 0xef, 0xbe, + 0xad, 0xde, 0xfe, 0xca, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "msb right, 16 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0x53, 0x7f, 0x7b, 0xb5, + 0x7d, 0xf7, 0x53, 0x7f, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_MSB_ON_THE_RIGHT, + }, + { + .desc = "msb right + lsw32 first, 16 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0x7d, 0xf7, 0x53, 0x7f, + 0x53, 0x7f, 0x7b, 0xb5, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LSW32_IS_FIRST, + }, + { + .desc = "msb right + little endian words, 16 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xb5, 0x7b, 0x7f, 0x53, + 0x7f, 0x53, 0xf7, 0x7d, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "msb right + lsw32 first + little endian words, 16 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0x7f, 0x53, 0xf7, 0x7d, + 0xb5, 0x7b, 0x7f, 0x53, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, + /* These tests pack and unpack a magic 64-bit value + * (0xcafedeadbeefcafe) at a fixed logical offset (32) within an + * otherwise zero array of varying size from 18 bytes to 24 bytes. + */ + { + .desc = "no quirks, 18 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xca, 0xfe, + 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0x00, 0x00, + 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = NO_QUIRKS, + }, + { + .desc = "no quirks, 19 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xca, + 0xfe, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0x00, + 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = NO_QUIRKS, + }, + { + .desc = "no quirks, 20 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xca, 0xfe, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, + 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = NO_QUIRKS, + }, + { + .desc = "no quirks, 22 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xca, 0xfe, 0xde, 0xad, 0xbe, 0xef, + 0xca, 0xfe, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = NO_QUIRKS, + }, + { + .desc = "no quirks, 24 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xca, 0xfe, 0xde, 0xad, + 0xbe, 0xef, 0xca, 0xfe, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = NO_QUIRKS, + }, + { + .desc = "lsw32 first + little endian words, 18 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xfe, 0xca, 0xef, 0xbe, + 0xad, 0xde, 0xfe, 0xca, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "lsw32 first + little endian words, 19 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xfe, 0xca, 0xef, 0xbe, + 0xad, 0xde, 0xfe, 0xca, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "lsw32 first + little endian words, 20 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xfe, 0xca, 0xef, 0xbe, + 0xad, 0xde, 0xfe, 0xca, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "lsw32 first + little endian words, 22 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xfe, 0xca, 0xef, 0xbe, + 0xad, 0xde, 0xfe, 0xca, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "lsw32 first + little endian words, 24 bytes", + PBUF(0x00, 0x00, 0x00, 0x00, 0xfe, 0xca, 0xef, 0xbe, + 0xad, 0xde, 0xfe, 0xca, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), + .uval = 0xcafedeadbeefcafe, + .start_bit = 95, + .end_bit = 32, + .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, +}; + +KUNIT_ARRAY_PARAM_DESC(packing, cases, desc); + +static void packing_test_pack(struct kunit *test) +{ + const struct packing_test_case *params = test->param_value; + u8 *pbuf; + int err; + + pbuf = kunit_kzalloc(test, params->pbuf_size, GFP_KERNEL); + + err = pack(pbuf, params->uval, params->start_bit, params->end_bit, + params->pbuf_size, params->quirks); + + KUNIT_EXPECT_EQ_MSG(test, err, 0, "pack() returned %pe\n", ERR_PTR(err)); + KUNIT_EXPECT_MEMEQ(test, pbuf, params->pbuf, params->pbuf_size); +} + +static void packing_test_unpack(struct kunit *test) +{ + const struct packing_test_case *params = test->param_value; + u64 uval; + int err; + + err = unpack(params->pbuf, &uval, params->start_bit, params->end_bit, + params->pbuf_size, params->quirks); + KUNIT_EXPECT_EQ_MSG(test, err, 0, "unpack() returned %pe\n", ERR_PTR(err)); + KUNIT_EXPECT_EQ(test, uval, params->uval); +} + +static struct kunit_case packing_test_cases[] = { + KUNIT_CASE_PARAM(packing_test_pack, packing_gen_params), + KUNIT_CASE_PARAM(packing_test_unpack, packing_gen_params), + {}, +}; + +static struct kunit_suite packing_test_suite = { + .name = "packing", + .test_cases = packing_test_cases, +}; + +kunit_test_suite(packing_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("KUnit tests for packing library"); -- cgit v1.2.3 From fcd6dd91d0e8b151210c9532309e451c12e386e1 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 2 Oct 2024 14:51:56 -0700 Subject: lib: packing: add additional KUnit tests While reviewing the initial KUnit tests for lib/packing, Przemek pointed out that the test values have duplicate bytes in the input sequence. In addition, I noticed that the unit tests pack and unpack on a byte boundary, instead of crossing bytes. Thus, we lack good coverage of the corner cases of the API. Add additional unit tests to cover packing and unpacking byte buffers which do not have duplicate bytes in the unpacked value, and which pack and unpack to an unaligned offset. A careful reviewer may note the lack tests for QUIRK_MSB_ON_THE_RIGHT. This is because I found issues with that quirk during test implementation. This quirk will be fixed and the tests will be included in a future change. Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-7-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- lib/packing_test.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'lib') diff --git a/lib/packing_test.c b/lib/packing_test.c index 4d07523dba29..5d533e633e06 100644 --- a/lib/packing_test.c +++ b/lib/packing_test.c @@ -210,6 +210,88 @@ static const struct packing_test_case cases[] = { .end_bit = 32, .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, }, + /* These tests pack and unpack a magic 64-bit value + * (0x1122334455667788) at an odd starting bit (43) within an + * otherwise zero array of 128 bits (16 bytes). They test all possible + * bit layouts of the 128 bit buffer. + */ + { + .desc = "no quirks, 16 bytes, non-aligned", + PBUF(0x00, 0x00, 0x00, 0x89, 0x11, 0x9a, 0x22, 0xab, + 0x33, 0xbc, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + .uval = 0x1122334455667788, + .start_bit = 106, + .end_bit = 43, + .quirks = NO_QUIRKS, + }, + { + .desc = "lsw32 first, 16 bytes, non-aligned", + PBUF(0x00, 0x00, 0x00, 0x00, 0x33, 0xbc, 0x40, 0x00, + 0x11, 0x9a, 0x22, 0xab, 0x00, 0x00, 0x00, 0x89), + .uval = 0x1122334455667788, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_LSW32_IS_FIRST, + }, + { + .desc = "little endian words, 16 bytes, non-aligned", + PBUF(0x89, 0x00, 0x00, 0x00, 0xab, 0x22, 0x9a, 0x11, + 0x00, 0x40, 0xbc, 0x33, 0x00, 0x00, 0x00, 0x00), + .uval = 0x1122334455667788, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "lsw32 first + little endian words, 16 bytes, non-aligned", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0xbc, 0x33, + 0xab, 0x22, 0x9a, 0x11, 0x89, 0x00, 0x00, 0x00), + .uval = 0x1122334455667788, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, + /* These tests pack and unpack a u64 with all bits set + * (0xffffffffffffffff) at an odd starting bit (43) within an + * otherwise zero array of 128 bits (16 bytes). They test all possible + * bit layouts of the 128 bit buffer. + */ + { + .desc = "no quirks, 16 bytes, non-aligned, 0xff", + PBUF(0x00, 0x00, 0x07, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00), + .uval = 0xffffffffffffffff, + .start_bit = 106, + .end_bit = 43, + .quirks = NO_QUIRKS, + }, + { + .desc = "lsw32 first, 16 bytes, non-aligned, 0xff", + PBUF(0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xf8, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x07, 0xff), + .uval = 0xffffffffffffffff, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_LSW32_IS_FIRST, + }, + { + .desc = "little endian words, 16 bytes, non-aligned, 0xff", + PBUF(0xff, 0x07, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0x00, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00), + .uval = 0xffffffffffffffff, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "lsw32 first + little endian words, 16 bytes, non-aligned, 0xff", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00), + .uval = 0xffffffffffffffff, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, }; KUNIT_ARRAY_PARAM_DESC(packing, cases, desc); -- cgit v1.2.3 From e7fdf5dddce5a4f4608787225973d38efb406425 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 2 Oct 2024 14:51:57 -0700 Subject: lib: packing: fix QUIRK_MSB_ON_THE_RIGHT behavior The QUIRK_MSB_ON_THE_RIGHT quirk is intended to modify pack() and unpack() so that the most significant bit of each byte in the packed layout is on the right. The way the quirk is currently implemented is broken whenever the packing code packs or unpacks any value that is not exactly a full byte. The broken behavior can occur when packing any values smaller than one byte, when packing any value that is not exactly a whole number of bytes, or when the packing is not aligned to a byte boundary. This quirk is documented in the following way: 1. Normally (no quirks), we would do it like this: :: 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 7 6 5 4 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 3 2 1 0 2. If QUIRK_MSB_ON_THE_RIGHT is set, we do it like this: :: 56 57 58 59 60 61 62 63 48 49 50 51 52 53 54 55 40 41 42 43 44 45 46 47 32 33 34 35 36 37 38 39 7 6 5 4 24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7 3 2 1 0 That is, QUIRK_MSB_ON_THE_RIGHT does not affect byte positioning, but inverts bit offsets inside a byte. Essentially, the mapping for physical bit offsets should be reserved for a given byte within the payload. This reversal should be fixed to the bytes in the packing layout. The logic to implement this quirk is handled within the adjust_for_msb_right_quirk() function. This function does not work properly when dealing with the bytes that contain only a partial amount of data. In particular, consider trying to pack or unpack the range 53-44. We should always be mapping the bits from the logical ordering to their physical ordering in the same way, regardless of what sequence of bits we are unpacking. This, we should grab the following logical bits: Logical: 55 54 53 52 51 50 49 48 47 45 44 43 42 41 40 39 ^ ^ ^ ^ ^ ^ ^ ^ ^ And pack them into the physical bits: Physical: 48 49 50 51 52 53 54 55 40 41 42 43 44 45 46 47 Logical: 48 49 50 51 52 53 44 45 46 47 ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ The current logic in adjust_for_msb_right_quirk is broken. I believe it is intending to map according to the following: Physical: 48 49 50 51 52 53 54 55 40 41 42 43 44 45 46 47 Logical: 48 49 50 51 52 53 44 45 46 47 ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ That is, it tries to keep the bits at the start and end of a packing together. This is wrong, as it makes the packing change what bit is being mapped to what based on which bits you're currently packing or unpacking. Worse, the actual calculations within adjust_for_msb_right_quirk don't make sense. Consider the case when packing the last byte of an unaligned packing. It might have a start bit of 7 and an end bit of 5. This would have a width of 3 bits. The new_start_bit will be calculated as the width - the box_end_bit - 1. This will underflow and produce a negative value, which will ultimate result in generating a new box_mask of all 0s. For any other values, the result of the calculations of the new_box_end_bit, new_box_start_bit, and the new box_mask will result in the exact same values for the box_end_bit, box_start_bit, and box_mask. This makes the calculations completely irrelevant. If box_end_bit is 0, and box_start_bit is 7, then the entire function of adjust_for_msb_right_quirk will boil down to just: *to_write = bitrev8(*to_write) The other adjustments are attempting (incorrectly) to keep the bits in the same place but just reversed. This is not the right behavior even if implemented correctly, as it leaves the mapping dependent on the bit values being packed or unpacked. Remove adjust_for_msb_right_quirk() and just use bitrev8 to reverse the byte order when interacting with the packed data. In particular, for packing, we need to reverse both the box_mask and the physical value being packed. This is done after shifting the value by box_end_bit so that the reversed mapping is always aligned to the physical buffer byte boundary. The box_mask is reversed as we're about to use it to clear any stale bits in the physical buffer at this block. For unpacking, we need to reverse the contents of the physical buffer *before* masking with the box_mask. This is critical, as the box_mask is a logical mask of the bit layout before handling the QUIRK_MSB_ON_THE_RIGHT. Add several new tests which cover this behavior. These tests will fail without the fix and pass afterwards. Note that no current drivers make use of QUIRK_MSB_ON_THE_RIGHT. I suspect this is why there have been no reports of this inconsistency before. Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-8-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- lib/packing.c | 39 +++++++++-------------------- lib/packing_test.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/lib/packing.c b/lib/packing.c index 500184530d07..9efe57d347c7 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -9,23 +9,6 @@ #include #include -static void adjust_for_msb_right_quirk(u64 *to_write, size_t *box_start_bit, - size_t *box_end_bit, u8 *box_mask) -{ - size_t box_bit_width = *box_start_bit - *box_end_bit + 1; - size_t new_box_start_bit, new_box_end_bit; - - *to_write >>= *box_end_bit; - *to_write = bitrev8(*to_write) >> (8 - box_bit_width); - *to_write <<= *box_end_bit; - - new_box_end_bit = box_bit_width - *box_start_bit - 1; - new_box_start_bit = box_bit_width - *box_end_bit - 1; - *box_mask = GENMASK_ULL(new_box_start_bit, new_box_end_bit); - *box_start_bit = new_box_start_bit; - *box_end_bit = new_box_end_bit; -} - /** * calculate_box_addr - Determine physical location of byte in buffer * @box: Index of byte within buffer seen as a logical big-endian big number @@ -170,13 +153,13 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, /* Write to pbuf, read from uval */ pval = uval & proj_mask; pval >>= proj_end_bit; - if (quirks & QUIRK_MSB_ON_THE_RIGHT) - adjust_for_msb_right_quirk(&pval, - &box_start_bit, - &box_end_bit, - &box_mask); - pval <<= box_end_bit; + + if (quirks & QUIRK_MSB_ON_THE_RIGHT) { + pval = bitrev8(pval); + box_mask = bitrev8(box_mask); + } + ((u8 *)pbuf)[box_addr] &= ~box_mask; ((u8 *)pbuf)[box_addr] |= pval; } @@ -276,12 +259,12 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, box_addr = calculate_box_addr(box, pbuflen, quirks); /* Read from pbuf, write to uval */ - pval = ((u8 *)pbuf)[box_addr] & box_mask; + pval = ((u8 *)pbuf)[box_addr]; + if (quirks & QUIRK_MSB_ON_THE_RIGHT) - adjust_for_msb_right_quirk(&pval, - &box_start_bit, - &box_end_bit, - &box_mask); + pval = bitrev8(pval); + + pval &= box_mask; pval >>= box_end_bit; pval <<= proj_end_bit; diff --git a/lib/packing_test.c b/lib/packing_test.c index 5d533e633e06..015ad1180d23 100644 --- a/lib/packing_test.c +++ b/lib/packing_test.c @@ -251,6 +251,42 @@ static const struct packing_test_case cases[] = { .end_bit = 43, .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, }, + { + .desc = "msb right, 16 bytes, non-aligned", + PBUF(0x00, 0x00, 0x00, 0x91, 0x88, 0x59, 0x44, 0xd5, + 0xcc, 0x3d, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00), + .uval = 0x1122334455667788, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_MSB_ON_THE_RIGHT, + }, + { + .desc = "msb right + lsw32 first, 16 bytes, non-aligned", + PBUF(0x00, 0x00, 0x00, 0x00, 0xcc, 0x3d, 0x02, 0x00, + 0x88, 0x59, 0x44, 0xd5, 0x00, 0x00, 0x00, 0x91), + .uval = 0x1122334455667788, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LSW32_IS_FIRST, + }, + { + .desc = "msb right + little endian words, 16 bytes, non-aligned", + PBUF(0x91, 0x00, 0x00, 0x00, 0xd5, 0x44, 0x59, 0x88, + 0x00, 0x02, 0x3d, 0xcc, 0x00, 0x00, 0x00, 0x00), + .uval = 0x1122334455667788, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "msb right + lsw32 first + little endian words, 16 bytes, non-aligned", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x3d, 0xcc, + 0xd5, 0x44, 0x59, 0x88, 0x91, 0x00, 0x00, 0x00), + .uval = 0x1122334455667788, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, /* These tests pack and unpack a u64 with all bits set * (0xffffffffffffffff) at an odd starting bit (43) within an * otherwise zero array of 128 bits (16 bytes). They test all possible @@ -292,6 +328,42 @@ static const struct packing_test_case cases[] = { .end_bit = 43, .quirks = QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, }, + { + .desc = "msb right, 16 bytes, non-aligned, 0xff", + PBUF(0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00), + .uval = 0xffffffffffffffff, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_MSB_ON_THE_RIGHT, + }, + { + .desc = "msb right + lsw32 first, 16 bytes, non-aligned, 0xff", + PBUF(0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x1f, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0xe0, 0xff), + .uval = 0xffffffffffffffff, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LSW32_IS_FIRST, + }, + { + .desc = "msb right + little endian words, 16 bytes, non-aligned, 0xff", + PBUF(0xff, 0xe0, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x1f, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00), + .uval = 0xffffffffffffffff, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LITTLE_ENDIAN, + }, + { + .desc = "msb right + lsw32 first + little endian words, 16 bytes, non-aligned, 0xff", + PBUF(0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xe0, 0x00, 0x00), + .uval = 0xffffffffffffffff, + .start_bit = 106, + .end_bit = 43, + .quirks = QUIRK_MSB_ON_THE_RIGHT | QUIRK_LSW32_IS_FIRST | QUIRK_LITTLE_ENDIAN, + }, }; KUNIT_ARRAY_PARAM_DESC(packing, cases, desc); -- cgit v1.2.3 From fb02c7c8a5775456698851185882f542debd8350 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:58 -0700 Subject: lib: packing: use BITS_PER_BYTE instead of 8 This helps clarify what the 8 is for. Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-9-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- lib/packing.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/packing.c b/lib/packing.c index 9efe57d347c7..ac1f36c56a77 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -83,7 +83,7 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, /* startbit is expected to be larger than endbit, and both are * expected to be within the logically addressable range of the buffer. */ - if (unlikely(startbit < endbit || startbit >= 8 * pbuflen)) + if (unlikely(startbit < endbit || startbit >= BITS_PER_BYTE * pbuflen)) /* Invalid function call */ return -EINVAL; @@ -106,8 +106,8 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low * logical bit significance. "box" denotes the current logical u8. */ - plogical_first_u8 = startbit / 8; - plogical_last_u8 = endbit / 8; + plogical_first_u8 = startbit / BITS_PER_BYTE; + plogical_last_u8 = endbit / BITS_PER_BYTE; for (box = plogical_first_u8; box >= plogical_last_u8; box--) { /* Bit indices into the currently accessed 8-bit box */ @@ -123,11 +123,11 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, * input arguments startbit and endbit. */ if (box == plogical_first_u8) - box_start_bit = startbit % 8; + box_start_bit = startbit % BITS_PER_BYTE; else box_start_bit = 7; if (box == plogical_last_u8) - box_end_bit = endbit % 8; + box_end_bit = endbit % BITS_PER_BYTE; else box_end_bit = 0; @@ -138,8 +138,8 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, * box is u8, the projection is u64 because it may fall * anywhere within the unpacked u64. */ - proj_start_bit = ((box * 8) + box_start_bit) - endbit; - proj_end_bit = ((box * 8) + box_end_bit) - endbit; + proj_start_bit = ((box * BITS_PER_BYTE) + box_start_bit) - endbit; + proj_end_bit = ((box * BITS_PER_BYTE) + box_end_bit) - endbit; proj_mask = GENMASK_ULL(proj_start_bit, proj_end_bit); box_mask = GENMASK_ULL(box_start_bit, box_end_bit); @@ -199,7 +199,7 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, /* startbit is expected to be larger than endbit, and both are * expected to be within the logically addressable range of the buffer. */ - if (unlikely(startbit < endbit || startbit >= 8 * pbuflen)) + if (unlikely(startbit < endbit || startbit >= BITS_PER_BYTE * pbuflen)) /* Invalid function call */ return -EINVAL; @@ -214,8 +214,8 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low * logical bit significance. "box" denotes the current logical u8. */ - plogical_first_u8 = startbit / 8; - plogical_last_u8 = endbit / 8; + plogical_first_u8 = startbit / BITS_PER_BYTE; + plogical_last_u8 = endbit / BITS_PER_BYTE; for (box = plogical_first_u8; box >= plogical_last_u8; box--) { /* Bit indices into the currently accessed 8-bit box */ @@ -231,11 +231,11 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, * input arguments startbit and endbit. */ if (box == plogical_first_u8) - box_start_bit = startbit % 8; + box_start_bit = startbit % BITS_PER_BYTE; else box_start_bit = 7; if (box == plogical_last_u8) - box_end_bit = endbit % 8; + box_end_bit = endbit % BITS_PER_BYTE; else box_end_bit = 0; @@ -246,8 +246,8 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, * box is u8, the projection is u64 because it may fall * anywhere within the unpacked u64. */ - proj_start_bit = ((box * 8) + box_start_bit) - endbit; - proj_end_bit = ((box * 8) + box_end_bit) - endbit; + proj_start_bit = ((box * BITS_PER_BYTE) + box_start_bit) - endbit; + proj_end_bit = ((box * BITS_PER_BYTE) + box_end_bit) - endbit; proj_mask = GENMASK_ULL(proj_start_bit, proj_end_bit); box_mask = GENMASK_ULL(box_start_bit, box_end_bit); -- cgit v1.2.3 From 46e784e94b82d1d23a67530cfee7de883f5c65fc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:59 -0700 Subject: lib: packing: use GENMASK() for box_mask This is an u8, so using GENMASK_ULL() for unsigned long long is unnecessary. Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-10-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- lib/packing.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/packing.c b/lib/packing.c index ac1f36c56a77..793942745e34 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -141,7 +141,7 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, proj_start_bit = ((box * BITS_PER_BYTE) + box_start_bit) - endbit; proj_end_bit = ((box * BITS_PER_BYTE) + box_end_bit) - endbit; proj_mask = GENMASK_ULL(proj_start_bit, proj_end_bit); - box_mask = GENMASK_ULL(box_start_bit, box_end_bit); + box_mask = GENMASK(box_start_bit, box_end_bit); /* Determine the offset of the u8 box inside the pbuf, * adjusted for quirks. The adjusted box_addr will be used for @@ -249,7 +249,7 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, proj_start_bit = ((box * BITS_PER_BYTE) + box_start_bit) - endbit; proj_end_bit = ((box * BITS_PER_BYTE) + box_end_bit) - endbit; proj_mask = GENMASK_ULL(proj_start_bit, proj_end_bit); - box_mask = GENMASK_ULL(box_start_bit, box_end_bit); + box_mask = GENMASK(box_start_bit, box_end_bit); /* Determine the offset of the u8 box inside the pbuf, * adjusted for quirks. The adjusted box_addr will be used for -- cgit v1.2.3 From 6100da511bd21d3ccb0a350c429579e8995a830e Mon Sep 17 00:00:00 2001 From: Qianqiang Liu Date: Fri, 13 Sep 2024 22:07:42 +0800 Subject: crypto: lib/mpi - Fix an "Uninitialized scalar variable" issue The "err" variable may be returned without an initialized value. Fixes: 8e3a67f2de87 ("crypto: lib/mpi - Add error checks to extension") Signed-off-by: Qianqiang Liu Signed-off-by: Herbert Xu --- lib/crypto/mpi/mpi-mul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/crypto/mpi/mpi-mul.c b/lib/crypto/mpi/mpi-mul.c index 892a246216b9..7e6ff1ce3e9b 100644 --- a/lib/crypto/mpi/mpi-mul.c +++ b/lib/crypto/mpi/mpi-mul.c @@ -21,7 +21,7 @@ int mpi_mul(MPI w, MPI u, MPI v) int usign, vsign, sign_product; int assign_wp = 0; mpi_ptr_t tmp_limb = NULL; - int err; + int err = 0; if (u->nlimbs < v->nlimbs) { /* Swap U and V. */ -- cgit v1.2.3 From 1405981bbba0796530311d07a67bf58228cc0fcc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 4 Oct 2024 14:00:12 +0300 Subject: lib: packing: catch kunit_kzalloc() failure in the pack() test kunit_kzalloc() may fail. Other call sites verify that this is the case, either using a direct comparison with the NULL pointer, or the KUNIT_ASSERT_NOT_NULL() or KUNIT_ASSERT_NOT_ERR_OR_NULL(). Pick KUNIT_ASSERT_NOT_NULL() as the error handling method that made most sense to me. It's an unlikely thing to happen, but at least we call __kunit_abort() instead of dereferencing this NULL pointer. Fixes: e9502ea6db8a ("lib: packing: add KUnit tests adapted from selftests") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241004110012.1323427-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- lib/packing_test.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/packing_test.c b/lib/packing_test.c index 015ad1180d23..b38ea43c03fd 100644 --- a/lib/packing_test.c +++ b/lib/packing_test.c @@ -375,6 +375,7 @@ static void packing_test_pack(struct kunit *test) int err; pbuf = kunit_kzalloc(test, params->pbuf_size, GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, pbuf); err = pack(pbuf, params->uval, params->start_bit, params->end_bit, params->pbuf_size, params->quirks); -- cgit v1.2.3 From 0ee4dcafda9576910559f0471a3d6891daf9ab92 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 18 Sep 2024 22:48:13 +0800 Subject: lib: devres: Simplify API devm_iounmap() implementation Simplify devm_iounmap() implementation by dedicated API devres_release() compared with current solution, namely, devres_destroy() + iounmap() devres_release() has the following advantages: - it is simpler if devm_iounmap()'s parameter @addr is valid, namely @addr was ever returned by one of devm_ioremap() variants. - it can avoid unnecessary iounmap(@addr) if @addr is not valid. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20240918-fix_lib_devres-v1-1-e696ab5486e6@quicinc.com Signed-off-by: Greg Kroah-Hartman --- lib/devres.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/devres.c b/lib/devres.c index 4fc152de6d8b..68ffcd5d9358 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -115,9 +115,8 @@ EXPORT_SYMBOL(devm_ioremap_wc); */ void devm_iounmap(struct device *dev, void __iomem *addr) { - WARN_ON(devres_destroy(dev, devm_ioremap_release, devm_ioremap_match, + WARN_ON(devres_release(dev, devm_ioremap_release, devm_ioremap_match, (__force void *)addr)); - iounmap(addr); } EXPORT_SYMBOL(devm_iounmap); -- cgit v1.2.3 From 9bd133f05b1dca5ca4399a76d04d0f6f4d454e44 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 18 Sep 2024 22:48:14 +0800 Subject: lib: devres: Simplify API devm_ioport_unmap() implementation Simplify devm_ioport_unmap() implementation by dedicated API devres_release(), compared with current solution, namely ioport_unmap() + devres_destroy(), devres_release() has below advantages: - it is simpler if devm_ioport_unmap()'s parameter @addr was ever returned by devm_ioport_map(). - it can avoid unnecessary ioport_unmap(@addr) if @addr was not ever returned by devm_ioport_map(). Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20240918-fix_lib_devres-v1-2-e696ab5486e6@quicinc.com Signed-off-by: Greg Kroah-Hartman --- lib/devres.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/devres.c b/lib/devres.c index 68ffcd5d9358..73901160197e 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -307,8 +307,7 @@ EXPORT_SYMBOL(devm_ioport_map); */ void devm_ioport_unmap(struct device *dev, void __iomem *addr) { - ioport_unmap(addr); - WARN_ON(devres_destroy(dev, devm_ioport_map_release, + WARN_ON(devres_release(dev, devm_ioport_map_release, devm_ioport_map_match, (__force void *)addr)); } EXPORT_SYMBOL(devm_ioport_unmap); -- cgit v1.2.3 From 6ba55951e70bf7a8d89c03aa5612d2e0c81ded6d Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 10 Oct 2024 11:27:15 -0500 Subject: logic_pio: Constify fwnode_handle The fwnode_handle passed into find_io_range_by_fwnode() and logic_pio_trans_hwaddr() are not modified, so make them const. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241010-dt-const-v1-2-87a51f558425@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/logic_pio.h | 6 +++--- lib/logic_pio.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/include/linux/logic_pio.h b/include/linux/logic_pio.h index babf4e3c28ba..8f1a9408302f 100644 --- a/include/linux/logic_pio.h +++ b/include/linux/logic_pio.h @@ -17,7 +17,7 @@ enum { struct logic_pio_hwaddr { struct list_head list; - struct fwnode_handle *fwnode; + const struct fwnode_handle *fwnode; resource_size_t hw_start; resource_size_t io_start; resource_size_t size; /* range size populated */ @@ -110,8 +110,8 @@ void logic_outsl(unsigned long addr, const void *buffer, unsigned int count); #endif /* CONFIG_INDIRECT_PIO */ #define MMIO_UPPER_LIMIT (IO_SPACE_LIMIT - PIO_INDIRECT_SIZE) -struct logic_pio_hwaddr *find_io_range_by_fwnode(struct fwnode_handle *fwnode); -unsigned long logic_pio_trans_hwaddr(struct fwnode_handle *fwnode, +struct logic_pio_hwaddr *find_io_range_by_fwnode(const struct fwnode_handle *fwnode); +unsigned long logic_pio_trans_hwaddr(const struct fwnode_handle *fwnode, resource_size_t hw_addr, resource_size_t size); int logic_pio_register_range(struct logic_pio_hwaddr *newrange); void logic_pio_unregister_range(struct logic_pio_hwaddr *range); diff --git a/lib/logic_pio.c b/lib/logic_pio.c index 2ea564a40064..e29496a38d06 100644 --- a/lib/logic_pio.c +++ b/lib/logic_pio.c @@ -122,7 +122,7 @@ void logic_pio_unregister_range(struct logic_pio_hwaddr *range) * * Traverse the io_range_list to find the registered node for @fwnode. */ -struct logic_pio_hwaddr *find_io_range_by_fwnode(struct fwnode_handle *fwnode) +struct logic_pio_hwaddr *find_io_range_by_fwnode(const struct fwnode_handle *fwnode) { struct logic_pio_hwaddr *range, *found_range = NULL; @@ -186,7 +186,7 @@ resource_size_t logic_pio_to_hwaddr(unsigned long pio) * * Returns Logical PIO value if successful, ~0UL otherwise */ -unsigned long logic_pio_trans_hwaddr(struct fwnode_handle *fwnode, +unsigned long logic_pio_trans_hwaddr(const struct fwnode_handle *fwnode, resource_size_t addr, resource_size_t size) { struct logic_pio_hwaddr *range; -- cgit v1.2.3 From a0ae95040853aa05dc006f4b16f8c82c6f9dd9e4 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Oct 2024 18:49:52 +0200 Subject: debugobjects: Delete a piece of redundant code The statically allocated objects are all located in obj_static_pool[], the whole memory of obj_static_pool[] will be reclaimed later. Therefore, there is no need to split the remaining statically nodes in list obj_pool into isolated ones, no one will use them anymore. Just write INIT_HLIST_HEAD(&obj_pool) is enough. Since hlist_move_list() directly discards the old list, even this can be omitted. Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240911083521.2257-2-thunder.leizhen@huawei.com Link: https://lore.kernel.org/all/20241007164913.009849239@linutronix.de --- lib/debugobjects.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 5ce473ad499b..df48acc5d4b3 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1325,10 +1325,10 @@ static int __init debug_objects_replace_static_objects(void) * active object references. */ - /* Remove the statically allocated objects from the pool */ - hlist_for_each_entry_safe(obj, tmp, &obj_pool, node) - hlist_del(&obj->node); - /* Move the allocated objects to the pool */ + /* + * Replace the statically allocated objects list with the allocated + * objects list. + */ hlist_move_list(&objects, &obj_pool); /* Replace the active object references */ -- cgit v1.2.3 From 813fd07858cfb410bc9574c05b7922185f65989b Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Oct 2024 18:49:53 +0200 Subject: debugobjects: Collect newly allocated objects in a list to reduce lock contention Collect the newly allocated debug objects in a list outside the lock, so that the lock held time and the potential lock contention is reduced. Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240911083521.2257-3-thunder.leizhen@huawei.com Link: https://lore.kernel.org/all/20241007164913.073653668@linutronix.de --- lib/debugobjects.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index df48acc5d4b3..798ce5ac1ffe 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -161,23 +161,25 @@ static void fill_pool(void) return; while (READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) { - struct debug_obj *new[ODEBUG_BATCH_SIZE]; + struct debug_obj *new, *last = NULL; + HLIST_HEAD(head); int cnt; for (cnt = 0; cnt < ODEBUG_BATCH_SIZE; cnt++) { - new[cnt] = kmem_cache_zalloc(obj_cache, gfp); - if (!new[cnt]) + new = kmem_cache_zalloc(obj_cache, gfp); + if (!new) break; + hlist_add_head(&new->node, &head); + if (!last) + last = new; } if (!cnt) return; raw_spin_lock_irqsave(&pool_lock, flags); - while (cnt) { - hlist_add_head(&new[--cnt]->node, &obj_pool); - debug_objects_allocated++; - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); - } + hlist_splice_init(&head, &last->node, &obj_pool); + debug_objects_allocated += cnt; + WRITE_ONCE(obj_pool_free, obj_pool_free + cnt); raw_spin_unlock_irqrestore(&pool_lock, flags); } } -- cgit v1.2.3 From 55fb412ef7d0c33226fcac4ebc68c60282e5f150 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:54 +0200 Subject: debugobjects: Dont destroy kmem cache in init() debug_objects_mem_init() is invoked from mm_core_init() before work queues are available. If debug_objects_mem_init() destroys the kmem cache in the error path it causes an Oops in __queue_work(): Oops: Oops: 0000 [#1] PREEMPT SMP PTI RIP: 0010:__queue_work+0x35/0x6a0 queue_work_on+0x66/0x70 flush_all_cpus_locked+0xdf/0x1a0 __kmem_cache_shutdown+0x2f/0x340 kmem_cache_destroy+0x4e/0x150 mm_core_init+0x9e/0x120 start_kernel+0x298/0x800 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0xc5/0xe0 common_startup_64+0x12c/0x138 Further the object cache pointer is used in various places to check for early boot operation. It is exposed before the replacments for the static boot time objects are allocated and the self test operates on it. This can be avoided by: 1) Running the self test with the static boot objects 2) Exposing it only after the replacement objects have been added to the pool. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241007164913.137021337@linutronix.de --- lib/debugobjects.c | 68 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 798ce5ac1ffe..1f6bf0fb0b20 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1211,7 +1211,7 @@ static __initconst const struct debug_obj_descr descr_type_test = { static __initdata struct self_test obj = { .static_init = 0 }; -static void __init debug_objects_selftest(void) +static bool __init debug_objects_selftest(void) { int fixups, oldfixups, warnings, oldwarnings; unsigned long flags; @@ -1280,9 +1280,10 @@ out: descr_test = NULL; local_irq_restore(flags); + return !!debug_objects_enabled; } #else -static inline void debug_objects_selftest(void) { } +static inline bool debug_objects_selftest(void) { return true; } #endif /* @@ -1302,18 +1303,21 @@ void __init debug_objects_early_init(void) } /* - * Convert the statically allocated objects to dynamic ones: + * Convert the statically allocated objects to dynamic ones. + * debug_objects_mem_init() is called early so only one CPU is up and + * interrupts are disabled, which means it is safe to replace the active + * object references. */ -static int __init debug_objects_replace_static_objects(void) +static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache) { struct debug_bucket *db = obj_hash; - struct hlist_node *tmp; struct debug_obj *obj, *new; + struct hlist_node *tmp; HLIST_HEAD(objects); int i, cnt = 0; for (i = 0; i < ODEBUG_POOL_SIZE; i++) { - obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL); + obj = kmem_cache_zalloc(cache, GFP_KERNEL); if (!obj) goto free; hlist_add_head(&obj->node, &objects); @@ -1321,12 +1325,6 @@ static int __init debug_objects_replace_static_objects(void) debug_objects_allocated += i; - /* - * debug_objects_mem_init() is now called early that only one CPU is up - * and interrupts have been disabled, so it is safe to replace the - * active object references. - */ - /* * Replace the statically allocated objects list with the allocated * objects list. @@ -1347,15 +1345,14 @@ static int __init debug_objects_replace_static_objects(void) } } - pr_debug("%d of %d active objects replaced\n", - cnt, obj_pool_used); - return 0; + pr_debug("%d of %d active objects replaced\n", cnt, obj_pool_used); + return true; free: hlist_for_each_entry_safe(obj, tmp, &objects, node) { hlist_del(&obj->node); - kmem_cache_free(obj_cache, obj); + kmem_cache_free(cache, obj); } - return -ENOMEM; + return false; } /* @@ -1366,6 +1363,7 @@ free: */ void __init debug_objects_mem_init(void) { + struct kmem_cache *cache; int cpu, extras; if (!debug_objects_enabled) @@ -1380,29 +1378,33 @@ void __init debug_objects_mem_init(void) for_each_possible_cpu(cpu) INIT_HLIST_HEAD(&per_cpu(percpu_obj_pool.free_objs, cpu)); - obj_cache = kmem_cache_create("debug_objects_cache", - sizeof (struct debug_obj), 0, - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE, - NULL); + if (!debug_objects_selftest()) + return; - if (!obj_cache || debug_objects_replace_static_objects()) { + cache = kmem_cache_create("debug_objects_cache", sizeof (struct debug_obj), 0, + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE, NULL); + + if (!cache || !debug_objects_replace_static_objects(cache)) { debug_objects_enabled = 0; - kmem_cache_destroy(obj_cache); - pr_warn("out of memory.\n"); + pr_warn("Out of memory.\n"); return; - } else - debug_objects_selftest(); - -#ifdef CONFIG_HOTPLUG_CPU - cpuhp_setup_state_nocalls(CPUHP_DEBUG_OBJ_DEAD, "object:offline", NULL, - object_cpu_offline); -#endif + } /* - * Increase the thresholds for allocating and freeing objects - * according to the number of possible CPUs available in the system. + * Adjust the thresholds for allocating and freeing objects + * according to the number of possible CPUs available in the + * system. */ extras = num_possible_cpus() * ODEBUG_BATCH_SIZE; debug_objects_pool_size += extras; debug_objects_pool_min_level += extras; + + /* Everything worked. Expose the cache */ + obj_cache = cache; + +#ifdef CONFIG_HOTPLUG_CPU + cpuhp_setup_state_nocalls(CPUHP_DEBUG_OBJ_DEAD, "object:offline", NULL, + object_cpu_offline); +#endif + return; } -- cgit v1.2.3 From 3f397bf9553d9f142fbfaa19713e0350803fcc31 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:56 +0200 Subject: debugobjects: Remove pointless hlist initialization It's BSS zero initialized. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.200379308@linutronix.de --- lib/debugobjects.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 1f6bf0fb0b20..9867412d7946 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1364,20 +1364,11 @@ free: void __init debug_objects_mem_init(void) { struct kmem_cache *cache; - int cpu, extras; + int extras; if (!debug_objects_enabled) return; - /* - * Initialize the percpu object pools - * - * Initialization is not strictly necessary, but was done for - * completeness. - */ - for_each_possible_cpu(cpu) - INIT_HLIST_HEAD(&per_cpu(percpu_obj_pool.free_objs, cpu)); - if (!debug_objects_selftest()) return; -- cgit v1.2.3 From a2a702383e8baa22ee66ee60f1e036835a1ef42e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:57 +0200 Subject: debugobjects: Dont free objects directly on CPU hotplug Freeing the per CPU pool of the unplugged CPU directly is suboptimal as the objects can be reused in the real pool if there is room. Aside of that this gets the accounting wrong. Use the regular free path, which allows reuse and has the accounting correct. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.263960570@linutronix.de --- lib/debugobjects.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 9867412d7946..a3d4c54f0839 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -430,27 +430,28 @@ static void free_object(struct debug_obj *obj) } #ifdef CONFIG_HOTPLUG_CPU -static int object_cpu_offline(unsigned int cpu) +static void put_objects(struct hlist_head *list) { - struct debug_percpu_free *percpu_pool; struct hlist_node *tmp; struct debug_obj *obj; - unsigned long flags; - /* Remote access is safe as the CPU is dead already */ - percpu_pool = per_cpu_ptr(&percpu_obj_pool, cpu); - hlist_for_each_entry_safe(obj, tmp, &percpu_pool->free_objs, node) { + /* + * Using free_object() puts the objects into reuse or schedules + * them for freeing and it get's all the accounting correct. + */ + hlist_for_each_entry_safe(obj, tmp, list, node) { hlist_del(&obj->node); - kmem_cache_free(obj_cache, obj); + free_object(obj); } +} - raw_spin_lock_irqsave(&pool_lock, flags); - obj_pool_used -= percpu_pool->obj_free; - debug_objects_freed += percpu_pool->obj_free; - raw_spin_unlock_irqrestore(&pool_lock, flags); - - percpu_pool->obj_free = 0; +static int object_cpu_offline(unsigned int cpu) +{ + /* Remote access is safe as the CPU is dead already */ + struct debug_percpu_free *pcp = per_cpu_ptr(&percpu_obj_pool, cpu); + put_objects(&pcp->free_objs); + pcp->obj_free = 0; return 0; } #endif -- cgit v1.2.3 From 49968cf18154d6391e84c68520149232057ca62c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:58 +0200 Subject: debugobjects: Reuse put_objects() on OOM Reuse the helper function instead of having a open coded copy. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.326834268@linutronix.de --- lib/debugobjects.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a3d4c54f0839..2c866d0bdd68 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -429,7 +429,6 @@ static void free_object(struct debug_obj *obj) } } -#ifdef CONFIG_HOTPLUG_CPU static void put_objects(struct hlist_head *list) { struct hlist_node *tmp; @@ -445,6 +444,7 @@ static void put_objects(struct hlist_head *list) } } +#ifdef CONFIG_HOTPLUG_CPU static int object_cpu_offline(unsigned int cpu) { /* Remote access is safe as the CPU is dead already */ @@ -456,31 +456,19 @@ static int object_cpu_offline(unsigned int cpu) } #endif -/* - * We run out of memory. That means we probably have tons of objects - * allocated. - */ +/* Out of memory. Free all objects from hash */ static void debug_objects_oom(void) { struct debug_bucket *db = obj_hash; - struct hlist_node *tmp; HLIST_HEAD(freelist); - struct debug_obj *obj; - unsigned long flags; - int i; pr_warn("Out of memory. ODEBUG disabled\n"); - for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { - raw_spin_lock_irqsave(&db->lock, flags); - hlist_move_list(&db->list, &freelist); - raw_spin_unlock_irqrestore(&db->lock, flags); + for (int i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { + scoped_guard(raw_spinlock_irqsave, &db->lock) + hlist_move_list(&db->list, &freelist); - /* Now free them */ - hlist_for_each_entry_safe(obj, tmp, &freelist, node) { - hlist_del(&obj->node); - free_object(obj); - } + put_objects(&freelist); } } -- cgit v1.2.3 From 241463f4fdcc845fa4a174e63a23940305cb6691 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:59 +0200 Subject: debugobjects: Remove pointless debug printk It has zero value. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.390511021@linutronix.de --- lib/debugobjects.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2c866d0bdd68..fc54115818a1 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1303,7 +1303,7 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache struct debug_obj *obj, *new; struct hlist_node *tmp; HLIST_HEAD(objects); - int i, cnt = 0; + int i; for (i = 0; i < ODEBUG_POOL_SIZE; i++) { obj = kmem_cache_zalloc(cache, GFP_KERNEL); @@ -1330,11 +1330,8 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache /* copy object data */ *new = *obj; hlist_add_head(&new->node, &db->list); - cnt++; } } - - pr_debug("%d of %d active objects replaced\n", cnt, obj_pool_used); return true; free: hlist_for_each_entry_safe(obj, tmp, &objects, node) { -- cgit v1.2.3 From 49a5cb827d3d625944d48518acec4e4b9d61e1da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:01 +0200 Subject: debugobjects: Provide and use free_object_list() Move the loop to free a list of objects into a helper function so it can be reused later. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241007164913.453912357@linutronix.de --- lib/debugobjects.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fc54115818a1..6ccdfebcd807 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -125,6 +125,20 @@ static const char *obj_states[ODEBUG_STATE_MAX] = { [ODEBUG_STATE_NOTAVAILABLE] = "not available", }; +static void free_object_list(struct hlist_head *head) +{ + struct hlist_node *tmp; + struct debug_obj *obj; + int cnt = 0; + + hlist_for_each_entry_safe(obj, tmp, head, node) { + hlist_del(&obj->node); + kmem_cache_free(obj_cache, obj); + cnt++; + } + debug_objects_freed += cnt; +} + static void fill_pool(void) { gfp_t gfp = __GFP_HIGH | __GFP_NOWARN; @@ -286,7 +300,6 @@ init_obj: */ static void free_obj_work(struct work_struct *work) { - struct hlist_node *tmp; struct debug_obj *obj; unsigned long flags; HLIST_HEAD(tofree); @@ -323,15 +336,11 @@ free_objs: */ if (obj_nr_tofree) { hlist_move_list(&obj_to_free, &tofree); - debug_objects_freed += obj_nr_tofree; WRITE_ONCE(obj_nr_tofree, 0); } raw_spin_unlock_irqrestore(&pool_lock, flags); - hlist_for_each_entry_safe(obj, tmp, &tofree, node) { - hlist_del(&obj->node); - kmem_cache_free(obj_cache, obj); - } + free_object_list(&tofree); } static void __free_object(struct debug_obj *obj) @@ -1334,6 +1343,7 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache } return true; free: + /* Can't use free_object_list() as the cache is not populated yet */ hlist_for_each_entry_safe(obj, tmp, &objects, node) { hlist_del(&obj->node); kmem_cache_free(cache, obj); -- cgit v1.2.3 From 661cc28b523d4616a322c8f82f06ec7880192060 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:02 +0200 Subject: debugobjects: Make debug_objects_enabled bool Make it what it is. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.518175013@linutronix.de --- lib/debugobjects.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 6ccdfebcd807..0d69095394ee 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -82,7 +82,7 @@ static int __data_racy debug_objects_maxchain __read_mostly; static int __data_racy __maybe_unused debug_objects_maxchecked __read_mostly; static int __data_racy debug_objects_fixups __read_mostly; static int __data_racy debug_objects_warnings __read_mostly; -static int __data_racy debug_objects_enabled __read_mostly +static bool __data_racy debug_objects_enabled __read_mostly = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT; static int debug_objects_pool_size __ro_after_init = ODEBUG_POOL_SIZE; @@ -103,17 +103,16 @@ static DECLARE_DELAYED_WORK(debug_obj_work, free_obj_work); static int __init enable_object_debug(char *str) { - debug_objects_enabled = 1; + debug_objects_enabled = true; return 0; } +early_param("debug_objects", enable_object_debug); static int __init disable_object_debug(char *str) { - debug_objects_enabled = 0; + debug_objects_enabled = false; return 0; } - -early_param("debug_objects", enable_object_debug); early_param("no_debug_objects", disable_object_debug); static const char *obj_states[ODEBUG_STATE_MAX] = { @@ -592,7 +591,7 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket } /* Out of memory. Do the cleanup outside of the locked region */ - debug_objects_enabled = 0; + debug_objects_enabled = false; return NULL; } @@ -1194,7 +1193,7 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings) out: raw_spin_unlock_irqrestore(&db->lock, flags); if (res) - debug_objects_enabled = 0; + debug_objects_enabled = false; return res; } @@ -1278,7 +1277,7 @@ out: descr_test = NULL; local_irq_restore(flags); - return !!debug_objects_enabled; + return debug_objects_enabled; } #else static inline bool debug_objects_selftest(void) { return true; } @@ -1372,7 +1371,7 @@ void __init debug_objects_mem_init(void) SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE, NULL); if (!cache || !debug_objects_replace_static_objects(cache)) { - debug_objects_enabled = 0; + debug_objects_enabled = false; pr_warn("Out of memory.\n"); return; } -- cgit v1.2.3 From d8c6cd3a5c8008f5d42c7763a93b43d7f3a40e94 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Oct 2024 18:50:03 +0200 Subject: debugobjects: Reduce parallel pool fill attempts The contention on the global pool_lock can be massive when the global pool needs to be refilled and many CPUs try to handle this. Address this by: - splitting the refill from free list and allocation. Refill from free list has no constraints vs. the context on RT, so it can be tried outside of the RT specific preemptible() guard - Let only one CPU handle the free list - Let only one CPU do allocations unless the pool level is below half of the minimum fill level. Suggested-by: Thomas Gleixner Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240911083521.2257-4-thunder.leizhen@huawei.com- Link: https://lore.kernel.org/all/20241007164913.582118421@linutronix.de -- lib/debugobjects.c | 84 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 25 deletions(-) --- lib/debugobjects.c | 84 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 25 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 0d69095394ee..64a72d419136 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -138,14 +138,10 @@ static void free_object_list(struct hlist_head *head) debug_objects_freed += cnt; } -static void fill_pool(void) +static void fill_pool_from_freelist(void) { - gfp_t gfp = __GFP_HIGH | __GFP_NOWARN; + static unsigned long state; struct debug_obj *obj; - unsigned long flags; - - if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) - return; /* * Reuse objs from the global obj_to_free list; they will be @@ -154,32 +150,58 @@ static void fill_pool(void) * obj_nr_tofree is checked locklessly; the READ_ONCE() pairs with * the WRITE_ONCE() in pool_lock critical sections. */ - if (READ_ONCE(obj_nr_tofree)) { - raw_spin_lock_irqsave(&pool_lock, flags); - /* - * Recheck with the lock held as the worker thread might have - * won the race and freed the global free list already. - */ - while (obj_nr_tofree && (obj_pool_free < debug_objects_pool_min_level)) { - obj = hlist_entry(obj_to_free.first, typeof(*obj), node); - hlist_del(&obj->node); - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); - hlist_add_head(&obj->node, &obj_pool); - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); - } - raw_spin_unlock_irqrestore(&pool_lock, flags); + if (!READ_ONCE(obj_nr_tofree)) + return; + + /* + * Prevent the context from being scheduled or interrupted after + * setting the state flag; + */ + guard(irqsave)(); + + /* + * Avoid lock contention on &pool_lock and avoid making the cache + * line exclusive by testing the bit before attempting to set it. + */ + if (test_bit(0, &state) || test_and_set_bit(0, &state)) + return; + + guard(raw_spinlock)(&pool_lock); + /* + * Recheck with the lock held as the worker thread might have + * won the race and freed the global free list already. + */ + while (obj_nr_tofree && (obj_pool_free < debug_objects_pool_min_level)) { + obj = hlist_entry(obj_to_free.first, typeof(*obj), node); + hlist_del(&obj->node); + WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); + hlist_add_head(&obj->node, &obj_pool); + WRITE_ONCE(obj_pool_free, obj_pool_free + 1); } + clear_bit(0, &state); +} - if (unlikely(!obj_cache)) +static void fill_pool(void) +{ + static atomic_t cpus_allocating; + + /* + * Avoid allocation and lock contention when: + * - One other CPU is already allocating + * - the global pool has not reached the critical level yet + */ + if (READ_ONCE(obj_pool_free) > (debug_objects_pool_min_level / 2) && + atomic_read(&cpus_allocating)) return; + atomic_inc(&cpus_allocating); while (READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) { struct debug_obj *new, *last = NULL; HLIST_HEAD(head); int cnt; for (cnt = 0; cnt < ODEBUG_BATCH_SIZE; cnt++) { - new = kmem_cache_zalloc(obj_cache, gfp); + new = kmem_cache_zalloc(obj_cache, __GFP_HIGH | __GFP_NOWARN); if (!new) break; hlist_add_head(&new->node, &head); @@ -187,14 +209,14 @@ static void fill_pool(void) last = new; } if (!cnt) - return; + break; - raw_spin_lock_irqsave(&pool_lock, flags); + guard(raw_spinlock_irqsave)(&pool_lock); hlist_splice_init(&head, &last->node, &obj_pool); debug_objects_allocated += cnt; WRITE_ONCE(obj_pool_free, obj_pool_free + cnt); - raw_spin_unlock_irqrestore(&pool_lock, flags); } + atomic_dec(&cpus_allocating); } /* @@ -597,6 +619,18 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket static void debug_objects_fill_pool(void) { + if (unlikely(!obj_cache)) + return; + + if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) + return; + + /* Try reusing objects from obj_to_free_list */ + fill_pool_from_freelist(); + + if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) + return; + /* * On RT enabled kernels the pool refill must happen in preemptible * context -- for !RT kernels we rely on the fact that spinlock_t and -- cgit v1.2.3 From e18328ff705270d1e53889ea9d79dce86d1b8786 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:04 +0200 Subject: debugobjects: Move pools into a datastructure The contention on the global pool lock can be reduced by strict batch processing where batches of objects are moved from one list head to another instead of moving them object by object. This also reduces the cache footprint because it avoids the list walk and dirties at maximum three cache lines instead of potentially up to eighteen. To prepare for that, move the hlist head and related counters into a struct. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.646171170@linutronix.de --- lib/debugobjects.c | 140 +++++++++++++++++++++++++++++------------------------ 1 file changed, 78 insertions(+), 62 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 64a72d419136..fcba13db90ed 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -52,6 +52,11 @@ struct debug_percpu_free { int obj_free; }; +struct obj_pool { + struct hlist_head objects; + unsigned int cnt; +} ____cacheline_aligned; + static DEFINE_PER_CPU(struct debug_percpu_free, percpu_obj_pool); static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; @@ -60,8 +65,8 @@ static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; static DEFINE_RAW_SPINLOCK(pool_lock); -static HLIST_HEAD(obj_pool); -static HLIST_HEAD(obj_to_free); +static struct obj_pool pool_global; +static struct obj_pool pool_to_free; /* * Because of the presence of percpu free pools, obj_pool_free will @@ -71,12 +76,9 @@ static HLIST_HEAD(obj_to_free); * can be off. */ static int __data_racy obj_pool_min_free = ODEBUG_POOL_SIZE; -static int __data_racy obj_pool_free = ODEBUG_POOL_SIZE; static int obj_pool_used; static int __data_racy obj_pool_max_used; static bool obj_freeing; -/* The number of objs on the global free list */ -static int obj_nr_tofree; static int __data_racy debug_objects_maxchain __read_mostly; static int __data_racy __maybe_unused debug_objects_maxchecked __read_mostly; @@ -124,6 +126,21 @@ static const char *obj_states[ODEBUG_STATE_MAX] = { [ODEBUG_STATE_NOTAVAILABLE] = "not available", }; +static __always_inline unsigned int pool_count(struct obj_pool *pool) +{ + return READ_ONCE(pool->cnt); +} + +static inline bool pool_global_should_refill(void) +{ + return READ_ONCE(pool_global.cnt) < debug_objects_pool_min_level; +} + +static inline bool pool_global_must_refill(void) +{ + return READ_ONCE(pool_global.cnt) < (debug_objects_pool_min_level / 2); +} + static void free_object_list(struct hlist_head *head) { struct hlist_node *tmp; @@ -146,11 +163,8 @@ static void fill_pool_from_freelist(void) /* * Reuse objs from the global obj_to_free list; they will be * reinitialized when allocating. - * - * obj_nr_tofree is checked locklessly; the READ_ONCE() pairs with - * the WRITE_ONCE() in pool_lock critical sections. */ - if (!READ_ONCE(obj_nr_tofree)) + if (!pool_count(&pool_to_free)) return; /* @@ -171,12 +185,12 @@ static void fill_pool_from_freelist(void) * Recheck with the lock held as the worker thread might have * won the race and freed the global free list already. */ - while (obj_nr_tofree && (obj_pool_free < debug_objects_pool_min_level)) { - obj = hlist_entry(obj_to_free.first, typeof(*obj), node); + while (pool_to_free.cnt && (pool_global.cnt < debug_objects_pool_min_level)) { + obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); hlist_del(&obj->node); - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); - hlist_add_head(&obj->node, &obj_pool); - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); + hlist_add_head(&obj->node, &pool_global.objects); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); } clear_bit(0, &state); } @@ -190,12 +204,11 @@ static void fill_pool(void) * - One other CPU is already allocating * - the global pool has not reached the critical level yet */ - if (READ_ONCE(obj_pool_free) > (debug_objects_pool_min_level / 2) && - atomic_read(&cpus_allocating)) + if (!pool_global_must_refill() && atomic_read(&cpus_allocating)) return; atomic_inc(&cpus_allocating); - while (READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) { + while (pool_global_should_refill()) { struct debug_obj *new, *last = NULL; HLIST_HEAD(head); int cnt; @@ -212,9 +225,9 @@ static void fill_pool(void) break; guard(raw_spinlock_irqsave)(&pool_lock); - hlist_splice_init(&head, &last->node, &obj_pool); + hlist_splice_init(&head, &last->node, &pool_global.objects); debug_objects_allocated += cnt; - WRITE_ONCE(obj_pool_free, obj_pool_free + cnt); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + cnt); } atomic_dec(&cpus_allocating); } @@ -268,10 +281,10 @@ alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *d } raw_spin_lock(&pool_lock); - obj = __alloc_object(&obj_pool); + obj = __alloc_object(&pool_global.objects); if (obj) { obj_pool_used++; - WRITE_ONCE(obj_pool_free, obj_pool_free - 1); + WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); /* * Looking ahead, allocate one batch of debug objects and @@ -283,22 +296,21 @@ alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *d for (i = 0; i < ODEBUG_BATCH_SIZE; i++) { struct debug_obj *obj2; - obj2 = __alloc_object(&obj_pool); + obj2 = __alloc_object(&pool_global.objects); if (!obj2) break; - hlist_add_head(&obj2->node, - &percpu_pool->free_objs); + hlist_add_head(&obj2->node, &percpu_pool->free_objs); percpu_pool->obj_free++; obj_pool_used++; - WRITE_ONCE(obj_pool_free, obj_pool_free - 1); + WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); } } if (obj_pool_used > obj_pool_max_used) obj_pool_max_used = obj_pool_used; - if (obj_pool_free < obj_pool_min_free) - obj_pool_min_free = obj_pool_free; + if (pool_global.cnt < obj_pool_min_free) + obj_pool_min_free = pool_global.cnt; } raw_spin_unlock(&pool_lock); @@ -329,7 +341,7 @@ static void free_obj_work(struct work_struct *work) if (!raw_spin_trylock_irqsave(&pool_lock, flags)) return; - if (obj_pool_free >= debug_objects_pool_size) + if (pool_global.cnt >= debug_objects_pool_size) goto free_objs; /* @@ -339,12 +351,12 @@ static void free_obj_work(struct work_struct *work) * may be gearing up to use more and more objects, don't free any * of them until the next round. */ - while (obj_nr_tofree && obj_pool_free < debug_objects_pool_size) { - obj = hlist_entry(obj_to_free.first, typeof(*obj), node); + while (pool_to_free.cnt && pool_global.cnt < debug_objects_pool_size) { + obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); hlist_del(&obj->node); - hlist_add_head(&obj->node, &obj_pool); - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); + hlist_add_head(&obj->node, &pool_global.objects); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); } raw_spin_unlock_irqrestore(&pool_lock, flags); return; @@ -355,9 +367,9 @@ free_objs: * list. Move remaining free objs to a temporary list to free the * memory outside the pool_lock held region. */ - if (obj_nr_tofree) { - hlist_move_list(&obj_to_free, &tofree); - WRITE_ONCE(obj_nr_tofree, 0); + if (pool_to_free.cnt) { + hlist_move_list(&pool_to_free.objects, &tofree); + WRITE_ONCE(pool_to_free.cnt, 0); } raw_spin_unlock_irqrestore(&pool_lock, flags); @@ -400,45 +412,45 @@ static void __free_object(struct debug_obj *obj) free_to_obj_pool: raw_spin_lock(&pool_lock); - work = (obj_pool_free > debug_objects_pool_size) && obj_cache && - (obj_nr_tofree < ODEBUG_FREE_WORK_MAX); + work = (pool_global.cnt > debug_objects_pool_size) && obj_cache && + (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX); obj_pool_used--; if (work) { - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree + 1); - hlist_add_head(&obj->node, &obj_to_free); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + 1); + hlist_add_head(&obj->node, &pool_to_free.objects); if (lookahead_count) { - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree + lookahead_count); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + lookahead_count); obj_pool_used -= lookahead_count; while (lookahead_count) { hlist_add_head(&objs[--lookahead_count]->node, - &obj_to_free); + &pool_to_free.objects); } } - if ((obj_pool_free > debug_objects_pool_size) && - (obj_nr_tofree < ODEBUG_FREE_WORK_MAX)) { + if ((pool_global.cnt > debug_objects_pool_size) && + (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX)) { int i; /* * Free one more batch of objects from obj_pool. */ for (i = 0; i < ODEBUG_BATCH_SIZE; i++) { - obj = __alloc_object(&obj_pool); - hlist_add_head(&obj->node, &obj_to_free); - WRITE_ONCE(obj_pool_free, obj_pool_free - 1); - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree + 1); + obj = __alloc_object(&pool_global.objects); + hlist_add_head(&obj->node, &pool_to_free.objects); + WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + 1); } } } else { - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); - hlist_add_head(&obj->node, &obj_pool); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); + hlist_add_head(&obj->node, &pool_global.objects); if (lookahead_count) { - WRITE_ONCE(obj_pool_free, obj_pool_free + lookahead_count); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + lookahead_count); obj_pool_used -= lookahead_count; while (lookahead_count) { hlist_add_head(&objs[--lookahead_count]->node, - &obj_pool); + &pool_global.objects); } } } @@ -453,7 +465,7 @@ free_to_obj_pool: static void free_object(struct debug_obj *obj) { __free_object(obj); - if (!READ_ONCE(obj_freeing) && READ_ONCE(obj_nr_tofree)) { + if (!READ_ONCE(obj_freeing) && pool_count(&pool_to_free)) { WRITE_ONCE(obj_freeing, true); schedule_delayed_work(&debug_obj_work, ODEBUG_FREE_WORK_DELAY); } @@ -622,13 +634,13 @@ static void debug_objects_fill_pool(void) if (unlikely(!obj_cache)) return; - if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) + if (likely(!pool_global_should_refill())) return; /* Try reusing objects from obj_to_free_list */ fill_pool_from_freelist(); - if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) + if (likely(!pool_global_should_refill())) return; /* @@ -1040,7 +1052,7 @@ repeat: debug_objects_maxchecked = objs_checked; /* Schedule work to actually kmem_cache_free() objects */ - if (!READ_ONCE(obj_freeing) && READ_ONCE(obj_nr_tofree)) { + if (!READ_ONCE(obj_freeing) && pool_count(&pool_to_free)) { WRITE_ONCE(obj_freeing, true); schedule_delayed_work(&debug_obj_work, ODEBUG_FREE_WORK_DELAY); } @@ -1066,12 +1078,12 @@ static int debug_stats_show(struct seq_file *m, void *v) seq_printf(m, "max_checked :%d\n", debug_objects_maxchecked); seq_printf(m, "warnings :%d\n", debug_objects_warnings); seq_printf(m, "fixups :%d\n", debug_objects_fixups); - seq_printf(m, "pool_free :%d\n", READ_ONCE(obj_pool_free) + obj_percpu_free); + seq_printf(m, "pool_free :%d\n", pool_count(&pool_global) + obj_percpu_free); seq_printf(m, "pool_pcp_free :%d\n", obj_percpu_free); seq_printf(m, "pool_min_free :%d\n", obj_pool_min_free); seq_printf(m, "pool_used :%d\n", obj_pool_used - obj_percpu_free); seq_printf(m, "pool_max_used :%d\n", obj_pool_max_used); - seq_printf(m, "on_free_list :%d\n", READ_ONCE(obj_nr_tofree)); + seq_printf(m, "on_free_list :%d\n", pool_count(&pool_to_free)); seq_printf(m, "objs_allocated:%d\n", debug_objects_allocated); seq_printf(m, "objs_freed :%d\n", debug_objects_freed); return 0; @@ -1330,7 +1342,9 @@ void __init debug_objects_early_init(void) raw_spin_lock_init(&obj_hash[i].lock); for (i = 0; i < ODEBUG_POOL_SIZE; i++) - hlist_add_head(&obj_static_pool[i].node, &obj_pool); + hlist_add_head(&obj_static_pool[i].node, &pool_global.objects); + + pool_global.cnt = ODEBUG_POOL_SIZE; } /* @@ -1354,21 +1368,23 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache hlist_add_head(&obj->node, &objects); } - debug_objects_allocated += i; + debug_objects_allocated = ODEBUG_POOL_SIZE; + pool_global.cnt = ODEBUG_POOL_SIZE; /* * Replace the statically allocated objects list with the allocated * objects list. */ - hlist_move_list(&objects, &obj_pool); + hlist_move_list(&objects, &pool_global.objects); /* Replace the active object references */ for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { hlist_move_list(&db->list, &objects); hlist_for_each_entry(obj, &objects, node) { - new = hlist_entry(obj_pool.first, typeof(*obj), node); + new = hlist_entry(pool_global.objects.first, typeof(*obj), node); hlist_del(&new->node); + pool_global.cnt--; /* copy object data */ *new = *obj; hlist_add_head(&new->node, &db->list); -- cgit v1.2.3 From cb58d190843059d5dc50d6ac483647ba61001e8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:05 +0200 Subject: debugobjects: Use separate list head for boot pool There is no point to handle the statically allocated objects during early boot in the actual pool list. This phase does not require accounting, so all of the related complexity can be avoided. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.708939081@linutronix.de --- lib/debugobjects.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fcba13db90ed..0b29a25dc3f9 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -68,6 +68,8 @@ static DEFINE_RAW_SPINLOCK(pool_lock); static struct obj_pool pool_global; static struct obj_pool pool_to_free; +static HLIST_HEAD(pool_boot); + /* * Because of the presence of percpu free pools, obj_pool_free will * under-count those in the percpu free pools. Similarly, obj_pool_used @@ -278,6 +280,9 @@ alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *d percpu_pool->obj_free--; goto init_obj; } + } else { + obj = __alloc_object(&pool_boot); + goto init_obj; } raw_spin_lock(&pool_lock); @@ -381,12 +386,14 @@ static void __free_object(struct debug_obj *obj) struct debug_obj *objs[ODEBUG_BATCH_SIZE]; struct debug_percpu_free *percpu_pool; int lookahead_count = 0; - unsigned long flags; bool work; - local_irq_save(flags); - if (!obj_cache) - goto free_to_obj_pool; + guard(irqsave)(); + + if (unlikely(!obj_cache)) { + hlist_add_head(&obj->node, &pool_boot); + return; + } /* * Try to free it into the percpu pool first. @@ -395,7 +402,6 @@ static void __free_object(struct debug_obj *obj) if (percpu_pool->obj_free < ODEBUG_POOL_PERCPU_SIZE) { hlist_add_head(&obj->node, &percpu_pool->free_objs); percpu_pool->obj_free++; - local_irq_restore(flags); return; } @@ -410,7 +416,6 @@ static void __free_object(struct debug_obj *obj) percpu_pool->obj_free--; } -free_to_obj_pool: raw_spin_lock(&pool_lock); work = (pool_global.cnt > debug_objects_pool_size) && obj_cache && (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX); @@ -455,7 +460,6 @@ free_to_obj_pool: } } raw_spin_unlock(&pool_lock); - local_irq_restore(flags); } /* @@ -1341,10 +1345,9 @@ void __init debug_objects_early_init(void) for (i = 0; i < ODEBUG_HASH_SIZE; i++) raw_spin_lock_init(&obj_hash[i].lock); + /* Keep early boot simple and add everything to the boot list */ for (i = 0; i < ODEBUG_POOL_SIZE; i++) - hlist_add_head(&obj_static_pool[i].node, &pool_global.objects); - - pool_global.cnt = ODEBUG_POOL_SIZE; + hlist_add_head(&obj_static_pool[i].node, &pool_boot); } /* @@ -1372,10 +1375,11 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache pool_global.cnt = ODEBUG_POOL_SIZE; /* - * Replace the statically allocated objects list with the allocated - * objects list. + * Move the allocated objects to the global pool and disconnect the + * boot pool. */ hlist_move_list(&objects, &pool_global.objects); + pool_boot.first = NULL; /* Replace the active object references */ for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { -- cgit v1.2.3 From 18b8afcb37d8a72479892e080e4d37890f2bf353 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:07 +0200 Subject: debugobjects: Rename and tidy up per CPU pools No point in having a separate data structure. Reuse struct obj_pool and tidy up the code. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.770595795@linutronix.de --- lib/debugobjects.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 0b29a25dc3f9..3d1d9733676e 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -43,21 +43,12 @@ struct debug_bucket { raw_spinlock_t lock; }; -/* - * Debug object percpu free list - * Access is protected by disabling irq - */ -struct debug_percpu_free { - struct hlist_head free_objs; - int obj_free; -}; - struct obj_pool { struct hlist_head objects; unsigned int cnt; } ____cacheline_aligned; -static DEFINE_PER_CPU(struct debug_percpu_free, percpu_obj_pool); +static DEFINE_PER_CPU(struct obj_pool, pool_pcpu); static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; @@ -271,13 +262,13 @@ static struct debug_obj *__alloc_object(struct hlist_head *list) static struct debug_obj * alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *descr) { - struct debug_percpu_free *percpu_pool = this_cpu_ptr(&percpu_obj_pool); + struct obj_pool *percpu_pool = this_cpu_ptr(&pool_pcpu); struct debug_obj *obj; if (likely(obj_cache)) { - obj = __alloc_object(&percpu_pool->free_objs); + obj = __alloc_object(&percpu_pool->objects); if (obj) { - percpu_pool->obj_free--; + percpu_pool->cnt--; goto init_obj; } } else { @@ -304,8 +295,8 @@ alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *d obj2 = __alloc_object(&pool_global.objects); if (!obj2) break; - hlist_add_head(&obj2->node, &percpu_pool->free_objs); - percpu_pool->obj_free++; + hlist_add_head(&obj2->node, &percpu_pool->objects); + percpu_pool->cnt++; obj_pool_used++; WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); } @@ -384,7 +375,7 @@ free_objs: static void __free_object(struct debug_obj *obj) { struct debug_obj *objs[ODEBUG_BATCH_SIZE]; - struct debug_percpu_free *percpu_pool; + struct obj_pool *percpu_pool; int lookahead_count = 0; bool work; @@ -398,10 +389,10 @@ static void __free_object(struct debug_obj *obj) /* * Try to free it into the percpu pool first. */ - percpu_pool = this_cpu_ptr(&percpu_obj_pool); - if (percpu_pool->obj_free < ODEBUG_POOL_PERCPU_SIZE) { - hlist_add_head(&obj->node, &percpu_pool->free_objs); - percpu_pool->obj_free++; + percpu_pool = this_cpu_ptr(&pool_pcpu); + if (percpu_pool->cnt < ODEBUG_POOL_PERCPU_SIZE) { + hlist_add_head(&obj->node, &percpu_pool->objects); + percpu_pool->cnt++; return; } @@ -410,10 +401,10 @@ static void __free_object(struct debug_obj *obj) * of objects from the percpu pool and free them as well. */ for (; lookahead_count < ODEBUG_BATCH_SIZE; lookahead_count++) { - objs[lookahead_count] = __alloc_object(&percpu_pool->free_objs); + objs[lookahead_count] = __alloc_object(&percpu_pool->objects); if (!objs[lookahead_count]) break; - percpu_pool->obj_free--; + percpu_pool->cnt--; } raw_spin_lock(&pool_lock); @@ -494,10 +485,10 @@ static void put_objects(struct hlist_head *list) static int object_cpu_offline(unsigned int cpu) { /* Remote access is safe as the CPU is dead already */ - struct debug_percpu_free *pcp = per_cpu_ptr(&percpu_obj_pool, cpu); + struct obj_pool *pcp = per_cpu_ptr(&pool_pcpu, cpu); - put_objects(&pcp->free_objs); - pcp->obj_free = 0; + put_objects(&pcp->objects); + pcp->cnt = 0; return 0; } #endif @@ -1076,7 +1067,7 @@ static int debug_stats_show(struct seq_file *m, void *v) int cpu, obj_percpu_free = 0; for_each_possible_cpu(cpu) - obj_percpu_free += per_cpu(percpu_obj_pool.obj_free, cpu); + obj_percpu_free += per_cpu(pool_pcpu.cnt, cpu); seq_printf(m, "max_chain :%d\n", debug_objects_maxchain); seq_printf(m, "max_checked :%d\n", debug_objects_maxchecked); -- cgit v1.2.3 From 96a9a0421c77301f9b551f3460ac04471a3c0612 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:08 +0200 Subject: debugobjects: Move min/max count into pool struct Having the accounting in the datastructure is better in terms of cache lines and allows more optimizations later on. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.831908427@linutronix.de --- lib/debugobjects.c | 55 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 3d1d9733676e..fbe8f2625082 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -46,9 +46,14 @@ struct debug_bucket { struct obj_pool { struct hlist_head objects; unsigned int cnt; + unsigned int min_cnt; + unsigned int max_cnt; } ____cacheline_aligned; -static DEFINE_PER_CPU(struct obj_pool, pool_pcpu); + +static DEFINE_PER_CPU_ALIGNED(struct obj_pool, pool_pcpu) = { + .max_cnt = ODEBUG_POOL_PERCPU_SIZE, +}; static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; @@ -56,8 +61,14 @@ static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; static DEFINE_RAW_SPINLOCK(pool_lock); -static struct obj_pool pool_global; -static struct obj_pool pool_to_free; +static struct obj_pool pool_global = { + .min_cnt = ODEBUG_POOL_MIN_LEVEL, + .max_cnt = ODEBUG_POOL_SIZE, +}; + +static struct obj_pool pool_to_free = { + .max_cnt = UINT_MAX, +}; static HLIST_HEAD(pool_boot); @@ -79,13 +90,9 @@ static int __data_racy debug_objects_fixups __read_mostly; static int __data_racy debug_objects_warnings __read_mostly; static bool __data_racy debug_objects_enabled __read_mostly = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT; -static int debug_objects_pool_size __ro_after_init - = ODEBUG_POOL_SIZE; -static int debug_objects_pool_min_level __ro_after_init - = ODEBUG_POOL_MIN_LEVEL; -static const struct debug_obj_descr *descr_test __read_mostly; -static struct kmem_cache *obj_cache __ro_after_init; +static const struct debug_obj_descr *descr_test __read_mostly; +static struct kmem_cache *obj_cache __ro_after_init; /* * Track numbers of kmem_cache_alloc()/free() calls done. @@ -124,14 +131,14 @@ static __always_inline unsigned int pool_count(struct obj_pool *pool) return READ_ONCE(pool->cnt); } -static inline bool pool_global_should_refill(void) +static __always_inline bool pool_should_refill(struct obj_pool *pool) { - return READ_ONCE(pool_global.cnt) < debug_objects_pool_min_level; + return pool_count(pool) < pool->min_cnt; } -static inline bool pool_global_must_refill(void) +static __always_inline bool pool_must_refill(struct obj_pool *pool) { - return READ_ONCE(pool_global.cnt) < (debug_objects_pool_min_level / 2); + return pool_count(pool) < pool->min_cnt / 2; } static void free_object_list(struct hlist_head *head) @@ -178,7 +185,7 @@ static void fill_pool_from_freelist(void) * Recheck with the lock held as the worker thread might have * won the race and freed the global free list already. */ - while (pool_to_free.cnt && (pool_global.cnt < debug_objects_pool_min_level)) { + while (pool_to_free.cnt && (pool_global.cnt < pool_global.min_cnt)) { obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); hlist_del(&obj->node); WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); @@ -197,11 +204,11 @@ static void fill_pool(void) * - One other CPU is already allocating * - the global pool has not reached the critical level yet */ - if (!pool_global_must_refill() && atomic_read(&cpus_allocating)) + if (!pool_must_refill(&pool_global) && atomic_read(&cpus_allocating)) return; atomic_inc(&cpus_allocating); - while (pool_global_should_refill()) { + while (pool_should_refill(&pool_global)) { struct debug_obj *new, *last = NULL; HLIST_HEAD(head); int cnt; @@ -337,7 +344,7 @@ static void free_obj_work(struct work_struct *work) if (!raw_spin_trylock_irqsave(&pool_lock, flags)) return; - if (pool_global.cnt >= debug_objects_pool_size) + if (pool_global.cnt >= pool_global.max_cnt) goto free_objs; /* @@ -347,7 +354,7 @@ static void free_obj_work(struct work_struct *work) * may be gearing up to use more and more objects, don't free any * of them until the next round. */ - while (pool_to_free.cnt && pool_global.cnt < debug_objects_pool_size) { + while (pool_to_free.cnt && pool_global.cnt < pool_global.max_cnt) { obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); hlist_del(&obj->node); hlist_add_head(&obj->node, &pool_global.objects); @@ -408,7 +415,7 @@ static void __free_object(struct debug_obj *obj) } raw_spin_lock(&pool_lock); - work = (pool_global.cnt > debug_objects_pool_size) && obj_cache && + work = (pool_global.cnt > pool_global.max_cnt) && obj_cache && (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX); obj_pool_used--; @@ -424,7 +431,7 @@ static void __free_object(struct debug_obj *obj) } } - if ((pool_global.cnt > debug_objects_pool_size) && + if ((pool_global.cnt > pool_global.max_cnt) && (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX)) { int i; @@ -629,13 +636,13 @@ static void debug_objects_fill_pool(void) if (unlikely(!obj_cache)) return; - if (likely(!pool_global_should_refill())) + if (likely(!pool_should_refill(&pool_global))) return; /* Try reusing objects from obj_to_free_list */ fill_pool_from_freelist(); - if (likely(!pool_global_should_refill())) + if (likely(!pool_should_refill(&pool_global))) return; /* @@ -1427,8 +1434,8 @@ void __init debug_objects_mem_init(void) * system. */ extras = num_possible_cpus() * ODEBUG_BATCH_SIZE; - debug_objects_pool_size += extras; - debug_objects_pool_min_level += extras; + pool_global.max_cnt += extras; + pool_global.min_cnt += extras; /* Everything worked. Expose the cache */ obj_cache = cache; -- cgit v1.2.3 From fb60c004f33e0fa2e87b9456b87f1b2709436b88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:09 +0200 Subject: debugobjects: Rework object allocation The current allocation scheme tries to allocate from the per CPU pool first. If that fails it allocates one object from the global pool and then refills the per CPU pool from the global pool. That is in the way of switching the pool management to batch mode as the global pool needs to be a strict stack of batches, which does not allow to allocate single objects. Rework the code to refill the per CPU pool first and then allocate the object from the refilled batch. Also try to allocate from the to free pool first to avoid freeing and reallocating objects. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.893554162@linutronix.de --- lib/debugobjects.c | 144 +++++++++++++++++++++++++---------------------------- 1 file changed, 69 insertions(+), 75 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fbe8f2625082..3b18d5dcbf6e 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -141,6 +141,64 @@ static __always_inline bool pool_must_refill(struct obj_pool *pool) return pool_count(pool) < pool->min_cnt / 2; } +static bool pool_move_batch(struct obj_pool *dst, struct obj_pool *src) +{ + if (dst->cnt + ODEBUG_BATCH_SIZE > dst->max_cnt || !src->cnt) + return false; + + for (int i = 0; i < ODEBUG_BATCH_SIZE && src->cnt; i++) { + struct hlist_node *node = src->objects.first; + + WRITE_ONCE(src->cnt, src->cnt - 1); + WRITE_ONCE(dst->cnt, dst->cnt + 1); + + hlist_del(node); + hlist_add_head(node, &dst->objects); + } + return true; +} + +static struct debug_obj *__alloc_object(struct hlist_head *list) +{ + struct debug_obj *obj; + + if (unlikely(!list->first)) + return NULL; + + obj = hlist_entry(list->first, typeof(*obj), node); + hlist_del(&obj->node); + return obj; +} + +static struct debug_obj *pcpu_alloc(void) +{ + struct obj_pool *pcp = this_cpu_ptr(&pool_pcpu); + + lockdep_assert_irqs_disabled(); + + for (;;) { + struct debug_obj *obj = __alloc_object(&pcp->objects); + + if (likely(obj)) { + pcp->cnt--; + return obj; + } + + guard(raw_spinlock)(&pool_lock); + if (!pool_move_batch(pcp, &pool_to_free)) { + if (!pool_move_batch(pcp, &pool_global)) + return NULL; + } + obj_pool_used += pcp->cnt; + + if (obj_pool_used > obj_pool_max_used) + obj_pool_max_used = obj_pool_used; + + if (pool_global.cnt < obj_pool_min_free) + obj_pool_min_free = pool_global.cnt; + } +} + static void free_object_list(struct hlist_head *head) { struct hlist_node *tmp; @@ -158,7 +216,6 @@ static void free_object_list(struct hlist_head *head) static void fill_pool_from_freelist(void) { static unsigned long state; - struct debug_obj *obj; /* * Reuse objs from the global obj_to_free list; they will be @@ -180,17 +237,11 @@ static void fill_pool_from_freelist(void) if (test_bit(0, &state) || test_and_set_bit(0, &state)) return; - guard(raw_spinlock)(&pool_lock); - /* - * Recheck with the lock held as the worker thread might have - * won the race and freed the global free list already. - */ - while (pool_to_free.cnt && (pool_global.cnt < pool_global.min_cnt)) { - obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); - hlist_del(&obj->node); - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); - hlist_add_head(&obj->node, &pool_global.objects); - WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); + /* Avoid taking the lock when there is no work to do */ + while (pool_should_refill(&pool_global) && pool_count(&pool_to_free)) { + guard(raw_spinlock)(&pool_lock); + /* Move a batch if possible */ + pool_move_batch(&pool_global, &pool_to_free); } clear_bit(0, &state); } @@ -251,74 +302,17 @@ static struct debug_obj *lookup_object(void *addr, struct debug_bucket *b) return NULL; } -/* - * Allocate a new object from the hlist - */ -static struct debug_obj *__alloc_object(struct hlist_head *list) -{ - struct debug_obj *obj = NULL; - - if (list->first) { - obj = hlist_entry(list->first, typeof(*obj), node); - hlist_del(&obj->node); - } - - return obj; -} - -static struct debug_obj * -alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *descr) +static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, + const struct debug_obj_descr *descr) { - struct obj_pool *percpu_pool = this_cpu_ptr(&pool_pcpu); struct debug_obj *obj; - if (likely(obj_cache)) { - obj = __alloc_object(&percpu_pool->objects); - if (obj) { - percpu_pool->cnt--; - goto init_obj; - } - } else { + if (likely(obj_cache)) + obj = pcpu_alloc(); + else obj = __alloc_object(&pool_boot); - goto init_obj; - } - - raw_spin_lock(&pool_lock); - obj = __alloc_object(&pool_global.objects); - if (obj) { - obj_pool_used++; - WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); - /* - * Looking ahead, allocate one batch of debug objects and - * put them into the percpu free pool. - */ - if (likely(obj_cache)) { - int i; - - for (i = 0; i < ODEBUG_BATCH_SIZE; i++) { - struct debug_obj *obj2; - - obj2 = __alloc_object(&pool_global.objects); - if (!obj2) - break; - hlist_add_head(&obj2->node, &percpu_pool->objects); - percpu_pool->cnt++; - obj_pool_used++; - WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); - } - } - - if (obj_pool_used > obj_pool_max_used) - obj_pool_max_used = obj_pool_used; - - if (pool_global.cnt < obj_pool_min_free) - obj_pool_min_free = pool_global.cnt; - } - raw_spin_unlock(&pool_lock); - -init_obj: - if (obj) { + if (likely(obj)) { obj->object = addr; obj->descr = descr; obj->state = ODEBUG_STATE_NONE; -- cgit v1.2.3 From a3b9e191f5fc11fa93176a4074a919d33d64c5fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:10 +0200 Subject: debugobjects: Rework object freeing __free_object() is uncomprehensibly complex. The same can be achieved by: 1) Adding the object to the per CPU pool 2) If that pool is full, move a batch of objects into the global pool or if the global pool is full into the to free pool This also prepares for batch processing. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.955542307@linutronix.de --- lib/debugobjects.c | 99 +++++++++++++----------------------------------------- 1 file changed, 24 insertions(+), 75 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 3b18d5dcbf6e..3700ddfcf76a 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -199,6 +199,27 @@ static struct debug_obj *pcpu_alloc(void) } } +static void pcpu_free(struct debug_obj *obj) +{ + struct obj_pool *pcp = this_cpu_ptr(&pool_pcpu); + + lockdep_assert_irqs_disabled(); + + hlist_add_head(&obj->node, &pcp->objects); + pcp->cnt++; + + /* Pool full ? */ + if (pcp->cnt < ODEBUG_POOL_PERCPU_SIZE) + return; + + /* Remove a batch from the per CPU pool */ + guard(raw_spinlock)(&pool_lock); + /* Try to fit the batch into the pool_global first */ + if (!pool_move_batch(&pool_global, pcp)) + pool_move_batch(&pool_to_free, pcp); + obj_pool_used -= ODEBUG_BATCH_SIZE; +} + static void free_object_list(struct hlist_head *head) { struct hlist_node *tmp; @@ -375,83 +396,11 @@ free_objs: static void __free_object(struct debug_obj *obj) { - struct debug_obj *objs[ODEBUG_BATCH_SIZE]; - struct obj_pool *percpu_pool; - int lookahead_count = 0; - bool work; - guard(irqsave)(); - - if (unlikely(!obj_cache)) { + if (likely(obj_cache)) + pcpu_free(obj); + else hlist_add_head(&obj->node, &pool_boot); - return; - } - - /* - * Try to free it into the percpu pool first. - */ - percpu_pool = this_cpu_ptr(&pool_pcpu); - if (percpu_pool->cnt < ODEBUG_POOL_PERCPU_SIZE) { - hlist_add_head(&obj->node, &percpu_pool->objects); - percpu_pool->cnt++; - return; - } - - /* - * As the percpu pool is full, look ahead and pull out a batch - * of objects from the percpu pool and free them as well. - */ - for (; lookahead_count < ODEBUG_BATCH_SIZE; lookahead_count++) { - objs[lookahead_count] = __alloc_object(&percpu_pool->objects); - if (!objs[lookahead_count]) - break; - percpu_pool->cnt--; - } - - raw_spin_lock(&pool_lock); - work = (pool_global.cnt > pool_global.max_cnt) && obj_cache && - (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX); - obj_pool_used--; - - if (work) { - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + 1); - hlist_add_head(&obj->node, &pool_to_free.objects); - if (lookahead_count) { - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + lookahead_count); - obj_pool_used -= lookahead_count; - while (lookahead_count) { - hlist_add_head(&objs[--lookahead_count]->node, - &pool_to_free.objects); - } - } - - if ((pool_global.cnt > pool_global.max_cnt) && - (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX)) { - int i; - - /* - * Free one more batch of objects from obj_pool. - */ - for (i = 0; i < ODEBUG_BATCH_SIZE; i++) { - obj = __alloc_object(&pool_global.objects); - hlist_add_head(&obj->node, &pool_to_free.objects); - WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + 1); - } - } - } else { - WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); - hlist_add_head(&obj->node, &pool_global.objects); - if (lookahead_count) { - WRITE_ONCE(pool_global.cnt, pool_global.cnt + lookahead_count); - obj_pool_used -= lookahead_count; - while (lookahead_count) { - hlist_add_head(&objs[--lookahead_count]->node, - &pool_global.objects); - } - } - } - raw_spin_unlock(&pool_lock); } /* -- cgit v1.2.3 From 9ce99c6d7bfbca71f1e5fa34045ea48cb768f54a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:12 +0200 Subject: debugobjects: Rework free_object_work() Convert it to batch processing with intermediate helper functions. This reduces the final changes for batch processing. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.015906394@linutronix.de --- lib/debugobjects.c | 82 ++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 43 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 3700ddfcf76a..d5a853851482 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -35,7 +35,7 @@ * frequency of 10Hz and about 1024 objects for each freeing operation. * So it is freeing at most 10k debug objects per second. */ -#define ODEBUG_FREE_WORK_MAX 1024 +#define ODEBUG_FREE_WORK_MAX (1024 / ODEBUG_BATCH_SIZE) #define ODEBUG_FREE_WORK_DELAY DIV_ROUND_UP(HZ, 10) struct debug_bucket { @@ -158,6 +158,21 @@ static bool pool_move_batch(struct obj_pool *dst, struct obj_pool *src) return true; } +static bool pool_pop_batch(struct hlist_head *head, struct obj_pool *src) +{ + if (!src->cnt) + return false; + + for (int i = 0; src->cnt && i < ODEBUG_BATCH_SIZE; i++) { + struct hlist_node *node = src->objects.first; + + WRITE_ONCE(src->cnt, src->cnt - 1); + hlist_del(node); + hlist_add_head(node, head); + } + return true; +} + static struct debug_obj *__alloc_object(struct hlist_head *list) { struct debug_obj *obj; @@ -343,55 +358,36 @@ static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, return obj; } -/* - * workqueue function to free objects. - * - * To reduce contention on the global pool_lock, the actual freeing of - * debug objects will be delayed if the pool_lock is busy. - */ +/* workqueue function to free objects. */ static void free_obj_work(struct work_struct *work) { - struct debug_obj *obj; - unsigned long flags; - HLIST_HEAD(tofree); + bool free = true; WRITE_ONCE(obj_freeing, false); - if (!raw_spin_trylock_irqsave(&pool_lock, flags)) - return; - if (pool_global.cnt >= pool_global.max_cnt) - goto free_objs; - - /* - * The objs on the pool list might be allocated before the work is - * run, so recheck if pool list it full or not, if not fill pool - * list from the global free list. As it is likely that a workload - * may be gearing up to use more and more objects, don't free any - * of them until the next round. - */ - while (pool_to_free.cnt && pool_global.cnt < pool_global.max_cnt) { - obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); - hlist_del(&obj->node); - hlist_add_head(&obj->node, &pool_global.objects); - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); - WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); - } - raw_spin_unlock_irqrestore(&pool_lock, flags); - return; + if (!pool_count(&pool_to_free)) + return; -free_objs: - /* - * Pool list is already full and there are still objs on the free - * list. Move remaining free objs to a temporary list to free the - * memory outside the pool_lock held region. - */ - if (pool_to_free.cnt) { - hlist_move_list(&pool_to_free.objects, &tofree); - WRITE_ONCE(pool_to_free.cnt, 0); + for (unsigned int cnt = 0; cnt < ODEBUG_FREE_WORK_MAX; cnt++) { + HLIST_HEAD(tofree); + + /* Acquire and drop the lock for each batch */ + scoped_guard(raw_spinlock_irqsave, &pool_lock) { + if (!pool_to_free.cnt) + return; + + /* Refill the global pool if possible */ + if (pool_move_batch(&pool_global, &pool_to_free)) { + /* Don't free as there seems to be demand */ + free = false; + } else if (free) { + pool_pop_batch(&tofree, &pool_to_free); + } else { + return; + } + } + free_object_list(&tofree); } - raw_spin_unlock_irqrestore(&pool_lock, flags); - - free_object_list(&tofree); } static void __free_object(struct debug_obj *obj) -- cgit v1.2.3 From 14077b9e583bbafc9a02734beab99c37bff68644 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:13 +0200 Subject: debugobjects: Use static key for boot pool selection Get rid of the conditional in the hot path. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.077247071@linutronix.de --- lib/debugobjects.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index d5a853851482..65cce4cd9643 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -7,16 +7,16 @@ #define pr_fmt(fmt) "ODEBUG: " fmt +#include #include -#include +#include +#include +#include #include #include #include -#include #include -#include -#include -#include +#include #define ODEBUG_HASH_BITS 14 #define ODEBUG_HASH_SIZE (1 << ODEBUG_HASH_BITS) @@ -103,6 +103,8 @@ static int __data_racy debug_objects_freed; static void free_obj_work(struct work_struct *work); static DECLARE_DELAYED_WORK(debug_obj_work, free_obj_work); +static DEFINE_STATIC_KEY_FALSE(obj_cache_enabled); + static int __init enable_object_debug(char *str) { debug_objects_enabled = true; @@ -343,7 +345,7 @@ static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, { struct debug_obj *obj; - if (likely(obj_cache)) + if (static_branch_likely(&obj_cache_enabled)) obj = pcpu_alloc(); else obj = __alloc_object(&pool_boot); @@ -393,7 +395,7 @@ static void free_obj_work(struct work_struct *work) static void __free_object(struct debug_obj *obj) { guard(irqsave)(); - if (likely(obj_cache)) + if (static_branch_likely(&obj_cache_enabled)) pcpu_free(obj); else hlist_add_head(&obj->node, &pool_boot); @@ -572,7 +574,7 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket static void debug_objects_fill_pool(void) { - if (unlikely(!obj_cache)) + if (!static_branch_likely(&obj_cache_enabled)) return; if (likely(!pool_should_refill(&pool_global))) @@ -1378,6 +1380,7 @@ void __init debug_objects_mem_init(void) /* Everything worked. Expose the cache */ obj_cache = cache; + static_branch_enable(&obj_cache_enabled); #ifdef CONFIG_HOTPLUG_CPU cpuhp_setup_state_nocalls(CPUHP_DEBUG_OBJ_DEAD, "object:offline", NULL, -- cgit v1.2.3 From 74fe1ad4132234f04fcc75e16600449496a67b5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:14 +0200 Subject: debugobjects: Prepare for batching Move the debug_obj::object pointer into a union and add a pointer to the last node in a batch. That allows to implement batch processing efficiently by utilizing the stack property of hlist: When the first object of a batch is added to the list, then the batch pointer is set to the hlist node of the object itself. Any subsequent add retrieves the pointer to the last node from the first object in the list and uses that for storing the last node pointer in the newly added object. Add the pointer to the data structure and ensure that all relevant pool sizes are strictly batch sized. The actual batching implementation follows in subsequent changes. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.139204961@linutronix.de --- include/linux/debugobjects.h | 12 ++++++++---- lib/debugobjects.c | 10 +++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 32444686b6ff..8b95545e7924 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -23,13 +23,17 @@ struct debug_obj_descr; * @state: tracked object state * @astate: current active state * @object: pointer to the real object + * @batch_last: pointer to the last hlist node in a batch * @descr: pointer to an object type specific debug description structure */ struct debug_obj { - struct hlist_node node; - enum debug_obj_state state; - unsigned int astate; - void *object; + struct hlist_node node; + enum debug_obj_state state; + unsigned int astate; + union { + void *object; + struct hlist_node *batch_last; + }; const struct debug_obj_descr *descr; }; diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 65cce4cd9643..2fed7b863827 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -21,11 +21,15 @@ #define ODEBUG_HASH_BITS 14 #define ODEBUG_HASH_SIZE (1 << ODEBUG_HASH_BITS) -#define ODEBUG_POOL_SIZE 1024 -#define ODEBUG_POOL_MIN_LEVEL 256 -#define ODEBUG_POOL_PERCPU_SIZE 64 +/* Must be power of two */ #define ODEBUG_BATCH_SIZE 16 +/* Initial values. Must all be a multiple of batch size */ +#define ODEBUG_POOL_SIZE (64 * ODEBUG_BATCH_SIZE) +#define ODEBUG_POOL_MIN_LEVEL (ODEBUG_POOL_SIZE / 4) + +#define ODEBUG_POOL_PERCPU_SIZE (4 * ODEBUG_BATCH_SIZE) + #define ODEBUG_CHUNK_SHIFT PAGE_SHIFT #define ODEBUG_CHUNK_SIZE (1 << ODEBUG_CHUNK_SHIFT) #define ODEBUG_CHUNK_MASK (~(ODEBUG_CHUNK_SIZE - 1)) -- cgit v1.2.3 From aebbfe0779b271c099cc80c5e2995c2087b28dcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:15 +0200 Subject: debugobjects: Prepare kmem_cache allocations for batching Allocate a batch and then push it into the pool. Utilize the debug_obj::last_node pointer for keeping track of the batch boundary. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241007164914.198647184@linutronix.de --- lib/debugobjects.c | 80 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2fed7b863827..cdd5d238eea7 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -164,6 +164,22 @@ static bool pool_move_batch(struct obj_pool *dst, struct obj_pool *src) return true; } +static bool pool_push_batch(struct obj_pool *dst, struct hlist_head *head) +{ + struct hlist_node *last; + struct debug_obj *obj; + + if (dst->cnt >= dst->max_cnt) + return false; + + obj = hlist_entry(head->first, typeof(*obj), node); + last = obj->batch_last; + + hlist_splice_init(head, last, &dst->objects); + WRITE_ONCE(dst->cnt, dst->cnt + ODEBUG_BATCH_SIZE); + return true; +} + static bool pool_pop_batch(struct hlist_head *head, struct obj_pool *src) { if (!src->cnt) @@ -288,6 +304,28 @@ static void fill_pool_from_freelist(void) clear_bit(0, &state); } +static bool kmem_alloc_batch(struct hlist_head *head, struct kmem_cache *cache, gfp_t gfp) +{ + struct hlist_node *last = NULL; + struct debug_obj *obj; + + for (int cnt = 0; cnt < ODEBUG_BATCH_SIZE; cnt++) { + obj = kmem_cache_zalloc(cache, gfp); + if (!obj) { + free_object_list(head); + return false; + } + debug_objects_allocated++; + + if (!last) + last = &obj->node; + obj->batch_last = last; + + hlist_add_head(&obj->node, head); + } + return true; +} + static void fill_pool(void) { static atomic_t cpus_allocating; @@ -302,25 +340,14 @@ static void fill_pool(void) atomic_inc(&cpus_allocating); while (pool_should_refill(&pool_global)) { - struct debug_obj *new, *last = NULL; HLIST_HEAD(head); - int cnt; - for (cnt = 0; cnt < ODEBUG_BATCH_SIZE; cnt++) { - new = kmem_cache_zalloc(obj_cache, __GFP_HIGH | __GFP_NOWARN); - if (!new) - break; - hlist_add_head(&new->node, &head); - if (!last) - last = new; - } - if (!cnt) + if (!kmem_alloc_batch(&head, obj_cache, __GFP_HIGH | __GFP_NOWARN)) break; guard(raw_spinlock_irqsave)(&pool_lock); - hlist_splice_init(&head, &last->node, &pool_global.objects); - debug_objects_allocated += cnt; - WRITE_ONCE(pool_global.cnt, pool_global.cnt + cnt); + if (!pool_push_batch(&pool_global, &head)) + pool_push_batch(&pool_to_free, &head); } atomic_dec(&cpus_allocating); } @@ -1302,26 +1329,18 @@ void __init debug_objects_early_init(void) static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache) { struct debug_bucket *db = obj_hash; - struct debug_obj *obj, *new; struct hlist_node *tmp; + struct debug_obj *obj; HLIST_HEAD(objects); int i; - for (i = 0; i < ODEBUG_POOL_SIZE; i++) { - obj = kmem_cache_zalloc(cache, GFP_KERNEL); - if (!obj) + for (i = 0; i < ODEBUG_POOL_SIZE; i += ODEBUG_BATCH_SIZE) { + if (!kmem_alloc_batch(&objects, cache, GFP_KERNEL)) goto free; - hlist_add_head(&obj->node, &objects); + pool_push_batch(&pool_global, &objects); } - debug_objects_allocated = ODEBUG_POOL_SIZE; - pool_global.cnt = ODEBUG_POOL_SIZE; - - /* - * Move the allocated objects to the global pool and disconnect the - * boot pool. - */ - hlist_move_list(&objects, &pool_global.objects); + /* Disconnect the boot pool. */ pool_boot.first = NULL; /* Replace the active object references */ @@ -1329,9 +1348,8 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache hlist_move_list(&db->list, &objects); hlist_for_each_entry(obj, &objects, node) { - new = hlist_entry(pool_global.objects.first, typeof(*obj), node); - hlist_del(&new->node); - pool_global.cnt--; + struct debug_obj *new = pcpu_alloc(); + /* copy object data */ *new = *obj; hlist_add_head(&new->node, &db->list); @@ -1340,7 +1358,7 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache return true; free: /* Can't use free_object_list() as the cache is not populated yet */ - hlist_for_each_entry_safe(obj, tmp, &objects, node) { + hlist_for_each_entry_safe(obj, tmp, &pool_global.objects, node) { hlist_del(&obj->node); kmem_cache_free(cache, obj); } -- cgit v1.2.3 From f57ebb92ba3e09a7e1082f147d6e1456d702d4b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:17 +0200 Subject: debugobjects: Implement batch processing Adding and removing single objects in a loop is bad in terms of lock contention and cache line accesses. To implement batching, record the last object in a batch in the object itself. This is trivialy possible as hlists are strictly stacks. At a batch boundary, when the first object is added to the list the object stores a pointer to itself in debug_obj::batch_last. When the next object is added to the list then the batch_last pointer is retrieved from the first object in the list and stored in the to be added one. That means for batch processing the first object always has a pointer to the last object in a batch, which allows to move batches in a cache line efficient way and reduces the lock held time. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.258995000@linutronix.de --- lib/debugobjects.c | 61 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index cdd5d238eea7..4e80c31f4349 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -149,18 +149,31 @@ static __always_inline bool pool_must_refill(struct obj_pool *pool) static bool pool_move_batch(struct obj_pool *dst, struct obj_pool *src) { - if (dst->cnt + ODEBUG_BATCH_SIZE > dst->max_cnt || !src->cnt) + struct hlist_node *last, *next_batch, *first_batch; + struct debug_obj *obj; + + if (dst->cnt >= dst->max_cnt || !src->cnt) return false; - for (int i = 0; i < ODEBUG_BATCH_SIZE && src->cnt; i++) { - struct hlist_node *node = src->objects.first; + first_batch = src->objects.first; + obj = hlist_entry(first_batch, typeof(*obj), node); + last = obj->batch_last; + next_batch = last->next; - WRITE_ONCE(src->cnt, src->cnt - 1); - WRITE_ONCE(dst->cnt, dst->cnt + 1); + /* Move the next batch to the front of the source pool */ + src->objects.first = next_batch; + if (next_batch) + next_batch->pprev = &src->objects.first; - hlist_del(node); - hlist_add_head(node, &dst->objects); - } + /* Add the extracted batch to the destination pool */ + last->next = dst->objects.first; + if (last->next) + last->next->pprev = &last->next; + first_batch->pprev = &dst->objects.first; + dst->objects.first = first_batch; + + WRITE_ONCE(src->cnt, src->cnt - ODEBUG_BATCH_SIZE); + WRITE_ONCE(dst->cnt, dst->cnt + ODEBUG_BATCH_SIZE); return true; } @@ -182,16 +195,27 @@ static bool pool_push_batch(struct obj_pool *dst, struct hlist_head *head) static bool pool_pop_batch(struct hlist_head *head, struct obj_pool *src) { + struct hlist_node *last, *next; + struct debug_obj *obj; + if (!src->cnt) return false; - for (int i = 0; src->cnt && i < ODEBUG_BATCH_SIZE; i++) { - struct hlist_node *node = src->objects.first; + /* Move the complete list to the head */ + hlist_move_list(&src->objects, head); - WRITE_ONCE(src->cnt, src->cnt - 1); - hlist_del(node); - hlist_add_head(node, head); - } + obj = hlist_entry(head->first, typeof(*obj), node); + last = obj->batch_last; + next = last->next; + /* Disconnect the batch from the list */ + last->next = NULL; + + /* Move the node after last back to the source pool. */ + src->objects.first = next; + if (next) + next->pprev = &src->objects.first; + + WRITE_ONCE(src->cnt, src->cnt - ODEBUG_BATCH_SIZE); return true; } @@ -226,7 +250,7 @@ static struct debug_obj *pcpu_alloc(void) if (!pool_move_batch(pcp, &pool_global)) return NULL; } - obj_pool_used += pcp->cnt; + obj_pool_used += ODEBUG_BATCH_SIZE; if (obj_pool_used > obj_pool_max_used) obj_pool_max_used = obj_pool_used; @@ -239,9 +263,16 @@ static struct debug_obj *pcpu_alloc(void) static void pcpu_free(struct debug_obj *obj) { struct obj_pool *pcp = this_cpu_ptr(&pool_pcpu); + struct debug_obj *first; lockdep_assert_irqs_disabled(); + if (!(pcp->cnt % ODEBUG_BATCH_SIZE)) { + obj->batch_last = &obj->node; + } else { + first = hlist_entry(pcp->objects.first, typeof(*first), node); + obj->batch_last = first->batch_last; + } hlist_add_head(&obj->node, &pcp->objects); pcp->cnt++; -- cgit v1.2.3 From 2638345d22529cdc964d20a617bfd32d87f27e0f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:18 +0200 Subject: debugobjects: Move pool statistics into global_pool struct Keep it along with the pool as that's a hot cache line anyway and it makes the code more comprehensible. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.318776207@linutronix.de --- lib/debugobjects.c | 87 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 35 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 4e80c31f4349..cf704e2bc301 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -47,11 +47,18 @@ struct debug_bucket { raw_spinlock_t lock; }; +struct pool_stats { + unsigned int cur_used; + unsigned int max_used; + unsigned int min_fill; +}; + struct obj_pool { struct hlist_head objects; unsigned int cnt; unsigned int min_cnt; unsigned int max_cnt; + struct pool_stats stats; } ____cacheline_aligned; @@ -66,8 +73,11 @@ static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; static DEFINE_RAW_SPINLOCK(pool_lock); static struct obj_pool pool_global = { - .min_cnt = ODEBUG_POOL_MIN_LEVEL, - .max_cnt = ODEBUG_POOL_SIZE, + .min_cnt = ODEBUG_POOL_MIN_LEVEL, + .max_cnt = ODEBUG_POOL_SIZE, + .stats = { + .min_fill = ODEBUG_POOL_SIZE, + }, }; static struct obj_pool pool_to_free = { @@ -76,16 +86,6 @@ static struct obj_pool pool_to_free = { static HLIST_HEAD(pool_boot); -/* - * Because of the presence of percpu free pools, obj_pool_free will - * under-count those in the percpu free pools. Similarly, obj_pool_used - * will over-count those in the percpu free pools. Adjustments will be - * made at debug_stats_show(). Both obj_pool_min_free and obj_pool_max_used - * can be off. - */ -static int __data_racy obj_pool_min_free = ODEBUG_POOL_SIZE; -static int obj_pool_used; -static int __data_racy obj_pool_max_used; static bool obj_freeing; static int __data_racy debug_objects_maxchain __read_mostly; @@ -231,6 +231,19 @@ static struct debug_obj *__alloc_object(struct hlist_head *list) return obj; } +static void pcpu_refill_stats(void) +{ + struct pool_stats *stats = &pool_global.stats; + + WRITE_ONCE(stats->cur_used, stats->cur_used + ODEBUG_BATCH_SIZE); + + if (stats->cur_used > stats->max_used) + stats->max_used = stats->cur_used; + + if (pool_global.cnt < stats->min_fill) + stats->min_fill = pool_global.cnt; +} + static struct debug_obj *pcpu_alloc(void) { struct obj_pool *pcp = this_cpu_ptr(&pool_pcpu); @@ -250,13 +263,7 @@ static struct debug_obj *pcpu_alloc(void) if (!pool_move_batch(pcp, &pool_global)) return NULL; } - obj_pool_used += ODEBUG_BATCH_SIZE; - - if (obj_pool_used > obj_pool_max_used) - obj_pool_max_used = obj_pool_used; - - if (pool_global.cnt < obj_pool_min_free) - obj_pool_min_free = pool_global.cnt; + pcpu_refill_stats(); } } @@ -285,7 +292,7 @@ static void pcpu_free(struct debug_obj *obj) /* Try to fit the batch into the pool_global first */ if (!pool_move_batch(&pool_global, pcp)) pool_move_batch(&pool_to_free, pcp); - obj_pool_used -= ODEBUG_BATCH_SIZE; + WRITE_ONCE(pool_global.stats.cur_used, pool_global.stats.cur_used - ODEBUG_BATCH_SIZE); } static void free_object_list(struct hlist_head *head) @@ -1074,23 +1081,33 @@ void debug_check_no_obj_freed(const void *address, unsigned long size) static int debug_stats_show(struct seq_file *m, void *v) { - int cpu, obj_percpu_free = 0; + unsigned int cpu, pool_used, pcp_free = 0; + /* + * pool_global.stats.cur_used is the number of batches currently + * handed out to per CPU pools. Convert it to number of objects + * and subtract the number of free objects in the per CPU pools. + * As this is lockless the number is an estimate. + */ for_each_possible_cpu(cpu) - obj_percpu_free += per_cpu(pool_pcpu.cnt, cpu); - - seq_printf(m, "max_chain :%d\n", debug_objects_maxchain); - seq_printf(m, "max_checked :%d\n", debug_objects_maxchecked); - seq_printf(m, "warnings :%d\n", debug_objects_warnings); - seq_printf(m, "fixups :%d\n", debug_objects_fixups); - seq_printf(m, "pool_free :%d\n", pool_count(&pool_global) + obj_percpu_free); - seq_printf(m, "pool_pcp_free :%d\n", obj_percpu_free); - seq_printf(m, "pool_min_free :%d\n", obj_pool_min_free); - seq_printf(m, "pool_used :%d\n", obj_pool_used - obj_percpu_free); - seq_printf(m, "pool_max_used :%d\n", obj_pool_max_used); - seq_printf(m, "on_free_list :%d\n", pool_count(&pool_to_free)); - seq_printf(m, "objs_allocated:%d\n", debug_objects_allocated); - seq_printf(m, "objs_freed :%d\n", debug_objects_freed); + pcp_free += per_cpu(pool_pcpu.cnt, cpu); + + pool_used = data_race(pool_global.stats.cur_used); + pcp_free = min(pool_used, pcp_free); + pool_used -= pcp_free; + + seq_printf(m, "max_chain : %d\n", debug_objects_maxchain); + seq_printf(m, "max_checked : %d\n", debug_objects_maxchecked); + seq_printf(m, "warnings : %d\n", debug_objects_warnings); + seq_printf(m, "fixups : %d\n", debug_objects_fixups); + seq_printf(m, "pool_free : %u\n", pool_count(&pool_global) + pcp_free); + seq_printf(m, "pool_pcp_free : %u\n", pcp_free); + seq_printf(m, "pool_min_free : %u\n", data_race(pool_global.stats.min_fill)); + seq_printf(m, "pool_used : %u\n", pool_used); + seq_printf(m, "pool_max_used : %u\n", data_race(pool_global.stats.max_used)); + seq_printf(m, "on_free_list : %u\n", pool_count(&pool_to_free)); + seq_printf(m, "objs_allocated: %d\n", debug_objects_allocated); + seq_printf(m, "objs_freed : %d\n", debug_objects_freed); return 0; } DEFINE_SHOW_ATTRIBUTE(debug_stats); -- cgit v1.2.3 From a201a96b9682e5b42ed93108c4aeb6135c909661 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:19 +0200 Subject: debugobjects: Double the per CPU slots In situations where objects are rapidly allocated from the pool and handed back, the size of the per CPU pool turns out to be too small. Double the size of the per CPU pool. This reduces the kmem cache allocation and free operations during a kernel compile: alloc free Baseline: 380k 330k Double size: 295k 245k Especially the reduction of allocations is important because that happens in the hot path when objects are initialized. The maximum increase in per CPU pool memory consumption is about 2.5K per online CPU, which is acceptable. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.378676302@linutronix.de --- lib/debugobjects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index cf704e2bc301..fc9397de5534 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -28,7 +28,7 @@ #define ODEBUG_POOL_SIZE (64 * ODEBUG_BATCH_SIZE) #define ODEBUG_POOL_MIN_LEVEL (ODEBUG_POOL_SIZE / 4) -#define ODEBUG_POOL_PERCPU_SIZE (4 * ODEBUG_BATCH_SIZE) +#define ODEBUG_POOL_PERCPU_SIZE (8 * ODEBUG_BATCH_SIZE) #define ODEBUG_CHUNK_SHIFT PAGE_SHIFT #define ODEBUG_CHUNK_SIZE (1 << ODEBUG_CHUNK_SHIFT) -- cgit v1.2.3 From 13f9ca723900ae3ae8e0a1e76ba86e7786e60645 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:20 +0200 Subject: debugobjects: Refill per CPU pool more agressively Right now the per CPU pools are only refilled when they become empty. That's suboptimal especially when there are still non-freed objects in the to free list. Check whether an allocation from the per CPU pool emptied a batch and try to allocate from the free pool if that still has objects available. kmem_cache_alloc() kmem_cache_free() Baseline: 295k 245k Refill: 225k 173k Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.439053085@linutronix.de --- lib/debugobjects.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fc9397de5534..cc32844ca18d 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -255,6 +255,24 @@ static struct debug_obj *pcpu_alloc(void) if (likely(obj)) { pcp->cnt--; + /* + * If this emptied a batch try to refill from the + * free pool. Don't do that if this was the top-most + * batch as pcpu_free() expects the per CPU pool + * to be less than ODEBUG_POOL_PERCPU_SIZE. + */ + if (unlikely(pcp->cnt < (ODEBUG_POOL_PERCPU_SIZE - ODEBUG_BATCH_SIZE) && + !(pcp->cnt % ODEBUG_BATCH_SIZE))) { + /* + * Don't try to allocate from the regular pool here + * to not exhaust it prematurely. + */ + if (pool_count(&pool_to_free)) { + guard(raw_spinlock)(&pool_lock); + pool_move_batch(pcp, &pool_to_free); + pcpu_refill_stats(); + } + } return obj; } -- cgit v1.2.3 From ff8d523cc4520a5ce86cde0fd57c304e2b4f61b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 13 Oct 2024 20:45:57 +0200 Subject: debugobjects: Track object usage to avoid premature freeing of objects The freelist is freed at a constant rate independent of the actual usage requirements. That's bad in scenarios where usage comes in bursts. The end of a burst puts the objects on the free list and freeing proceeds even when the next burst which requires objects started again. Keep track of the usage with a exponentially wheighted moving average and take that into account in the worker function which frees objects from the free list. This further reduces the kmem_cache allocation/free rate for a full kernel compile: kmem_cache_alloc() kmem_cache_free() Baseline: 225k 173k Usage: 170k 117k Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/87bjznhme2.ffs@tglx --- lib/debugobjects.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index cc32844ca18d..7f50c4480a4e 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,7 @@ static struct obj_pool pool_to_free = { static HLIST_HEAD(pool_boot); +static unsigned long avg_usage; static bool obj_freeing; static int __data_racy debug_objects_maxchain __read_mostly; @@ -427,11 +429,31 @@ static struct debug_obj *lookup_object(void *addr, struct debug_bucket *b) return NULL; } +static void calc_usage(void) +{ + static DEFINE_RAW_SPINLOCK(avg_lock); + static unsigned long avg_period; + unsigned long cur, now = jiffies; + + if (!time_after_eq(now, READ_ONCE(avg_period))) + return; + + if (!raw_spin_trylock(&avg_lock)) + return; + + WRITE_ONCE(avg_period, now + msecs_to_jiffies(10)); + cur = READ_ONCE(pool_global.stats.cur_used) * ODEBUG_FREE_WORK_MAX; + WRITE_ONCE(avg_usage, calc_load(avg_usage, EXP_5, cur)); + raw_spin_unlock(&avg_lock); +} + static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *descr) { struct debug_obj *obj; + calc_usage(); + if (static_branch_likely(&obj_cache_enabled)) obj = pcpu_alloc(); else @@ -450,14 +472,26 @@ static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, /* workqueue function to free objects. */ static void free_obj_work(struct work_struct *work) { - bool free = true; + static unsigned long last_use_avg; + unsigned long cur_used, last_used, delta; + unsigned int max_free = 0; WRITE_ONCE(obj_freeing, false); + /* Rate limit freeing based on current use average */ + cur_used = READ_ONCE(avg_usage); + last_used = last_use_avg; + last_use_avg = cur_used; + if (!pool_count(&pool_to_free)) return; - for (unsigned int cnt = 0; cnt < ODEBUG_FREE_WORK_MAX; cnt++) { + if (cur_used <= last_used) { + delta = (last_used - cur_used) / ODEBUG_FREE_WORK_MAX; + max_free = min(delta, ODEBUG_FREE_WORK_MAX); + } + + for (int cnt = 0; cnt < ODEBUG_FREE_WORK_MAX; cnt++) { HLIST_HEAD(tofree); /* Acquire and drop the lock for each batch */ @@ -468,9 +502,10 @@ static void free_obj_work(struct work_struct *work) /* Refill the global pool if possible */ if (pool_move_batch(&pool_global, &pool_to_free)) { /* Don't free as there seems to be demand */ - free = false; - } else if (free) { + max_free = 0; + } else if (max_free) { pool_pop_batch(&tofree, &pool_to_free); + max_free--; } else { return; } @@ -1110,7 +1145,7 @@ static int debug_stats_show(struct seq_file *m, void *v) for_each_possible_cpu(cpu) pcp_free += per_cpu(pool_pcpu.cnt, cpu); - pool_used = data_race(pool_global.stats.cur_used); + pool_used = READ_ONCE(pool_global.stats.cur_used); pcp_free = min(pool_used, pcp_free); pool_used -= pcp_free; -- cgit v1.2.3 From 5eadeb7b3bc206e2ac9494e9499e7c1f1e44eab7 Mon Sep 17 00:00:00 2001 From: Ahmed Ehab Date: Thu, 5 Sep 2024 04:12:20 +0300 Subject: locking/lockdep: Add a test for lockdep_set_subclass() Add a test case to ensure that no new name string literal will be created in lockdep_set_subclass(), otherwise a warning will be triggered in look_up_lock_class(). Add this to catch the problem in the future. [boqun: Reword the title, replace #if with #ifdef and rename functions and variables] Signed-off-by: Ahmed Ehab Signed-off-by: Boqun Feng Link: https://lore.kernel.org/lkml/20240905011220.356973-1-bottaawesome633@gmail.com/ --- lib/locking-selftest.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'lib') diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 6f6a5fc85b42..6e0c019f71b6 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -2710,6 +2710,43 @@ static void local_lock_3B(void) } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline const char *rw_semaphore_lockdep_name(struct rw_semaphore *rwsem) +{ + return rwsem->dep_map.name; +} +#else +static inline const char *rw_semaphore_lockdep_name(struct rw_semaphore *rwsem) +{ + return NULL; +} +#endif + +static void test_lockdep_set_subclass_name(void) +{ + const char *name_before = rw_semaphore_lockdep_name(&rwsem_X1); + const char *name_after; + + lockdep_set_subclass(&rwsem_X1, 1); + name_after = rw_semaphore_lockdep_name(&rwsem_X1); + DEBUG_LOCKS_WARN_ON(name_before != name_after); +} + +/* + * lockdep_set_subclass() should reuse the existing lock class name instead + * of creating a new one. + */ +static void lockdep_set_subclass_name_test(void) +{ + printk(" --------------------------------------------------------------------------\n"); + printk(" | lockdep_set_subclass() name test|\n"); + printk(" -----------------------------------\n"); + + print_testname("compare name before and after"); + dotest(test_lockdep_set_subclass_name, SUCCESS, LOCKTYPE_RWSEM); + pr_cont("\n"); +} + static void local_lock_tests(void) { printk(" --------------------------------------------------------------------------\n"); @@ -2920,6 +2957,8 @@ void locking_selftest(void) dotest(hardirq_deadlock_softirq_not_deadlock, FAILURE, LOCKTYPE_SPECIAL); pr_cont("\n"); + lockdep_set_subclass_name_test(); + if (unexpected_testcase_failures) { printk("-----------------------------------------------------------------\n"); debug_locks = 0; -- cgit v1.2.3 From 560af5dc839eef08a273908f390cfefefb82aa04 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 9 Oct 2024 17:45:03 +0200 Subject: lockdep: Enable PROVE_RAW_LOCK_NESTING with PROVE_LOCKING. With the printk issues solved, the last known splat created by PROVE_RAW_LOCK_NESTING is gone. Enable PROVE_RAW_LOCK_NESTING by default as part of PROVE_LOCKING. Keep the defines around in case something serious pops up and it needs to be disabled. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Waiman Long Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241009161041.1018375-2-bigeasy@linutronix.de --- lib/Kconfig.debug | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7315f643817a..5b67816f4a62 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1409,22 +1409,14 @@ config PROVE_LOCKING For more details, see Documentation/locking/lockdep-design.rst. config PROVE_RAW_LOCK_NESTING - bool "Enable raw_spinlock - spinlock nesting checks" + bool depends on PROVE_LOCKING - default n + default y help Enable the raw_spinlock vs. spinlock nesting checks which ensure that the lock nesting rules for PREEMPT_RT enabled kernels are not violated. - NOTE: There are known nesting problems. So if you enable this - option expect lockdep splats until these problems have been fully - addressed which is work in progress. This config switch allows to - identify and analyze these problems. It will be removed and the - check permanently enabled once the main issues have been fixed. - - If unsure, select N. - config LOCK_STAT bool "Lock usage statistics" depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT -- cgit v1.2.3 From 84b4a51fce4ccc6605113ed8af41a3d91609a756 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Mon, 21 Oct 2024 12:11:44 -0700 Subject: selftests: add new kallsyms selftests We lack find_symbol() selftests, so add one. This let's us stress test improvements easily on find_symbol() or optimizations. It also inherently allows us to test the limits of kallsyms on Linux today. We test a pathalogical use case for kallsyms by introducing modules which are automatically written for us with a larger number of symbols. We have 4 kallsyms test modules: A: has KALLSYSMS_NUMSYMS exported symbols B: uses one of A's symbols C: adds KALLSYMS_SCALE_FACTOR * KALLSYSMS_NUMSYMS exported D: adds 2 * the symbols than C By using anything much larger than KALLSYSMS_NUMSYMS as 10,000 and KALLSYMS_SCALE_FACTOR of 8 we segfault today. So we're capped at around 160000 symbols somehow today. We can inpsect that issue at our leasure later, but for now the real value to this test is that this will easily allow us to test improvements on find_symbol(). We want to enable this test on allyesmodconfig builds so we can't use this combination, so instead just use a safe value for now and be informative on the Kconfig symbol documentation about where our thresholds are for testers. We default then to KALLSYSMS_NUMSYMS of just 100 and KALLSYMS_SCALE_FACTOR of 8. On x86_64 we can use perf, for other architectures we just use 'time' and allow for customizations. For example a future enhancements could be done for parisc to check for unaligned accesses which triggers a special special exception handler assembler code inside the kernel. The negative impact on performance is so large on parisc that it keeps track of its accesses on /proc/cpuinfo as UAH: IRQ: CPU0 CPU1 3: 1332 0 SuperIO ttyS0 7: 1270013 0 SuperIO pata_ns87415 64: 320023012 320021431 CPU timer 65: 17080507 20624423 CPU IPI UAH: 10948640 58104 Unaligned access handler traps While at it, this tidies up lib/ test modules to allow us to have a new directory for them. The amount of test modules under lib/ is insane. This should also hopefully showcase how to start doing basic self module writing code, which may be more useful for more complex cases later in the future. Signed-off-by: Luis Chamberlain --- lib/Kconfig.debug | 105 +++++++++++++++++++++ lib/Makefile | 1 + lib/tests/Makefile | 1 + lib/tests/module/.gitignore | 4 + lib/tests/module/Makefile | 15 +++ lib/tests/module/gen_test_kallsyms.sh | 128 ++++++++++++++++++++++++++ tools/testing/selftests/module/Makefile | 12 +++ tools/testing/selftests/module/config | 3 + tools/testing/selftests/module/find_symbol.sh | 81 ++++++++++++++++ 9 files changed, 350 insertions(+) create mode 100644 lib/tests/Makefile create mode 100644 lib/tests/module/.gitignore create mode 100644 lib/tests/module/Makefile create mode 100755 lib/tests/module/gen_test_kallsyms.sh create mode 100644 tools/testing/selftests/module/Makefile create mode 100644 tools/testing/selftests/module/config create mode 100755 tools/testing/selftests/module/find_symbol.sh (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7315f643817a..b5929721fc63 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2903,6 +2903,111 @@ config TEST_KMOD If unsure, say N. +config TEST_RUNTIME + bool + +config TEST_RUNTIME_MODULE + bool + +config TEST_KALLSYMS + tristate "module kallsyms find_symbol() test" + depends on m + select TEST_RUNTIME + select TEST_RUNTIME_MODULE + select TEST_KALLSYMS_A + select TEST_KALLSYMS_B + select TEST_KALLSYMS_C + select TEST_KALLSYMS_D + help + This allows us to stress test find_symbol() through the kallsyms + used to place symbols on the kernel ELF kallsyms and modules kallsyms + where we place kernel symbols such as exported symbols. + + We have four test modules: + + A: has KALLSYSMS_NUMSYMS exported symbols + B: uses one of A's symbols + C: adds KALLSYMS_SCALE_FACTOR * KALLSYSMS_NUMSYMS exported + D: adds 2 * the symbols than C + + We stress test find_symbol() through two means: + + 1) Upon load of B it will trigger simplify_symbols() to look for the + one symbol it uses from the module A with tons of symbols. This is an + indirect way for us to have B call resolve_symbol_wait() upon module + load. This will eventually call find_symbol() which will eventually + try to find the symbols used with find_exported_symbol_in_section(). + find_exported_symbol_in_section() uses bsearch() so a binary search + for each symbol. Binary search will at worst be O(log(n)) so the + larger TEST_MODULE_KALLSYSMS the worse the search. + + 2) The selftests should load C first, before B. Upon B's load towards + the end right before we call module B's init routine we get + complete_formation() called on the module. That will first check + for duplicate symbols with the call to verify_exported_symbols(). + That is when we'll force iteration on module C's insane symbol list. + Since it has 10 * KALLSYMS_NUMSYMS it means we can first test + just loading B without C. The amount of time it takes to load C Vs + B can give us an idea of the impact growth of the symbol space and + give us projection. Module A only uses one symbol from B so to allow + this scaling in module C to be proportional, if it used more symbols + then the first test would be doing more and increasing just the + search space would be slightly different. The last module, module D + will just increase the search space by twice the number of symbols in + C so to allow for full projects. + + tools/testing/selftests/module/find_symbol.sh + + The current defaults will incur a build delay of about 7 minutes + on an x86_64 with only 8 cores. Enable this only if you want to + stress test find_symbol() with thousands of symbols. At the same + time this is also useful to test building modules with thousands of + symbols, and if BTF is enabled this also stress tests adding BTF + information for each module. Currently enabling many more symbols + will segfault the build system. + + If unsure, say N. + +if TEST_KALLSYMS + +config TEST_KALLSYMS_A + tristate + depends on m + +config TEST_KALLSYMS_B + tristate + depends on m + +config TEST_KALLSYMS_C + tristate + depends on m + +config TEST_KALLSYMS_D + tristate + depends on m + +config TEST_KALLSYMS_NUMSYMS + int "test kallsyms number of symbols" + default 100 + help + The number of symbols to create on TEST_KALLSYMS_A, only one of which + module TEST_KALLSYMS_B will use. This also will be used + for how many symbols TEST_KALLSYMS_C will have, scaled up by + TEST_KALLSYMS_SCALE_FACTOR. Note that setting this to 10,000 will + trigger a segfault today, don't use anything close to it unless + you are aware that this should not be used for automated build tests. + +config TEST_KALLSYMS_SCALE_FACTOR + int "test kallsyms scale factor" + default 8 + help + How many more unusued symbols will TEST_KALLSYSMS_C have than + TEST_KALLSYMS_A. If 8, then module C will have 8 * syms + than module A. Then TEST_KALLSYMS_D will have double the amount + of symbols than C so to allow projections. + +endif # TEST_KALLSYMS + config TEST_DEBUG_VIRTUAL tristate "Test CONFIG_DEBUG_VIRTUAL feature" depends on DEBUG_VIRTUAL diff --git a/lib/Makefile b/lib/Makefile index 773adf88af41..ae720c7eb996 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_MAPLE_TREE) += test_maple_tree.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o +obj-$(CONFIG_TEST_RUNTIME) += tests/ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o diff --git a/lib/tests/Makefile b/lib/tests/Makefile new file mode 100644 index 000000000000..8e4f42cb9c54 --- /dev/null +++ b/lib/tests/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_TEST_RUNTIME_MODULE) += module/ diff --git a/lib/tests/module/.gitignore b/lib/tests/module/.gitignore new file mode 100644 index 000000000000..8be7891b250f --- /dev/null +++ b/lib/tests/module/.gitignore @@ -0,0 +1,4 @@ +test_kallsyms_a.c +test_kallsyms_b.c +test_kallsyms_c.c +test_kallsyms_d.c diff --git a/lib/tests/module/Makefile b/lib/tests/module/Makefile new file mode 100644 index 000000000000..af5c27b996cb --- /dev/null +++ b/lib/tests/module/Makefile @@ -0,0 +1,15 @@ +obj-$(CONFIG_TEST_KALLSYMS_A) += test_kallsyms_a.o +obj-$(CONFIG_TEST_KALLSYMS_B) += test_kallsyms_b.o +obj-$(CONFIG_TEST_KALLSYMS_C) += test_kallsyms_c.o +obj-$(CONFIG_TEST_KALLSYMS_D) += test_kallsyms_d.o + +$(obj)/%.c: FORCE + @$(kecho) " GEN $@" + $(Q)$(srctree)/lib/tests/module/gen_test_kallsyms.sh $@\ + $(CONFIG_TEST_KALLSYMS_NUMSYMS) \ + $(CONFIG_TEST_KALLSYMS_SCALE_FACTOR) + +clean-files += test_kallsyms_a.c +clean-files += test_kallsyms_b.c +clean-files += test_kallsyms_c.c +clean-files += test_kallsyms_d.c diff --git a/lib/tests/module/gen_test_kallsyms.sh b/lib/tests/module/gen_test_kallsyms.sh new file mode 100755 index 000000000000..e85f10dc11bd --- /dev/null +++ b/lib/tests/module/gen_test_kallsyms.sh @@ -0,0 +1,128 @@ +#!/bin/bash + +TARGET=$(basename $1) +DIR=lib/tests/module +TARGET="$DIR/$TARGET" +NUM_SYMS=$2 +SCALE_FACTOR=$3 +TEST_TYPE=$(echo $TARGET | sed -e 's|lib/tests/module/test_kallsyms_||g') +TEST_TYPE=$(echo $TEST_TYPE | sed -e 's|.c||g') + +gen_template_module_header() +{ + cat <<____END_MODULE +// SPDX-License-Identifier: GPL-2.0-or-later OR copyleft-next-0.3.1 +/* + * Copyright (C) 2023 Luis Chamberlain + * + * Automatically generated code for testing, do not edit manually. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +____END_MODULE +} + +gen_num_syms() +{ + PREFIX=$1 + NUM=$2 + for i in $(seq 1 $NUM); do + printf "int auto_test_%s_%010d = 0xff;\n" $PREFIX $i + printf "EXPORT_SYMBOL_GPL(auto_test_%s_%010d);\n" $PREFIX $i + done + echo +} + +gen_template_module_data_a() +{ + gen_num_syms a $1 + cat <<____END_MODULE +static int auto_runtime_test(void) +{ + return 0; +} + +____END_MODULE +} + +gen_template_module_data_b() +{ + printf "\nextern int auto_test_a_%010d;\n\n" 28 + echo "static int auto_runtime_test(void)" + echo "{" + printf "\nreturn auto_test_a_%010d;\n" 28 + echo "}" +} + +gen_template_module_data_c() +{ + gen_num_syms c $1 + cat <<____END_MODULE +static int auto_runtime_test(void) +{ + return 0; +} + +____END_MODULE +} + +gen_template_module_data_d() +{ + gen_num_syms d $1 + cat <<____END_MODULE +static int auto_runtime_test(void) +{ + return 0; +} + +____END_MODULE +} + +gen_template_module_exit() +{ + cat <<____END_MODULE +static int __init auto_test_module_init(void) +{ + return auto_runtime_test(); +} +module_init(auto_test_module_init); + +static void __exit auto_test_module_exit(void) +{ +} +module_exit(auto_test_module_exit); + +MODULE_AUTHOR("Luis Chamberlain "); +MODULE_LICENSE("GPL"); +____END_MODULE +} + +case $TEST_TYPE in + a) + gen_template_module_header > $TARGET + gen_template_module_data_a $NUM_SYMS >> $TARGET + gen_template_module_exit >> $TARGET + ;; + b) + gen_template_module_header > $TARGET + gen_template_module_data_b >> $TARGET + gen_template_module_exit >> $TARGET + ;; + c) + gen_template_module_header > $TARGET + gen_template_module_data_c $((NUM_SYMS * SCALE_FACTOR)) >> $TARGET + gen_template_module_exit >> $TARGET + ;; + d) + gen_template_module_header > $TARGET + gen_template_module_data_d $((NUM_SYMS * SCALE_FACTOR * 2)) >> $TARGET + gen_template_module_exit >> $TARGET + ;; + *) + ;; +esac diff --git a/tools/testing/selftests/module/Makefile b/tools/testing/selftests/module/Makefile new file mode 100644 index 000000000000..6132d7ddb08b --- /dev/null +++ b/tools/testing/selftests/module/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Makefile for module loading selftests + +# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" +all: + +TEST_PROGS := find_symbol.sh + +include ../lib.mk + +# Nothing to clean up. +clean: diff --git a/tools/testing/selftests/module/config b/tools/testing/selftests/module/config new file mode 100644 index 000000000000..b0c206b1ad47 --- /dev/null +++ b/tools/testing/selftests/module/config @@ -0,0 +1,3 @@ +CONFIG_TEST_RUNTIME=y +CONFIG_TEST_RUNTIME_MODULE=y +CONFIG_TEST_KALLSYMS=m diff --git a/tools/testing/selftests/module/find_symbol.sh b/tools/testing/selftests/module/find_symbol.sh new file mode 100755 index 000000000000..140364d3c49f --- /dev/null +++ b/tools/testing/selftests/module/find_symbol.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later OR copyleft-next-0.3.1 +# Copyright (C) 2023 Luis Chamberlain +# +# This is a stress test script for kallsyms through find_symbol() + +set -e + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +test_reqs() +{ + if ! which modprobe 2> /dev/null > /dev/null; then + echo "$0: You need modprobe installed" >&2 + exit $ksft_skip + fi + + if ! which kmod 2> /dev/null > /dev/null; then + echo "$0: You need kmod installed" >&2 + exit $ksft_skip + fi + + if ! which perf 2> /dev/null > /dev/null; then + echo "$0: You need perf installed" >&2 + exit $ksft_skip + fi + + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo $msg must be run as root >&2 + exit $ksft_skip + fi +} + +load_mod() +{ + local STATS="-e duration_time" + STATS="$STATS -e user_time" + STATS="$STATS -e system_time" + STATS="$STATS -e page-faults" + local MOD=$1 + + local ARCH="$(uname -m)" + case "${ARCH}" in + x86_64) + perf stat $STATS $MODPROBE test_kallsyms_b + ;; + *) + time $MODPROBE test_kallsyms_b + exit 1 + ;; + esac +} + +remove_all() +{ + $MODPROBE -r test_kallsyms_b + for i in a b c d; do + $MODPROBE -r test_kallsyms_$i + done +} +test_reqs + +MODPROBE=$( Date: Wed, 16 Oct 2024 20:57:24 +0200 Subject: crypto: crc32 - Provide crc32-arch driver for accelerated library code crc32-generic is currently backed by the architecture's CRC-32 library code, which may offer a variety of implementations depending on the capabilities of the platform. These are not covered by the crypto subsystem's fuzz testing capabilities because crc32-generic is the reference driver that the fuzzing logic uses as a source of truth. Fix this by providing a crc32-arch implementation which is based on the arch library code if available, and modify crc32-generic so it is always based on the generic C implementation. If the arch has no CRC-32 library code, this change does nothing. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Makefile | 1 + crypto/crc32_generic.c | 94 +++++++++++++++++++++++++++++++++++++------------- lib/crc32.c | 2 ++ 3 files changed, 73 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/crypto/Makefile b/crypto/Makefile index 81be78d39c2d..3e49a820f148 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -155,6 +155,7 @@ obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c_generic.o obj-$(CONFIG_CRYPTO_CRC32) += crc32_generic.o +CFLAGS_crc32_generic.o += -DARCH=$(ARCH) obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif_common.o crct10dif_generic.o obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT) += crc64_rocksoft_generic.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o diff --git a/crypto/crc32_generic.c b/crypto/crc32_generic.c index a989cb44fd16..8d6e1baec509 100644 --- a/crypto/crc32_generic.c +++ b/crypto/crc32_generic.c @@ -59,6 +59,15 @@ static int crc32_update(struct shash_desc *desc, const u8 *data, { u32 *crcp = shash_desc_ctx(desc); + *crcp = crc32_le_base(*crcp, data, len); + return 0; +} + +static int crc32_update_arch(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + *crcp = crc32_le(*crcp, data, len); return 0; } @@ -66,6 +75,13 @@ static int crc32_update(struct shash_desc *desc, const u8 *data, /* No final XOR 0xFFFFFFFF, like crc32_le */ static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) +{ + put_unaligned_le32(crc32_le_base(*crcp, data, len), out); + return 0; +} + +static int __crc32_finup_arch(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) { put_unaligned_le32(crc32_le(*crcp, data, len), out); return 0; @@ -77,6 +93,12 @@ static int crc32_finup(struct shash_desc *desc, const u8 *data, return __crc32_finup(shash_desc_ctx(desc), data, len, out); } +static int crc32_finup_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup_arch(shash_desc_ctx(desc), data, len, out); +} + static int crc32_final(struct shash_desc *desc, u8 *out) { u32 *crcp = shash_desc_ctx(desc); @@ -88,38 +110,62 @@ static int crc32_final(struct shash_desc *desc, u8 *out) static int crc32_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, - out); + return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, out); } -static struct shash_alg alg = { - .setkey = crc32_setkey, - .init = crc32_init, - .update = crc32_update, - .final = crc32_final, - .finup = crc32_finup, - .digest = crc32_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "crc32", - .cra_driver_name = "crc32-generic", - .cra_priority = 100, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = crc32_cra_init, - } -}; + +static int crc32_digest_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup_arch(crypto_shash_ctx(desc->tfm), data, len, out); +} + +static struct shash_alg algs[] = {{ + .setkey = crc32_setkey, + .init = crc32_init, + .update = crc32_update, + .final = crc32_final, + .finup = crc32_finup, + .digest = crc32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + + .base.cra_name = "crc32", + .base.cra_driver_name = "crc32-generic", + .base.cra_priority = 100, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(u32), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32_cra_init, +}, { + .setkey = crc32_setkey, + .init = crc32_init, + .update = crc32_update_arch, + .final = crc32_final, + .finup = crc32_finup_arch, + .digest = crc32_digest_arch, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + + .base.cra_name = "crc32", + .base.cra_driver_name = "crc32-" __stringify(ARCH), + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(u32), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32_cra_init, +}}; static int __init crc32_mod_init(void) { - return crypto_register_shash(&alg); + /* register the arch flavor only if it differs from the generic one */ + return crypto_register_shashes(algs, 1 + (&crc32_le != &crc32_le_base)); } static void __exit crc32_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, 1 + (&crc32_le != &crc32_le_base)); } subsys_initcall(crc32_mod_init); diff --git a/lib/crc32.c b/lib/crc32.c index 5649847d0a8d..a54ba87b7073 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -205,6 +205,8 @@ EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(__crc32c_le); u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); +EXPORT_SYMBOL(crc32_le_base); + u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be); -- cgit v1.2.3 From 16739efac6e1ea40df5ec7a263e664481840e73a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 16 Oct 2024 20:57:25 +0200 Subject: crypto: crc32c - Provide crc32c-arch driver for accelerated library code crc32c-generic is currently backed by the architecture's CRC-32c library code, which may offer a variety of implementations depending on the capabilities of the platform. These are not covered by the crypto subsystem's fuzz testing capabilities because crc32c-generic is the reference driver that the fuzzing logic uses as a source of truth. Fix this by providing a crc32c-arch implementation which is based on the arch library code if available, and modify crc32c-generic so it is always based on the generic C implementation. If the arch has no CRC-32c library code, this change does nothing. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Makefile | 1 + crypto/crc32c_generic.c | 94 +++++++++++++++++++++++++++++++++++++------------ lib/crc32.c | 2 ++ 3 files changed, 75 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/crypto/Makefile b/crypto/Makefile index 3e49a820f148..77abca715445 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -155,6 +155,7 @@ obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c_generic.o obj-$(CONFIG_CRYPTO_CRC32) += crc32_generic.o +CFLAGS_crc32c_generic.o += -DARCH=$(ARCH) CFLAGS_crc32_generic.o += -DARCH=$(ARCH) obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif_common.o crct10dif_generic.o obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT) += crc64_rocksoft_generic.o diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c index 768614738541..3f928d7f4ade 100644 --- a/crypto/crc32c_generic.c +++ b/crypto/crc32c_generic.c @@ -85,6 +85,15 @@ static int chksum_update(struct shash_desc *desc, const u8 *data, { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + ctx->crc = __crc32c_le_base(ctx->crc, data, length); + return 0; +} + +static int chksum_update_arch(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + ctx->crc = __crc32c_le(ctx->crc, data, length); return 0; } @@ -98,6 +107,13 @@ static int chksum_final(struct shash_desc *desc, u8 *out) } static int __chksum_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) +{ + put_unaligned_le32(~__crc32c_le_base(*crcp, data, len), out); + return 0; +} + +static int __chksum_finup_arch(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) { put_unaligned_le32(~__crc32c_le(*crcp, data, len), out); return 0; @@ -111,6 +127,14 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data, return __chksum_finup(&ctx->crc, data, len, out); } +static int chksum_finup_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup_arch(&ctx->crc, data, len, out); +} + static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) { @@ -119,6 +143,14 @@ static int chksum_digest(struct shash_desc *desc, const u8 *data, return __chksum_finup(&mctx->key, data, length, out); } +static int chksum_digest_arch(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); + + return __chksum_finup_arch(&mctx->key, data, length, out); +} + static int crc32c_cra_init(struct crypto_tfm *tfm) { struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); @@ -127,35 +159,53 @@ static int crc32c_cra_init(struct crypto_tfm *tfm) return 0; } -static struct shash_alg alg = { - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = chksum_setkey, - .init = chksum_init, - .update = chksum_update, - .final = chksum_final, - .finup = chksum_finup, - .digest = chksum_digest, - .descsize = sizeof(struct chksum_desc_ctx), - .base = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-generic", - .cra_priority = 100, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct chksum_ctx), - .cra_module = THIS_MODULE, - .cra_init = crc32c_cra_init, - } -}; +static struct shash_alg algs[] = {{ + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = chksum_setkey, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + + .base.cra_name = "crc32c", + .base.cra_driver_name = "crc32c-generic", + .base.cra_priority = 100, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct chksum_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32c_cra_init, +}, { + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = chksum_setkey, + .init = chksum_init, + .update = chksum_update_arch, + .final = chksum_final, + .finup = chksum_finup_arch, + .digest = chksum_digest_arch, + .descsize = sizeof(struct chksum_desc_ctx), + + .base.cra_name = "crc32c", + .base.cra_driver_name = "crc32c-" __stringify(ARCH), + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct chksum_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32c_cra_init, +}}; static int __init crc32c_mod_init(void) { - return crypto_register_shash(&alg); + /* register the arch flavor only if it differs from the generic one */ + return crypto_register_shashes(algs, 1 + (&__crc32c_le != &__crc32c_le_base)); } static void __exit crc32c_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, 1 + (&__crc32c_le != &__crc32c_le_base)); } subsys_initcall(crc32c_mod_init); diff --git a/lib/crc32.c b/lib/crc32.c index a54ba87b7073..ff587fee3893 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -208,6 +208,8 @@ u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); EXPORT_SYMBOL(crc32_le_base); u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); +EXPORT_SYMBOL(__crc32c_le_base); + u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be); /* -- cgit v1.2.3 From 4964a1d91cd186b423666aac6d4ad3a61cf88b54 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Oct 2024 16:53:43 -0700 Subject: crypto: api - move crypto_simd_disabled_for_test to lib Move crypto_simd_disabled_for_test to lib/ so that crypto_simd_usable() can be used by library code. This was discussed previously (https://lore.kernel.org/linux-crypto/20220716062920.210381-4-ebiggers@kernel.org/) but was not done because there was no use case yet. However, this is now needed for the arm64 CRC32 library code. Tested with: export ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- echo CONFIG_CRC32=y > .config echo CONFIG_MODULES=y >> .config echo CONFIG_CRYPTO=m >> .config echo CONFIG_DEBUG_KERNEL=y >> .config echo CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=n >> .config echo CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y >> .config make olddefconfig make -j$(nproc) Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/algapi.c | 6 ------ lib/crypto/Makefile | 2 ++ lib/crypto/simd.c | 11 +++++++++++ 3 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 lib/crypto/simd.c (limited to 'lib') diff --git a/crypto/algapi.c b/crypto/algapi.c index 74e2261c184c..429a832f90fe 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -6,7 +6,6 @@ */ #include -#include #include #include #include @@ -23,11 +22,6 @@ static LIST_HEAD(crypto_template_list); -#ifdef CONFIG_CRYPTO_MANAGER_EXTRA_TESTS -DEFINE_PER_CPU(bool, crypto_simd_disabled_for_test); -EXPORT_PER_CPU_SYMBOL_GPL(crypto_simd_disabled_for_test); -#endif - static inline void crypto_check_module_sig(struct module *mod) { if (fips_enabled && mod && !module_sig_ok(mod)) diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 969baab8c805..01fac1cd05a1 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -58,3 +58,5 @@ libcurve25519-y += curve25519-selftest.o endif obj-$(CONFIG_MPILIB) += mpi/ + +obj-$(CONFIG_CRYPTO_MANAGER_EXTRA_TESTS) += simd.o diff --git a/lib/crypto/simd.c b/lib/crypto/simd.c new file mode 100644 index 000000000000..9c36cb3bb49c --- /dev/null +++ b/lib/crypto/simd.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SIMD testing utility functions + * + * Copyright 2024 Google LLC + */ + +#include + +DEFINE_PER_CPU(bool, crypto_simd_disabled_for_test); +EXPORT_PER_CPU_SYMBOL_GPL(crypto_simd_disabled_for_test); -- cgit v1.2.3 From 8e7f07e608864dcf7cabc9c252ca02b6ca9ff0d4 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 25 Oct 2024 19:46:53 -0500 Subject: test printf: Add very basic struct resource tests The printf tests for struct resource were stubbed out. struct range printing will leverage the struct resource implementation. To prevent regression add some basic sanity tests for struct resource. Reviewed-by: Andy Shevchenko Reviewed-by: Jonathan Cameron Reviewed-by: Fan Ni Tested-by: Fan Ni Acked-by: Petr Mladek Link: https://patch.msgid.link/20241007-dcd-type2-upstream-v4-1-c261ee6eeded@intel.com Tested-by: Petr Mladek Signed-off-by: Ira Weiny Link: https://patch.msgid.link/20241025-cxl-pra-v2-1-123a825daba2@intel.com Signed-off-by: Dave Jiang --- lib/test_printf.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'lib') diff --git a/lib/test_printf.c b/lib/test_printf.c index 8448b6d02bd9..5afdf5efc627 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -386,6 +386,50 @@ kernel_ptr(void) static void __init struct_resource(void) { + struct resource test_resource = { + .start = 0xc0ffee00, + .end = 0xc0ffee00, + .flags = IORESOURCE_MEM, + }; + + test("[mem 0xc0ffee00 flags 0x200]", + "%pr", &test_resource); + + test_resource = (struct resource) { + .start = 0xc0ffee, + .end = 0xba5eba11, + .flags = IORESOURCE_MEM, + }; + test("[mem 0x00c0ffee-0xba5eba11 flags 0x200]", + "%pr", &test_resource); + + test_resource = (struct resource) { + .start = 0xba5eba11, + .end = 0xc0ffee, + .flags = IORESOURCE_MEM, + }; + test("[mem 0xba5eba11-0x00c0ffee flags 0x200]", + "%pr", &test_resource); + + test_resource = (struct resource) { + .start = 0xba5eba11, + .end = 0xba5eca11, + .flags = IORESOURCE_MEM, + }; + + test("[mem 0xba5eba11-0xba5eca11 flags 0x200]", + "%pr", &test_resource); + + test_resource = (struct resource) { + .start = 0xba11, + .end = 0xca10, + .flags = IORESOURCE_IO | + IORESOURCE_DISABLED | + IORESOURCE_UNSET, + }; + + test("[io size 0x1000 disabled]", + "%pR", &test_resource); } static void __init -- cgit v1.2.3 From 4261974701851630951e9ab31f0de4ade0faea53 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 25 Oct 2024 19:46:55 -0500 Subject: printf: Add print format (%pra) for struct range The use of struct range in the CXL subsystem is growing. In particular, the addition of Dynamic Capacity devices uses struct range in a number of places which are reported in debug and error messages. To wit requiring the printing of the start/end fields in each print became cumbersome. Dan Williams mentions in [1] that it might be time to have a print specifier for struct range similar to struct resource. A few alternatives were considered including '%par', '%r', and '%pn'. %pra follows that struct range is similar to struct resource (%p[rR]) but needs to be different. Based on discussions with Petr and Andy '%pra' was chosen.[2] Andy also suggested to keep the range prints similar to struct resource though combined code. Add hex_range() to handle printing for both pointer types. Finally introduce DEFINE_RANGE() as a parallel to DEFINE_RES_*() and use it in the tests. Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: open list Link: https://lore.kernel.org/all/663922b475e50_d54d72945b@dwillia2-xfh.jf.intel.com.notmuch/ [1] Link: https://lore.kernel.org/all/66cea3bf3332f_f937b29424@iweiny-mobl.notmuch/ [2] Suggested-by: Dan Williams Signed-off-by: Ira Weiny Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20241025-cxl-pra-v2-3-123a825daba2@intel.com Signed-off-by: Dave Jiang --- Documentation/core-api/printk-formats.rst | 13 +++++++ include/linux/range.h | 6 ++++ lib/test_printf.c | 17 +++++++++ lib/vsprintf.c | 57 +++++++++++++++++++++++++++---- 4 files changed, 87 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 552f51046cf3..ecccc0473da9 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -236,6 +236,19 @@ width of the CPU data path. Passed by reference. +Struct Range +------------ + +:: + + %pra [range 0x0000000060000000-0x000000006fffffff] or + [range 0x0000000060000000] + +For printing struct range. struct range holds an arbitrary range of u64 +values. If start is equal to end only print the start value. + +Passed by reference. + DMA address types dma_addr_t ---------------------------- diff --git a/include/linux/range.h b/include/linux/range.h index 6ad0b73cb7ad..1358d4b1807a 100644 --- a/include/linux/range.h +++ b/include/linux/range.h @@ -31,4 +31,10 @@ int clean_sort_range(struct range *range, int az); void sort_range(struct range *range, int nr_range); +#define DEFINE_RANGE(_start, _end) \ +(struct range) { \ + .start = (_start), \ + .end = (_end), \ + } + #endif diff --git a/lib/test_printf.c b/lib/test_printf.c index 5afdf5efc627..59dbe4f9a4cb 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -432,6 +432,22 @@ struct_resource(void) "%pR", &test_resource); } +static void __init +struct_range(void) +{ + struct range test_range = DEFINE_RANGE(0xc0ffee00ba5eba11, + 0xc0ffee00ba5eba11); + test("[range 0xc0ffee00ba5eba11]", "%pra", &test_range); + + test_range = DEFINE_RANGE(0xc0ffee, 0xba5eba11); + test("[range 0x0000000000c0ffee-0x00000000ba5eba11]", + "%pra", &test_range); + + test_range = DEFINE_RANGE(0xba5eba11, 0xc0ffee); + test("[range 0x00000000ba5eba11-0x0000000000c0ffee]", + "%pra", &test_range); +} + static void __init addr(void) { @@ -807,6 +823,7 @@ test_pointer(void) symbol_ptr(); kernel_ptr(); struct_resource(); + struct_range(); addr(); escaped_str(); hex_string(); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index c5e2ec9303c5..6ac02bbb7df1 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1039,6 +1039,20 @@ static const struct printf_spec default_dec04_spec = { .flags = ZEROPAD, }; +static noinline_for_stack +char *hex_range(char *buf, char *end, u64 start_val, u64 end_val, + struct printf_spec spec) +{ + buf = number(buf, end, start_val, spec); + if (start_val == end_val) + return buf; + + if (buf < end) + *buf = '-'; + ++buf; + return number(buf, end, end_val, spec); +} + static noinline_for_stack char *resource_string(char *buf, char *end, struct resource *res, struct printf_spec spec, const char *fmt) @@ -1115,11 +1129,7 @@ char *resource_string(char *buf, char *end, struct resource *res, p = string_nocheck(p, pend, "size ", str_spec); p = number(p, pend, resource_size(res), *specp); } else { - p = number(p, pend, res->start, *specp); - if (res->start != res->end) { - *p++ = '-'; - p = number(p, pend, res->end, *specp); - } + p = hex_range(p, pend, res->start, res->end, *specp); } if (decode) { if (res->flags & IORESOURCE_MEM_64) @@ -1140,6 +1150,31 @@ char *resource_string(char *buf, char *end, struct resource *res, return string_nocheck(buf, end, sym, spec); } +static noinline_for_stack +char *range_string(char *buf, char *end, const struct range *range, + struct printf_spec spec, const char *fmt) +{ + char sym[sizeof("[range 0x0123456789abcdef-0x0123456789abcdef]")]; + char *p = sym, *pend = sym + sizeof(sym); + + struct printf_spec range_spec = { + .field_width = 2 + 2 * sizeof(range->start), /* 0x + 2 * 8 */ + .flags = SPECIAL | SMALL | ZEROPAD, + .base = 16, + .precision = -1, + }; + + if (check_pointer(&buf, end, range, spec)) + return buf; + + p = string_nocheck(p, pend, "[range ", default_str_spec); + p = hex_range(p, pend, range->start, range->end, range_spec); + *p++ = ']'; + *p = '\0'; + + return string_nocheck(buf, end, sym, spec); +} + static noinline_for_stack char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) @@ -2229,6 +2264,15 @@ char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode, return widen_string(buf, buf - buf_start, end, spec); } +static noinline_for_stack +char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr, + struct printf_spec spec) +{ + if (*fmt == 'r' && fmt[1] == 'a') + return range_string(buf, end, ptr, spec, fmt); + return resource_string(buf, end, ptr, spec, fmt); +} + int __init no_hash_pointers_enable(char *str) { if (no_hash_pointers) @@ -2277,6 +2321,7 @@ char *rust_fmt_argument(char *buf, char *end, void *ptr); * - 'Bb' as above with module build ID (for use in backtraces) * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] + * - 'ra' For struct ranges, e.g., [range 0x0000000000000000 - 0x00000000000000ff] * - 'b[l]' For a bitmap, the number of bits is determined by the field * width which must be explicitly specified either as part of the * format string '%32b[l]' or through '%*b[l]', [l] selects @@ -2401,7 +2446,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return symbol_string(buf, end, ptr, spec, fmt); case 'R': case 'r': - return resource_string(buf, end, ptr, spec, fmt); + return resource_or_range(fmt, buf, end, ptr, spec); case 'h': return hex_string(buf, end, ptr, spec, fmt); case 'b': -- cgit v1.2.3 From 1dc82675cb79200d5e140520efd7ce88b38ea56d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 3 Oct 2024 17:16:13 -0400 Subject: lib/math/test_div64: add some edge cases relevant to __div64_const32() Be sure to test the extreme cases with and without bias. Signed-off-by: Nicolas Pitre Signed-off-by: Arnd Bergmann --- lib/math/test_div64.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/math/test_div64.c b/lib/math/test_div64.c index c15edd688dd2..3cd699b654d9 100644 --- a/lib/math/test_div64.c +++ b/lib/math/test_div64.c @@ -26,6 +26,9 @@ static const u64 test_div64_dividends[] = { 0x0072db27380dd689, 0x0842f488162e2284, 0xf66745411d8ab063, + 0xfffffffffffffffb, + 0xfffffffffffffffc, + 0xffffffffffffffff, }; #define SIZE_DIV64_DIVIDENDS ARRAY_SIZE(test_div64_dividends) @@ -37,7 +40,10 @@ static const u64 test_div64_dividends[] = { #define TEST_DIV64_DIVISOR_5 0x0008a880 #define TEST_DIV64_DIVISOR_6 0x003fd3ae #define TEST_DIV64_DIVISOR_7 0x0b658fac -#define TEST_DIV64_DIVISOR_8 0xdc08b349 +#define TEST_DIV64_DIVISOR_8 0x80000001 +#define TEST_DIV64_DIVISOR_9 0xdc08b349 +#define TEST_DIV64_DIVISOR_A 0xfffffffe +#define TEST_DIV64_DIVISOR_B 0xffffffff static const u32 test_div64_divisors[] = { TEST_DIV64_DIVISOR_0, @@ -49,13 +55,16 @@ static const u32 test_div64_divisors[] = { TEST_DIV64_DIVISOR_6, TEST_DIV64_DIVISOR_7, TEST_DIV64_DIVISOR_8, + TEST_DIV64_DIVISOR_9, + TEST_DIV64_DIVISOR_A, + TEST_DIV64_DIVISOR_B, }; #define SIZE_DIV64_DIVISORS ARRAY_SIZE(test_div64_divisors) static const struct { u64 quotient; u32 remainder; -} test_div64_results[SIZE_DIV64_DIVISORS][SIZE_DIV64_DIVIDENDS] = { +} test_div64_results[SIZE_DIV64_DIVIDENDS][SIZE_DIV64_DIVISORS] = { { { 0x0000000013045e47, 0x00000001 }, { 0x000000000161596c, 0x00000030 }, @@ -65,6 +74,9 @@ static const struct { { 0x00000000000013c4, 0x0004ce80 }, { 0x00000000000002ae, 0x001e143c }, { 0x000000000000000f, 0x0033e56c }, + { 0x0000000000000001, 0x2b27507f }, + { 0x0000000000000000, 0xab275080 }, + { 0x0000000000000000, 0xab275080 }, { 0x0000000000000000, 0xab275080 }, }, { { 0x00000001c45c02d1, 0x00000000 }, @@ -75,7 +87,10 @@ static const struct { { 0x000000000001d637, 0x0004e5d9 }, { 0x0000000000003fc9, 0x000713bb }, { 0x0000000000000165, 0x029abe7d }, + { 0x000000000000001f, 0x673c193a }, { 0x0000000000000012, 0x6e9f7e37 }, + { 0x000000000000000f, 0xe73c1977 }, + { 0x000000000000000f, 0xe73c1968 }, }, { { 0x000000197a3a0cf7, 0x00000002 }, { 0x00000001d9632e5c, 0x00000021 }, @@ -85,7 +100,10 @@ static const struct { { 0x00000000001a7bb3, 0x00072331 }, { 0x00000000000397ad, 0x0002c61b }, { 0x000000000000141e, 0x06ea2e89 }, + { 0x00000000000001ca, 0x4c0a72e7 }, { 0x000000000000010a, 0xab002ad7 }, + { 0x00000000000000e5, 0x4c0a767b }, + { 0x00000000000000e5, 0x4c0a7596 }, }, { { 0x0000017949e37538, 0x00000001 }, { 0x0000001b62441f37, 0x00000055 }, @@ -95,7 +113,10 @@ static const struct { { 0x0000000001882ec6, 0x0005cbf9 }, { 0x000000000035333b, 0x0017abdf }, { 0x00000000000129f1, 0x0ab4520d }, + { 0x0000000000001a87, 0x18ff0472 }, { 0x0000000000000f6e, 0x8ac0ce9b }, + { 0x0000000000000d43, 0x98ff397f }, + { 0x0000000000000d43, 0x98ff2c3c }, }, { { 0x000011f321a74e49, 0x00000006 }, { 0x0000014d8481d211, 0x0000005b }, @@ -105,7 +126,10 @@ static const struct { { 0x0000000012a88828, 0x00036c97 }, { 0x000000000287f16f, 0x002c2a25 }, { 0x00000000000e2cc7, 0x02d581e3 }, + { 0x0000000000014318, 0x2ee07d7f }, { 0x000000000000bbf4, 0x1ba08c03 }, + { 0x000000000000a18c, 0x2ee303af }, + { 0x000000000000a18c, 0x2ee26223 }, }, { { 0x0000d8db8f72935d, 0x00000005 }, { 0x00000fbd5aed7a2e, 0x00000002 }, @@ -115,7 +139,10 @@ static const struct { { 0x00000000e16b20fa, 0x0002a14a }, { 0x000000001e940d22, 0x00353b2e }, { 0x0000000000ab40ac, 0x06fba6ba }, + { 0x00000000000f3f70, 0x0af7eeda }, { 0x000000000008debd, 0x72d98365 }, + { 0x0000000000079fb8, 0x0b166dba }, + { 0x0000000000079fb8, 0x0b0ece02 }, }, { { 0x000cc3045b8fc281, 0x00000000 }, { 0x0000ed1f48b5c9fc, 0x00000079 }, @@ -125,7 +152,10 @@ static const struct { { 0x0000000d43fce827, 0x00082b09 }, { 0x00000001ccaba11a, 0x0037e8dd }, { 0x000000000a13f729, 0x0566dffd }, + { 0x0000000000e5b64e, 0x3728203b }, { 0x000000000085a14b, 0x23d36726 }, + { 0x000000000072db27, 0x38f38cd7 }, + { 0x000000000072db27, 0x3880b1b0 }, }, { { 0x00eafeb9c993592b, 0x00000001 }, { 0x00110e5befa9a991, 0x00000048 }, @@ -135,7 +165,10 @@ static const struct { { 0x000000f4459740fc, 0x00084484 }, { 0x0000002122c47bf9, 0x002ca446 }, { 0x00000000b9936290, 0x004979c4 }, + { 0x000000001085e910, 0x05a83974 }, { 0x00000000099ca89d, 0x9db446bf }, + { 0x000000000842f488, 0x26b40b94 }, + { 0x000000000842f488, 0x1e71170c }, }, { { 0x1b60cece589da1d2, 0x00000001 }, { 0x01fcb42be1453f5b, 0x0000004f }, @@ -145,7 +178,49 @@ static const struct { { 0x00001c757dfab350, 0x00048863 }, { 0x000003dc4979c652, 0x00224ea7 }, { 0x000000159edc3144, 0x06409ab3 }, + { 0x00000001ecce8a7e, 0x30bc25e5 }, { 0x000000011eadfee3, 0xa99c48a8 }, + { 0x00000000f6674543, 0x0a593ae9 }, + { 0x00000000f6674542, 0x13f1f5a5 }, + }, { + { 0x1c71c71c71c71c71, 0x00000002 }, + { 0x0210842108421084, 0x0000000b }, + { 0x007f01fc07f01fc0, 0x000000fb }, + { 0x00014245eabf1f9a, 0x0000a63d }, + { 0x0000ffffffffffff, 0x0000fffb }, + { 0x00001d913cecc509, 0x0007937b }, + { 0x00000402c70c678f, 0x0005bfc9 }, + { 0x00000016766cb70b, 0x045edf97 }, + { 0x00000001fffffffb, 0x80000000 }, + { 0x0000000129d84b3a, 0xa2e8fe71 }, + { 0x0000000100000001, 0xfffffffd }, + { 0x0000000100000000, 0xfffffffb }, + }, { + { 0x1c71c71c71c71c71, 0x00000003 }, + { 0x0210842108421084, 0x0000000c }, + { 0x007f01fc07f01fc0, 0x000000fc }, + { 0x00014245eabf1f9a, 0x0000a63e }, + { 0x0000ffffffffffff, 0x0000fffc }, + { 0x00001d913cecc509, 0x0007937c }, + { 0x00000402c70c678f, 0x0005bfca }, + { 0x00000016766cb70b, 0x045edf98 }, + { 0x00000001fffffffc, 0x00000000 }, + { 0x0000000129d84b3a, 0xa2e8fe72 }, + { 0x0000000100000002, 0x00000000 }, + { 0x0000000100000000, 0xfffffffc }, + }, { + { 0x1c71c71c71c71c71, 0x00000006 }, + { 0x0210842108421084, 0x0000000f }, + { 0x007f01fc07f01fc0, 0x000000ff }, + { 0x00014245eabf1f9a, 0x0000a641 }, + { 0x0000ffffffffffff, 0x0000ffff }, + { 0x00001d913cecc509, 0x0007937f }, + { 0x00000402c70c678f, 0x0005bfcd }, + { 0x00000016766cb70b, 0x045edf9b }, + { 0x00000001fffffffc, 0x00000003 }, + { 0x0000000129d84b3a, 0xa2e8fe75 }, + { 0x0000000100000002, 0x00000003 }, + { 0x0000000100000001, 0x00000000 }, }, }; @@ -208,6 +283,12 @@ static bool __init test_div64(void) return false; if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_8, i, 8)) return false; + if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_9, i, 9)) + return false; + if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_A, i, 10)) + return false; + if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_B, i, 11)) + return false; for (j = 0; j < SIZE_DIV64_DIVISORS; j++) { if (!test_div64_one(dividend, test_div64_divisors[j], i, j)) -- cgit v1.2.3 From b660d0a2acb9053d627befbb259a397d26aa9582 Mon Sep 17 00:00:00 2001 From: Julian Vetter Date: Mon, 28 Oct 2024 14:42:24 +0100 Subject: New implementation for IO memcpy and IO memset The IO memcpy and IO memset functions in asm-generic/io.h simply call memcpy and memset. This can lead to alignment problems or faults on architectures that do not define their own version and fall back to these defaults. This patch introduces new implementations for IO memcpy and IO memset, that use read{l,q} accessor functions, align accesses to machine word size, and resort to byte accesses when the target memory is not aligned. For new architectures and existing ones that were using the old fallbacks these functions are save to use, because IO memory constraints are taken into account. Moreover, architectures with similar implementations can now use these new versions, not needing to implement their own. Reviewed-by: Yann Sionneau Signed-off-by: Julian Vetter Signed-off-by: Arnd Bergmann --- include/asm-generic/io.h | 22 ++------ lib/Makefile | 2 +- lib/iomem_copy.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+), 20 deletions(-) create mode 100644 lib/iomem_copy.c (limited to 'lib') diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 1027be6a62bc..a5cbbf3e26ec 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1211,7 +1211,6 @@ static inline void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) #endif #ifndef memset_io -#define memset_io memset_io /** * memset_io Set a range of I/O memory to a constant value * @addr: The beginning of the I/O-memory range to set @@ -1220,15 +1219,10 @@ static inline void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) * * Set a range of I/O memory to a given value. */ -static inline void memset_io(volatile void __iomem *addr, int value, - size_t size) -{ - memset(__io_virt(addr), value, size); -} +void memset_io(volatile void __iomem *addr, int val, size_t count); #endif #ifndef memcpy_fromio -#define memcpy_fromio memcpy_fromio /** * memcpy_fromio Copy a block of data from I/O memory * @dst: The (RAM) destination for the copy @@ -1237,16 +1231,10 @@ static inline void memset_io(volatile void __iomem *addr, int value, * * Copy a block of data from I/O memory. */ -static inline void memcpy_fromio(void *buffer, - const volatile void __iomem *addr, - size_t size) -{ - memcpy(buffer, __io_virt(addr), size); -} +void memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count); #endif #ifndef memcpy_toio -#define memcpy_toio memcpy_toio /** * memcpy_toio Copy a block of data into I/O memory * @dst: The (I/O memory) destination for the copy @@ -1255,11 +1243,7 @@ static inline void memcpy_fromio(void *buffer, * * Copy a block of data to I/O memory. */ -static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer, - size_t size) -{ - memcpy(__io_virt(addr), buffer, size); -} +void memcpy_toio(volatile void __iomem *dst, const void *src, size_t count); #endif extern int devmem_is_allowed(unsigned long pfn); diff --git a/lib/Makefile b/lib/Makefile index 773adf88af41..db4717538fad 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -35,7 +35,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ - buildid.o objpool.o union_find.o + buildid.o objpool.o union_find.o iomem_copy.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/iomem_copy.c b/lib/iomem_copy.c new file mode 100644 index 000000000000..24c2b59a0ad2 --- /dev/null +++ b/lib/iomem_copy.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Kalray, Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include + +#ifndef memset_io +/** + * memset_io Set a range of I/O memory to a constant value + * @addr: The beginning of the I/O-memory range to set + * @val: The value to set the memory to + * @count: The number of bytes to set + * + * Set a range of I/O memory to a given value. + */ +void memset_io(volatile void __iomem *addr, int val, size_t count) +{ + long qc = (u8)val; + + qc *= ~0UL / 0xff; + + while (count && !IS_ALIGNED((long)addr, sizeof(long))) { + __raw_writeb(val, addr); + addr++; + count--; + } + + while (count >= sizeof(long)) { +#ifdef CONFIG_64BIT + __raw_writeq(qc, addr); +#else + __raw_writel(qc, addr); +#endif + + addr += sizeof(long); + count -= sizeof(long); + } + + while (count) { + __raw_writeb(val, addr); + addr++; + count--; + } +} +EXPORT_SYMBOL(memset_io); +#endif + +#ifndef memcpy_fromio +/** + * memcpy_fromio Copy a block of data from I/O memory + * @dst: The (RAM) destination for the copy + * @src: The (I/O memory) source for the data + * @count: The number of bytes to copy + * + * Copy a block of data from I/O memory. + */ +void memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) +{ + while (count && !IS_ALIGNED((long)src, sizeof(long))) { + *(u8 *)dst = __raw_readb(src); + src++; + dst++; + count--; + } + + while (count >= sizeof(long)) { +#ifdef CONFIG_64BIT + long val = __raw_readq(src); +#else + long val = __raw_readl(src); +#endif + put_unaligned(val, (long *)dst); + + + src += sizeof(long); + dst += sizeof(long); + count -= sizeof(long); + } + + while (count) { + *(u8 *)dst = __raw_readb(src); + src++; + dst++; + count--; + } +} +EXPORT_SYMBOL(memcpy_fromio); +#endif + +#ifndef memcpy_toio +/** + * memcpy_toio Copy a block of data into I/O memory + * @dst: The (I/O memory) destination for the copy + * @src: The (RAM) source for the data + * @count: The number of bytes to copy + * + * Copy a block of data to I/O memory. + */ +void memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) +{ + while (count && !IS_ALIGNED((long)dst, sizeof(long))) { + __raw_writeb(*(u8 *)src, dst); + src++; + dst++; + count--; + } + + while (count >= sizeof(long)) { + long val = get_unaligned((long *)src); +#ifdef CONFIG_64BIT + __raw_writeq(val, dst); +#else + __raw_writel(val, dst); +#endif + + src += sizeof(long); + dst += sizeof(long); + count -= sizeof(long); + } + + while (count) { + __raw_writeb(*(u8 *)src, dst); + src++; + dst++; + count--; + } +} +EXPORT_SYMBOL(memcpy_toio); +#endif + + -- cgit v1.2.3 From af0847537031c0c7cc0a168776ac21e29fa74c5c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 28 Oct 2024 16:29:54 +0000 Subject: selftests: kallsyms: add MODULE_DESCRIPTION The newly added test script creates modules that are lacking a description line in order to build cleanly: WARNING: modpost: missing MODULE_DESCRIPTION() in lib/tests/module/test_kallsyms_a.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/tests/module/test_kallsyms_b.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/tests/module/test_kallsyms_c.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/tests/module/test_kallsyms_d.o Fixes: 84b4a51fce4c ("selftests: add new kallsyms selftests") Signed-off-by: Arnd Bergmann Signed-off-by: Luis Chamberlain --- lib/tests/module/gen_test_kallsyms.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/tests/module/gen_test_kallsyms.sh b/lib/tests/module/gen_test_kallsyms.sh index e85f10dc11bd..ae5966f1f904 100755 --- a/lib/tests/module/gen_test_kallsyms.sh +++ b/lib/tests/module/gen_test_kallsyms.sh @@ -99,6 +99,7 @@ module_exit(auto_test_module_exit); MODULE_AUTHOR("Luis Chamberlain "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Test module for kallsyms"); ____END_MODULE } -- cgit v1.2.3 From 5a8b4b4001252fd64f4f0756d4f8cdfd61eccb77 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 29 Oct 2024 07:14:29 +0000 Subject: lib/iomem_copy: fix kerneldoc format style The newly added file did not quite get the punctuation right: lib/iomem_copy.c:14: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410290907.0mDZVYPK-lkp@intel.com/ Signed-off-by: Arnd Bergmann --- lib/iomem_copy.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/iomem_copy.c b/lib/iomem_copy.c index 24c2b59a0ad2..dec7eaea60e0 100644 --- a/lib/iomem_copy.c +++ b/lib/iomem_copy.c @@ -11,10 +11,10 @@ #ifndef memset_io /** - * memset_io Set a range of I/O memory to a constant value - * @addr: The beginning of the I/O-memory range to set - * @val: The value to set the memory to - * @count: The number of bytes to set + * memset_io() - Set a range of I/O memory to a constant value + * @addr: The beginning of the I/O-memory range to set + * @val: The value to set the memory to + * @count: The number of bytes to set * * Set a range of I/O memory to a given value. */ @@ -52,10 +52,10 @@ EXPORT_SYMBOL(memset_io); #ifndef memcpy_fromio /** - * memcpy_fromio Copy a block of data from I/O memory - * @dst: The (RAM) destination for the copy - * @src: The (I/O memory) source for the data - * @count: The number of bytes to copy + * memcpy_fromio() - Copy a block of data from I/O memory + * @dst: The (RAM) destination for the copy + * @src: The (I/O memory) source for the data + * @count: The number of bytes to copy * * Copy a block of data from I/O memory. */ @@ -94,10 +94,10 @@ EXPORT_SYMBOL(memcpy_fromio); #ifndef memcpy_toio /** - * memcpy_toio Copy a block of data into I/O memory - * @dst: The (I/O memory) destination for the copy - * @src: The (RAM) source for the data - * @count: The number of bytes to copy + * memcpy_toio() -Copy a block of data into I/O memory + * @dst: The (I/O memory) destination for the copy + * @src: The (RAM) source for the data + * @count: The number of bytes to copy * * Copy a block of data to I/O memory. */ -- cgit v1.2.3 From e4e535bff2bc82bb49a633775f9834beeaa527db Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 24 Oct 2024 07:00:15 +0200 Subject: iov_iter: don't require contiguous pages in iov_iter_extract_bvec_pages The iov_iter_extract_pages interface allows to return physically discontiguous pages, as long as all but the first and last page in the array are page aligned and page size. Rewrite iov_iter_extract_bvec_pages to take advantage of that instead of only returning ranges of physically contiguous pages. Signed-off-by: Ming Lei [hch: minor cleanups, new commit log] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241024050021.627350-1-hch@lst.de Signed-off-by: Jens Axboe --- lib/iov_iter.c | 67 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 1abb32c0da50..9fc06f5fb748 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1677,8 +1677,8 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, } /* - * Extract a list of contiguous pages from an ITER_BVEC iterator. This does - * not get references on the pages, nor does it get a pin on them. + * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. + * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, @@ -1686,35 +1686,58 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, iov_iter_extraction_t extraction_flags, size_t *offset0) { - struct page **p, *page; - size_t skip = i->iov_offset, offset, size; - int k; + size_t skip = i->iov_offset, size = 0; + struct bvec_iter bi; + int k = 0; - for (;;) { - if (i->nr_segs == 0) - return 0; - size = min(maxsize, i->bvec->bv_len - skip); - if (size) - break; + if (i->nr_segs == 0) + return 0; + + if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } + bi.bi_size = maxsize + skip; + bi.bi_bvec_done = skip; + + maxpages = want_pages_array(pages, maxsize, skip, maxpages); + + while (bi.bi_size && bi.bi_idx < i->nr_segs) { + struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); + + /* + * The iov_iter_extract_pages interface only allows an offset + * into the first page. Break out of the loop if we see an + * offset into subsequent pages, the caller will have to call + * iov_iter_extract_pages again for the reminder. + */ + if (k) { + if (bv.bv_offset) + break; + } else { + *offset0 = bv.bv_offset; + } - skip += i->bvec->bv_offset; - page = i->bvec->bv_page + skip / PAGE_SIZE; - offset = skip % PAGE_SIZE; - *offset0 = offset; + (*pages)[k++] = bv.bv_page; + size += bv.bv_len; - maxpages = want_pages_array(pages, size, offset, maxpages); - if (!maxpages) - return -ENOMEM; - p = *pages; - for (k = 0; k < maxpages; k++) - p[k] = page + k; + if (k >= maxpages) + break; + + /* + * We are done when the end of the bvec doesn't align to a page + * boundary as that would create a hole in the returned space. + * The caller will handle this with another call to + * iov_iter_extract_pages. + */ + if (bv.bv_offset + bv.bv_len != PAGE_SIZE) + break; + + bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); + } - size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } -- cgit v1.2.3 From db71aae70e3e646d8ba4cb50e4bd4c281a91c804 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Sat, 26 Oct 2024 12:53:36 +0000 Subject: net: checksum: Move from32to16() to generic header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit from32to16() is used by lib/checksum.c and also by arch/parisc/lib/checksum.c. The next patch will use it in the bpf_csum_diff helper. Move from32to16() to the include/net/checksum.h as csum_from32to16() and remove other implementations. Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Reviewed-by: Toke Høiland-Jørgensen Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20241026125339.26459-2-puranjay@kernel.org --- arch/parisc/lib/checksum.c | 13 ++----------- include/net/checksum.h | 6 ++++++ lib/checksum.c | 11 +---------- 3 files changed, 9 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/arch/parisc/lib/checksum.c b/arch/parisc/lib/checksum.c index 4818f3db84a5..59d8c15d81bd 100644 --- a/arch/parisc/lib/checksum.c +++ b/arch/parisc/lib/checksum.c @@ -25,15 +25,6 @@ : "=r"(_t) \ : "r"(_r), "0"(_t)); -static inline unsigned short from32to16(unsigned int x) -{ - /* 32 bits --> 16 bits + carry */ - x = (x & 0xffff) + (x >> 16); - /* 16 bits + carry --> 16 bits including carry */ - x = (x & 0xffff) + (x >> 16); - return (unsigned short)x; -} - static inline unsigned int do_csum(const unsigned char * buff, int len) { int odd, count; @@ -85,7 +76,7 @@ static inline unsigned int do_csum(const unsigned char * buff, int len) } if (len & 1) result += le16_to_cpu(*buff); - result = from32to16(result); + result = csum_from32to16(result); if (odd) result = swab16(result); out: @@ -102,7 +93,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) { unsigned int result = do_csum(buff, len); addc(result, sum); - return (__force __wsum)from32to16(result); + return (__force __wsum)csum_from32to16(result); } EXPORT_SYMBOL(csum_partial); diff --git a/include/net/checksum.h b/include/net/checksum.h index 1338cb92c8e7..243f972267b8 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -151,6 +151,12 @@ static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) *csum = csum_add(csum_sub(*csum, old), new); } +static inline unsigned short csum_from32to16(unsigned int sum) +{ + sum += (sum >> 16) | (sum << 16); + return (unsigned short)(sum >> 16); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); diff --git a/lib/checksum.c b/lib/checksum.c index 6860d6b05a17..025ba546e1ec 100644 --- a/lib/checksum.c +++ b/lib/checksum.c @@ -34,15 +34,6 @@ #include #ifndef do_csum -static inline unsigned short from32to16(unsigned int x) -{ - /* add up 16-bit and 16-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - static unsigned int do_csum(const unsigned char *buff, int len) { int odd; @@ -90,7 +81,7 @@ static unsigned int do_csum(const unsigned char *buff, int len) #else result += (*buff << 8); #endif - result = from32to16(result); + result = csum_from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: -- cgit v1.2.3 From 496a51b37143c690a06612a6bd58827ef2341761 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 31 Oct 2024 19:02:24 +0800 Subject: lib/iov_iter.c: initialize bi.bi_idx before iterating over bvec Initialize bi.bi_idx as 0 before iterating over bvec, otherwise garbage data can be used as ->bi_idx. Cc: Christoph Hellwig Reported-and-tested-by: Klara Modin Fixes: e4e535bff2bc ("iov_iter: don't require contiguous pages in iov_iter_extract_bvec_pages") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- lib/iov_iter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 9fc06f5fb748..c761f6db3cb4 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1699,6 +1699,7 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, i->bvec++; skip = 0; } + bi.bi_idx = 0; bi.bi_size = maxsize + skip; bi.bi_bvec_done = skip; -- cgit v1.2.3 From a911bad094b010e276f072fe9a599b66e59ed5fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Oct 2024 19:14:25 +0000 Subject: dql: annotate data-races around dql->last_obj_cnt dql->last_obj_cnt is read/written from different contexts, without any lock synchronization. Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing. Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241029191425.2519085-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/dynamic_queue_limits.h | 2 +- lib/dynamic_queue_limits.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h index 281298e77a15..808b1a5102e7 100644 --- a/include/linux/dynamic_queue_limits.h +++ b/include/linux/dynamic_queue_limits.h @@ -127,7 +127,7 @@ static inline void dql_queued(struct dql *dql, unsigned int count) if (WARN_ON_ONCE(count > DQL_MAX_OBJECT)) return; - dql->last_obj_cnt = count; + WRITE_ONCE(dql->last_obj_cnt, count); /* We want to force a write first, so that cpu do not attempt * to get cache line containing last_obj_cnt, num_queued, adj_limit diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c index e49deddd3de9..c1b7638a594a 100644 --- a/lib/dynamic_queue_limits.c +++ b/lib/dynamic_queue_limits.c @@ -179,7 +179,7 @@ void dql_completed(struct dql *dql, unsigned int count) dql->adj_limit = limit + completed; dql->prev_ovlimit = ovlimit; - dql->prev_last_obj_cnt = dql->last_obj_cnt; + dql->prev_last_obj_cnt = READ_ONCE(dql->last_obj_cnt); dql->num_completed = completed; dql->prev_num_queued = num_queued; -- cgit v1.2.3 From 341468e0ab4bb1b26ad0b0cee7c6db4bf283043a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 2 Nov 2024 09:42:11 +0800 Subject: lib/iov_iter: fix bvec iterator setup .bi_size of bvec iterator should be initialized as real max size for walking, and .bi_bvec_done just counts how many bytes need to be skipped in the 1st bvec, so .bi_size isn't related with .bi_bvec_done. This patch fixes bvec iterator initialization, and the inner `size` check isn't needed any more, so revert Eric Dumazet's commit 7bc802acf193 ("iov-iter: do not return more bytes than requested in iov_iter_extract_bvec_pages()"). Cc: Eric Dumazet Fixes: e4e535bff2bc ("iov_iter: don't require contiguous pages in iov_iter_extract_bvec_pages") Reported-by: syzbot+71abe7ab2b70bca770fd@syzkaller.appspotmail.com Tested-by: syzbot+71abe7ab2b70bca770fd@syzkaller.appspotmail.com Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- lib/iov_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c761f6db3cb4..4a54c7af62c0 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1700,7 +1700,7 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, skip = 0; } bi.bi_idx = 0; - bi.bi_size = maxsize + skip; + bi.bi_size = maxsize; bi.bi_bvec_done = skip; maxpages = want_pages_array(pages, maxsize, skip, maxpages); -- cgit v1.2.3 From d44d26987bb3df6d76556827097fc9ce17565cb8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Oct 2024 13:04:07 +0100 Subject: timekeeping: Remove CONFIG_DEBUG_TIMEKEEPING Since 135225a363ae timekeeping_cycles_to_ns() handles large offsets which would lead to 64bit multiplication overflows correctly. It's also protected against negative motion of the clocksource unconditionally, which was exclusive to x86 before. timekeeping_advance() handles large offsets already correctly. That means the value of CONFIG_DEBUG_TIMEKEEPING which analyzed these cases is very close to zero. Remove all of it. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241031120328.536010148@linutronix.de --- arch/riscv/configs/defconfig | 1 - include/linux/timekeeper_internal.h | 16 --- kernel/time/timekeeping.c | 108 +-------------------- lib/Kconfig.debug | 13 --- .../testing/selftests/wireguard/qemu/debug.config | 1 - 5 files changed, 3 insertions(+), 136 deletions(-) (limited to 'lib') diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 2341393cfac1..26c01b9e3434 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -301,7 +301,6 @@ CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEBUG_PER_CPU_MAPS=y CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_WQ_WATCHDOG=y -CONFIG_DEBUG_TIMEKEEPING=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index a3b6380a7777..e39d4d563b19 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -76,9 +76,6 @@ struct tk_read_base { * ntp shifted nano seconds. * @ntp_err_mult: Multiplication factor for scaled math conversion * @skip_second_overflow: Flag used to avoid updating NTP twice with same second - * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) - * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) - * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffy times) @@ -147,19 +144,6 @@ struct timekeeper { u32 ntp_error_shift; u32 ntp_err_mult; u32 skip_second_overflow; - -#ifdef CONFIG_DEBUG_TIMEKEEPING - long last_warning; - /* - * These simple flag variables are managed - * without locks, which is racy, but they are - * ok since we don't really care about being - * super precise about how many events were - * seen, just that a problem was observed. - */ - int underflow_seen; - int overflow_seen; -#endif }; #ifdef CONFIG_GENERIC_TIME_VSYSCALL diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 17cae886ca82..d115adebc418 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -226,97 +226,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr) return clock->read(clock); } -#ifdef CONFIG_DEBUG_TIMEKEEPING -#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ - -static void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ - - u64 max_cycles = tk->tkr_mono.clock->max_cycles; - const char *name = tk->tkr_mono.clock->name; - - if (offset > max_cycles) { - printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", - offset, name, max_cycles); - printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); - } else { - if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", - offset, name, max_cycles >> 1); - printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); - } - } - - if (tk->underflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->underflow_seen = 0; - } - - if (tk->overflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->overflow_seen = 0; - } -} - -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles); - -static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) -{ - struct timekeeper *tk = &tk_core.timekeeper; - u64 now, last, mask, max, delta; - unsigned int seq; - - /* - * Since we're called holding a seqcount, the data may shift - * under us while we're doing the calculation. This can cause - * false positives, since we'd note a problem but throw the - * results away. So nest another seqcount here to atomically - * grab the points we are checking with. - */ - do { - seq = read_seqcount_begin(&tk_core.seq); - now = tk_clock_read(tkr); - last = tkr->cycle_last; - mask = tkr->mask; - max = tkr->clock->max_cycles; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - delta = clocksource_delta(now, last, mask); - - /* - * Try to catch underflows by checking if we are seeing small - * mask-relative negative values. - */ - if (unlikely((~delta & mask) < (mask >> 3))) - tk->underflow_seen = 1; - - /* Check for multiplication overflows */ - if (unlikely(delta > max)) - tk->overflow_seen = 1; - - /* timekeeping_cycles_to_ns() handles both under and overflow */ - return timekeeping_cycles_to_ns(tkr, now); -} -#else -static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ -} -static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) -{ - BUG(); -} -#endif - /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -421,19 +330,11 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } -static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) +static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } -static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) -{ - if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING)) - return timekeeping_debug_get_ns(tkr); - - return __timekeeping_get_ns(tkr); -} - /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update @@ -477,7 +378,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += __timekeeping_get_ns(tkr); + now += timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -593,7 +494,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) tkr = tkf->base + (seq & 0x01); basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - delta = __timekeeping_get_ns(tkr); + delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) @@ -2333,9 +2234,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; - /* Do some additional sanity checking */ - timekeeping_check_update(tk, offset); - /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7315f643817a..14977b9fc254 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1328,19 +1328,6 @@ config SCHEDSTATS endmenu -config DEBUG_TIMEKEEPING - bool "Enable extra timekeeping sanity checking" - help - This option will enable additional timekeeping sanity checks - which may be helpful when diagnosing issues where timekeeping - problems are suspected. - - This may include checks in the timekeeping hotpaths, so this - option may have a (very small) performance impact to some - workloads. - - If unsure, say N. - config DEBUG_PREEMPT bool "Debug preemptible kernel" depends on DEBUG_KERNEL && PREEMPTION && TRACE_IRQFLAGS_SUPPORT diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 9d172210e2c6..139fd9aa8b12 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -31,7 +31,6 @@ CONFIG_SCHED_DEBUG=y CONFIG_SCHED_INFO=y CONFIG_SCHEDSTATS=y CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_DEBUG_TIMEKEEPING=y CONFIG_DEBUG_PREEMPT=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_SPINLOCK=y -- cgit v1.2.3 From a508ef4b1dcc82227edc594ffae583874dd425d7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 1 Nov 2024 21:54:53 +0100 Subject: lib: string_helpers: silence snprintf() output truncation warning The output of ".%03u" with the unsigned int in range [0, 4294966295] may get truncated if the target buffer is not 12 bytes. This can't really happen here as the 'remainder' variable cannot exceed 999 but the compiler doesn't know it. To make it happy just increase the buffer to where the warning goes away. Fixes: 3c9f3681d0b4 ("[SCSI] lib: add generic helper to print sizes rounded to the correct SI range") Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Cc: James E.J. Bottomley Cc: Kees Cook Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton Link: https://lore.kernel.org/r/20241101205453.9353-1-brgl@bgdev.pl Signed-off-by: Kees Cook --- lib/string_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 4f887aa62fa0..91fa37b5c510 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -57,7 +57,7 @@ int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, static const unsigned int rounding[] = { 500, 50, 5 }; int i = 0, j; u32 remainder = 0, sf_cap; - char tmp[8]; + char tmp[12]; const char *unit; tmp[0] = '\0'; -- cgit v1.2.3 From a865276872ec4f129f8a582634be82dcc275dc2a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 30 Oct 2024 18:23:25 -0600 Subject: dim: make dim_calc_stats() inputs const pointers Make the start and end arguments to dim_calc_stats() const pointers to clarify that the function does not modify their values. Signed-off-by: Caleb Sander Mateos Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Arthur Kiyanovski Link: https://patch.msgid.link/20241031002326.3426181-1-csander@purestorage.com Signed-off-by: Jakub Kicinski --- include/linux/dim.h | 3 ++- lib/dim/dim.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/include/linux/dim.h b/include/linux/dim.h index 1b581ff25a15..84579a50ae7f 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -351,7 +351,8 @@ void dim_park_tired(struct dim *dim); * Takes into consideration counter wrap-around. * Returned boolean indicates whether curr_stats are reliable. */ -bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end, +bool dim_calc_stats(const struct dim_sample *start, + const struct dim_sample *end, struct dim_stats *curr_stats); /** diff --git a/lib/dim/dim.c b/lib/dim/dim.c index 83b65ac74d73..97c3d084ebf0 100644 --- a/lib/dim/dim.c +++ b/lib/dim/dim.c @@ -54,7 +54,8 @@ void dim_park_tired(struct dim *dim) } EXPORT_SYMBOL(dim_park_tired); -bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end, +bool dim_calc_stats(const struct dim_sample *start, + const struct dim_sample *end, struct dim_stats *curr_stats) { /* u32 holds up to 71 minutes, should be enough */ -- cgit v1.2.3 From 61bf0009a7657d394d942c8ee961b9ea5f2168fe Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 30 Oct 2024 18:23:26 -0600 Subject: dim: pass dim_sample to net_dim() by reference net_dim() is currently passed a struct dim_sample argument by value. struct dim_sample is 24 bytes. Since this is greater 16 bytes, x86-64 passes it on the stack. All callers have already initialized dim_sample on the stack, so passing it by value requires pushing a duplicated copy to the stack. Either witing to the stack and immediately reading it, or perhaps dereferencing addresses relative to the stack pointer in a chain of push instructions, seems to perform quite poorly. In a heavy TCP workload, mlx5e_handle_rx_dim() consumes 3% of CPU time, 94% of which is attributed to the first push instruction to copy dim_sample on the stack for the call to net_dim(): // Call ktime_get() 0.26 |4ead2: call 4ead7 // Pass the address of struct dim in %rdi |4ead7: lea 0x3d0(%rbx),%rdi // Set dim_sample.pkt_ctr |4eade: mov %r13d,0x8(%rsp) // Set dim_sample.byte_ctr |4eae3: mov %r12d,0xc(%rsp) // Set dim_sample.event_ctr 0.15 |4eae8: mov %bp,0x10(%rsp) // Duplicate dim_sample on the stack 94.16 |4eaed: push 0x10(%rsp) 2.79 |4eaf1: push 0x10(%rsp) 0.07 |4eaf5: push %rax // Call net_dim() 0.21 |4eaf6: call 4eafb To allow the caller to reuse the struct dim_sample already on the stack, pass the struct dim_sample by reference to net_dim(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Vladimir Oltean Reviewed-by: Shannon Nelson Reviewed-by: Florian Fainelli Reviewed-by: Arthur Kiyanovski Reviewed-by: Louis Peens Link: https://patch.msgid.link/20241031002326.3426181-2-csander@purestorage.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_dim.rst | 2 +- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 2 +- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_txrx.c | 4 ++-- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 4 ++-- drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 2 +- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 4 ++-- drivers/net/ethernet/netronome/nfp/nfd3/dp.c | 4 ++-- drivers/net/ethernet/netronome/nfp/nfdk/dp.c | 4 ++-- drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/soc/fsl/dpio/dpio-service.c | 2 +- include/linux/dim.h | 2 +- lib/dim/net_dim.c | 10 +++++----- 19 files changed, 31 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/Documentation/networking/net_dim.rst b/Documentation/networking/net_dim.rst index 8908fd7b0a8d..4377998e6826 100644 --- a/Documentation/networking/net_dim.rst +++ b/Documentation/networking/net_dim.rst @@ -156,7 +156,7 @@ usage is not complete but it should make the outline of the usage clear. my_entity->bytes, &dim_sample); /* Call net DIM */ - net_dim(&my_entity->dim, dim_sample); + net_dim(&my_entity->dim, &dim_sample); ... } diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 96df20854eb9..63c8a2328142 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1383,7 +1383,7 @@ static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi) rx_ring->rx_stats.bytes, &dim_sample); - net_dim(&ena_napi->dim, dim_sample); + net_dim(&ena_napi->dim, &dim_sample); rx_ring->per_napi_packets = 0; } diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index caff6e87a488..031e9e0cca53 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1029,7 +1029,7 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget) if (priv->dim.use_dim) { dim_update_sample(priv->dim.event_ctr, priv->dim.packets, priv->dim.bytes, &dim_sample); - net_dim(&priv->dim.dim, dim_sample); + net_dim(&priv->dim.dim, &dim_sample); } return work_done; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6dd6541d8619..ca42b81133d7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3102,7 +3102,7 @@ static int bnxt_poll(struct napi_struct *napi, int budget) cpr->rx_packets, cpr->rx_bytes, &dim_sample); - net_dim(&cpr->dim, dim_sample); + net_dim(&cpr->dim, &dim_sample); } return work_done; } @@ -3233,7 +3233,7 @@ poll_done: cpr_rx->rx_packets, cpr_rx->rx_bytes, &dim_sample); - net_dim(&cpr->dim, dim_sample); + net_dim(&cpr->dim, &dim_sample); } return work_done; } diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 10966ab15373..53a949eb9180 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2405,7 +2405,7 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget) if (ring->dim.use_dim) { dim_update_sample(ring->dim.event_ctr, ring->dim.packets, ring->dim.bytes, &dim_sample); - net_dim(&ring->dim.dim, dim_sample); + net_dim(&ring->dim.dim, &dim_sample); } return work_done; diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index c09370eab319..05dedea6185a 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -718,7 +718,7 @@ static void enetc_rx_net_dim(struct enetc_int_vector *v) v->rx_ring.stats.packets, v->rx_ring.stats.bytes, &dim_sample); - net_dim(&v->rx_dim, dim_sample); + net_dim(&v->rx_dim, &dim_sample); } static int enetc_bd_ready_count(struct enetc_bdr *tx_ring, int ci) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b09f0cca34dc..fbfd3ee5648f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4478,7 +4478,7 @@ static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) dim_update_sample(tqp_vector->event_cnt, rx_group->total_packets, rx_group->total_bytes, &sample); - net_dim(&rx_group->dim, sample); + net_dim(&rx_group->dim, &sample); } static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) @@ -4491,7 +4491,7 @@ static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) dim_update_sample(tqp_vector->event_cnt, tx_group->total_packets, tx_group->total_bytes, &sample); - net_dim(&tx_group->dim, sample); + net_dim(&tx_group->dim, &sample); } static int hns3_nic_common_poll(struct napi_struct *napi, int budget) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 8208055d6e7f..5d2d7736fd5f 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1352,14 +1352,14 @@ static void ice_net_dim(struct ice_q_vector *q_vector) struct dim_sample dim_sample; __ice_update_sample(q_vector, tx, &dim_sample, true); - net_dim(&tx->dim, dim_sample); + net_dim(&tx->dim, &dim_sample); } if (ITR_IS_DYNAMIC(rx)) { struct dim_sample dim_sample; __ice_update_sample(q_vector, rx, &dim_sample, false); - net_dim(&rx->dim, dim_sample); + net_dim(&rx->dim, &dim_sample); } } diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index d4e6f0e10487..da2a5becf62f 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3679,7 +3679,7 @@ static void idpf_net_dim(struct idpf_q_vector *q_vector) idpf_update_dim_sample(q_vector, &dim_sample, &q_vector->tx_dim, packets, bytes); - net_dim(&q_vector->tx_dim, dim_sample); + net_dim(&q_vector->tx_dim, &dim_sample); check_rx_itr: if (!IDPF_ITR_IS_DYNAMIC(q_vector->rx_intr_mode)) @@ -3698,7 +3698,7 @@ check_rx_itr: idpf_update_dim_sample(q_vector, &dim_sample, &q_vector->rx_dim, packets, bytes); - net_dim(&q_vector->rx_dim, dim_sample); + net_dim(&q_vector->rx_dim, &dim_sample); } /** diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 933e18ba2fb2..7aaf32e9aa95 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -527,7 +527,7 @@ static void otx2_adjust_adaptive_coalese(struct otx2_nic *pfvf, struct otx2_cq_p rx_frames + tx_frames, rx_bytes + tx_bytes, &dim_sample); - net_dim(&cq_poll->dim, dim_sample); + net_dim(&cq_poll->dim, &dim_sample); } int otx2_napi_handler(struct napi_struct *napi, int budget) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index f01ceee5f02d..53485142938c 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2227,7 +2227,7 @@ rx_done: eth->rx_bytes += bytes; dim_update_sample(eth->rx_events, eth->rx_packets, eth->rx_bytes, &dim_sample); - net_dim(ð->rx_dim, dim_sample); + net_dim(ð->rx_dim, &dim_sample); if (xdp_flush) xdp_do_flush(); @@ -2377,7 +2377,7 @@ static int mtk_poll_tx(struct mtk_eth *eth, int budget) dim_update_sample(eth->tx_events, eth->tx_packets, eth->tx_bytes, &dim_sample); - net_dim(ð->tx_dim, dim_sample); + net_dim(ð->tx_dim, &dim_sample); if (mtk_queue_stopped(eth) && (atomic_read(&ring->free_count) > ring->thresh)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c index 5873fde65c2e..417098f0b2bb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -55,7 +55,7 @@ static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq) return; dim_update_sample(sq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample); - net_dim(sq->dim, dim_sample); + net_dim(sq->dim, &dim_sample); } static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq) @@ -67,7 +67,7 @@ static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq) return; dim_update_sample(rq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample); - net_dim(rq->dim, dim_sample); + net_dim(rq->dim, &dim_sample); } void mlx5e_trigger_irq(struct mlx5e_icosq *sq) diff --git a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c index d215efc6cad0..f1c6c47564b1 100644 --- a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c +++ b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c @@ -1179,7 +1179,7 @@ int nfp_nfd3_poll(struct napi_struct *napi, int budget) } while (u64_stats_fetch_retry(&r_vec->rx_sync, start)); dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample); - net_dim(&r_vec->rx_dim, dim_sample); + net_dim(&r_vec->rx_dim, &dim_sample); } if (r_vec->nfp_net->tx_coalesce_adapt_on && r_vec->tx_ring) { @@ -1194,7 +1194,7 @@ int nfp_nfd3_poll(struct napi_struct *napi, int budget) } while (u64_stats_fetch_retry(&r_vec->tx_sync, start)); dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample); - net_dim(&r_vec->tx_dim, dim_sample); + net_dim(&r_vec->tx_dim, &dim_sample); } return pkts_polled; diff --git a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c index dae5af7d1845..ebeb6ab4465c 100644 --- a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c +++ b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c @@ -1289,7 +1289,7 @@ int nfp_nfdk_poll(struct napi_struct *napi, int budget) } while (u64_stats_fetch_retry(&r_vec->rx_sync, start)); dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample); - net_dim(&r_vec->rx_dim, dim_sample); + net_dim(&r_vec->rx_dim, &dim_sample); } if (r_vec->nfp_net->tx_coalesce_adapt_on && r_vec->tx_ring) { @@ -1304,7 +1304,7 @@ int nfp_nfdk_poll(struct napi_struct *napi, int budget) } while (u64_stats_fetch_retry(&r_vec->tx_sync, start)); dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample); - net_dim(&r_vec->tx_dim, dim_sample); + net_dim(&r_vec->tx_dim, &dim_sample); } return pkts_polled; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 0eeda7e502db..2ac59564ded1 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -928,7 +928,7 @@ static void ionic_dim_update(struct ionic_qcq *qcq, int napi_mode) dim_update_sample(qcq->cq.bound_intr->rearm_count, pkts, bytes, &dim_sample); - net_dim(&qcq->dim, dim_sample); + net_dim(&qcq->dim, &dim_sample); } int ionic_tx_napi(struct napi_struct *napi, int budget) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 792e9eadbfc3..869586c17ffd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2804,7 +2804,7 @@ static void virtnet_rx_dim_update(struct virtnet_info *vi, struct receive_queue u64_stats_read(&rq->stats.bytes), &cur_sample); - net_dim(&rq->dim, cur_sample); + net_dim(&rq->dim, &cur_sample); rq->packets_in_napi = 0; } diff --git a/drivers/soc/fsl/dpio/dpio-service.c b/drivers/soc/fsl/dpio/dpio-service.c index b811446e0fa5..0b60ed16297c 100644 --- a/drivers/soc/fsl/dpio/dpio-service.c +++ b/drivers/soc/fsl/dpio/dpio-service.c @@ -891,7 +891,7 @@ void dpaa2_io_update_net_dim(struct dpaa2_io *d, __u64 frames, __u64 bytes) d->frames += frames; dim_update_sample(d->event_ctr, d->frames, d->bytes, &dim_sample); - net_dim(&d->rx_dim, dim_sample); + net_dim(&d->rx_dim, &dim_sample); spin_unlock(&d->dim_lock); } diff --git a/include/linux/dim.h b/include/linux/dim.h index 84579a50ae7f..06543fd40fcc 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -425,7 +425,7 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode); * This is the main logic of the algorithm, where data is processed in order * to decide on next required action. */ -void net_dim(struct dim *dim, struct dim_sample end_sample); +void net_dim(struct dim *dim, const struct dim_sample *end_sample); /* RDMA DIM */ diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index d7e7028e9b19..d6aa09a979b3 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -347,7 +347,7 @@ static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim) return dim->profile_ix != prev_ix; } -void net_dim(struct dim *dim, struct dim_sample end_sample) +void net_dim(struct dim *dim, const struct dim_sample *end_sample) { struct dim_stats curr_stats; u16 nevents; @@ -355,11 +355,11 @@ void net_dim(struct dim *dim, struct dim_sample end_sample) switch (dim->state) { case DIM_MEASURE_IN_PROGRESS: nevents = BIT_GAP(BITS_PER_TYPE(u16), - end_sample.event_ctr, + end_sample->event_ctr, dim->start_sample.event_ctr); if (nevents < DIM_NEVENTS) break; - if (!dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats)) + if (!dim_calc_stats(&dim->start_sample, end_sample, &curr_stats)) break; if (net_dim_decision(&curr_stats, dim)) { dim->state = DIM_APPLY_NEW_PROFILE; @@ -368,8 +368,8 @@ void net_dim(struct dim *dim, struct dim_sample end_sample) } fallthrough; case DIM_START_MEASURE: - dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr, - end_sample.byte_ctr, &dim->start_sample); + dim_update_sample(end_sample->event_ctr, end_sample->pkt_ctr, + end_sample->byte_ctr, &dim->start_sample); dim->state = DIM_MEASURE_IN_PROGRESS; break; case DIM_APPLY_NEW_PROFILE: -- cgit v1.2.3 From 8c7904a8cd0dfb061d078545c2d3c4acce1fcfeb Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 11 Sep 2024 14:27:58 +0000 Subject: maple_tree: i is always less than or equal to mas_end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "refine mas_mab_cp()". By analysis of the code, one condition check can be removed and one case would hit a redundant assignment. This patch (of 2): mas_mab_cp() copy range [mas_start, mas_end] inclusively from a maple_node to maple_big_node. This implies mas_start <= mas_end. Based on the relationship of mas_start and mas_end, we can have the following four cases: | mas_start == mas_end | mas_start < mas_end ---------------+----------------------+---------------------- mas_start == 0 | 1 | 2 ---------------+----------------------+---------------------- mas_start != 0 | 3 | 4 We can see in all these four cases, i is always less than or equal to mas_end after finish the loop: Case 1: After assign pivot 0, i is set to 1, which is bigger than mas_end 0. So it jumps to complete and skip the check. Case 2: After assign pivot 0, i is set to 1. ∵ (mas_start < mas_end) && (mas_start == 0) ==> (1 <= mas_end) ∵ (i == 1) && (1 <= mas_end) ==> (i <= mas_end) ∴ Before loop, we have (i <= mas_end). And we still hold this if it skips the loop. For example, (i == mas_end). Now let's see what happens in the loop: ∵ piv_end = min(mas_end, mt_pivots[mt]) ==> (piv_end <= mas_end) ∵ loop condition is (i < piv_end) ==> (i <= piv_end) on finish the loop both normally or break ∵ (i <= piv_end) && (piv_end <= mas_end) ==> (i <= mas_end) ∴ After loop, we still get (i <= mas_end) in this case Case 3: This case would skip both if clause and loop. So when it comes to the check, i is still mas_start which equals to mas_end. Case 4: This case would skip the if clause. ∵ (mas_start < mas_end) && (i == mas_start) ==> (i < mas_end) ∴ Before loop, we have (i < mas_end). The loop process is similar with Case 2, so we get the same result. Now we can conclude in all cases, we get (i <= mas_end) when doing check. Then it is not necessary to do the check. Link: https://lkml.kernel.org/r/20240911142759.20989-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20240911142759.20989-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3619301dda2e..55958cbcc3fa 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1949,8 +1949,7 @@ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, goto complete; } - if (likely(i <= mas_end)) - b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); + b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; -- cgit v1.2.3 From 1c148069b240a3a65d1aee90c9d5c6997a747a7d Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 11 Sep 2024 14:27:59 +0000 Subject: maple_tree: goto complete directly on a pivot of 0 When we break the loop after assigning a pivot, the index i/j is not changed. Then the following code assign pivot, which means we do the assignment with same i/j by mas_safe_pivot. Since the loop condition is (i < piv_end), from which we can get i is less than mt_pivots[mt]. It implies mas_safe_pivot() return pivot[i] which is the same value we get in loop. Now we can conclude it does a redundant assignment on a pivot of 0. Let's just go to complete to avoid it. Link: https://lkml.kernel.org/r/20240911142759.20989-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 55958cbcc3fa..de883bfb97ef 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1943,7 +1943,7 @@ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) - break; + goto complete; if (unlikely(mas->max == b_node->pivot[j])) goto complete; -- cgit v1.2.3 From f36ba810816182953af74d176e0644e38979b723 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 8 Sep 2024 14:05:53 +0000 Subject: maple_tree: remove maple_big_node.parent Patch series "Reduce the space to be cleared for maple_big_node", v2. Found current code may clear maple_big_node redundantly. First we define a field parent, which is never used. After removing this, we reduce the size of memory to be cleared by memset. Then mast_fill_bnode() clears part of the structure twice, since slot and gap share some space. By clearing the whole structure, we can avoid this. This patch (of 2): The member parent of maple_big_node is never used. Let's remove it which could reduce the number of space to be cleared on memset. Link: https://lkml.kernel.org/r/20240908140554.20378-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20240908140554.20378-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index de883bfb97ef..04cd5ce2a33c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -120,7 +120,6 @@ static const unsigned char mt_min_slots[] = { #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { - struct maple_pnode *parent; unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; -- cgit v1.2.3 From 5059aa6334fcf4b7ddd672255aec5835aecd32b6 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 8 Sep 2024 14:05:54 +0000 Subject: maple_tree: memset maple_big_node as a whole In mast_fill_bnode(), we first clear some fields of maple_big_node and set the 'type' unconditionally before return. This means we won't leverage any information in maple_big_node and it is safe to clear the whole structure. In maple_big_node, we define slot and padding/gap in a union. And based on current definition of MAPLE_BIG_NODE_SLOTS/GAPS, padding is always less than slot and part of the gap is overlapped by slot. For example on 64bit system: MAPLE_BIG_NODE_SLOT is 34 MAPLE_BIG_NODE_GAP is 21 With this knowledge, current code may clear some space by twice. And this could be avoid by clearing the structure as a whole. Link: https://lkml.kernel.org/r/20240908140554.20378-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 04cd5ce2a33c..c5987244ff63 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3157,10 +3157,7 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast, bool cp = true; unsigned char split; - memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap)); - memset(mast->bn->slot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->slot)); - memset(mast->bn->pivot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->pivot)); - mast->bn->b_end = 0; + memset(mast->bn, 0, sizeof(struct maple_big_node)); if (mte_is_root(mas->node)) { cp = false; -- cgit v1.2.3 From b42166427b46af0d963242283fc99d429623d303 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 6 Oct 2024 06:22:21 +0800 Subject: lib/Kconfig.debug: move int_pow test option to runtime testing section When executing 'make menuconfig' with KUNIT enabled, the int_pow test option appears on the first page of the main menu instead of under the runtime testing section. Relocate the int_pow test configuration to the appropriate runtime testing submenu, ensuring a more organized and logical structure in the menu configuration. Link: https://lkml.kernel.org/r/20241005222221.2154393-1-visitorckw@gmail.com Fixes: 7fcc9b53216c ("lib/math: Add int_pow test suite") Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: David Gow Cc: Luis Felipe Hernandez Cc: Shuah Khan Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc5..409dd193c09b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2993,6 +2993,22 @@ config TEST_OBJPOOL If unsure, say N. +config INT_POW_TEST + tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the int_pow function, + which performs integer exponentiation. The test suite is designed to + verify that the implementation of int_pow correctly computes the power + of a given base raised to a given exponent. + + Enabling this option will include tests that check various scenarios + and edge cases to ensure the accuracy and reliability of the exponentiation + function. + + If unsure, say N + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST @@ -3088,19 +3104,3 @@ config RUST_KERNEL_DOCTESTS endmenu # "Rust" endmenu # Kernel hacking - -config INT_POW_TEST - tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS - depends on KUNIT - default KUNIT_ALL_TESTS - help - This option enables the KUnit test suite for the int_pow function, - which performs integer exponentiation. The test suite is designed to - verify that the implementation of int_pow correctly computes the power - of a given base raised to a given exponent. - - Enabling this option will include tests that check various scenarios - and edge cases to ensure the accuracy and reliability of the exponentiation - function. - - If unsure, say N -- cgit v1.2.3 From 5a3c9366cbbf876521f570ce1fb525dc2cb0ed5c Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Tue, 8 Oct 2024 14:52:53 +0800 Subject: list: test: check the size of every lists for list_cut_position*() Check the total number of elements in both resultant lists are correct within list_cut_position*(). Previously, only the first list's size was checked. so additional elements in the second list would not have been caught. Link: https://lkml.kernel.org/r/20241008065253.26673-1-richard120310@gmail.com Signed-off-by: I Hsin Cheng Cc: David Gow Signed-off-by: Andrew Morton --- lib/list-test.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib') diff --git a/lib/list-test.c b/lib/list-test.c index e207c4c98d70..9135cdc1bb39 100644 --- a/lib/list-test.c +++ b/lib/list-test.c @@ -412,6 +412,8 @@ static void list_test_list_cut_position(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, cur, &entries[i]); i++; } + + KUNIT_EXPECT_EQ(test, i, 3); } static void list_test_list_cut_before(struct kunit *test) @@ -440,6 +442,8 @@ static void list_test_list_cut_before(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, cur, &entries[i]); i++; } + + KUNIT_EXPECT_EQ(test, i, 3); } static void list_test_list_splice(struct kunit *test) -- cgit v1.2.3 From 5d042707089f0d0c49473d05250e4f319a71e1df Mon Sep 17 00:00:00 2001 From: Vinicius Peixoto Date: Sat, 12 Oct 2024 04:43:49 -0300 Subject: lib/crc16_kunit.c: add KUnit tests for crc16 Add Kunit tests for the kernel's implementation of the standard CRC-16 algorithm (). The test data consists of 100 randomly-generated test cases, validated against a naive CRC-16 implementation. This test follows roughly the same logic as lib/crc32test.c, but without the performance measurements. Link: https://lkml.kernel.org/r/20241012-crc16-kunit-v3-1-0ca75cb58ca9@lkcamp.dev Signed-off-by: Vinicius Peixoto Co-developed-by: Enzo Bertoloti Signed-off-by: Enzo Bertoloti Co-developed-by: Fabricio Gasperin Signed-off-by: Fabricio Gasperin Suggested-by: David Laight Cc: Brendan Higgins Cc: David Gow Cc: Rae Moar Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 9 ++++ lib/Makefile | 1 + lib/crc16_kunit.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 lib/crc16_kunit.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 409dd193c09b..eda319e9d569 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2850,6 +2850,15 @@ config USERCOPY_KUNIT_TEST on the copy_to/from_user infrastructure, making sure basic user/kernel boundary testing is working. +config CRC16_KUNIT_TEST + tristate "KUnit tests for CRC16" + depends on KUNIT + default KUNIT_ALL_TESTS + select CRC16 + help + Enable this option to run unit tests for the kernel's CRC16 + implementation (). + config TEST_UDELAY tristate "udelay test driver" help diff --git a/lib/Makefile b/lib/Makefile index 773adf88af41..1faed6414a85 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -389,6 +389,7 @@ CFLAGS_fortify_kunit.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o +obj-$(CONFIG_CRC16_KUNIT_TEST) += crc16_kunit.o obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o diff --git a/lib/crc16_kunit.c b/lib/crc16_kunit.c new file mode 100644 index 000000000000..0918c98a96d2 --- /dev/null +++ b/lib/crc16_kunit.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnits tests for CRC16. + * + * Copyright (C) 2024, LKCAMP + * Author: Vinicius Peixoto + * Author: Fabricio Gasperin + * Author: Enzo Bertoloti + */ +#include +#include +#include + +#define CRC16_KUNIT_DATA_SIZE 4096 +#define CRC16_KUNIT_TEST_SIZE 100 +#define CRC16_KUNIT_SEED 0x12345678 + +/** + * struct crc16_test - CRC16 test data + * @crc: initial input value to CRC16 + * @start: Start index within the data buffer + * @length: Length of the data + */ +static struct crc16_test { + u16 crc; + u16 start; + u16 length; +} tests[CRC16_KUNIT_TEST_SIZE]; + +u8 data[CRC16_KUNIT_DATA_SIZE]; + + +/* Naive implementation of CRC16 for validation purposes */ +static inline u16 _crc16_naive_byte(u16 crc, u8 data) +{ + u8 i = 0; + + crc ^= (u16) data; + for (i = 0; i < 8; i++) { + if (crc & 0x01) + crc = (crc >> 1) ^ 0xa001; + else + crc = crc >> 1; + } + + return crc; +} + + +static inline u16 _crc16_naive(u16 crc, u8 *buffer, size_t len) +{ + while (len--) + crc = _crc16_naive_byte(crc, *buffer++); + return crc; +} + + +/* Small helper for generating pseudorandom 16-bit data */ +static inline u16 _rand16(void) +{ + static u32 rand = CRC16_KUNIT_SEED; + + rand = next_pseudo_random32(rand); + return rand & 0xFFFF; +} + + +static int crc16_init_test_data(struct kunit_suite *suite) +{ + size_t i; + + /* Fill the data buffer with random bytes */ + for (i = 0; i < CRC16_KUNIT_DATA_SIZE; i++) + data[i] = _rand16() & 0xFF; + + /* Generate random test data while ensuring the random + * start + length values won't overflow the 4096-byte + * buffer (0x7FF * 2 = 0xFFE < 0x1000) + */ + for (size_t i = 0; i < CRC16_KUNIT_TEST_SIZE; i++) { + tests[i].crc = _rand16(); + tests[i].start = _rand16() & 0x7FF; + tests[i].length = _rand16() & 0x7FF; + } + + return 0; +} + +static void crc16_test_empty(struct kunit *test) +{ + u16 crc; + + /* The result for empty data should be the same as the + * initial crc + */ + crc = crc16(0x00, data, 0); + KUNIT_EXPECT_EQ(test, crc, 0); + crc = crc16(0xFF, data, 0); + KUNIT_EXPECT_EQ(test, crc, 0xFF); +} + +static void crc16_test_correctness(struct kunit *test) +{ + size_t i; + u16 crc, crc_naive; + + for (i = 0; i < CRC16_KUNIT_TEST_SIZE; i++) { + /* Compare results with the naive crc16 implementation */ + crc = crc16(tests[i].crc, data + tests[i].start, + tests[i].length); + crc_naive = _crc16_naive(tests[i].crc, data + tests[i].start, + tests[i].length); + KUNIT_EXPECT_EQ(test, crc, crc_naive); + } +} + + +static void crc16_test_combine(struct kunit *test) +{ + size_t i, j; + u16 crc, crc_naive; + + /* Make sure that combining two consecutive crc16 calculations + * yields the same result as calculating the crc16 for the whole thing + */ + for (i = 0; i < CRC16_KUNIT_TEST_SIZE; i++) { + crc_naive = crc16(tests[i].crc, data + tests[i].start, tests[i].length); + for (j = 0; j < tests[i].length; j++) { + crc = crc16(tests[i].crc, data + tests[i].start, j); + crc = crc16(crc, data + tests[i].start + j, tests[i].length - j); + KUNIT_EXPECT_EQ(test, crc, crc_naive); + } + } +} + + +static struct kunit_case crc16_test_cases[] = { + KUNIT_CASE(crc16_test_empty), + KUNIT_CASE(crc16_test_combine), + KUNIT_CASE(crc16_test_correctness), + {}, +}; + +static struct kunit_suite crc16_test_suite = { + .name = "crc16", + .test_cases = crc16_test_cases, + .suite_init = crc16_init_test_data, +}; +kunit_test_suite(crc16_test_suite); + +MODULE_AUTHOR("Fabricio Gasperin "); +MODULE_AUTHOR("Vinicius Peixoto "); +MODULE_AUTHOR("Enzo Bertoloti "); +MODULE_DESCRIPTION("Unit tests for crc16"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From bf9850f6ea3577a099b0ed43f6e19ca43ef08704 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 11 Oct 2024 22:12:14 +0800 Subject: lib/Makefile: make union-find compilation conditional on CONFIG_CPUSETS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, cpuset is the only user of the union-find implementation. Compiling union-find in all configurations unnecessarily increases the code size when building the kernel without cgroup support. Modify the build system to compile union-find only when CONFIG_CPUSETS is enabled. Link: https://lore.kernel.org/lkml/1ccd6411-5002-4574-bb8e-3e64bba6a757@redhat.com/ Link: https://lkml.kernel.org/r/20241011141214.87096-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Waiman Long Acked-by: Waiman Long Acked-by: Tejun Heo Reviewed-by: Christoph Hellwig Cc: Ching-Chun (Jim) Huang Cc: Johannes Weiner Cc: Michal Koutný Cc: Xavier Cc: Zefan Li Signed-off-by: Andrew Morton --- init/Kconfig | 1 + lib/Kconfig | 3 +++ lib/Makefile | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/init/Kconfig b/init/Kconfig index c521e1421ad4..5cb9b9490f30 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1157,6 +1157,7 @@ config CGROUP_HUGETLB config CPUSETS bool "Cpuset controller" depends on SMP + select UNION_FIND help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and diff --git a/lib/Kconfig b/lib/Kconfig index b38849af6f13..cf303bd91dda 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -777,3 +777,6 @@ config POLYNOMIAL config FIRMWARE_TABLE bool + +config UNION_FIND + bool diff --git a/lib/Makefile b/lib/Makefile index 1faed6414a85..feebed74fc7a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -35,8 +35,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ - buildid.o objpool.o union_find.o + buildid.o objpool.o +lib-$(CONFIG_UNION_FIND) += union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o -- cgit v1.2.3 From 908ef9bb4bd36837c3619109bdcf58f6ab00bfc7 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 12 Oct 2024 12:28:26 +0800 Subject: lib/list_sort: remove unnecessary header includes Patch series "Remove unnecessary header includes from {tools/}lib/list_sort.c". Remove outdated and unnecessary header includes from lib/list_sort.c and tools/lib/list_sort.c. Additionally, update the hunk exceptions checked by check_headers.sh to reflect these changes. This patch (of 3): After commit 043b3f7b6388 ("lib/list_sort: simplify and remove MAX_LIST_LENGTH_BITS"), list_sort.c no longer uses ARRAY_SIZE() (which required kernel.h and bug.h for BUILD_BUG_ON_ZERO via __must_be_array) or memset() (which required string.h). As these headers are no longer needed, removes them. There are no changes to the generated code, as confirmed by 'objdump -d'. Additionally, 'wc -l' shows that the size of lib/.list_sort.o.cmd is reduced from 259 lines to 101 lines. Link: https://lkml.kernel.org/r/20241012042828.471614-1-visitorckw@gmail.com Link: https://lkml.kernel.org/r/20241012042828.471614-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: "Liang, Kan" Cc: Mark Rutland Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- lib/list_sort.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'lib') diff --git a/lib/list_sort.c b/lib/list_sort.c index 0fb59e92ca2d..8d3f623536fe 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include #include #include -#include #include #include -- cgit v1.2.3 From 92a8b224b833e82d286d2100432adbac8cf8a2a1 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:51 +0800 Subject: lib/min_heap: introduce non-inline versions of min heap API functions Patch series "Enhance min heap API with non-inline functions and optimizations", v2. Add non-inline versions of the min heap API functions in lib/min_heap.c and updates all users outside of kernel/events/core.c to use these non-inline versions. To mitigate the performance impact of indirect function calls caused by the non-inline versions of the swap and compare functions, a builtin swap has been introduced that swaps elements based on their size. Additionally, it micro-optimizes the efficiency of the min heap by pre-scaling the counter, following the same approach as in lib/sort.c. Documentation for the min heap API has also been added to the core-api section. This patch (of 10): All current min heap API functions are marked with '__always_inline'. However, as the number of users increases, inlining these functions everywhere leads to a increase in kernel size. In performance-critical paths, such as when perf events are enabled and min heap functions are called on every context switch, it is important to retain the inline versions for optimal performance. To balance this, the original inline functions are kept, and additional non-inline versions of the functions have been added in lib/min_heap.c. Link: https://lkml.kernel.org/r/20241020040200.939973-1-visitorckw@gmail.com Link: https://lore.kernel.org/20240522161048.8d8bbc7b153b4ecd92c50666@linux-foundation.org Link: https://lkml.kernel.org/r/20241020040200.939973-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Andrew Morton Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: Kuan-Wei Chiu Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- drivers/md/bcache/Kconfig | 1 + drivers/md/dm-vdo/Kconfig | 1 + fs/bcachefs/Kconfig | 1 + include/linux/min_heap.h | 129 ++++++++++++++++++++++++++++++---------------- kernel/events/core.c | 6 +-- lib/Kconfig | 3 ++ lib/Kconfig.debug | 1 + lib/Makefile | 1 + lib/min_heap.c | 70 +++++++++++++++++++++++++ 9 files changed, 167 insertions(+), 46 deletions(-) create mode 100644 lib/min_heap.c (limited to 'lib') diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index b2d10063d35f..d4697e79d5a3 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -5,6 +5,7 @@ config BCACHE select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 select CLOSURES + select MIN_HEAP help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/dm-vdo/Kconfig b/drivers/md/dm-vdo/Kconfig index 111ecd2c2a24..2400b2bc4bc7 100644 --- a/drivers/md/dm-vdo/Kconfig +++ b/drivers/md/dm-vdo/Kconfig @@ -7,6 +7,7 @@ config DM_VDO select DM_BUFIO select LZ4_COMPRESS select LZ4_DECOMPRESS + select MIN_HEAP help This device mapper target presents a block device with deduplication, compression and thin-provisioning. diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5bac803ea367..ab6c95b895b3 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -24,6 +24,7 @@ config BCACHEFS_FS select XXHASH select SRCU select SYMBOLIC_ERRNAME + select MIN_HEAP help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 43a7b9dcf15e..0abb21173979 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -40,7 +40,7 @@ struct min_heap_callbacks { /* Initialize a min-heap. */ static __always_inline -void __min_heap_init(min_heap_char *heap, void *data, int size) +void __min_heap_init_inline(min_heap_char *heap, void *data, int size) { heap->nr = 0; heap->size = size; @@ -50,33 +50,33 @@ void __min_heap_init(min_heap_char *heap, void *data, int size) heap->data = heap->preallocated; } -#define min_heap_init(_heap, _data, _size) \ - __min_heap_init((min_heap_char *)_heap, _data, _size) +#define min_heap_init_inline(_heap, _data, _size) \ + __min_heap_init_inline((min_heap_char *)_heap, _data, _size) /* Get the minimum element from the heap. */ static __always_inline -void *__min_heap_peek(struct min_heap_char *heap) +void *__min_heap_peek_inline(struct min_heap_char *heap) { return heap->nr ? heap->data : NULL; } -#define min_heap_peek(_heap) \ - (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +#define min_heap_peek_inline(_heap) \ + (__minheap_cast(_heap) __min_heap_peek_inline((min_heap_char *)_heap)) /* Check if the heap is full. */ static __always_inline -bool __min_heap_full(min_heap_char *heap) +bool __min_heap_full_inline(min_heap_char *heap) { return heap->nr == heap->size; } -#define min_heap_full(_heap) \ - __min_heap_full((min_heap_char *)_heap) +#define min_heap_full_inline(_heap) \ + __min_heap_full_inline((min_heap_char *)_heap) /* Sift the element at pos down the heap. */ static __always_inline -void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *left, *right; void *data = heap->data; @@ -108,13 +108,14 @@ void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, } } -#define min_heap_sift_down(_heap, _pos, _func, _args) \ - __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_down_inline(_heap, _pos, _func, _args) \ + __min_heap_sift_down_inline((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), \ + _func, _args) /* Sift up ith element from the heap, O(log2(nr)). */ static __always_inline -void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, - const struct min_heap_callbacks *func, void *args) +void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; size_t parent; @@ -128,27 +129,28 @@ void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, } } -#define min_heap_sift_up(_heap, _idx, _func, _args) \ - __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heap_sift_up_inline(_heap, _idx, _func, _args) \ + __min_heap_sift_up_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ + _func, _args) /* Floyd's approach to heapification that is O(nr). */ static __always_inline -void __min_heapify_all(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { int i; for (i = heap->nr / 2 - 1; i >= 0; i--) - __min_heap_sift_down(heap, i, elem_size, func, args); + __min_heap_sift_down_inline(heap, i, elem_size, func, args); } -#define min_heapify_all(_heap, _func, _args) \ - __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heapify_all_inline(_heap, _func, _args) \ + __min_heapify_all_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_pop(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -158,13 +160,13 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, /* Place last element at the root (position 0) and then sift down. */ heap->nr--; memcpy(data, data + (heap->nr * elem_size), elem_size); - __min_heap_sift_down(heap, 0, elem_size, func, args); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); return true; } -#define min_heap_pop(_heap, _func, _args) \ - __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_inline(_heap, _func, _args) \ + __min_heap_pop_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* * Remove the minimum element and then push the given element. The @@ -172,22 +174,21 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, * efficient than a pop followed by a push that does 2. */ static __always_inline -void __min_heap_pop_push(min_heap_char *heap, - const void *element, size_t elem_size, - const struct min_heap_callbacks *func, - void *args) +void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { memcpy(heap->data, element, elem_size); - __min_heap_sift_down(heap, 0, elem_size, func, args); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); } -#define min_heap_pop_push(_heap, _element, _func, _args) \ - __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push_inline(_heap, _element, _func, _args) \ + __min_heap_pop_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) /* Push an element on to the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; int pos; @@ -201,18 +202,19 @@ bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, heap->nr++; /* Sift child at pos up. */ - __min_heap_sift_up(heap, elem_size, pos, func, args); + __min_heap_sift_up_inline(heap, elem_size, pos, func, args); return true; } -#define min_heap_push(_heap, _element, _func, _args) \ - __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +#define min_heap_push_inline(_heap, _element, _func, _args) \ + __min_heap_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) /* Remove ith element from the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -224,12 +226,53 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, if (idx == heap->nr) return true; func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); - __min_heap_sift_up(heap, elem_size, idx, func, args); - __min_heap_sift_down(heap, idx, elem_size, func, args); + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); + __min_heap_sift_down_inline(heap, idx, elem_size, func, args); return true; } +#define min_heap_del_inline(_heap, _idx, _func, _args) \ + __min_heap_del_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ + _func, _args) + +void __min_heap_init(min_heap_char *heap, void *data, int size); +void *__min_heap_peek(struct min_heap_char *heap); +bool __min_heap_full(min_heap_char *heap); +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); + +#define min_heap_init(_heap, _data, _size) \ + __min_heap_init((min_heap_char *)_heap, _data, _size) +#define min_heap_peek(_heap) \ + (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +#define min_heap_full(_heap) \ + __min_heap_full((min_heap_char *)_heap) +#define min_heap_sift_down(_heap, _pos, _func, _args) \ + __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_up(_heap, _idx, _func, _args) \ + __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heapify_all(_heap, _func, _args) \ + __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop(_heap, _func, _args) \ + __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push(_heap, _element, _func, _args) \ + __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) +#define min_heap_push(_heap, _element, _func, _args) \ + __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) #define min_heap_del(_heap, _idx, _func, _args) \ __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) diff --git a/kernel/events/core.c b/kernel/events/core.c index df27d08a7232..1b3c1198b2af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3870,7 +3870,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } - min_heapify_all(&event_heap, &perf_min_heap, NULL); + min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); while (event_heap.nr) { ret = func(*evt, data); @@ -3879,9 +3879,9 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, *evt = perf_event_groups_next(*evt, pmu); if (*evt) - min_heap_sift_down(&event_heap, 0, &perf_min_heap, NULL); + min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); else - min_heap_pop(&event_heap, &perf_min_heap, NULL); + min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); } return 0; diff --git a/lib/Kconfig b/lib/Kconfig index cf303bd91dda..f5a2781669ea 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -780,3 +780,6 @@ config FIRMWARE_TABLE config UNION_FIND bool + +config MIN_HEAP + bool diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index eda319e9d569..2549b64b2280 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2279,6 +2279,7 @@ config TEST_LIST_SORT config TEST_MIN_HEAP tristate "Min heap test" depends on DEBUG_KERNEL || m + select MIN_HEAP help Enable this to turn on min heap function tests. This test is executed only once during system boot (so affects only boot time), diff --git a/lib/Makefile b/lib/Makefile index feebed74fc7a..1eb89962daef 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -40,6 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ lib-$(CONFIG_UNION_FIND) += union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o +lib-$(CONFIG_MIN_HEAP) += min_heap.o lib-y += kobject.o klist.o obj-y += lockref.o diff --git a/lib/min_heap.c b/lib/min_heap.c new file mode 100644 index 000000000000..4485372ff3b1 --- /dev/null +++ b/lib/min_heap.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +void __min_heap_init(min_heap_char *heap, void *data, int size) +{ + __min_heap_init_inline(heap, data, size); +} +EXPORT_SYMBOL(__min_heap_init); + +void *__min_heap_peek(struct min_heap_char *heap) +{ + return __min_heap_peek_inline(heap); +} +EXPORT_SYMBOL(__min_heap_peek); + +bool __min_heap_full(min_heap_char *heap) +{ + return __min_heap_full_inline(heap); +} +EXPORT_SYMBOL(__min_heap_full); + +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_sift_down_inline(heap, pos, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_sift_down); + +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); +} +EXPORT_SYMBOL(__min_heap_sift_up); + +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heapify_all_inline(heap, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heapify_all); + +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_pop_inline(heap, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_pop); + +void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_pop_push_inline(heap, element, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_pop_push); + +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_push_inline(heap, element, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_push); + +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_del_inline(heap, elem_size, idx, func, args); +} +EXPORT_SYMBOL(__min_heap_del); -- cgit v1.2.3 From d559bb2c6deea1fb4650b8a784b27e87ea12f71d Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:54 +0800 Subject: lib/test_min_heap: update min_heap_callbacks to use default builtin swap Replace the swp function pointer in the min_heap_callbacks of test_min_heap with NULL, allowing direct usage of the default builtin swap implementation. This modification simplifies the code and improves performance by removing unnecessary function indirection. Link: https://lkml.kernel.org/r/20241020040200.939973-5-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- lib/test_min_heap.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/test_min_heap.c b/lib/test_min_heap.c index 64c877e73b64..e6fbb798558b 100644 --- a/lib/test_min_heap.c +++ b/lib/test_min_heap.c @@ -23,14 +23,6 @@ static __init bool greater_than(const void *lhs, const void *rhs, void __always_ return *(int *)lhs > *(int *)rhs; } -static __init void swap_ints(void *lhs, void *rhs, void __always_unused *args) -{ - int temp = *(int *)lhs; - - *(int *)lhs = *(int *)rhs; - *(int *)rhs = temp; -} - static __init int pop_verify_heap(bool min_heap, struct min_heap_test *heap, const struct min_heap_callbacks *funcs) @@ -72,7 +64,7 @@ static __init int test_heapify_all(bool min_heap) }; struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, - .swp = swap_ints, + .swp = NULL, }; int i, err; @@ -104,7 +96,7 @@ static __init int test_heap_push(bool min_heap) }; struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, - .swp = swap_ints, + .swp = NULL, }; int i, temp, err; @@ -136,7 +128,7 @@ static __init int test_heap_pop_push(bool min_heap) }; struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, - .swp = swap_ints, + .swp = NULL, }; int i, temp, err; @@ -175,7 +167,7 @@ static __init int test_heap_del(bool min_heap) heap.nr = ARRAY_SIZE(values); struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, - .swp = swap_ints, + .swp = NULL, }; int i, err; -- cgit v1.2.3 From e01caa2b63c88c64ee90b83a92a584ec90feeda5 Mon Sep 17 00:00:00 2001 From: Sui Jingfeng Date: Tue, 29 Oct 2024 02:29:20 +0800 Subject: lib/scatterlist: use sg_phys() helper This shorten the length of code in horizential direction, therefore is easier to read. Link: https://lkml.kernel.org/r/20241028182920.1025819-1-sui.jingfeng@linux.dev Signed-off-by: Sui Jingfeng Signed-off-by: Andrew Morton --- lib/scatterlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 473b2646f71c..5bb6b8aff232 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -474,14 +474,14 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, return -EOPNOTSUPP; if (sgt_append->prv) { - unsigned long next_pfn = (page_to_phys(sg_page(sgt_append->prv)) + - sgt_append->prv->offset + sgt_append->prv->length) / PAGE_SIZE; + unsigned long next_pfn; if (WARN_ON(offset)) return -EINVAL; /* Merge contiguous pages into the last SG */ prv_len = sgt_append->prv->length; + next_pfn = (sg_phys(sgt_append->prv) + prv_len) / PAGE_SIZE; if (page_to_pfn(pages[0]) == next_pfn) { last_pg = pfn_to_page(next_pfn - 1); while (n_pages && pages_are_mergeable(pages[0], last_pg)) { -- cgit v1.2.3 From b314e21596a48d21a88b8c6a98ecfea8d7b2d2a1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 7 Oct 2024 12:53:35 +0100 Subject: maple_tree: do not hash pointers on dump in debug mode Many maple tree values output when an mt_validate() or equivalent hits an issue utilise tagged pointers, most notably parent nodes. Also some pivots/slots contain meaningful values, output as pointers, such as the index of the last entry with data for example. All pointer values such as this are destroyed by kernel pointer hashing rendering the debug output obtained from CONFIG_DEBUG_VM_MAPLE_TREE considerably less usable. Update this code to output the raw pointers using %px rather than %p when CONFIG_DEBUG_VM_MAPLE_TREE is defined. This is justified, as the use of this configuration flag indicates that this is a test environment. Userland does not understand %px, so use %p there. In an abundance of caution, if CONFIG_DEBUG_VM_MAPLE_TREE is not set, also use %p to avoid exposing raw kernel pointers except when we are positive a testing mode is enabled. This was inspired by the investigation performed in recent debugging efforts around a maple tree regression [0] where kernel pointer tagging had to be disabled in order to obtain truly meaningful and useful data. [0]:https://lore.kernel.org/all/20241001023402.3374-1-spasswolf@web.de/ Link: https://lkml.kernel.org/r/20241007115335.90104-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- lib/maple_tree.c | 100 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 41 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c5987244ff63..b3b1d4b8126b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -64,6 +64,21 @@ #define CREATE_TRACE_POINTS #include +/* + * Kernel pointer hashing renders much of the maple tree dump useless as tagged + * pointers get hashed to arbitrary values. + * + * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is + * permissible to bypass this. Otherwise remain cautious and retain the hashing. + * + * Userland doesn't know about %px so also use %p there. + */ +#if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE) +#define PTR_FMT "%px" +#else +#define PTR_FMT "%p" +#endif + #define MA_ROOT_PARENT 1 /* @@ -5414,7 +5429,8 @@ void *mas_store(struct ma_state *mas, void *entry) trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) - pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry); + pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, + entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); @@ -7119,14 +7135,14 @@ static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) - pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry), - xa_to_value(entry), entry); + pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry), + xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) - pr_cont("UNKNOWN ENTRY (%p)\n", entry); + pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry); else - pr_cont("%p\n", entry); + pr_cont(PTR_FMT "\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, @@ -7142,13 +7158,13 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: - pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } - pr_cont("%p\n", node->slot[i]); + pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; @@ -7170,11 +7186,11 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, if (last > max) { switch(format) { case mt_dump_hex: - pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", + pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } @@ -7204,13 +7220,13 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: - pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } - pr_cont("%p\n", node->slot[i]); + pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; @@ -7229,11 +7245,11 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, if (last > max) { switch(format) { case mt_dump_hex: - pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", + pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } @@ -7251,8 +7267,8 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry, mt_dump_range(min, max, depth, format); - pr_cont("node %p depth %d type %d parent %p", node, depth, type, - node ? node->parent : NULL); + pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node, + depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); @@ -7280,7 +7296,7 @@ void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); - pr_info("maple_tree(%p) flags %X, height %u root %p\n", + pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); if (!xa_is_node(entry)) mt_dump_entry(entry, 0, 0, 0, format); @@ -7332,7 +7348,7 @@ static void mas_validate_gaps(struct ma_state *mas) MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { - pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", + pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); @@ -7352,19 +7368,19 @@ counted: MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { - pr_err("gap offset %p[%u] is invalid\n", node, offset); + pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { - pr_err("gap %p[%u] is not the largest gap %lu\n", + pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { - pr_err("gap %p[%u] beyond node limit != 0\n", + pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } @@ -7378,7 +7394,7 @@ counted: p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { - pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); + pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } @@ -7408,11 +7424,11 @@ static void mas_validate_parent_slot(struct ma_state *mas) node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) - pr_err("parent %p[%u] does not have %p\n", + pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { - pr_err("Invalid child %p at parent %p[%u] p_slot %u\n", + pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } @@ -7434,20 +7450,20 @@ static void mas_validate_child_slot(struct ma_state *mas) child = mas_slot(mas, slots, i); if (!child) { - pr_err("Non-leaf node lacks child at %p[%u]\n", + pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { - pr_err("Slot error at %p[%u]: child %p has pslot %u\n", + pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { - pr_err("child %p has parent %p not %p\n", + pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); @@ -7477,24 +7493,24 @@ static void mas_validate_limits(struct ma_state *mas) piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { - pr_err("Missing node limit pivot at %p[%u]", + pr_err("Missing node limit pivot at " PTR_FMT "[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { - pr_err("%p[%u] piv %lu < prev_piv %lu\n", + pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { - pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i, + pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { - pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i, + pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } @@ -7504,7 +7520,7 @@ static void mas_validate_limits(struct ma_state *mas) } if (mas_data_end(mas) != i) { - pr_err("node%p: data_end %u != the last slot offset %u\n", + pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } @@ -7513,8 +7529,8 @@ static void mas_validate_limits(struct ma_state *mas) void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { - pr_err("%p[%u] should not have entry %p\n", mas_mn(mas), - i, entry); + pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n", + mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } @@ -7524,7 +7540,7 @@ static void mas_validate_limits(struct ma_state *mas) if (!piv) continue; - pr_err("%p[%u] should not have piv %lu\n", + pr_err(PTR_FMT "[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } @@ -7549,7 +7565,7 @@ static void mt_validate_nulls(struct maple_tree *mt) do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { - pr_err("Sequential nulls end at %p[%u]\n", + pr_err("Sequential nulls end at " PTR_FMT "[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); @@ -7591,7 +7607,8 @@ void mt_validate(struct maple_tree *mt) end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (mas.max != ULONG_MAX))) { - pr_err("Invalid size %u of %p\n", end, mas_mn(&mas)); + pr_err("Invalid size %u of " PTR_FMT "\n", + end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); @@ -7607,7 +7624,8 @@ EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { - pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node); + pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ", + mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); @@ -7671,7 +7689,7 @@ void mas_dump(const struct ma_state *mas) pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); - pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", + pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); @@ -7680,7 +7698,7 @@ EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { - pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n", + pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, -- cgit v1.2.3 From f0c99037a0c6301ca8c3e41162dd0426b5d38abe Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 11 Oct 2024 17:44:51 -0400 Subject: maple_tree: refactor mas_wr_store_type() In mas_wr_store_type(), we check if new_end < mt_slots[wr_mas->type]. If this check fails, we know that ,after this, new_end is >= mt_min_slots. Checking this again when we detect a wr_node_store later in the function is reduntant. Because this check is part of an OR statement, the statement will always evaluate to true, therefore we can just get rid of it. We also refactor mas_wr_store_type() to return the store type rather than set it directly as it greatly cleans up the function. Link: https://lkml.kernel.org/r/20241011214451.7286-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Suggested-by: Liam Howlett Suggested-by: Wei Yang Reviewed-by: Wei Yang Reviewed-by: Liam Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 72 ++++++++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 47 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b3b1d4b8126b..a5e982e482dd 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4191,24 +4191,22 @@ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) } /* - * mas_wr_store_type() - Set the store type for a given + * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state + * + * Return: the type of store needed for the operation */ -static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) +static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; - if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) { - mas->store_type = wr_store_root; - return; - } + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) + return wr_store_root; - if (unlikely(!mas_wr_walk(wr_mas))) { - mas->store_type = wr_spanning_store; - return; - } + if (unlikely(!mas_wr_walk(wr_mas))) + return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); @@ -4216,50 +4214,30 @@ static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) mas_wr_extend_null(wr_mas); new_end = mas_wr_new_end(wr_mas); - if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) { - mas->store_type = wr_exact_fit; - return; - } + if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) + return wr_exact_fit; - if (unlikely(!mas->index && mas->last == ULONG_MAX)) { - mas->store_type = wr_new_root; - return; - } + if (unlikely(!mas->index && mas->last == ULONG_MAX)) + return wr_new_root; /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { - if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) { - mas->store_type = wr_rebalance; - return; - } - mas->store_type = wr_node_store; - return; + if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) + return wr_rebalance; + return wr_node_store; } - if (new_end >= mt_slots[wr_mas->type]) { - mas->store_type = wr_split_store; - return; - } + if (new_end >= mt_slots[wr_mas->type]) + return wr_split_store; - if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) { - mas->store_type = wr_append; - return; - } + if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) + return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || - (wr_mas->offset_end - mas->offset == 1))) { - mas->store_type = wr_slot_store; - return; - } - - if (mte_is_root(mas->node) || (new_end >= mt_min_slots[wr_mas->type]) || - (mas->mas_flags & MA_STATE_BULK)) { - mas->store_type = wr_node_store; - return; - } + (wr_mas->offset_end - mas->offset == 1))) + return wr_slot_store; - mas->store_type = wr_invalid; - MAS_WARN_ON(mas, 1); + return wr_node_store; } /** @@ -4274,7 +4252,7 @@ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) int request; mas_wr_prealloc_setup(wr_mas); - mas_wr_store_type(wr_mas); + mas->store_type = mas_wr_store_type(wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return; @@ -5446,7 +5424,7 @@ void *mas_store(struct ma_state *mas, void *entry) * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); - mas_wr_store_type(&wr_mas); + mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); @@ -5549,7 +5527,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) int request; mas_wr_prealloc_setup(&wr_mas); - mas_wr_store_type(&wr_mas); + mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return ret; -- cgit v1.2.3 From 5b2100f723bd5c2b5552b27208f3e7d7447910d3 Mon Sep 17 00:00:00 2001 From: Jiazi Li Date: Wed, 26 Jun 2024 12:06:30 -0400 Subject: maple_tree: fix alloc node fail issue In the following code, the second call to the mas_node_count will return -ENOMEM: mas_node_count(mas, MAPLE_ALLOC_SLOTS + 1); mas_node_count(mas, MAPLE_ALLOC_SLOTS * 2 + 2); This is because there may be some full maple_alloc node in current maple state. Use full maple_alloc node will make max_req equal to 0. And it leads to mt_alloc_bulk return 0. As a result, mas_node_count set mas.node to MA_ERROR(-ENOMEM). Find a non-full maple_alloc node, and if necessary, use this non-full node in the next while loop. Link: https://lkml.kernel.org/r/20240626160631.3636515-1-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Jiazi Li Signed-off-by: Liam R. Howlett Suggested-by: Liam R. Howlett Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a5e982e482dd..cdac15168405 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1285,7 +1285,10 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node->node_count += count; allocated += count; - node = node->slot[0]; + /* find a non-full node*/ + do { + node = node->slot[0]; + } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); requested -= count; } mas->alloc->total = allocated; -- cgit v1.2.3 From 0cc8d68abe2fdcb7039ece95f784698c0b0dc51e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 13 Sep 2024 06:31:28 +0000 Subject: maple_tree: root node could be handled by !p_slot too For a root node, mte_parent_slot() return 0, this exactly fits the following !p_slot check. So we can remove the special handling for root node. Link: https://lkml.kernel.org/r/20240913063128.27391-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index cdac15168405..c2d3c8d27358 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2155,9 +2155,7 @@ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); - if (mte_is_root(mas->node)) - return false; - + /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; -- cgit v1.2.3 From e852cb1d00ceb4b0156832c13ba3daf7ed93ac17 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:44 +0000 Subject: maple_tree: clear request_count for new allocated one Patch series "maple_tree: simplify mas_push_node()", v2. When count is not 0, we know head is valid. So we can put the assignment in if (count) instead of checking the head pointer again. Also count represents current total, we can assign the new total by increasing the count by one. This patch (of 3): If this is not a new allocated one, the request_count has already been cleared in mas_set_alloc_req(). Link: https://lkml.kernel.org/r/20241015120746.15850-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20241015120746.15850-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c2d3c8d27358..ee7922b19f0a 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1265,11 +1265,11 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) mas->alloc = node; node->total = ++allocated; + node->request_count = 0; requested--; } node = mas->alloc; - node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; -- cgit v1.2.3 From 4223dd93bfc976debededffc0b03cc63d9b73d14 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:45 +0000 Subject: maple_tree: total is not changed for nomem_one case If it jumps to nomem_one, the total allocated number is not changed. So we don't need to adjust it. For the nomem_bulk case, we know there is a valid mas->alloc. So we don't need to do the check. Link: https://lkml.kernel.org/r/20241015120746.15850-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ee7922b19f0a..1b80201865d8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1297,10 +1297,9 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); + mas->alloc->total = allocated; nomem_one: mas_set_alloc_req(mas, requested); - if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) - mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); } -- cgit v1.2.3 From 908378a30b0972e5bf8fae3cf38affc162fe8e3b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:46 +0000 Subject: maple_tree: simplify mas_push_node() When count is not 0, we know head is valid. So we can put the assignment in if (count) instead of checking the head pointer again. Also count represents current total, we can assign the new total by increasing the count by one. Link: https://lkml.kernel.org/r/20241015120746.15850-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1b80201865d8..667120a44570 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1207,19 +1207,17 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) reuse->request_count = 0; reuse->node_count = 0; - if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { - head->slot[head->node_count++] = reuse; - head->total++; - goto done; - } - - reuse->total = 1; - if ((head) && !((unsigned long)head & 0x1)) { + if (count) { + if (head->node_count < MAPLE_ALLOC_SLOTS) { + head->slot[head->node_count++] = reuse; + head->total++; + goto done; + } reuse->slot[0] = head; reuse->node_count = 1; - reuse->total += head->total; } + reuse->total = count + 1; mas->alloc = reuse; done: if (requested > 1) -- cgit v1.2.3 From 4a7bba1df00163ecbdf4994bc42b879ade4aeed2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 16 Oct 2024 21:23:52 +0300 Subject: percpu: add a test case for the specific 64-bit value addition It might be a corner case when we add UINT_MAX as 64-bit unsigned value to the percpu variable as it's not the same as -1 (ULONG_LONG_MAX). Add a test case for that. Link: https://lkml.kernel.org/r/20241016182635.1156168-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Borislav Petkov (AMD) Cc: Christoph Lameter Cc: Dave Hansen Cc: Dennis Zhou Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ingo Molnar Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uros Bizjak Signed-off-by: Andrew Morton --- lib/percpu_test.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/percpu_test.c b/lib/percpu_test.c index 4a3d70bbc1a0..ce7124b16dab 100644 --- a/lib/percpu_test.c +++ b/lib/percpu_test.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include /* validate @native and @pcp counter values match @expected */ @@ -24,8 +25,9 @@ static int __init percpu_test_init(void) * +ul_one/-ul_one below would replace with inc/dec instructions. */ volatile unsigned int ui_one = 1; - long l = 0; + unsigned long long ull = 0; unsigned long ul = 0; + long l = 0; pr_info("percpu test start\n"); @@ -112,6 +114,13 @@ static int __init percpu_test_init(void) CHECK(ul, ulong_counter, -1); CHECK(ul, ulong_counter, ULONG_MAX); + ul = ull = 0; + __this_cpu_write(ulong_counter, 0); + + ul = ull += UINT_MAX; + __this_cpu_add(ulong_counter, ull); + CHECK(ul, ulong_counter, UINT_MAX); + ul = 3; __this_cpu_write(ulong_counter, 3); -- cgit v1.2.3 From 61e9df7085cca6b62e9d230ed807eb524126a105 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 17 Oct 2024 01:58:08 +0000 Subject: maple_tree: calculate new_end when needed Patch series "Following cleanup after introduce mas_wr_store_type()", v2. Patch 1 postpone new_end calculation when needed. Patch 2 removes a unnecessary sanity check in mas_wr_slot_store(). This patch (of 2): For wr_exact_fit/wr_new_root, we don't need to calculate new_end. Let's postpone it until necessary. Link: https://lkml.kernel.org/r/20241017015809.23392-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20241017015809.23392-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 667120a44570..bc30e99d6cf0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4211,13 +4211,13 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) if (!wr_mas->entry) mas_wr_extend_null(wr_mas); - new_end = mas_wr_new_end(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; + new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) -- cgit v1.2.3 From 38dc8f495246667b543de4cc646fce2925e4cf3b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 17 Oct 2024 01:58:09 +0000 Subject: maple_tree: remove sanity check from mas_wr_slot_store() After commit 5d659bbb52a2 ("maple_tree: introduce mas_wr_store_type()"), the check here is redundant. Let's remove it. Link: https://lkml.kernel.org/r/20241017015809.23392-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index bc30e99d6cf0..38aa8abf8eb8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3897,7 +3897,8 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } - } else if (!mt_in_rcu(mas->tree)) { + } else { + WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges @@ -3907,8 +3908,6 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ - } else { - return; } trace_ma_write(__func__, mas, 0, wr_mas->entry); -- cgit v1.2.3 From 2b37c814aab74130bcf2573a9dc1551301283cea Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 25 Oct 2024 16:14:50 +0200 Subject: lib/Kconfig.debug: Default STRICT_DEVMEM to "y" on s390 virtio-mem currently depends on !DEVMEM | STRICT_DEVMEM. Let's default STRICT_DEVMEM to "y" just like we do for arm64 and x86. There could be ways in the future to filter access to virtio-mem device memory even without STRICT_DEVMEM, but for now let's just keep it simple. Tested-by: Mario Casquero Acked-by: Heiko Carstens Signed-off-by: David Hildenbrand Tested-by: Sumanth Korikkar Acked-by: Christian Borntraeger Link: https://lore.kernel.org/r/20241025141453.1210600-6-david@redhat.com Signed-off-by: Heiko Carstens --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7315f643817a..e7a917540e2a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1905,7 +1905,7 @@ config STRICT_DEVMEM bool "Filter access to /dev/mem" depends on MMU && DEVMEM depends on ARCH_HAS_DEVMEM_IS_ALLOWED || GENERIC_LIB_DEVMEM_IS_ALLOWED - default y if PPC || X86 || ARM64 + default y if PPC || X86 || ARM64 || S390 help If this option is disabled, you allow userspace (root) access to all of memory, including kernel and userspace memory. Accidental -- cgit v1.2.3 From cb6fcef8b4b6c655b6a25cc3a415cd9eb81b3da8 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 28 Oct 2024 12:26:27 +0900 Subject: objpool: fix to make percpu slot allocation more robust Since gfp & GFP_ATOMIC == GFP_ATOMIC is true for GFP_KERNEL | GFP_HIGH, it will use kmalloc if user specifies that combination. Here the reason why combining the __vmalloc_node() and kmalloc_node() is that the vmalloc does not support all GFP flag, especially GFP_ATOMIC. So we should check if gfp & (GFP_ATOMIC | GFP_KERNEL) != GFP_ATOMIC for vmalloc first. This ensures caller can sleep. And for the robustness, even if vmalloc fails, it should retry with kmalloc to allocate it. Link: https://lkml.kernel.org/r/173008598713.1262174.2959179484209897252.stgit@mhiramat.roam.corp.google.com Fixes: aff1871bfc81 ("objpool: fix choosing allocation for percpu slots") Signed-off-by: Masami Hiramatsu (Google) Reported-by: Linus Torvalds Closes: https://lore.kernel.org/all/CAHk-=whO+vSH+XVRio8byJU8idAWES0SPGVZ7KAVdc4qrV0VUA@mail.gmail.com/ Cc: Leo Yan Cc: Linus Torvalds Cc: Matt Wu Cc: Mikel Rychliski Cc: Steven Rostedt (Google) Cc: Viktor Malik Signed-off-by: Andrew Morton --- lib/objpool.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/objpool.c b/lib/objpool.c index fd108fe0d095..b998b720c732 100644 --- a/lib/objpool.c +++ b/lib/objpool.c @@ -74,15 +74,21 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, * warm caches and TLB hits. in default vmalloc is used to * reduce the pressure of kernel slab system. as we know, * mimimal size of vmalloc is one page since vmalloc would - * always align the requested size to page size + * always align the requested size to page size. + * but if vmalloc fails or it is not available (e.g. GFP_ATOMIC) + * allocate percpu slot with kmalloc. */ - if ((pool->gfp & GFP_ATOMIC) == GFP_ATOMIC) - slot = kmalloc_node(size, pool->gfp, cpu_to_node(i)); - else + slot = NULL; + + if ((pool->gfp & (GFP_ATOMIC | GFP_KERNEL)) != GFP_ATOMIC) slot = __vmalloc_node(size, sizeof(void *), pool->gfp, cpu_to_node(i), __builtin_return_address(0)); - if (!slot) - return -ENOMEM; + + if (!slot) { + slot = kmalloc_node(size, pool->gfp, cpu_to_node(i)); + if (!slot) + return -ENOMEM; + } memset(slot, 0, size); pool->cpu_slots[i] = slot; -- cgit v1.2.3 From 3e09c500bb5b0606282d04438404f67df132835a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:55 -0700 Subject: alloc_tag: introduce shutdown_mem_profiling helper function Implement a helper function to disable memory allocation profiling and use it when creation of /proc/allocinfo fails. Ensure /proc/allocinfo does not get created when memory allocation profiling is disabled. Link: https://lkml.kernel.org/r/20241023170759.999909-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 81e5f9a70f22..435aa837e550 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -8,6 +8,14 @@ #include #include +#define ALLOCINFO_FILE_NAME "allocinfo" + +#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT +static bool mem_profiling_support __meminitdata = true; +#else +static bool mem_profiling_support __meminitdata; +#endif + static struct codetag_type *alloc_tag_cttype; DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); @@ -144,9 +152,26 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } +static void __init shutdown_mem_profiling(void) +{ + if (mem_alloc_profiling_enabled()) + static_branch_disable(&mem_alloc_profiling_key); + + if (!mem_profiling_support) + return; + + mem_profiling_support = false; +} + static void __init procfs_init(void) { - proc_create_seq("allocinfo", 0400, NULL, &allocinfo_seq_op); + if (!mem_profiling_support) + return; + + if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { + pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); + shutdown_mem_profiling(); + } } static bool alloc_tag_module_unload(struct codetag_type *cttype, @@ -174,12 +199,6 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype, return module_unused; } -#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -static bool mem_profiling_support __meminitdata = true; -#else -static bool mem_profiling_support __meminitdata; -#endif - static int __init setup_early_mem_profiling(char *str) { bool enable; -- cgit v1.2.3 From 0db6f8d7820a4b788565dac8eed52bfc2c3216da Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:56 -0700 Subject: alloc_tag: load module tags into separate contiguous memory When a module gets unloaded there is a possibility that some of the allocations it made are still used and therefore the allocation tags corresponding to these allocations are still referenced. As such, the memory for these tags can't be freed. This is currently handled as an abnormal situation and module's data section is not being unloaded. To handle this situation without keeping module's data in memory, allow codetags with longer lifespan than the module to be loaded into their own separate memory. The in-use memory areas and gaps after module unloading in this separate memory are tracked using maple trees. Allocation tags arrange their separate memory so that it is virtually contiguous and that will allow simple allocation tag indexing later on in this patchset. The size of this virtually contiguous memory is set to store up to 100000 allocation tags. [surenb@google.com: fix empty codetag module section handling] Link: https://lkml.kernel.org/r/20241101000017.3856204-1-surenb@google.com [akpm@linux-foundation.org: update comment, per Dan] Link: https://lkml.kernel.org/r/20241023170759.999909-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/asm-generic/codetag.lds.h | 19 +++ include/linux/alloc_tag.h | 13 +- include/linux/codetag.h | 37 +++++- kernel/module/main.c | 84 +++++++++---- lib/alloc_tag.c | 249 +++++++++++++++++++++++++++++++++++--- lib/codetag.c | 100 +++++++++++++-- scripts/module.lds.S | 5 +- 7 files changed, 445 insertions(+), 62 deletions(-) (limited to 'lib') diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h index 64f536b80380..372c320c5043 100644 --- a/include/asm-generic/codetag.lds.h +++ b/include/asm-generic/codetag.lds.h @@ -11,4 +11,23 @@ #define CODETAG_SECTIONS() \ SECTION_WITH_BOUNDARIES(alloc_tags) +/* + * Module codetags which aren't used after module unload, therefore have the + * same lifespan as the module and can be safely unloaded with the module. + */ +#define MOD_CODETAG_SECTIONS() + +#define MOD_SEPARATE_CODETAG_SECTION(_name) \ + .codetag.##_name : { \ + SECTION_WITH_BOUNDARIES(_name) \ + } + +/* + * For codetags which might be used after module unload, therefore might stay + * longer in memory. Each such codetag type has its own section so that we can + * unload them individually once unused. + */ +#define MOD_SEPARATE_CODETAG_SECTIONS() \ + MOD_SEPARATE_CODETAG_SECTION(alloc_tags) + #endif /* __ASM_GENERIC_CODETAG_LDS_H */ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 941deffc590d..55d30543c4c7 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,6 +30,13 @@ struct alloc_tag { struct alloc_tag_counters __percpu *counters; } __aligned(8); +struct alloc_tag_module_section { + unsigned long start_addr; + unsigned long end_addr; + /* used size */ + unsigned long size; +}; + #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG #define CODETAG_EMPTY ((void *)1) @@ -54,6 +61,8 @@ static inline void set_codetag_empty(union codetag_ref *ref) {} #ifdef CONFIG_MEM_ALLOC_PROFILING +#define ALLOC_TAG_SECTION_NAME "alloc_tags" + struct codetag_bytes { struct codetag *ct; s64 bytes; @@ -76,7 +85,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #define DEFINE_ALLOC_TAG(_alloc_tag) \ static struct alloc_tag _alloc_tag __used __aligned(8) \ - __section("alloc_tags") = { \ + __section(ALLOC_TAG_SECTION_NAME) = { \ .ct = CODE_TAG_INIT, \ .counters = &_shared_alloc_tag }; @@ -85,7 +94,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #define DEFINE_ALLOC_TAG(_alloc_tag) \ static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ static struct alloc_tag _alloc_tag __used __aligned(8) \ - __section("alloc_tags") = { \ + __section(ALLOC_TAG_SECTION_NAME) = { \ .ct = CODE_TAG_INIT, \ .counters = &_alloc_tag_cntr }; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index c2a579ccd455..d10bd9810d32 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -35,8 +35,15 @@ struct codetag_type_desc { size_t tag_size; void (*module_load)(struct codetag_type *cttype, struct codetag_module *cmod); - bool (*module_unload)(struct codetag_type *cttype, + void (*module_unload)(struct codetag_type *cttype, struct codetag_module *cmod); +#ifdef CONFIG_MODULES + void (*module_replaced)(struct module *mod, struct module *new_mod); + bool (*needs_section_mem)(struct module *mod, unsigned long size); + void *(*alloc_section_mem)(struct module *mod, unsigned long size, + unsigned int prepend, unsigned long align); + void (*free_section_mem)(struct module *mod, bool used); +#endif }; struct codetag_iterator { @@ -71,11 +78,31 @@ struct codetag_type * codetag_register_type(const struct codetag_type_desc *desc); #if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) + +bool codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size); +void *codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align); +void codetag_free_module_sections(struct module *mod); +void codetag_module_replaced(struct module *mod, struct module *new_mod); void codetag_load_module(struct module *mod); -bool codetag_unload_module(struct module *mod); -#else +void codetag_unload_module(struct module *mod); + +#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ + +static inline bool +codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size) { return false; } +static inline void * +codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align) { return NULL; } +static inline void codetag_free_module_sections(struct module *mod) {} +static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {} static inline void codetag_load_module(struct module *mod) {} -static inline bool codetag_unload_module(struct module *mod) { return true; } -#endif +static inline void codetag_unload_module(struct module *mod) {} + +#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ #endif /* _LINUX_CODETAG_H */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 73b588fe98d4..00c16f5c5568 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1251,22 +1251,17 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) return 0; } -static void module_memory_free(struct module *mod, enum mod_mem_type type, - bool unload_codetags) +static void module_memory_free(struct module *mod, enum mod_mem_type type) { struct module_memory *mem = &mod->mem[type]; - void *ptr = mem->base; if (mem->is_rox) vfree(mem->rw_copy); - if (!unload_codetags && mod_mem_type_is_core_data(type)) - return; - - execmem_free(ptr); + execmem_free(mem->base); } -static void free_mod_mem(struct module *mod, bool unload_codetags) +static void free_mod_mem(struct module *mod) { for_each_mod_mem_type(type) { struct module_memory *mod_mem = &mod->mem[type]; @@ -1277,25 +1272,20 @@ static void free_mod_mem(struct module *mod, bool unload_codetags) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod, type, unload_codetags); + module_memory_free(mod, type); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod, MOD_DATA, unload_codetags); + module_memory_free(mod, MOD_DATA); } /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { - bool unload_codetags; - trace_module_free(mod); - unload_codetags = codetag_unload_module(mod); - if (!unload_codetags) - pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n", - mod->name); + codetag_unload_module(mod); mod_sysfs_teardown(mod); @@ -1338,7 +1328,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - free_mod_mem(mod, unload_codetags); + free_mod_mem(mod); } void *__symbol_get(const char *symbol) @@ -1603,6 +1593,20 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i if (WARN_ON_ONCE(type == MOD_INVALID)) continue; + /* + * Do not allocate codetag memory as we load it into + * preallocated contiguous memory. + */ + if (codetag_needs_module_section(mod, sname, s->sh_size)) { + /* + * s->sh_entsize won't be used but populate the + * type field to avoid confusion. + */ + s->sh_entsize = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) + << SH_ENTSIZE_TYPE_SHIFT; + continue; + } + s->sh_entsize = module_get_offset_and_type(mod, type, s, i); pr_debug("\t%s\n", sname); } @@ -2277,6 +2281,7 @@ static int move_module(struct module *mod, struct load_info *info) int i; enum mod_mem_type t = 0; int ret = -ENOMEM; + bool codetag_section_found = false; for_each_mod_mem_type(type) { if (!mod->mem[type].size) { @@ -2288,7 +2293,7 @@ static int move_module(struct module *mod, struct load_info *info) ret = module_memory_alloc(mod, type); if (ret) { t = type; - goto out_enomem; + goto out_err; } } @@ -2297,15 +2302,37 @@ static int move_module(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; - enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; - unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; + const char *sname; unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; - addr = (unsigned long)mod->mem[type].base + offset; - dest = mod->mem[type].rw_copy + offset; + sname = info->secstrings + shdr->sh_name; + /* + * Load codetag sections separately as they might still be used + * after module unload. + */ + if (codetag_needs_module_section(mod, sname, shdr->sh_size)) { + dest = codetag_alloc_module_section(mod, sname, shdr->sh_size, + arch_mod_section_prepend(mod, i), shdr->sh_addralign); + if (WARN_ON(!dest)) { + ret = -EINVAL; + goto out_err; + } + if (IS_ERR(dest)) { + ret = PTR_ERR(dest); + goto out_err; + } + addr = (unsigned long)dest; + codetag_section_found = true; + } else { + enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; + unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; + + addr = (unsigned long)mod->mem[type].base + offset; + dest = mod->mem[type].rw_copy + offset; + } if (shdr->sh_type != SHT_NOBITS) { /* @@ -2317,7 +2344,7 @@ static int move_module(struct module *mod, struct load_info *info) if (i == info->index.mod && (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) { ret = -ENOEXEC; - goto out_enomem; + goto out_err; } memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); } @@ -2333,9 +2360,12 @@ static int move_module(struct module *mod, struct load_info *info) } return 0; -out_enomem: +out_err: for (t--; t >= 0; t--) - module_memory_free(mod, t, true); + module_memory_free(mod, t); + if (codetag_section_found) + codetag_free_module_sections(mod); + return ret; } @@ -2456,6 +2486,8 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); + codetag_module_replaced(info->mod, mod); + return mod; } @@ -2465,7 +2497,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) percpu_modfree(mod); module_arch_freeing_init(mod); - free_mod_mem(mod, true); + free_mod_mem(mod); } int __weak module_finalize(const Elf_Ehdr *hdr, diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 435aa837e550..5f9cd1642d58 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -9,6 +10,7 @@ #include #define ALLOCINFO_FILE_NAME "allocinfo" +#define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT static bool mem_profiling_support __meminitdata = true; @@ -174,31 +176,226 @@ static void __init procfs_init(void) } } -static bool alloc_tag_module_unload(struct codetag_type *cttype, - struct codetag_module *cmod) +#ifdef CONFIG_MODULES + +static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); +/* A dummy object used to indicate an unloaded module */ +static struct module unloaded_mod; +/* A dummy object used to indicate a module prepended area */ +static struct module prepend_mod; + +static struct alloc_tag_module_section module_tags; + +static bool needs_section_mem(struct module *mod, unsigned long size) { - struct codetag_iterator iter = codetag_get_ct_iter(cttype); - struct alloc_tag_counters counter; - bool module_unused = true; - struct alloc_tag *tag; - struct codetag *ct; + return size >= sizeof(struct alloc_tag); +} + +static struct alloc_tag *find_used_tag(struct alloc_tag *from, struct alloc_tag *to) +{ + while (from <= to) { + struct alloc_tag_counters counter; - for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { - if (iter.cmod != cmod) + counter = alloc_tag_read(from); + if (counter.bytes) + return from; + from++; + } + + return NULL; +} + +/* Called with mod_area_mt locked */ +static void clean_unused_module_areas_locked(void) +{ + MA_STATE(mas, &mod_area_mt, 0, module_tags.size); + struct module *val; + + mas_for_each(&mas, val, module_tags.size) { + if (val != &unloaded_mod) continue; - tag = ct_to_alloc_tag(ct); - counter = alloc_tag_read(tag); + /* Release area if all tags are unused */ + if (!find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), + (struct alloc_tag *)(module_tags.start_addr + mas.last))) + mas_erase(&mas); + } +} + +/* Called with mod_area_mt locked */ +static bool find_aligned_area(struct ma_state *mas, unsigned long section_size, + unsigned long size, unsigned int prepend, unsigned long align) +{ + bool cleanup_done = false; + +repeat: + /* Try finding exact size and hope the start is aligned */ + if (!mas_empty_area(mas, 0, section_size - 1, prepend + size)) { + if (IS_ALIGNED(mas->index + prepend, align)) + return true; + + /* Try finding larger area to align later */ + mas_reset(mas); + if (!mas_empty_area(mas, 0, section_size - 1, + size + prepend + align - 1)) + return true; + } + + /* No free area, try cleanup stale data and repeat the search once */ + if (!cleanup_done) { + clean_unused_module_areas_locked(); + cleanup_done = true; + mas_reset(mas); + goto repeat; + } + + return false; +} + +static void *reserve_module_tags(struct module *mod, unsigned long size, + unsigned int prepend, unsigned long align) +{ + unsigned long section_size = module_tags.end_addr - module_tags.start_addr; + MA_STATE(mas, &mod_area_mt, 0, section_size - 1); + unsigned long offset; + void *ret = NULL; + + /* If no tags return error */ + if (size < sizeof(struct alloc_tag)) + return ERR_PTR(-EINVAL); + + /* + * align is always power of 2, so we can use IS_ALIGNED and ALIGN. + * align 0 or 1 means no alignment, to simplify set to 1. + */ + if (!align) + align = 1; + + mas_lock(&mas); + if (!find_aligned_area(&mas, section_size, size, prepend, align)) { + ret = ERR_PTR(-ENOMEM); + goto unlock; + } + + /* Mark found area as reserved */ + offset = mas.index; + offset += prepend; + offset = ALIGN(offset, align); + if (offset != mas.index) { + unsigned long pad_start = mas.index; + + mas.last = offset - 1; + mas_store(&mas, &prepend_mod); + if (mas_is_err(&mas)) { + ret = ERR_PTR(xa_err(mas.node)); + goto unlock; + } + mas.index = offset; + mas.last = offset + size - 1; + mas_store(&mas, mod); + if (mas_is_err(&mas)) { + mas.index = pad_start; + mas_erase(&mas); + ret = ERR_PTR(xa_err(mas.node)); + } + } else { + mas.last = offset + size - 1; + mas_store(&mas, mod); + if (mas_is_err(&mas)) + ret = ERR_PTR(xa_err(mas.node)); + } +unlock: + mas_unlock(&mas); + + if (IS_ERR(ret)) + return ret; - if (WARN(counter.bytes, - "%s:%u module %s func:%s has %llu allocated at module unload", - ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) - module_unused = false; + if (module_tags.size < offset + size) + module_tags.size = offset + size; + + return (struct alloc_tag *)(module_tags.start_addr + offset); +} + +static void release_module_tags(struct module *mod, bool used) +{ + MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size); + struct alloc_tag *tag; + struct module *val; + + mas_lock(&mas); + mas_for_each_rev(&mas, val, 0) + if (val == mod) + break; + + if (!val) /* module not found */ + goto out; + + if (!used) + goto release_area; + + /* Find out if the area is used */ + tag = find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), + (struct alloc_tag *)(module_tags.start_addr + mas.last)); + if (tag) { + struct alloc_tag_counters counter = alloc_tag_read(tag); + + pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n", + tag->ct.filename, tag->ct.lineno, tag->ct.modname, + tag->ct.function, counter.bytes); + } else { + used = false; + } +release_area: + mas_store(&mas, used ? &unloaded_mod : NULL); + val = mas_prev_range(&mas, 0); + if (val == &prepend_mod) + mas_store(&mas, NULL); +out: + mas_unlock(&mas); +} + +static void replace_module(struct module *mod, struct module *new_mod) +{ + MA_STATE(mas, &mod_area_mt, 0, module_tags.size); + struct module *val; + + mas_lock(&mas); + mas_for_each(&mas, val, module_tags.size) { + if (val != mod) + continue; + + mas_store_gfp(&mas, new_mod, GFP_KERNEL); + break; } + mas_unlock(&mas); +} + +static int __init alloc_mod_tags_mem(void) +{ + /* Allocate space to copy allocation tags */ + module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA, + MODULE_ALLOC_TAG_VMAP_SIZE); + if (!module_tags.start_addr) + return -ENOMEM; + + module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; + + return 0; +} - return module_unused; +static void __init free_mod_tags_mem(void) +{ + execmem_free((void *)module_tags.start_addr); + module_tags.start_addr = 0; } +#else /* CONFIG_MODULES */ + +static inline int alloc_mod_tags_mem(void) { return 0; } +static inline void free_mod_tags_mem(void) {} + +#endif /* CONFIG_MODULES */ + static int __init setup_early_mem_profiling(char *str) { bool enable; @@ -274,14 +471,26 @@ static inline void sysctl_init(void) {} static int __init alloc_tag_init(void) { const struct codetag_type_desc desc = { - .section = "alloc_tags", - .tag_size = sizeof(struct alloc_tag), - .module_unload = alloc_tag_module_unload, + .section = ALLOC_TAG_SECTION_NAME, + .tag_size = sizeof(struct alloc_tag), +#ifdef CONFIG_MODULES + .needs_section_mem = needs_section_mem, + .alloc_section_mem = reserve_module_tags, + .free_section_mem = release_module_tags, + .module_replaced = replace_module, +#endif }; + int res; + + res = alloc_mod_tags_mem(); + if (res) + return res; alloc_tag_cttype = codetag_register_type(&desc); - if (IS_ERR(alloc_tag_cttype)) + if (IS_ERR(alloc_tag_cttype)) { + free_mod_tags_mem(); return PTR_ERR(alloc_tag_cttype); + } sysctl_init(); procfs_init(); diff --git a/lib/codetag.c b/lib/codetag.c index d1fbbb7c2ec3..7455b966cae4 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -207,6 +207,94 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) } #ifdef CONFIG_MODULES +#define CODETAG_SECTION_PREFIX ".codetag." + +/* Some codetag types need a separate module section */ +bool codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size) +{ + const char *type_name; + struct codetag_type *cttype; + bool ret = false; + + if (strncmp(name, CODETAG_SECTION_PREFIX, strlen(CODETAG_SECTION_PREFIX))) + return false; + + type_name = name + strlen(CODETAG_SECTION_PREFIX); + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (strcmp(type_name, cttype->desc.section) == 0) { + if (!cttype->desc.needs_section_mem) + break; + + down_write(&cttype->mod_lock); + ret = cttype->desc.needs_section_mem(mod, size); + up_write(&cttype->mod_lock); + break; + } + } + mutex_unlock(&codetag_lock); + + return ret; +} + +void *codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align) +{ + const char *type_name = name + strlen(CODETAG_SECTION_PREFIX); + struct codetag_type *cttype; + void *ret = ERR_PTR(-EINVAL); + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (strcmp(type_name, cttype->desc.section) == 0) { + if (WARN_ON(!cttype->desc.alloc_section_mem)) + break; + + down_write(&cttype->mod_lock); + ret = cttype->desc.alloc_section_mem(mod, size, prepend, align); + up_write(&cttype->mod_lock); + break; + } + } + mutex_unlock(&codetag_lock); + + return ret; +} + +void codetag_free_module_sections(struct module *mod) +{ + struct codetag_type *cttype; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (!cttype->desc.free_section_mem) + continue; + + down_write(&cttype->mod_lock); + cttype->desc.free_section_mem(mod, false); + up_write(&cttype->mod_lock); + } + mutex_unlock(&codetag_lock); +} + +void codetag_module_replaced(struct module *mod, struct module *new_mod) +{ + struct codetag_type *cttype; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (!cttype->desc.module_replaced) + continue; + + down_write(&cttype->mod_lock); + cttype->desc.module_replaced(mod, new_mod); + up_write(&cttype->mod_lock); + } + mutex_unlock(&codetag_lock); +} + void codetag_load_module(struct module *mod) { struct codetag_type *cttype; @@ -220,13 +308,12 @@ void codetag_load_module(struct module *mod) mutex_unlock(&codetag_lock); } -bool codetag_unload_module(struct module *mod) +void codetag_unload_module(struct module *mod) { struct codetag_type *cttype; - bool unload_ok = true; if (!mod) - return true; + return; /* await any module's kfree_rcu() operations to complete */ kvfree_rcu_barrier(); @@ -246,18 +333,17 @@ bool codetag_unload_module(struct module *mod) } if (found) { if (cttype->desc.module_unload) - if (!cttype->desc.module_unload(cttype, cmod)) - unload_ok = false; + cttype->desc.module_unload(cttype, cmod); cttype->count -= range_size(cttype, &cmod->range); idr_remove(&cttype->mod_idr, mod_id); kfree(cmod); } up_write(&cttype->mod_lock); + if (found && cttype->desc.free_section_mem) + cttype->desc.free_section_mem(mod, true); } mutex_unlock(&codetag_lock); - - return unload_ok; } #endif /* CONFIG_MODULES */ diff --git a/scripts/module.lds.S b/scripts/module.lds.S index 3f43edef813c..711c6e029936 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -50,7 +50,7 @@ SECTIONS { .data : { *(.data .data.[0-9a-zA-Z_]*) *(.data..L*) - CODETAG_SECTIONS() + MOD_CODETAG_SECTIONS() } .rodata : { @@ -59,9 +59,10 @@ SECTIONS { } #else .data : { - CODETAG_SECTIONS() + MOD_CODETAG_SECTIONS() } #endif + MOD_SEPARATE_CODETAG_SECTIONS() } /* bring in arch-specific sections */ -- cgit v1.2.3 From 0f9b685626daa2f8e19a9788625c9b624c223e45 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:57 -0700 Subject: alloc_tag: populate memory for module tags as needed The memory reserved for module tags does not need to be backed by physical pages until there are tags to store there. Change the way we reserve this memory to allocate only virtual area for the tags and populate it with physical pages as needed when we load a module. [surenb@google.com: avoid execmem_vmap() when !MMU] Link: https://lkml.kernel.org/r/20241031233611.3833002-1-surenb@google.com Link: https://lkml.kernel.org/r/20241023170759.999909-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/execmem.h | 12 ++++++++ include/linux/vmalloc.h | 3 ++ lib/Kconfig.debug | 1 + lib/alloc_tag.c | 73 +++++++++++++++++++++++++++++++++++++++++++------ mm/execmem.c | 16 +++++++++++ mm/internal.h | 6 ++++ mm/vmalloc.c | 4 +-- 7 files changed, 104 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 1517fa196bf7..64130ae19690 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -139,6 +139,18 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +#ifdef CONFIG_MMU +/** + * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory + * @size: size of the virtual mapping in bytes + * + * Maps virtually contiguous area in the range suitable for EXECMEM_MODULE_DATA. + * + * Return: the area descriptor on success or %NULL on failure. + */ +struct vm_struct *execmem_vmap(size_t size); +#endif + /** * execmem_update_copy - copy an update to executable memory * @dst: destination address to update diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 27408f21e501..31e9ffd936e3 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -202,6 +202,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, + struct page **pages, unsigned int page_shift); + /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc5..6798bbbcbd32 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -993,6 +993,7 @@ config CODE_TAGGING config MEM_ALLOC_PROFILING bool "Enable memory allocation profiling" default n + depends on MMU depends on PROC_FS depends on !DEBUG_FORCE_WEAK_PER_CPU select CODE_TAGGING diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 5f9cd1642d58..4a7fc081b789 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -8,14 +8,15 @@ #include #include #include +#include #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -static bool mem_profiling_support __meminitdata = true; +static bool mem_profiling_support = true; #else -static bool mem_profiling_support __meminitdata; +static bool mem_profiling_support; #endif static struct codetag_type *alloc_tag_cttype; @@ -154,7 +155,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } -static void __init shutdown_mem_profiling(void) +static void shutdown_mem_profiling(void) { if (mem_alloc_profiling_enabled()) static_branch_disable(&mem_alloc_profiling_key); @@ -179,6 +180,7 @@ static void __init procfs_init(void) #ifdef CONFIG_MODULES static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); +static struct vm_struct *vm_module_tags; /* A dummy object used to indicate an unloaded module */ static struct module unloaded_mod; /* A dummy object used to indicate a module prepended area */ @@ -252,6 +254,33 @@ repeat: return false; } +static int vm_module_tags_populate(void) +{ + unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT; + + if (phys_size < module_tags.size) { + struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages; + unsigned long addr = module_tags.start_addr + phys_size; + unsigned long more_pages; + unsigned long nr; + + more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT; + nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, + NUMA_NO_NODE, more_pages, next_page); + if (nr < more_pages || + vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL, + next_page, PAGE_SHIFT) < 0) { + /* Clean up and error out */ + for (int i = 0; i < nr; i++) + __free_page(next_page[i]); + return -ENOMEM; + } + vm_module_tags->nr_pages += nr; + } + + return 0; +} + static void *reserve_module_tags(struct module *mod, unsigned long size, unsigned int prepend, unsigned long align) { @@ -310,8 +339,18 @@ unlock: if (IS_ERR(ret)) return ret; - if (module_tags.size < offset + size) + if (module_tags.size < offset + size) { + int grow_res; + module_tags.size = offset + size; + grow_res = vm_module_tags_populate(); + if (grow_res) { + shutdown_mem_profiling(); + pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n", + mod->name); + return ERR_PTR(grow_res); + } + } return (struct alloc_tag *)(module_tags.start_addr + offset); } @@ -372,12 +411,23 @@ static void replace_module(struct module *mod, struct module *new_mod) static int __init alloc_mod_tags_mem(void) { - /* Allocate space to copy allocation tags */ - module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA, - MODULE_ALLOC_TAG_VMAP_SIZE); - if (!module_tags.start_addr) + /* Map space to copy allocation tags */ + vm_module_tags = execmem_vmap(MODULE_ALLOC_TAG_VMAP_SIZE); + if (!vm_module_tags) { + pr_err("Failed to map %lu bytes for module allocation tags\n", + MODULE_ALLOC_TAG_VMAP_SIZE); + module_tags.start_addr = 0; return -ENOMEM; + } + vm_module_tags->pages = kmalloc_array(get_vm_area_size(vm_module_tags) >> PAGE_SHIFT, + sizeof(struct page *), GFP_KERNEL | __GFP_ZERO); + if (!vm_module_tags->pages) { + free_vm_area(vm_module_tags); + return -ENOMEM; + } + + module_tags.start_addr = (unsigned long)vm_module_tags->addr; module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; return 0; @@ -385,8 +435,13 @@ static int __init alloc_mod_tags_mem(void) static void __init free_mod_tags_mem(void) { - execmem_free((void *)module_tags.start_addr); + int i; + module_tags.start_addr = 0; + for (i = 0; i < vm_module_tags->nr_pages; i++) + __free_page(vm_module_tags->pages[i]); + kfree(vm_module_tags->pages); + free_vm_area(vm_module_tags); } #else /* CONFIG_MODULES */ diff --git a/mm/execmem.c b/mm/execmem.c index 576a57e2161f..317b6a8d35be 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -64,6 +64,22 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size, return p; } + +struct vm_struct *execmem_vmap(size_t size) +{ + struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA]; + struct vm_struct *area; + + area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, + range->start, range->end, NUMA_NO_NODE, + GFP_KERNEL, __builtin_return_address(0)); + if (!area && range->fallback_start) + area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, + range->fallback_start, range->fallback_end, + NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); + + return area; +} #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) diff --git a/mm/internal.h b/mm/internal.h index 3dc745ba76dd..cd96848be245 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1263,6 +1263,12 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); +struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long align, unsigned long shift, + unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, + const void *caller); + /* * mm/gup.c */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 74c0a5eae210..7ed39d104201 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -653,7 +653,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, * RETURNS: * 0 on success, -errno on failure. */ -static int vmap_pages_range(unsigned long addr, unsigned long end, +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; @@ -3106,7 +3106,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) vm->flags &= ~VM_UNINITIALIZED; } -static struct vm_struct *__get_vm_area_node(unsigned long size, +struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) -- cgit v1.2.3 From 4835f747d3ed181bf2c67930fe06b2c01a5d2323 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:59 -0700 Subject: alloc_tag: support for page allocation tag compression Implement support for storing page allocation tag references directly in the page flags instead of page extensions. sysctl.vm.mem_profiling boot parameter it extended to provide a way for a user to request this mode. Enabling compression eliminates memory overhead caused by page_ext and results in better performance for page allocations. However this mode will not work if the number of available page flag bits is insufficient to address all kernel allocations. Such condition can happen during boot or when loading a module. If this condition is detected, memory allocation profiling gets disabled with an appropriate warning. By default compression mode is disabled. Link: https://lkml.kernel.org/r/20241023170759.999909-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/allocation-profiling.rst | 7 +- include/linux/alloc_tag.h | 10 ++- include/linux/codetag.h | 3 + include/linux/page-flags-layout.h | 7 ++ include/linux/pgalloc_tag.h | 145 ++++++++++++++++++++++++++---- lib/alloc_tag.c | 142 +++++++++++++++++++++++++++-- lib/codetag.c | 4 +- mm/mm_init.c | 5 +- 8 files changed, 290 insertions(+), 33 deletions(-) (limited to 'lib') diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst index ffd6655b7be2..316311240e6a 100644 --- a/Documentation/mm/allocation-profiling.rst +++ b/Documentation/mm/allocation-profiling.rst @@ -18,12 +18,17 @@ kconfig options: missing annotation Boot parameter: - sysctl.vm.mem_profiling=0|1|never + sysctl.vm.mem_profiling={0|1|never}[,compressed] When set to "never", memory allocation profiling overhead is minimized and it cannot be enabled at runtime (sysctl becomes read-only). When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y, default value is "1". When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, default value is "never". + "compressed" optional parameter will try to store page tag references in a + compact format, avoiding page extensions. This results in improved performance + and memory consumption, however it might fail depending on system configuration. + If compression fails, a warning is issued and memory allocation profiling gets + disabled. sysctl: /proc/sys/vm/mem_profiling diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 55d30543c4c7..7c0786bdf9af 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,8 +30,16 @@ struct alloc_tag { struct alloc_tag_counters __percpu *counters; } __aligned(8); +struct alloc_tag_kernel_section { + struct alloc_tag *first_tag; + unsigned long count; +}; + struct alloc_tag_module_section { - unsigned long start_addr; + union { + unsigned long start_addr; + struct alloc_tag *first_tag; + }; unsigned long end_addr; /* used size */ unsigned long size; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index d10bd9810d32..d14dbd26b370 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -13,6 +13,9 @@ struct codetag_module; struct seq_buf; struct module; +#define CODETAG_SECTION_START_PREFIX "__start_" +#define CODETAG_SECTION_STOP_PREFIX "__stop_" + /* * An instance of this structure is created in a special ELF section at every * code location being tagged. At runtime, the special section is treated as diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 7d79818dc065..4f5c9e979bb9 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -111,5 +111,12 @@ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) +#define NR_NON_PAGEFLAG_BITS (SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + \ + LAST_CPUPID_SHIFT + KASAN_TAG_WIDTH + \ + LRU_GEN_WIDTH + LRU_REFS_WIDTH) + +#define NR_UNUSED_PAGEFLAG_BITS (BITS_PER_LONG - \ + (NR_NON_PAGEFLAG_BITS + NR_PAGEFLAGS)) + #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index b13cd3313a88..1fe63b52e5e5 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -11,29 +11,118 @@ #include +extern struct page_ext_operations page_alloc_tagging_ops; +extern unsigned long alloc_tag_ref_mask; +extern int alloc_tag_ref_offs; +extern struct alloc_tag_kernel_section kernel_tags; + +DECLARE_STATIC_KEY_FALSE(mem_profiling_compressed); + +typedef u16 pgalloc_tag_idx; + union pgtag_ref_handle { union codetag_ref *ref; /* reference in page extension */ + struct page *page; /* reference in page flags */ }; -extern struct page_ext_operations page_alloc_tagging_ops; +/* Reserved indexes */ +#define CODETAG_ID_NULL 0 +#define CODETAG_ID_EMPTY 1 +#define CODETAG_ID_FIRST 2 + +#ifdef CONFIG_MODULES + +extern struct alloc_tag_module_section module_tags; + +static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx) +{ + return &module_tags.first_tag[idx - kernel_tags.count]; +} + +static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag) +{ + return CODETAG_ID_FIRST + kernel_tags.count + (tag - module_tags.first_tag); +} + +#else /* CONFIG_MODULES */ + +static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx) +{ + pr_warn("invalid page tag reference %lu\n", (unsigned long)idx); + return NULL; +} + +static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag) +{ + pr_warn("invalid page tag 0x%lx\n", (unsigned long)tag); + return CODETAG_ID_NULL; +} + +#endif /* CONFIG_MODULES */ + +static inline void idx_to_ref(pgalloc_tag_idx idx, union codetag_ref *ref) +{ + switch (idx) { + case (CODETAG_ID_NULL): + ref->ct = NULL; + break; + case (CODETAG_ID_EMPTY): + set_codetag_empty(ref); + break; + default: + idx -= CODETAG_ID_FIRST; + ref->ct = idx < kernel_tags.count ? + &kernel_tags.first_tag[idx].ct : + &module_idx_to_tag(idx)->ct; + break; + } +} + +static inline pgalloc_tag_idx ref_to_idx(union codetag_ref *ref) +{ + struct alloc_tag *tag; + + if (!ref->ct) + return CODETAG_ID_NULL; + + if (is_codetag_empty(ref)) + return CODETAG_ID_EMPTY; + + tag = ct_to_alloc_tag(ref->ct); + if (tag >= kernel_tags.first_tag && tag < kernel_tags.first_tag + kernel_tags.count) + return CODETAG_ID_FIRST + (tag - kernel_tags.first_tag); + + return module_tag_to_idx(tag); +} + + /* Should be called only if mem_alloc_profiling_enabled() */ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, union pgtag_ref_handle *handle) { - struct page_ext *page_ext; - union codetag_ref *tmp; - if (!page) return false; - page_ext = page_ext_get(page); - if (!page_ext) - return false; + if (static_key_enabled(&mem_profiling_compressed)) { + pgalloc_tag_idx idx; + + idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask; + idx_to_ref(idx, ref); + handle->page = page; + } else { + struct page_ext *page_ext; + union codetag_ref *tmp; + + page_ext = page_ext_get(page); + if (!page_ext) + return false; + + tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); + ref->ct = tmp->ct; + handle->ref = tmp; + } - tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); - ref->ct = tmp->ct; - handle->ref = tmp; return true; } @@ -42,16 +131,35 @@ static inline void put_page_tag_ref(union pgtag_ref_handle handle) if (WARN_ON(!handle.ref)) return; - page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); + if (!static_key_enabled(&mem_profiling_compressed)) + page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); } -static inline void update_page_tag_ref(union pgtag_ref_handle handle, - union codetag_ref *ref) +static inline void update_page_tag_ref(union pgtag_ref_handle handle, union codetag_ref *ref) { - if (WARN_ON(!handle.ref || !ref)) - return; - - handle.ref->ct = ref->ct; + if (static_key_enabled(&mem_profiling_compressed)) { + struct page *page = handle.page; + unsigned long old_flags; + unsigned long flags; + unsigned long idx; + + if (WARN_ON(!page || !ref)) + return; + + idx = (unsigned long)ref_to_idx(ref); + idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs; + do { + old_flags = READ_ONCE(page->flags); + flags = old_flags; + flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs); + flags |= idx; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } else { + if (WARN_ON(!handle.ref || !ref)) + return; + + handle.ref->ct = ref->ct; + } } static inline void clear_page_tag_ref(struct page *page) @@ -122,6 +230,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } +void __init alloc_tag_sec_init(void); + #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void clear_page_tag_ref(struct page *page) {} @@ -130,6 +240,7 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} +static inline void alloc_tag_sec_init(void) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 4a7fc081b789..d38a4b2a551d 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -12,6 +13,8 @@ #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) +#define SECTION_START(NAME) (CODETAG_SECTION_START_PREFIX NAME) +#define SECTION_STOP(NAME) (CODETAG_SECTION_STOP_PREFIX NAME) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT static bool mem_profiling_support = true; @@ -26,6 +29,11 @@ EXPORT_SYMBOL(_shared_alloc_tag); DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, mem_alloc_profiling_key); +DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed); + +struct alloc_tag_kernel_section kernel_tags = { NULL, 0 }; +unsigned long alloc_tag_ref_mask; +int alloc_tag_ref_offs; struct allocinfo_private { struct codetag_iterator iter; @@ -155,7 +163,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } -static void shutdown_mem_profiling(void) +static void shutdown_mem_profiling(bool remove_file) { if (mem_alloc_profiling_enabled()) static_branch_disable(&mem_alloc_profiling_key); @@ -163,6 +171,8 @@ static void shutdown_mem_profiling(void) if (!mem_profiling_support) return; + if (remove_file) + remove_proc_entry(ALLOCINFO_FILE_NAME, NULL); mem_profiling_support = false; } @@ -173,10 +183,40 @@ static void __init procfs_init(void) if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); - shutdown_mem_profiling(); + shutdown_mem_profiling(false); } } +void __init alloc_tag_sec_init(void) +{ + struct alloc_tag *last_codetag; + + if (!mem_profiling_support) + return; + + if (!static_key_enabled(&mem_profiling_compressed)) + return; + + kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name( + SECTION_START(ALLOC_TAG_SECTION_NAME)); + last_codetag = (struct alloc_tag *)kallsyms_lookup_name( + SECTION_STOP(ALLOC_TAG_SECTION_NAME)); + kernel_tags.count = last_codetag - kernel_tags.first_tag; + + /* Check if kernel tags fit into page flags */ + if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) { + shutdown_mem_profiling(false); /* allocinfo file does not exist yet */ + pr_err("%lu allocation tags cannot be references using %d available page flag bits. Memory allocation profiling is disabled!\n", + kernel_tags.count, NR_UNUSED_PAGEFLAG_BITS); + return; + } + + alloc_tag_ref_offs = (LRU_REFS_PGOFF - NR_UNUSED_PAGEFLAG_BITS); + alloc_tag_ref_mask = ((1UL << NR_UNUSED_PAGEFLAG_BITS) - 1); + pr_debug("Memory allocation profiling compression is using %d page flag bits!\n", + NR_UNUSED_PAGEFLAG_BITS); +} + #ifdef CONFIG_MODULES static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); @@ -186,10 +226,59 @@ static struct module unloaded_mod; /* A dummy object used to indicate a module prepended area */ static struct module prepend_mod; -static struct alloc_tag_module_section module_tags; +struct alloc_tag_module_section module_tags; + +static inline unsigned long alloc_tag_align(unsigned long val) +{ + if (!static_key_enabled(&mem_profiling_compressed)) { + /* No alignment requirements when we are not indexing the tags */ + return val; + } + + if (val % sizeof(struct alloc_tag) == 0) + return val; + return ((val / sizeof(struct alloc_tag)) + 1) * sizeof(struct alloc_tag); +} + +static bool ensure_alignment(unsigned long align, unsigned int *prepend) +{ + if (!static_key_enabled(&mem_profiling_compressed)) { + /* No alignment requirements when we are not indexing the tags */ + return true; + } + + /* + * If alloc_tag size is not a multiple of required alignment, tag + * indexing does not work. + */ + if (!IS_ALIGNED(sizeof(struct alloc_tag), align)) + return false; + + /* Ensure prepend consumes multiple of alloc_tag-sized blocks */ + if (*prepend) + *prepend = alloc_tag_align(*prepend); + + return true; +} + +static inline bool tags_addressable(void) +{ + unsigned long tag_idx_count; + + if (!static_key_enabled(&mem_profiling_compressed)) + return true; /* with page_ext tags are always addressable */ + + tag_idx_count = CODETAG_ID_FIRST + kernel_tags.count + + module_tags.size / sizeof(struct alloc_tag); + + return tag_idx_count < (1UL << NR_UNUSED_PAGEFLAG_BITS); +} static bool needs_section_mem(struct module *mod, unsigned long size) { + if (!mem_profiling_support) + return false; + return size >= sizeof(struct alloc_tag); } @@ -300,6 +389,13 @@ static void *reserve_module_tags(struct module *mod, unsigned long size, if (!align) align = 1; + if (!ensure_alignment(align, &prepend)) { + shutdown_mem_profiling(true); + pr_err("%s: alignment %lu is incompatible with allocation tag indexing. Memory allocation profiling is disabled!\n", + mod->name, align); + return ERR_PTR(-EINVAL); + } + mas_lock(&mas); if (!find_aligned_area(&mas, section_size, size, prepend, align)) { ret = ERR_PTR(-ENOMEM); @@ -343,9 +439,15 @@ unlock: int grow_res; module_tags.size = offset + size; + if (mem_alloc_profiling_enabled() && !tags_addressable()) { + shutdown_mem_profiling(true); + pr_warn("With module %s there are too many tags to fit in %d page flag bits. Memory allocation profiling is disabled!\n", + mod->name, NR_UNUSED_PAGEFLAG_BITS); + } + grow_res = vm_module_tags_populate(); if (grow_res) { - shutdown_mem_profiling(); + shutdown_mem_profiling(true); pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n", mod->name); return ERR_PTR(grow_res); @@ -429,6 +531,8 @@ static int __init alloc_mod_tags_mem(void) module_tags.start_addr = (unsigned long)vm_module_tags->addr; module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; + /* Ensure the base is alloc_tag aligned when required for indexing */ + module_tags.start_addr = alloc_tag_align(module_tags.start_addr); return 0; } @@ -451,8 +555,10 @@ static inline void free_mod_tags_mem(void) {} #endif /* CONFIG_MODULES */ +/* See: Documentation/mm/allocation-profiling.rst */ static int __init setup_early_mem_profiling(char *str) { + bool compressed = false; bool enable; if (!str || !str[0]) @@ -461,22 +567,37 @@ static int __init setup_early_mem_profiling(char *str) if (!strncmp(str, "never", 5)) { enable = false; mem_profiling_support = false; + pr_info("Memory allocation profiling is disabled!\n"); } else { - int res; + char *token = strsep(&str, ","); + + if (kstrtobool(token, &enable)) + return -EINVAL; - res = kstrtobool(str, &enable); - if (res) - return res; + if (str) { + if (strcmp(str, "compressed")) + return -EINVAL; + + compressed = true; + } mem_profiling_support = true; + pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n", + compressed ? "with" : "without", enable ? "on" : "off"); } - if (enable != static_key_enabled(&mem_alloc_profiling_key)) { + if (enable != mem_alloc_profiling_enabled()) { if (enable) static_branch_enable(&mem_alloc_profiling_key); else static_branch_disable(&mem_alloc_profiling_key); } + if (compressed != static_key_enabled(&mem_profiling_compressed)) { + if (compressed) + static_branch_enable(&mem_profiling_compressed); + else + static_branch_disable(&mem_profiling_compressed); + } return 0; } @@ -484,6 +605,9 @@ early_param("sysctl.vm.mem_profiling", setup_early_mem_profiling); static __init bool need_page_alloc_tagging(void) { + if (static_key_enabled(&mem_profiling_compressed)) + return false; + return mem_profiling_support; } diff --git a/lib/codetag.c b/lib/codetag.c index 7455b966cae4..42aadd6c1454 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -149,8 +149,8 @@ static struct codetag_range get_section_range(struct module *mod, const char *section) { return (struct codetag_range) { - get_symbol(mod, "__start_", section), - get_symbol(mod, "__stop_", section), + get_symbol(mod, CODETAG_SECTION_START_PREFIX, section), + get_symbol(mod, CODETAG_SECTION_STOP_PREFIX, section), }; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 4ba5607aaf19..1c205b0a86ed 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -83,8 +83,7 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = BITS_PER_LONG; - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + width = shift - NR_NON_PAGEFLAG_BITS; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", SECTIONS_WIDTH, @@ -2639,7 +2638,7 @@ void __init mm_core_init(void) BUILD_BUG_ON(MAX_ZONELISTS > 2); build_all_zonelists(NULL); page_alloc_init_cpuhp(); - + alloc_tag_sec_init(); /* * page_ext requires contiguous pages, * bigger than MAX_PAGE_ORDER unless SPARSEMEM. -- cgit v1.2.3 From b7fc16a16b0850e41b88eae0edfa4c085c012347 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 24 Oct 2024 09:23:18 -0700 Subject: mm/codetag: uninline and move pgalloc_tag_copy and pgalloc_tag_split pgalloc_tag_copy() and pgalloc_tag_split() are sizable and outside of any performance-critical paths, so it should be fine to uninline them. Also move their declarations into pgalloc_tag.h which seems like a more appropriate place for them. No functional changes other than uninlining. Link: https://lkml.kernel.org/r/20241024162318.1640781-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Acked-by: Yu Zhao Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Signed-off-by: Andrew Morton --- include/linux/mm.h | 58 --------------------------------------------- include/linux/pgalloc_tag.h | 5 ++++ lib/alloc_tag.c | 48 +++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 58 deletions(-) (limited to 'lib') diff --git a/include/linux/mm.h b/include/linux/mm.h index f9120ac6d901..08e487e5850a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4166,62 +4166,4 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif -#ifdef CONFIG_MEM_ALLOC_PROFILING -static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) -{ - int i; - struct alloc_tag *tag; - unsigned int nr_pages = 1 << new_order; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = pgalloc_tag_get(&folio->page); - if (!tag) - return; - - for (i = nr_pages; i < (1 << old_order); i += nr_pages) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { - /* Set new reference to point to the original tag */ - alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} - -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) -{ - union pgtag_ref_handle handle; - union codetag_ref ref; - struct alloc_tag *tag; - - tag = pgalloc_tag_get(&old->page); - if (!tag) - return; - - if (!get_page_tag_ref(&new->page, &ref, &handle)) - return; - - /* Clear the old ref to the original allocation tag. */ - clear_page_tag_ref(&old->page); - /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(&ref, folio_size(new)); - __alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); -} -#else /* !CONFIG_MEM_ALLOC_PROFILING */ -static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) -{ -} - -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) -{ -} -#endif /* CONFIG_MEM_ALLOC_PROFILING */ - #endif /* _LINUX_MM_H */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 1fe63b52e5e5..0e43ab653ab6 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -230,6 +230,9 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } +void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); +void pgalloc_tag_copy(struct folio *new, struct folio *old); + void __init alloc_tag_sec_init(void); #else /* CONFIG_MEM_ALLOC_PROFILING */ @@ -241,6 +244,8 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index d38a4b2a551d..2414a7ee7ec7 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -163,6 +163,54 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } +void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ + int i; + struct alloc_tag *tag; + unsigned int nr_pages = 1 << new_order; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = pgalloc_tag_get(&folio->page); + if (!tag) + return; + + for (i = nr_pages; i < (1 << old_order); i += nr_pages) { + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { + /* Set new reference to point to the original tag */ + alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } + } +} + +void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + struct alloc_tag *tag; + + tag = pgalloc_tag_get(&old->page); + if (!tag) + return; + + if (!get_page_tag_ref(&new->page, &ref, &handle)) + return; + + /* Clear the old ref to the original allocation tag. */ + clear_page_tag_ref(&old->page); + /* Decrement the counters of the tag on get_new_folio. */ + alloc_tag_sub(&ref, folio_size(new)); + __alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); +} + static void shutdown_mem_profiling(bool remove_file) { if (mem_alloc_profiling_enabled()) -- cgit v1.2.3 From 2466b31201424ccb7eda00277222302a4d6576cb Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 6 Nov 2024 00:17:21 +0000 Subject: tests/module/gen_test_kallsyms.sh: use 0 value for variables Use 0 for the values as we use them for the return value on init to keep the test modules simple. This fixes a splat reported do_init_module: 'test_kallsyms_b'->init suspiciously returned 255, it should follow 0/-E convention do_init_module: loading module anyway... CPU: 5 UID: 0 PID: 1873 Comm: modprobe Not tainted 6.12.0-rc3+ #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2024.08-1 09/18/2024 Call Trace: dump_stack_lvl+0x56/0x80 do_init_module.cold+0x21/0x26 init_module_from_file+0x88/0xf0 idempotent_init_module+0x108/0x300 __x64_sys_finit_module+0x5a/0xb0 do_syscall_64+0x4b/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f4f3a718839 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff> RSP: 002b:00007fff97d1a9e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000055b94001ab90 RCX: 00007f4f3a718839 RDX: 0000000000000000 RSI: 000055b910e68a10 RDI: 0000000000000004 RBP: 0000000000000000 R08: 00007f4f3a7f1b20 R09: 000055b94001c5b0 R10: 0000000000000040 R11: 0000000000000246 R12: 000055b910e68a10 R13: 0000000000040000 R14: 000055b94001ad60 R15: 0000000000000000 do_init_module: 'test_kallsyms_b'->init suspiciously returned 255, it should follow 0/-E convention do_init_module: loading module anyway... CPU: 1 UID: 0 PID: 1884 Comm: modprobe Not tainted 6.12.0-rc3+ #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2024.08-1 09/18/2024 Call Trace: dump_stack_lvl+0x56/0x80 do_init_module.cold+0x21/0x26 init_module_from_file+0x88/0xf0 idempotent_init_module+0x108/0x300 __x64_sys_finit_module+0x5a/0xb0 do_syscall_64+0x4b/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7ffaa5d18839 Reported-by: Sami Tolvanen Reviewed-by: Sami Tolvanen Signed-off-by: Luis Chamberlain --- lib/tests/module/gen_test_kallsyms.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/tests/module/gen_test_kallsyms.sh b/lib/tests/module/gen_test_kallsyms.sh index ae5966f1f904..3f2c626350ad 100755 --- a/lib/tests/module/gen_test_kallsyms.sh +++ b/lib/tests/module/gen_test_kallsyms.sh @@ -32,7 +32,7 @@ gen_num_syms() PREFIX=$1 NUM=$2 for i in $(seq 1 $NUM); do - printf "int auto_test_%s_%010d = 0xff;\n" $PREFIX $i + printf "int auto_test_%s_%010d = 0;\n" $PREFIX $i printf "EXPORT_SYMBOL_GPL(auto_test_%s_%010d);\n" $PREFIX $i done echo -- cgit v1.2.3 From ae193dd79398970ee760e0c8129ac42ef8f5c6ff Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Wed, 16 Oct 2024 18:18:00 +0500 Subject: kasan: move checks to do_strncpy_from_user Patch series "kasan: migrate the last module test to kunit", v4. copy_user_test() is the last KUnit-incompatible test with CONFIG_KASAN_MODULE_TEST requirement, which we are going to migrate to KUnit framework and delete the former test and Kconfig as well. In this patch series: - [1/3] move kasan_check_write() and check_object_size() to do_strncpy_from_user() to cover with KASAN checks with multiple conditions in strncpy_from_user(). - [2/3] migrated copy_user_test() to KUnit, where we can also test strncpy_from_user() due to [1/4]. KUnits have been tested on: - x86_64 with CONFIG_KASAN_GENERIC. Passed - arm64 with CONFIG_KASAN_SW_TAGS. 1 fail. See [1] - arm64 with CONFIG_KASAN_HW_TAGS. 1 fail. See [1] [1] https://lore.kernel.org/linux-mm/CACzwLxj21h7nCcS2-KA_q7ybe+5pxH0uCDwu64q_9pPsydneWQ@mail.gmail.com/ - [3/3] delete CONFIG_KASAN_MODULE_TEST and documentation occurrences. This patch (of 3): Since in the commit 2865baf54077("x86: support user address masking instead of non-speculative conditional") do_strncpy_from_user() is called from multiple places, we should sanitize the kernel *dst memory and size which were done in strncpy_from_user() previously. Link: https://lkml.kernel.org/r/20241016131802.3115788-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20241016131802.3115788-2-snovitoll@gmail.com Fixes: 2865baf54077 ("x86: support user address masking instead of non-speculative conditional") Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Alex Shi Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Marco Elver Cc: Vincenzo Frascino Cc: Yanteng Si Signed-off-by: Andrew Morton --- lib/strncpy_from_user.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 989a12a67872..6dc234913dd5 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -120,6 +120,9 @@ long strncpy_from_user(char *dst, const char __user *src, long count) if (unlikely(count <= 0)) return 0; + kasan_check_write(dst, count); + check_object_size(dst, count, false); + if (can_do_masked_user_access()) { long retval; @@ -142,8 +145,6 @@ long strncpy_from_user(char *dst, const char __user *src, long count) if (max > count) max = count; - kasan_check_write(dst, count); - check_object_size(dst, count, false); if (user_read_access_begin(src, max)) { retval = do_strncpy_from_user(dst, src, count, max); user_read_access_end(); -- cgit v1.2.3 From 4e4d9c72c946b77f0278988d0bf1207fa1b2cd0f Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Wed, 16 Oct 2024 18:18:02 +0500 Subject: kasan: delete CONFIG_KASAN_MODULE_TEST Since we've migrated all tests to the KUnit framework, we can delete CONFIG_KASAN_MODULE_TEST and mentioning of it in the documentation as well. I've used the online translator to modify the non-English documentation. [snovitoll@gmail.com: fix indentation in translation] Link: https://lkml.kernel.org/r/20241020042813.3223449-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20241016131802.3115788-4-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Alex Shi Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Marco Elver Cc: Vincenzo Frascino Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 23 +++++++++------------- .../translations/zh_CN/dev-tools/kasan.rst | 20 ++++++++----------- .../translations/zh_TW/dev-tools/kasan.rst | 21 +++++++++----------- lib/Kconfig.kasan | 7 ------- mm/kasan/kasan.h | 2 +- mm/kasan/report.c | 2 +- 6 files changed, 28 insertions(+), 47 deletions(-) (limited to 'lib') diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index d7de44f5339d..0a1418ab72fd 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -511,19 +511,14 @@ Tests ~~~~~ There are KASAN tests that allow verifying that KASAN works and can detect -certain types of memory corruptions. The tests consist of two parts: +certain types of memory corruptions. -1. Tests that are integrated with the KUnit Test Framework. Enabled with -``CONFIG_KASAN_KUNIT_TEST``. These tests can be run and partially verified +All KASAN tests are integrated with the KUnit Test Framework and can be enabled +via ``CONFIG_KASAN_KUNIT_TEST``. The tests can be run and partially verified automatically in a few different ways; see the instructions below. -2. Tests that are currently incompatible with KUnit. Enabled with -``CONFIG_KASAN_MODULE_TEST`` and can only be run as a module. These tests can -only be verified manually by loading the kernel module and inspecting the -kernel log for KASAN reports. - -Each KUnit-compatible KASAN test prints one of multiple KASAN reports if an -error is detected. Then the test prints its number and status. +Each KASAN test prints one of multiple KASAN reports if an error is detected. +Then the test prints its number and status. When a test passes:: @@ -550,16 +545,16 @@ Or, if one of the tests failed:: not ok 1 - kasan -There are a few ways to run KUnit-compatible KASAN tests. +There are a few ways to run the KASAN tests. 1. Loadable module - With ``CONFIG_KUNIT`` enabled, KASAN-KUnit tests can be built as a loadable - module and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``. + With ``CONFIG_KUNIT`` enabled, the tests can be built as a loadable module + and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``. 2. Built-In - With ``CONFIG_KUNIT`` built-in, KASAN-KUnit tests can be built-in as well. + With ``CONFIG_KUNIT`` built-in, the tests can be built-in as well. In this case, the tests will run at boot as a late-init call. 3. Using kunit_tool diff --git a/Documentation/translations/zh_CN/dev-tools/kasan.rst b/Documentation/translations/zh_CN/dev-tools/kasan.rst index 4491ad2830ed..fd2e3afbdfad 100644 --- a/Documentation/translations/zh_CN/dev-tools/kasan.rst +++ b/Documentation/translations/zh_CN/dev-tools/kasan.rst @@ -422,16 +422,12 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。 ~~~~ 有一些KASAN测试可以验证KASAN是否正常工作并可以检测某些类型的内存损坏。 -测试由两部分组成: -1. 与KUnit测试框架集成的测试。使用 ``CONFIG_KASAN_KUNIT_TEST`` 启用。 -这些测试可以通过几种不同的方式自动运行和部分验证;请参阅下面的说明。 +所有 KASAN 测试都与 KUnit 测试框架集成,可通过 ``CONFIG_KASAN_KUNIT_TEST`` 启用。 +测试可以通过几种不同的方式自动运行和部分验证;请参阅以下说明。 -2. 与KUnit不兼容的测试。使用 ``CONFIG_KASAN_MODULE_TEST`` 启用并且只能作为模块 -运行。这些测试只能通过加载内核模块并检查内核日志以获取KASAN报告来手动验证。 - -如果检测到错误,每个KUnit兼容的KASAN测试都会打印多个KASAN报告之一,然后测试打印 -其编号和状态。 +如果检测到错误,每个 KASAN 测试都会打印多份 KASAN 报告中的一份。 +然后测试会打印其编号和状态。 当测试通过:: @@ -458,16 +454,16 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。 not ok 1 - kasan -有几种方法可以运行与KUnit兼容的KASAN测试。 +有几种方法可以运行 KASAN 测试。 1. 可加载模块 - 启用 ``CONFIG_KUNIT`` 后,KASAN-KUnit测试可以构建为可加载模块,并通过使用 - ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` 来运行。 + 启用 ``CONFIG_KUNIT`` 后,可以将测试构建为可加载模块 + 并通过使用 ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` 来运行。 2. 内置 - 通过内置 ``CONFIG_KUNIT`` ,也可以内置KASAN-KUnit测试。在这种情况下, + 通过内置 ``CONFIG_KUNIT``,测试也可以内置。 测试将在启动时作为后期初始化调用运行。 3. 使用kunit_tool diff --git a/Documentation/translations/zh_TW/dev-tools/kasan.rst b/Documentation/translations/zh_TW/dev-tools/kasan.rst index ed342e67d8ed..27fb7645174d 100644 --- a/Documentation/translations/zh_TW/dev-tools/kasan.rst +++ b/Documentation/translations/zh_TW/dev-tools/kasan.rst @@ -404,16 +404,13 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。 ~~~~ 有一些KASAN測試可以驗證KASAN是否正常工作並可以檢測某些類型的內存損壞。 -測試由兩部分組成: -1. 與KUnit測試框架集成的測試。使用 ``CONFIG_KASAN_KUNIT_TEST`` 啓用。 -這些測試可以通過幾種不同的方式自動運行和部分驗證;請參閱下面的說明。 +所有 KASAN 測試均與 KUnit 測試框架集成,並且可以啟用 +透過 ``CONFIG_KASAN_KUNIT_TEST``。可以運行測試並進行部分驗證 +以幾種不同的方式自動進行;請參閱下面的說明。 -2. 與KUnit不兼容的測試。使用 ``CONFIG_KASAN_MODULE_TEST`` 啓用並且只能作爲模塊 -運行。這些測試只能通過加載內核模塊並檢查內核日誌以獲取KASAN報告來手動驗證。 - -如果檢測到錯誤,每個KUnit兼容的KASAN測試都會打印多個KASAN報告之一,然後測試打印 -其編號和狀態。 +如果偵測到錯誤,每個 KASAN 測試都會列印多個 KASAN 報告之一。 +然後測試列印其編號和狀態。 當測試通過:: @@ -440,16 +437,16 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。 not ok 1 - kasan -有幾種方法可以運行與KUnit兼容的KASAN測試。 +有幾種方法可以執行 KASAN 測試。 1. 可加載模塊 - 啓用 ``CONFIG_KUNIT`` 後,KASAN-KUnit測試可以構建爲可加載模塊,並通過使用 - ``insmod`` 或 ``modprobe`` 加載 ``kasan_test.ko`` 來運行。 + 啟用 ``CONFIG_KUNIT`` 後,測試可以建置為可載入模組 + 並且透過使用 ``insmod`` 或 ``modprobe`` 來載入 ``kasan_test.ko`` 來運作。 2. 內置 - 通過內置 ``CONFIG_KUNIT`` ,也可以內置KASAN-KUnit測試。在這種情況下, + 透過內建 ``CONFIG_KUNIT``,測試也可以內建。 測試將在啓動時作爲後期初始化調用運行。 3. 使用kunit_tool diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 98016e137b7f..f82889a830fa 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -195,13 +195,6 @@ config KASAN_KUNIT_TEST For more information on KUnit and unit tests in general, please refer to the KUnit documentation in Documentation/dev-tools/kunit/. -config KASAN_MODULE_TEST - tristate "KUnit-incompatible tests of KASAN bug detection capabilities" - depends on m && KASAN && !KASAN_HW_TAGS - help - A part of the KASAN test suite that is not integrated with KUnit. - Incompatible with Hardware Tag-Based KASAN. - config KASAN_EXTRA_INFO bool "Record and report more information" depends on KASAN diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index f438a6cdc964..b7e4b81421b3 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -568,7 +568,7 @@ static inline void kasan_kunit_test_suite_end(void) { } #endif /* CONFIG_KASAN_KUNIT_TEST */ -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) bool kasan_save_enable_multi_shot(void); void kasan_restore_multi_shot(bool enabled); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b48c768acc84..3e48668c3e40 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -132,7 +132,7 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) bool kasan_save_enable_multi_shot(void) { -- cgit v1.2.3 From 04dafdd2082c601f267d68bd48b15b8189d63c29 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:23 +0000 Subject: maple_tree: print empty for an empty tree on mt_dump() Patch series "refine storing null", v5. When overwriting the whole range with NULL, current behavior is not correct. An empty tree is represented by having the tree point to NULL directly. An empty tree indicates the entire range (0-ULONG_MAX) is NULL. A store operation into an existing node that causes 0 - ULONG_MAX to be equal to NULL may not be restored to an empty state - a node is used to store the single range instead. This is wasteful and different from the initial setup of the tree. Once the tree is using a single node to store 0 - ULONG_MAX, problems may arise when storing more values into a tree with the unexpected state of 0 - ULONG being a single range in a node. User visible issues may mean a corrupt tree and incorrect storage of information within the tree. This would be limited to users who create and then empty a tree by overwriting all values, then try to store more NULLs into the empty tree. I cannot come up with an example of any user doing this (users usually destroy the tree and generally don't keep trying to store NULLs over NULLs), but patch 4/5 "maple_tree: refine mas_store_root() on storing NULL" should be backported just in case. This patch (of 5): Currently for an empty tree, it would print: maple_tree(0x7ffcd02c6ee0) flags 1, height 0 root (nil) 0: (nil) This is a little misleading. Let's print (empty) for an empty tree. Link: https://lkml.kernel.org/r/20241031231627.14316-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20241031231627.14316-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 38aa8abf8eb8..523355fb2bbe 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7273,10 +7273,12 @@ void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); - if (!xa_is_node(entry)) - mt_dump_entry(entry, 0, 0, 0, format); - else if (entry) + if (xa_is_node(entry)) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); + else if (entry) + mt_dump_entry(entry, 0, 0, 0, format); + else + pr_info("(empty)\n"); } EXPORT_SYMBOL_GPL(mt_dump); -- cgit v1.2.3 From cefbcf206f6d92dc0076e3fda06e2b9331b77868 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:24 +0000 Subject: maple_tree: the return value of mas_root_expand() is not used No user of the return value now, just remove it. Link: https://lkml.kernel.org/r/20241031231627.14316-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 523355fb2bbe..071d3055f1fa 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3408,7 +3408,7 @@ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, * @mas: The maple state * @entry: The entry to store into the tree */ -static inline int mas_root_expand(struct ma_state *mas, void *entry) +static inline void mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; @@ -3444,7 +3444,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - return slot; + return; } static inline void mas_store_root(struct ma_state *mas, void *entry) -- cgit v1.2.3 From 8c836f1712d750163fb00b6cc3a730149c215979 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:25 +0000 Subject: maple_tree: not necessary to check index/last again Before calling mas_new_root(), the range has been checked. Link: https://lkml.kernel.org/r/20241031231627.14316-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 071d3055f1fa..4900f182e99d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3670,7 +3670,9 @@ static inline void mas_new_root(struct ma_state *mas, void *entry) void __rcu **slots; unsigned long *pivots; - if (!entry && !mas->index && mas->last == ULONG_MAX) { + WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); + + if (!entry) { mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); -- cgit v1.2.3 From 0ea120b278ad7f7cfeeb606e150ad04b192df60b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:26 +0000 Subject: maple_tree: refine mas_store_root() on storing NULL Currently, when storing NULL on mas_store_root(), the behavior could be improved. Storing NULLs over the entire tree may result in a node being used to store a single range. Further stores of NULL may cause the node and tree to be corrupt and cause incorrect behaviour. Fixing the store to the root null fixes the issue by ensuring that a range of 0 - ULONG_MAX results in an empty tree. Users of the tree may experience incorrect values returned if the tree was expanded to store values, then overwritten by all NULLS, then continued to store NULLs over the empty area. For example possible cases are: * store NULL at any range result a new node * store NULL at range [m, n] where m > 0 to a single entry tree result a new node with range [m, n] set to NULL * store NULL at range [m, n] where m > 0 to an empty tree result consecutive NULL slot * it allows for multiple NULL entries by expanding root to store NULLs to an empty tree This patch tries to improve in: * memory efficient by setting to empty tree instead of using a node * remove the possibility of consecutive NULL slot which will prohibit extended null in later operation Link: https://lkml.kernel.org/r/20241031231627.14316-5-richard.weiyang@gmail.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4900f182e99d..d0ae808f3a14 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3447,9 +3447,20 @@ static inline void mas_root_expand(struct ma_state *mas, void *entry) return; } +/* + * mas_store_root() - Storing value into root. + * @mas: The maple state + * @entry: The entry to store. + * + * There is no root node now and we are storing a value into the root - this + * function either assigns the pointer or expands into a node. + */ static inline void mas_store_root(struct ma_state *mas, void *entry) { - if (likely((mas->last != 0) || (mas->index != 0))) + if (!entry) { + if (!mas->index) + rcu_assign_pointer(mas->tree->ma_root, NULL); + } else if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); -- cgit v1.2.3 From 431e10601913f8f2006f3ed607e73eedf264b426 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:27 +0000 Subject: maple_tree: add a test checking storing null Add a test to assert that, when storing null to am empty tree or a single entry tree it will not result into: * a root node with range [0, ULONG_MAX] set to NULL * a root node with consecutive slot set to NULL [akpm@linux-foundation.org: work around build error (mas_root)] Link: https://lkml.kernel.org/r/20241031231627.14316-6-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 31561e0e1a0d..704cb1093ae8 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1387,6 +1387,92 @@ static noinline void __init check_prev_entry(struct maple_tree *mt) mas_unlock(&mas); } +static noinline void __init check_store_null(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, ULONG_MAX); + + /* + * Store NULL at range [0, ULONG_MAX] to an empty tree should result + * in an empty tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at any range to an empty tree should result in an empty + * tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set_range(&mas, 3, 10); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at range [0, ULONG_MAX] to a single entry tree should + * result in an empty tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set(&mas, 0); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_set_range(&mas, 0, ULONG_MAX); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at range [0, n] to a single entry tree should + * result in an empty tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set(&mas, 0); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_set_range(&mas, 0, 5); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at range [m, n] where m > 0 to a single entry tree + * should still be a single entry tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set(&mas, 0); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_set_range(&mas, 2, 5); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, mtree_empty(mt)); +// MT_BUG_ON(mt, xa_is_node(mas_root(&mas))); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at range [0, ULONG_MAX] to a tree with node should + * result in an empty tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set_range(&mas, 1, 3); + mas_store_gfp(&mas, &mas, GFP_KERNEL); +// MT_BUG_ON(mt, !xa_is_node(mas_root(&mas))); + mas_set_range(&mas, 0, ULONG_MAX); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); +} + static noinline void __init check_root_expand(struct maple_tree *mt) { MA_STATE(mas, mt, 0, 0); @@ -3710,6 +3796,10 @@ static int __init maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_store_null(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_root_expand(&tree); mtree_destroy(&tree); -- cgit v1.2.3 From 111314157f7891da7a51a8f95df42eeb22f4268a Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 5 Nov 2024 16:54:06 +0200 Subject: lib: util_macros_kunit: add kunit test for util_macros.h A bug was found in the find_closest() (find_closest_descending() is also affected after some testing), where for certain values with small progressions of 1, 2 & 3, the rounding (done by averaging 2 values) causes an incorrect index to be returned. The bug is described in more detail in the commit which fixes the bug. This commit adds a kunit test to validate that the fix works correctly. This kunit test adds some of the arrays (from the driver-sphere) that seem to produce issues with the 'find_closest()' macro. Specifically the one from ad7606 driver (with which the bug was found) and from the ina2xx drivers, which shows the quirk with 'find_closest()' with elements in a array that have an interval of 3. For the find_closest_descending() tests, the same arrays are used as for the find_closest(), but in reverse; the idea is that 'find_closest_descending()' should return the sames indices as 'find_closest()' but in reverse. For testing both macros, there are 4 special arrays created, one for testing find_closest{_descending}() for arrays of progressions 1, 2, 3 and 4. The idea is to show that (for progressions of 1, 2 & 3) the fix works as expected. When removing the fix, the issues should start to show up. Then an extra array of negative and positive values is added. There are currently no such arrays within drivers, but one could expect that these macros behave correctly even for such arrays. To run this kunit: ./tools/testing/kunit/kunit.py run "*util_macros*" Link: https://lkml.kernel.org/r/20241105145406.554365-2-aardelean@baylibre.com Signed-off-by: Alexandru Ardelean Cc: Bartosz Golaszewski Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 17 ++++ lib/Makefile | 1 + lib/util_macros_kunit.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 lib/util_macros_kunit.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2549b64b2280..d7dd38f14333 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2630,6 +2630,23 @@ config CHECKSUM_KUNIT If unsure, say N. +config UTIL_MACROS_KUNIT + tristate "KUnit test util_macros.h functions at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test the util_macros.h function at boot. + + KUnit tests run during boot and output the results to the debug log + in TAP format (http://testanything.org/). Only useful for kernel devs + running the KUnit test harness, and not intended for inclusion into a + production build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + config HASH_KUNIT_TEST tristate "KUnit Test for integer hash functions" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index 1eb89962daef..cc26f81722a5 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -372,6 +372,7 @@ obj-$(CONFIG_PLDMFW) += pldmfw/ CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o +obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o diff --git a/lib/util_macros_kunit.c b/lib/util_macros_kunit.c new file mode 100644 index 000000000000..94cc9f0de50a --- /dev/null +++ b/lib/util_macros_kunit.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Test cases for bitfield helpers. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#define FIND_CLOSEST_RANGE_CHECK(from, to, array, exp_idx) \ +{ \ + int i; \ + for (i = from; i <= to; i++) { \ + int found = find_closest(i, array, ARRAY_SIZE(array)); \ + KUNIT_ASSERT_EQ(ctx, exp_idx, found); \ + } \ +} + +static void test_find_closest(struct kunit *ctx) +{ + /* This will test a few arrays that are found in drivers */ + static const int ina226_avg_tab[] = { 1, 4, 16, 64, 128, 256, 512, 1024 }; + static const unsigned int ad7616_oversampling_avail[] = { + 1, 2, 4, 8, 16, 32, 64, 128, + }; + static u32 wd_timeout_table[] = { 2, 4, 6, 8, 16, 32, 48, 64 }; + static int array_prog1a[] = { 1, 2, 3, 4, 5 }; + static u32 array_prog1b[] = { 2, 3, 4, 5, 6 }; + static int array_prog1mix[] = { -2, -1, 0, 1, 2 }; + static int array_prog2a[] = { 1, 3, 5, 7 }; + static u32 array_prog2b[] = { 2, 4, 6, 8 }; + static int array_prog3a[] = { 1, 4, 7, 10 }; + static u32 array_prog3b[] = { 2, 5, 8, 11 }; + static int array_prog4a[] = { 1, 5, 9, 13 }; + static u32 array_prog4b[] = { 2, 6, 10, 14 }; + + FIND_CLOSEST_RANGE_CHECK(-3, 2, ina226_avg_tab, 0); + FIND_CLOSEST_RANGE_CHECK(3, 10, ina226_avg_tab, 1); + FIND_CLOSEST_RANGE_CHECK(11, 40, ina226_avg_tab, 2); + FIND_CLOSEST_RANGE_CHECK(41, 96, ina226_avg_tab, 3); + FIND_CLOSEST_RANGE_CHECK(97, 192, ina226_avg_tab, 4); + FIND_CLOSEST_RANGE_CHECK(193, 384, ina226_avg_tab, 5); + FIND_CLOSEST_RANGE_CHECK(385, 768, ina226_avg_tab, 6); + FIND_CLOSEST_RANGE_CHECK(769, 2048, ina226_avg_tab, 7); + + /* The array that found the bug that caused this kunit to exist */ + FIND_CLOSEST_RANGE_CHECK(-3, 1, ad7616_oversampling_avail, 0); + FIND_CLOSEST_RANGE_CHECK(2, 3, ad7616_oversampling_avail, 1); + FIND_CLOSEST_RANGE_CHECK(4, 6, ad7616_oversampling_avail, 2); + FIND_CLOSEST_RANGE_CHECK(7, 12, ad7616_oversampling_avail, 3); + FIND_CLOSEST_RANGE_CHECK(13, 24, ad7616_oversampling_avail, 4); + FIND_CLOSEST_RANGE_CHECK(25, 48, ad7616_oversampling_avail, 5); + FIND_CLOSEST_RANGE_CHECK(49, 96, ad7616_oversampling_avail, 6); + FIND_CLOSEST_RANGE_CHECK(97, 256, ad7616_oversampling_avail, 7); + + FIND_CLOSEST_RANGE_CHECK(-3, 3, wd_timeout_table, 0); + FIND_CLOSEST_RANGE_CHECK(4, 5, wd_timeout_table, 1); + FIND_CLOSEST_RANGE_CHECK(6, 7, wd_timeout_table, 2); + FIND_CLOSEST_RANGE_CHECK(8, 12, wd_timeout_table, 3); + FIND_CLOSEST_RANGE_CHECK(13, 24, wd_timeout_table, 4); + FIND_CLOSEST_RANGE_CHECK(25, 40, wd_timeout_table, 5); + FIND_CLOSEST_RANGE_CHECK(41, 56, wd_timeout_table, 6); + FIND_CLOSEST_RANGE_CHECK(57, 128, wd_timeout_table, 7); + + /* One could argue that find_closest() should not be used for monotonic + * arrays (like 1,2,3,4,5), but even so, it should work as long as the + * array is sorted ascending. */ + FIND_CLOSEST_RANGE_CHECK(-3, 1, array_prog1a, 0); + FIND_CLOSEST_RANGE_CHECK(2, 2, array_prog1a, 1); + FIND_CLOSEST_RANGE_CHECK(3, 3, array_prog1a, 2); + FIND_CLOSEST_RANGE_CHECK(4, 4, array_prog1a, 3); + FIND_CLOSEST_RANGE_CHECK(5, 8, array_prog1a, 4); + + FIND_CLOSEST_RANGE_CHECK(-3, 2, array_prog1b, 0); + FIND_CLOSEST_RANGE_CHECK(3, 3, array_prog1b, 1); + FIND_CLOSEST_RANGE_CHECK(4, 4, array_prog1b, 2); + FIND_CLOSEST_RANGE_CHECK(5, 5, array_prog1b, 3); + FIND_CLOSEST_RANGE_CHECK(6, 8, array_prog1b, 4); + + FIND_CLOSEST_RANGE_CHECK(-4, -2, array_prog1mix, 0); + FIND_CLOSEST_RANGE_CHECK(-1, -1, array_prog1mix, 1); + FIND_CLOSEST_RANGE_CHECK(0, 0, array_prog1mix, 2); + FIND_CLOSEST_RANGE_CHECK(1, 1, array_prog1mix, 3); + FIND_CLOSEST_RANGE_CHECK(2, 5, array_prog1mix, 4); + + FIND_CLOSEST_RANGE_CHECK(-3, 2, array_prog2a, 0); + FIND_CLOSEST_RANGE_CHECK(3, 4, array_prog2a, 1); + FIND_CLOSEST_RANGE_CHECK(5, 6, array_prog2a, 2); + FIND_CLOSEST_RANGE_CHECK(7, 10, array_prog2a, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 3, array_prog2b, 0); + FIND_CLOSEST_RANGE_CHECK(4, 5, array_prog2b, 1); + FIND_CLOSEST_RANGE_CHECK(6, 7, array_prog2b, 2); + FIND_CLOSEST_RANGE_CHECK(8, 10, array_prog2b, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 2, array_prog3a, 0); + FIND_CLOSEST_RANGE_CHECK(3, 5, array_prog3a, 1); + FIND_CLOSEST_RANGE_CHECK(6, 8, array_prog3a, 2); + FIND_CLOSEST_RANGE_CHECK(9, 20, array_prog3a, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 3, array_prog3b, 0); + FIND_CLOSEST_RANGE_CHECK(4, 6, array_prog3b, 1); + FIND_CLOSEST_RANGE_CHECK(7, 9, array_prog3b, 2); + FIND_CLOSEST_RANGE_CHECK(10, 20, array_prog3b, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 3, array_prog4a, 0); + FIND_CLOSEST_RANGE_CHECK(4, 7, array_prog4a, 1); + FIND_CLOSEST_RANGE_CHECK(8, 11, array_prog4a, 2); + FIND_CLOSEST_RANGE_CHECK(12, 20, array_prog4a, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 4, array_prog4b, 0); + FIND_CLOSEST_RANGE_CHECK(5, 8, array_prog4b, 1); + FIND_CLOSEST_RANGE_CHECK(9, 12, array_prog4b, 2); + FIND_CLOSEST_RANGE_CHECK(13, 20, array_prog4b, 3); +} + +#define FIND_CLOSEST_DESC_RANGE_CHECK(from, to, array, exp_idx) \ +{ \ + int i; \ + for (i = from; i <= to; i++) { \ + int found = find_closest_descending(i, array, \ + ARRAY_SIZE(array)); \ + KUNIT_ASSERT_EQ(ctx, exp_idx, found); \ + } \ +} + +static void test_find_closest_descending(struct kunit *ctx) +{ + /* Same arrays as 'test_find_closest' but reversed */ + static const int ina226_avg_tab[] = { 1024, 512, 256, 128, 64, 16, 4, 1 }; + static const unsigned int ad7616_oversampling_avail[] = { + 128, 64, 32, 16, 8, 4, 2, 1 + }; + static u32 wd_timeout_table[] = { 64, 48, 32, 16, 8, 6, 4, 2 }; + static int array_prog1a[] = { 5, 4, 3, 2, 1 }; + static u32 array_prog1b[] = { 6, 5, 4, 3, 2 }; + static int array_prog1mix[] = { 2, 1, 0, -1, -2 }; + static int array_prog2a[] = { 7, 5, 3, 1 }; + static u32 array_prog2b[] = { 8, 6, 4, 2 }; + static int array_prog3a[] = { 10, 7, 4, 1 }; + static u32 array_prog3b[] = { 11, 8, 5, 2 }; + static int array_prog4a[] = { 13, 9, 5, 1 }; + static u32 array_prog4b[] = { 14, 10, 6, 2 }; + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 2, ina226_avg_tab, 7); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 10, ina226_avg_tab, 6); + FIND_CLOSEST_DESC_RANGE_CHECK(11, 40, ina226_avg_tab, 5); + FIND_CLOSEST_DESC_RANGE_CHECK(41, 96, ina226_avg_tab, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(97, 192, ina226_avg_tab, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(193, 384, ina226_avg_tab, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(385, 768, ina226_avg_tab, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(769, 2048, ina226_avg_tab, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 1, ad7616_oversampling_avail, 7); + FIND_CLOSEST_DESC_RANGE_CHECK(2, 3, ad7616_oversampling_avail, 6); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 6, ad7616_oversampling_avail, 5); + FIND_CLOSEST_DESC_RANGE_CHECK(7, 12, ad7616_oversampling_avail, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(13, 24, ad7616_oversampling_avail, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(25, 48, ad7616_oversampling_avail, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(49, 96, ad7616_oversampling_avail, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(97, 256, ad7616_oversampling_avail, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 3, wd_timeout_table, 7); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 5, wd_timeout_table, 6); + FIND_CLOSEST_DESC_RANGE_CHECK(6, 7, wd_timeout_table, 5); + FIND_CLOSEST_DESC_RANGE_CHECK(8, 12, wd_timeout_table, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(13, 24, wd_timeout_table, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(25, 40, wd_timeout_table, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(41, 56, wd_timeout_table, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(57, 128, wd_timeout_table, 0); + + /* One could argue that find_closest_descending() should not be used + * for monotonic arrays (like 5,4,3,2,1), but even so, it should still + * it should work as long as the array is sorted descending. */ + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 1, array_prog1a, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(2, 2, array_prog1a, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 3, array_prog1a, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 4, array_prog1a, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(5, 8, array_prog1a, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 2, array_prog1b, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 3, array_prog1b, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 4, array_prog1b, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(5, 5, array_prog1b, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(6, 8, array_prog1b, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-4, -2, array_prog1mix, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(-1, -1, array_prog1mix, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(0, 0, array_prog1mix, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(1, 1, array_prog1mix, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(2, 5, array_prog1mix, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 2, array_prog2a, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 4, array_prog2a, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(5, 6, array_prog2a, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(7, 10, array_prog2a, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 3, array_prog2b, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 5, array_prog2b, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(6, 7, array_prog2b, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(8, 10, array_prog2b, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 2, array_prog3a, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 5, array_prog3a, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(6, 8, array_prog3a, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(9, 20, array_prog3a, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 3, array_prog3b, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 6, array_prog3b, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(7, 9, array_prog3b, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(10, 20, array_prog3b, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 3, array_prog4a, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 7, array_prog4a, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(8, 11, array_prog4a, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(12, 20, array_prog4a, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 4, array_prog4b, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(5, 8, array_prog4b, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(9, 12, array_prog4b, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(13, 20, array_prog4b, 0); +} + +static struct kunit_case __refdata util_macros_test_cases[] = { + KUNIT_CASE(test_find_closest), + KUNIT_CASE(test_find_closest_descending), + {} +}; + +static struct kunit_suite util_macros_test_suite = { + .name = "util_macros.h", + .test_cases = util_macros_test_cases, +}; + +kunit_test_suites(&util_macros_test_suite); + +MODULE_AUTHOR("Alexandru Ardelean "); +MODULE_DESCRIPTION("Test cases for util_macros.h helpers"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 12079a59ce52e72a342c49cfacf0281213fd6f32 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 7 Nov 2024 08:11:44 -0800 Subject: net: Implement fault injection forcing skb reallocation Introduce a fault injection mechanism to force skb reallocation. The primary goal is to catch bugs related to pointer invalidation after potential skb reallocation. The fault injection mechanism aims to identify scenarios where callers retain pointers to various headers in the skb but fail to reload these pointers after calling a function that may reallocate the data. This type of bug can lead to memory corruption or crashes if the old, now-invalid pointers are used. By forcing reallocation through fault injection, we can stress-test code paths and ensure proper pointer management after potential skb reallocations. Add a hook for fault injection in the following functions: * pskb_trim_rcsum() * pskb_may_pull_reason() * pskb_trim() As the other fault injection mechanism, protect it under a debug Kconfig called CONFIG_FAIL_SKB_REALLOC. This patch was *heavily* inspired by Jakub's proposal from: https://lore.kernel.org/all/20240719174140.47a868e6@kernel.org/ CC: Akinobu Mita Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Akinobu Mita Acked-by: Paolo Abeni Acked-by: Guillaume Nault Link: https://patch.msgid.link/20241107-fault_v6-v6-1-1b82cb6ecacd@debian.org Signed-off-by: Paolo Abeni --- Documentation/admin-guide/kernel-parameters.txt | 1 + Documentation/fault-injection/fault-injection.rst | 40 ++++++++ include/linux/skbuff.h | 9 ++ lib/Kconfig.debug | 10 ++ net/core/Makefile | 1 + net/core/skb_fault_injection.c | 106 ++++++++++++++++++++++ 6 files changed, 167 insertions(+) create mode 100644 net/core/skb_fault_injection.c (limited to 'lib') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..2fb830453dcc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1546,6 +1546,7 @@ failslab= fail_usercopy= fail_page_alloc= + fail_skb_realloc= fail_make_request=[KNL] General fault injection mechanism. Format: ,,, diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 8b8aeea71c68..1c14ba08fbfc 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -45,6 +45,32 @@ Available fault injection capabilities ALLOW_ERROR_INJECTION() macro, by setting debugfs entries under /sys/kernel/debug/fail_function. No boot option supported. +- fail_skb_realloc + + inject skb (socket buffer) reallocation events into the network path. The + primary goal is to identify and prevent issues related to pointer + mismanagement in the network subsystem. By forcing skb reallocation at + strategic points, this feature creates scenarios where existing pointers to + skb headers become invalid. + + When the fault is injected and the reallocation is triggered, cached pointers + to skb headers and data no longer reference valid memory locations. This + deliberate invalidation helps expose code paths where proper pointer updating + is neglected after a reallocation event. + + By creating these controlled fault scenarios, the system can catch instances + where stale pointers are used, potentially leading to memory corruption or + system instability. + + To select the interface to act on, write the network name to + /sys/kernel/debug/fail_skb_realloc/devname. + If this field is left empty (which is the default value), skb reallocation + will be forced on all network interfaces. + + The effectiveness of this fault detection is enhanced when KASAN is + enabled, as it helps identify invalid memory references and use-after-free + (UAF) issues. + - NVMe fault injection inject NVMe status code and retry flag on devices permitted by setting @@ -216,6 +242,19 @@ configuration of fault-injection capabilities. use a negative errno, you better use 'printf' instead of 'echo', e.g.: $ printf %#x -12 > retval +- /sys/kernel/debug/fail_skb_realloc/devname: + + Specifies the network interface on which to force SKB reallocation. If + left empty, SKB reallocation will be applied to all network interfaces. + + Example usage:: + + # Force skb reallocation on eth0 + echo "eth0" > /sys/kernel/debug/fail_skb_realloc/devname + + # Clear the selection and force skb reallocation on all interfaces + echo "" > /sys/kernel/debug/fail_skb_realloc/devname + Boot option ^^^^^^^^^^^ @@ -227,6 +266,7 @@ use the boot option:: fail_usercopy= fail_make_request= fail_futex= + fail_skb_realloc= mmc_core.fail_request=,,, proc entries diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 60535c706851..58009fa66102 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2682,6 +2682,12 @@ static inline void skb_assert_len(struct sk_buff *skb) #endif /* CONFIG_DEBUG_NET */ } +#if defined(CONFIG_FAIL_SKB_REALLOC) +void skb_might_realloc(struct sk_buff *skb); +#else +static inline void skb_might_realloc(struct sk_buff *skb) {} +#endif + /* * Add data to an sk_buff */ @@ -2782,6 +2788,7 @@ static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_might_realloc(skb); if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; @@ -3240,6 +3247,7 @@ static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) static inline int pskb_trim(struct sk_buff *skb, unsigned int len) { + skb_might_realloc(skb); return (len < skb->len) ? __pskb_trim(skb, len) : 0; } @@ -3994,6 +4002,7 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { + skb_might_realloc(skb); if (likely(len >= skb->len)) return 0; return pskb_trim_rcsum_slow(skb, len); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc5..67b669d2e70e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2115,6 +2115,16 @@ config FAIL_SUNRPC Provide fault-injection capability for SunRPC and its consumers. +config FAIL_SKB_REALLOC + bool "Fault-injection capability forcing skb to reallocate" + depends on FAULT_INJECTION_DEBUG_FS + help + Provide fault-injection capability that forces the skb to be + reallocated, catching possible invalid pointers to the skb. + + For more information, check + Documentation/dev-tools/fault-injection/fault-injection.rst + config FAULT_INJECTION_CONFIGFS bool "Configfs interface for fault-injection capabilities" depends on FAULT_INJECTION diff --git a/net/core/Makefile b/net/core/Makefile index 5a72a87ee0f1..d9326600e289 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -46,3 +46,4 @@ obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o obj-$(CONFIG_NET_DEVMEM) += devmem.o obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o +obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c new file mode 100644 index 000000000000..4235db6bdfad --- /dev/null +++ b/net/core/skb_fault_injection.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +static struct { + struct fault_attr attr; + char devname[IFNAMSIZ]; + bool filtered; +} skb_realloc = { + .attr = FAULT_ATTR_INITIALIZER, + .filtered = false, +}; + +static bool should_fail_net_realloc_skb(struct sk_buff *skb) +{ + struct net_device *net = skb->dev; + + if (skb_realloc.filtered && + strncmp(net->name, skb_realloc.devname, IFNAMSIZ)) + /* device name filter set, but names do not match */ + return false; + + if (!should_fail(&skb_realloc.attr, 1)) + return false; + + return true; +} +ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE); + +void skb_might_realloc(struct sk_buff *skb) +{ + if (!should_fail_net_realloc_skb(skb)) + return; + + pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_might_realloc); + +static int __init fail_skb_realloc_setup(char *str) +{ + return setup_fault_attr(&skb_realloc.attr, str); +} +__setup("fail_skb_realloc=", fail_skb_realloc_setup); + +static void reset_settings(void) +{ + skb_realloc.filtered = false; + memset(&skb_realloc.devname, 0, IFNAMSIZ); +} + +static ssize_t devname_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + ssize_t ret; + + reset_settings(); + ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ, + ppos, buffer, count); + if (ret < 0) + return ret; + + skb_realloc.devname[IFNAMSIZ - 1] = '\0'; + /* Remove a possible \n at the end of devname */ + strim(skb_realloc.devname); + + if (strnlen(skb_realloc.devname, IFNAMSIZ)) + skb_realloc.filtered = true; + + return count; +} + +static ssize_t devname_read(struct file *file, + char __user *buffer, + size_t size, loff_t *ppos) +{ + if (!skb_realloc.filtered) + return 0; + + return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname, + strlen(skb_realloc.devname)); +} + +static const struct file_operations devname_ops = { + .write = devname_write, + .read = devname_read, +}; + +static int __init fail_skb_realloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_skb_realloc", NULL, + &skb_realloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_file("devname", mode, dir, NULL, &devname_ops); + + return 0; +} + +late_initcall(fail_skb_realloc_debugfs); -- cgit v1.2.3 From 0594ad6184598b5b9a6eb5619785f37f825e6ffd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 7 Nov 2024 13:37:35 +0800 Subject: crypto: lib/mpi - Export mpi_set_bit This function is part of the exposed API and should be exported. Otherwise a modular user would fail to build, e.g., crypto/rsa. Signed-off-by: Herbert Xu --- lib/crypto/mpi/mpi-bit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c index 835a2f0622a0..934d81311360 100644 --- a/lib/crypto/mpi/mpi-bit.c +++ b/lib/crypto/mpi/mpi-bit.c @@ -95,6 +95,7 @@ int mpi_set_bit(MPI a, unsigned int n) a->d[limbno] |= (A_LIMB_1< Date: Wed, 16 Oct 2024 23:41:52 +0800 Subject: mm/slub, kunit: Add testcase for krealloc redzone and zeroing Danilo Krummrich raised issue about krealloc+GFP_ZERO [1], and Vlastimil suggested to add some test case which can sanity test the kmalloc-redzone and zeroing by utilizing the kmalloc's 'orig_size' debug feature. It covers the grow and shrink case of krealloc() re-using current kmalloc object, and the case of re-allocating a new bigger object. [1]. https://lore.kernel.org/lkml/20240812223707.32049-1-dakr@kernel.org/ Suggested-by: Vlastimil Babka Signed-off-by: Feng Tang Signed-off-by: Vlastimil Babka --- lib/slub_kunit.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'lib') diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index 33564f965958..f11691315c2f 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -192,6 +192,47 @@ static void test_leak_destroy(struct kunit *test) KUNIT_EXPECT_EQ(test, 2, slab_errors); } +static void test_krealloc_redzone_zeroing(struct kunit *test) +{ + u8 *p; + int i; + struct kmem_cache *s = test_kmem_cache_create("TestSlub_krealloc", 64, + SLAB_KMALLOC|SLAB_STORE_USER|SLAB_RED_ZONE); + + p = alloc_hooks(__kmalloc_cache_noprof(s, GFP_KERNEL, 48)); + memset(p, 0xff, 48); + + kasan_disable_current(); + OPTIMIZER_HIDE_VAR(p); + + /* Test shrink */ + p = krealloc(p, 40, GFP_KERNEL | __GFP_ZERO); + for (i = 40; i < 64; i++) + KUNIT_EXPECT_EQ(test, p[i], SLUB_RED_ACTIVE); + + /* Test grow within the same 64B kmalloc object */ + p = krealloc(p, 56, GFP_KERNEL | __GFP_ZERO); + for (i = 40; i < 56; i++) + KUNIT_EXPECT_EQ(test, p[i], 0); + for (i = 56; i < 64; i++) + KUNIT_EXPECT_EQ(test, p[i], SLUB_RED_ACTIVE); + + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 0, slab_errors); + + memset(p, 0xff, 56); + /* Test grow with allocating a bigger 128B object */ + p = krealloc(p, 112, GFP_KERNEL | __GFP_ZERO); + for (i = 0; i < 56; i++) + KUNIT_EXPECT_EQ(test, p[i], 0xff); + for (i = 56; i < 112; i++) + KUNIT_EXPECT_EQ(test, p[i], 0); + + kfree(p); + kasan_enable_current(); + kmem_cache_destroy(s); +} + static int test_init(struct kunit *test) { slab_errors = 0; @@ -214,6 +255,7 @@ static struct kunit_case test_cases[] = { KUNIT_CASE(test_kmalloc_redzone_access), KUNIT_CASE(test_kfree_rcu), KUNIT_CASE(test_leak_destroy), + KUNIT_CASE(test_krealloc_redzone_zeroing), {} }; -- cgit v1.2.3 From f06e108a3dc53c0f5234d18de0bd224753db5019 Mon Sep 17 00:00:00 2001 From: Jan Hendrik Farr Date: Tue, 29 Oct 2024 15:00:36 +0100 Subject: Compiler Attributes: disable __counted_by for clang < 19.1.3 This patch disables __counted_by for clang versions < 19.1.3 because of the two issues listed below. It does this by introducing CONFIG_CC_HAS_COUNTED_BY. 1. clang < 19.1.2 has a bug that can lead to __bdos returning 0: https://github.com/llvm/llvm-project/pull/110497 2. clang < 19.1.3 has a bug that can lead to __bdos being off by 4: https://github.com/llvm/llvm-project/pull/112636 Fixes: c8248faf3ca2 ("Compiler Attributes: counted_by: Adjust name and identifier expansion") Cc: stable@vger.kernel.org # 6.6.x: 16c31dd7fdf6: Compiler Attributes: counted_by: bump min gcc version Cc: stable@vger.kernel.org # 6.6.x: 2993eb7a8d34: Compiler Attributes: counted_by: fixup clang URL Cc: stable@vger.kernel.org # 6.6.x: 231dc3f0c936: lkdtm/bugs: Improve warning message for compilers without counted_by support Cc: stable@vger.kernel.org # 6.6.x Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/all/20240913164630.GA4091534@thelio-3990X/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202409260949.a1254989-oliver.sang@intel.com Link: https://lore.kernel.org/all/Zw8iawAF5W2uzGuh@archlinux/T/#m204c09f63c076586a02d194b87dffc7e81b8de7b Suggested-by: Nathan Chancellor Signed-off-by: Jan Hendrik Farr Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Miguel Ojeda Reviewed-by: Thorsten Blum Link: https://lore.kernel.org/r/20241029140036.577804-2-kernel@jfarr.cc Signed-off-by: Kees Cook --- drivers/misc/lkdtm/bugs.c | 2 +- include/linux/compiler_attributes.h | 13 ------------- include/linux/compiler_types.h | 19 +++++++++++++++++++ init/Kconfig | 9 +++++++++ lib/overflow_kunit.c | 2 +- 5 files changed, 30 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index 62ba01525479..376047beea3d 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -445,7 +445,7 @@ static void lkdtm_FAM_BOUNDS(void) pr_err("FAIL: survived access of invalid flexible array member index!\n"); - if (!__has_attribute(__counted_by__)) + if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY)) pr_warn("This is expected since this %s was built with a compiler that does not support __counted_by\n", lkdtm_kernel_info); else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS)) diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 32284cd26d52..c16d4199bf92 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -94,19 +94,6 @@ # define __copy(symbol) #endif -/* - * Optional: only supported since gcc >= 15 - * Optional: only supported since clang >= 18 - * - * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 - * clang: https://github.com/llvm/llvm-project/pull/76348 - */ -#if __has_attribute(__counted_by__) -# define __counted_by(member) __attribute__((__counted_by__(member))) -#else -# define __counted_by(member) -#endif - /* * Optional: not supported by gcc * Optional: only supported since clang >= 14.0 diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1a957ea2f4fe..639be0f30b45 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -323,6 +323,25 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* + * Optional: only supported since gcc >= 15 + * Optional: only supported since clang >= 18 + * + * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 + * clang: https://github.com/llvm/llvm-project/pull/76348 + * + * __bdos on clang < 19.1.2 can erroneously return 0: + * https://github.com/llvm/llvm-project/pull/110497 + * + * __bdos on clang < 19.1.3 can be off by 4: + * https://github.com/llvm/llvm-project/pull/112636 + */ +#ifdef CONFIG_CC_HAS_COUNTED_BY +# define __counted_by(member) __attribute__((__counted_by__(member))) +#else +# define __counted_by(member) +#endif + /* * Apply __counted_by() when the Endianness matches to increase test coverage. */ diff --git a/init/Kconfig b/init/Kconfig index 530a382ee0fe..92f106cf5572 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -116,6 +116,15 @@ config CC_HAS_ASM_INLINE config CC_HAS_NO_PROFILE_FN_ATTR def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) +config CC_HAS_COUNTED_BY + # TODO: when gcc 15 is released remove the build test and add + # a gcc version check + def_bool $(success,echo 'struct flex { int count; int array[] __attribute__((__counted_by__(count))); };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror) + # clang needs to be at least 19.1.3 to avoid __bdos miscalculations + # https://github.com/llvm/llvm-project/pull/110497 + # https://github.com/llvm/llvm-project/pull/112636 + depends on !(CC_IS_CLANG && CLANG_VERSION < 190103) + config PAHOLE_VERSION int default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE)) diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c index 2abc78367dd1..5222c6393f11 100644 --- a/lib/overflow_kunit.c +++ b/lib/overflow_kunit.c @@ -1187,7 +1187,7 @@ static void DEFINE_FLEX_test(struct kunit *test) { /* Using _RAW_ on a __counted_by struct will initialize "counter" to zero */ DEFINE_RAW_FLEX(struct foo, two_but_zero, array, 2); -#if __has_attribute(__counted_by__) +#ifdef CONFIG_CC_HAS_COUNTED_BY int expected_raw_size = sizeof(struct foo); #else int expected_raw_size = sizeof(struct foo) + 2 * sizeof(s16); -- cgit v1.2.3 From 39e21403c978862846fa68b7f6d06f9cca235194 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Tue, 12 Nov 2024 16:03:14 +0800 Subject: kunit: string-stream: Fix a UAF bug in kunit_init_suite() In kunit_debugfs_create_suite(), if alloc_string_stream() fails in the kunit_suite_for_each_test_case() loop, the "suite->log = stream" has assigned before, and the error path only free the suite->log's stream memory but not set it to NULL, so the later string_stream_clear() of suite->log in kunit_init_suite() will cause below UAF bug. Set stream pointer to NULL after free to fix it. Unable to handle kernel paging request at virtual address 006440150000030d Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [006440150000030d] address between user and kernel address ranges Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: iio_test_gts industrialio_gts_helper cfg80211 rfkill ipv6 [last unloaded: iio_test_gts] CPU: 5 UID: 0 PID: 6253 Comm: modprobe Tainted: G B W N 6.12.0-rc4+ #458 Tainted: [B]=BAD_PAGE, [W]=WARN, [N]=TEST Hardware name: linux,dummy-virt (DT) pstate: 40000005 (nZcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : string_stream_clear+0x54/0x1ac lr : string_stream_clear+0x1a8/0x1ac sp : ffffffc080b47410 x29: ffffffc080b47410 x28: 006440550000030d x27: ffffff80c96b5e98 x26: ffffff80c96b5e80 x25: ffffffe461b3f6c0 x24: 0000000000000003 x23: ffffff80c96b5e88 x22: 1ffffff019cdf4fc x21: dfffffc000000000 x20: ffffff80ce6fa7e0 x19: 032202a80000186d x18: 0000000000001840 x17: 0000000000000000 x16: 0000000000000000 x15: ffffffe45c355cb4 x14: ffffffe45c35589c x13: ffffffe45c03da78 x12: ffffffb810168e75 x11: 1ffffff810168e74 x10: ffffffb810168e74 x9 : dfffffc000000000 x8 : 0000000000000004 x7 : 0000000000000003 x6 : 0000000000000001 x5 : ffffffc080b473a0 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000001 x1 : ffffffe462fbf620 x0 : dfffffc000000000 Call trace: string_stream_clear+0x54/0x1ac __kunit_test_suites_init+0x108/0x1d8 kunit_exec_run_tests+0xb8/0x100 kunit_module_notify+0x400/0x55c notifier_call_chain+0xfc/0x3b4 blocking_notifier_call_chain+0x68/0x9c do_init_module+0x24c/0x5c8 load_module+0x4acc/0x4e90 init_module_from_file+0xd4/0x128 idempotent_init_module+0x2d4/0x57c __arm64_sys_finit_module+0xac/0x100 invoke_syscall+0x6c/0x258 el0_svc_common.constprop.0+0x160/0x22c do_el0_svc+0x44/0x5c el0_svc+0x48/0xb8 el0t_64_sync_handler+0x13c/0x158 el0t_64_sync+0x190/0x194 Code: f9400753 d2dff800 f2fbffe0 d343fe7c (38e06b80) ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Oops: Fatal exception Link: https://lore.kernel.org/r/20241112080314.407966-1-ruanjinjie@huawei.com Cc: stable@vger.kernel.org Fixes: a3fdf784780c ("kunit: string-stream: Decouple string_stream from kunit") Suggested-by: Kuan-Wei Chiu Signed-off-by: Jinjie Ruan Reviewed-by: Kuan-Wei Chiu Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/debugfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c index d548750a325a..b25d214b93e1 100644 --- a/lib/kunit/debugfs.c +++ b/lib/kunit/debugfs.c @@ -212,8 +212,11 @@ void kunit_debugfs_create_suite(struct kunit_suite *suite) err: string_stream_destroy(suite->log); - kunit_suite_for_each_test_case(suite, test_case) + suite->log = NULL; + kunit_suite_for_each_test_case(suite, test_case) { string_stream_destroy(test_case->log); + test_case->log = NULL; + } } void kunit_debugfs_destroy_suite(struct kunit_suite *suite) -- cgit v1.2.3 From 435c20eed572a95709b1536ff78832836b2f91b1 Mon Sep 17 00:00:00 2001 From: Zichen Xie Date: Thu, 14 Nov 2024 23:43:36 -0600 Subject: kunit: Fix potential null dereference in kunit_device_driver_test() kunit_kzalloc() may return a NULL pointer, dereferencing it without NULL check may lead to NULL dereference. Add a NULL check for test_state. Link: https://lore.kernel.org/r/20241115054335.21673-1-zichenxie0106@gmail.com Fixes: d03c720e03bd ("kunit: Add APIs for managing devices") Signed-off-by: Zichen Xie Cc: stable@vger.kernel.org Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/kunit-test.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib') diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index 37e02be1e710..d9c781c859fd 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -805,6 +805,8 @@ static void kunit_device_driver_test(struct kunit *test) struct device *test_device; struct driver_test_state *test_state = kunit_kzalloc(test, sizeof(*test_state), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, test_state); + test->priv = test_state; test_driver = kunit_driver_create(test, "my_driver"); -- cgit v1.2.3 From 95b6d723a00710f129b81556c93fdd994c105ab1 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 15 Nov 2024 00:55:58 +0800 Subject: kunit: debugfs: Use IS_ERR() for alloc_string_stream() error check The alloc_string_stream() function only returns ERR_PTR(-ENOMEM) on failure and never returns NULL. Therefore, switching the error check in the caller from IS_ERR_OR_NULL to IS_ERR improves clarity, indicating that this function will return an error pointer (not NULL) when an error occurs. This change avoids any ambiguity regarding the function's return behavior. Link: https://lore.kernel.org/lkml/Zy9deU5VK3YR+r9N@visitorckw-System-Product-Name Signed-off-by: Kuan-Wei Chiu Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c index b25d214b93e1..af71911f4a07 100644 --- a/lib/kunit/debugfs.c +++ b/lib/kunit/debugfs.c @@ -181,7 +181,7 @@ void kunit_debugfs_create_suite(struct kunit_suite *suite) * successfully. */ stream = alloc_string_stream(GFP_KERNEL); - if (IS_ERR_OR_NULL(stream)) + if (IS_ERR(stream)) return; string_stream_set_append_newlines(stream, true); @@ -189,7 +189,7 @@ void kunit_debugfs_create_suite(struct kunit_suite *suite) kunit_suite_for_each_test_case(suite, test_case) { stream = alloc_string_stream(GFP_KERNEL); - if (IS_ERR_OR_NULL(stream)) + if (IS_ERR(stream)) goto err; string_stream_set_append_newlines(stream, true); -- cgit v1.2.3 From 7ea13556f7d287fb55f7cacc316ff7647550c702 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 27 Nov 2024 14:10:57 -0800 Subject: selftests: kallsyms: fix double build stupidity The current arrangement will have the test modules rebuilt on any make without having the script or code actually change. Take Masahiro Yamada's suggested fix and cleanups on the Makefile to fix this. Suggested-by: Masahiro Yamada Reported-by: Linus Torvalds Fixes: 84b4a51fce4ccc66 ("selftests: add new kallsyms selftests") Closes: https://lore.kernel.org/all/CAK7LNATRDODmfz1tE=inV-DQqPA4G9vKH+38zMbaGdpTuFWZFw@mail.gmail.com/T/#me6c8f98e82acbee6e75a31b34bbb543eb4940b15 Signed-off-by: Luis Chamberlain --- lib/tests/module/Makefile | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/tests/module/Makefile b/lib/tests/module/Makefile index af5c27b996cb..2f3e1a772c2c 100644 --- a/lib/tests/module/Makefile +++ b/lib/tests/module/Makefile @@ -3,13 +3,12 @@ obj-$(CONFIG_TEST_KALLSYMS_B) += test_kallsyms_b.o obj-$(CONFIG_TEST_KALLSYMS_C) += test_kallsyms_c.o obj-$(CONFIG_TEST_KALLSYMS_D) += test_kallsyms_d.o -$(obj)/%.c: FORCE - @$(kecho) " GEN $@" - $(Q)$(srctree)/lib/tests/module/gen_test_kallsyms.sh $@\ - $(CONFIG_TEST_KALLSYMS_NUMSYMS) \ - $(CONFIG_TEST_KALLSYMS_SCALE_FACTOR) +quiet_cmd_gen_test_kallsyms = GEN $@ + cmd_gen_test_kallsyms = $< $@ \ + $(CONFIG_TEST_KALLSYMS_NUMSYMS) \ + $(CONFIG_TEST_KALLSYMS_SCALE_FACTOR) -clean-files += test_kallsyms_a.c -clean-files += test_kallsyms_b.c -clean-files += test_kallsyms_c.c -clean-files += test_kallsyms_d.c +$(obj)/%.c: $(src)/gen_test_kallsyms.sh FORCE + $(call if_changed,gen_test_kallsyms) + +targets += $(foreach x, a b c d, test_kallsyms_$(x).c) -- cgit v1.2.3 From 3e1d95b63c97506d0d98c75fc72a60662981a3c6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 27 Nov 2024 19:06:03 -0800 Subject: selftests: kallsyms: fix and clarify current test boundaries Provide and clarify the existing ranges and what you should expect. Fix the gen_test_kallsyms.sh script to accept different ranges. Fixes: 84b4a51fce4ccc66 ("selftests: add new kallsyms selftests") Signed-off-by: Luis Chamberlain --- lib/Kconfig.debug | 32 +++++++++++++++++++++++++++++++- lib/tests/module/gen_test_kallsyms.sh | 9 +++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f340017585c5..f3d723705879 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -3003,9 +3003,39 @@ config TEST_KALLSYMS_D tristate depends on m +choice + prompt "Kallsym test range" + default TEST_KALLSYMS_LARGE + help + Selecting something other than "Fast" will enable tests which slow + down the build and may crash your build. + +config TEST_KALLSYMS_FAST + bool "Fast builds" + help + You won't really be testing kallsysms, so this just helps fast builds + when allmodconfig is used.. + +config TEST_KALLSYMS_LARGE + bool "Enable testing kallsyms with large exports" + help + This will enable larger number of symbols. This will slow down + your build considerably. + +config TEST_KALLSYMS_MAX + bool "Known kallsysms limits" + help + This will enable exports to the point we know we'll start crashing + builds. + +endchoice + config TEST_KALLSYMS_NUMSYMS int "test kallsyms number of symbols" - default 100 + range 2 10000 + default 2 if TEST_KALLSYMS_FAST + default 100 if TEST_KALLSYMS_LARGE + default 10000 if TEST_KALLSYMS_MAX help The number of symbols to create on TEST_KALLSYMS_A, only one of which module TEST_KALLSYMS_B will use. This also will be used diff --git a/lib/tests/module/gen_test_kallsyms.sh b/lib/tests/module/gen_test_kallsyms.sh index 3f2c626350ad..561dcac0f359 100755 --- a/lib/tests/module/gen_test_kallsyms.sh +++ b/lib/tests/module/gen_test_kallsyms.sh @@ -7,6 +7,11 @@ NUM_SYMS=$2 SCALE_FACTOR=$3 TEST_TYPE=$(echo $TARGET | sed -e 's|lib/tests/module/test_kallsyms_||g') TEST_TYPE=$(echo $TEST_TYPE | sed -e 's|.c||g') +FIRST_B_LOOKUP=1 + +if [[ $NUM_SYMS -gt 2 ]]; then + FIRST_B_LOOKUP=$((NUM_SYMS/2)) +fi gen_template_module_header() { @@ -52,10 +57,10 @@ ____END_MODULE gen_template_module_data_b() { - printf "\nextern int auto_test_a_%010d;\n\n" 28 + printf "\nextern int auto_test_a_%010d;\n\n" $FIRST_B_LOOKUP echo "static int auto_runtime_test(void)" echo "{" - printf "\nreturn auto_test_a_%010d;\n" 28 + printf "\nreturn auto_test_a_%010d;\n" $FIRST_B_LOOKUP echo "}" } -- cgit v1.2.3 From f69e63756f7822fcdad8a34f9967e8b243e883ee Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 2 Oct 2024 18:31:47 +0100 Subject: printf: Remove unused 'bprintf' bprintf() is unused. Remove it. It was added in the commit 4370aa4aa753 ("vsprintf: add binary printf") but as far as I can see was never used, unlike the other two functions in that patch. Link: https://lore.kernel.org/20241002173147.210107-1-linux@treblig.org Reviewed-by: Andy Shevchenko Acked-by: Petr Mladek Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steven Rostedt (Google) --- include/linux/string.h | 1 - lib/vsprintf.c | 23 ----------------------- 2 files changed, 24 deletions(-) (limited to 'lib') diff --git a/include/linux/string.h b/include/linux/string.h index 0dd27afcfaf7..493ac4862c77 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -335,7 +335,6 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s); #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); -int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 6ac02bbb7df1..9d3dac38a3f4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -3428,29 +3428,6 @@ out: } EXPORT_SYMBOL_GPL(bstr_printf); -/** - * bprintf - Parse a format string and place args' binary value in a buffer - * @bin_buf: The buffer to place args' binary value - * @size: The size of the buffer(by words(32bits), not characters) - * @fmt: The format string to use - * @...: Arguments for the format string - * - * The function returns the number of words(u32) written - * into @bin_buf. - */ -int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = vbin_printf(bin_buf, size, fmt, args); - va_end(args); - - return ret; -} -EXPORT_SYMBOL_GPL(bprintf); - #endif /* CONFIG_BINARY_PRINTF */ /** -- cgit v1.2.3 From 9022ed0e7e65734d83a0648648589b9fbea8e8c9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Dec 2024 09:23:33 -0800 Subject: strscpy: write destination buffer only once The point behind strscpy() was to once and for all avoid all the problems with 'strncpy()' and later broken "fixed" versions like strlcpy() that just made things worse. So strscpy not only guarantees NUL-termination (unlike strncpy), it also doesn't do unnecessary padding at the destination. But at the same time also avoids byte-at-a-time reads and writes by _allowing_ some extra NUL writes - within the size, of course - so that the whole copy can be done with word operations. It is also stable in the face of a mutable source string: it explicitly does not read the source buffer multiple times (so an implementation using "strnlen()+memcpy()" would be wrong), and does not read the source buffer past the size (like the mis-design that is strlcpy does). Finally, the return value is designed to be simple and unambiguous: if the string cannot be copied fully, it returns an actual negative error, making error handling clearer and simpler (and the caller already knows the size of the buffer). Otherwise it returns the string length of the result. However, there was one final stability issue that can be important to callers: the stability of the destination buffer. In particular, the same way we shouldn't read the source buffer more than once, we should avoid doing multiple writes to the destination buffer: first writing a potentially non-terminated string, and then terminating it with NUL at the end does not result in a stable result buffer. Yes, it gives the right result in the end, but if the rule for the destination buffer was that it is _always_ NUL-terminated even when accessed concurrently with updates, the final byte of the buffer needs to always _stay_ as a NUL byte. [ Note that "final byte is NUL" here is literally about the final byte in the destination array, not the terminating NUL at the end of the string itself. There is no attempt to try to make concurrent reads and writes give any kind of consistent string length or contents, but we do want to guarantee that there is always at least that final terminating NUL character at the end of the destination array if it existed before ] This is relevant in the kernel for the tsk->comm[] array, for example. Even without locking (for either readers or writers), we want to know that while the buffer contents may be garbled, it is always a valid C string and always has a NUL character at 'comm[TASK_COMM_LEN-1]' (and never has any "out of thin air" data). So avoid any "copy possibly non-terminated string, and terminate later" behavior, and write the destination buffer only once. Signed-off-by: Linus Torvalds --- lib/string.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/string.c b/lib/string.c index 76327b51e36f..eb4486ed40d2 100644 --- a/lib/string.c +++ b/lib/string.c @@ -104,6 +104,12 @@ char *strncpy(char *dest, const char *src, size_t count) EXPORT_SYMBOL(strncpy); #endif +#ifdef __BIG_ENDIAN +# define ALLBUTLAST_BYTE_MASK (~255ul) +#else +# define ALLBUTLAST_BYTE_MASK (~0ul >> 8) +#endif + ssize_t sized_strscpy(char *dest, const char *src, size_t count) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; @@ -147,13 +153,18 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) *(unsigned long *)(dest+res) = c & zero_bytemask(data); return res + find_zero(data); } + count -= sizeof(unsigned long); + if (unlikely(!count)) { + c &= ALLBUTLAST_BYTE_MASK; + *(unsigned long *)(dest+res) = c; + return -E2BIG; + } *(unsigned long *)(dest+res) = c; res += sizeof(unsigned long); - count -= sizeof(unsigned long); max -= sizeof(unsigned long); } - while (count) { + while (count > 1) { char c; c = src[res]; @@ -164,11 +175,11 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) count--; } - /* Hit buffer length without finding a NUL; force NUL-termination. */ - if (res) - dest[res-1] = '\0'; + /* Force NUL-termination. */ + dest[res] = '\0'; - return -E2BIG; + /* Return E2BIG if the source didn't stop */ + return src[res] ? -E2BIG : res; } EXPORT_SYMBOL(sized_strscpy); -- cgit v1.2.3